FuJacob · Jam-Cai · May 25, 2026 · May 25, 2026 · May 25, 2026
diff --git a/Cotabby/Models/SuggestionModels.swift b/Cotabby/Models/SuggestionModels.swift
@@ -39,16 +39,17 @@ enum SuggestionWordCountPreset: String, CaseIterable, Equatable, Hashable, Senda
         }
     }
 
-    /// Token budget sized at ~1.5x the upper word bound. Tight enough to enforce the word cap
-    /// while leaving room for multi-token words (contractions, proper nouns, punctuation).
+    /// Token budget sized at ~2x the upper word bound. Tight enough to enforce the word cap
+    /// while leaving room for modern subword tokenizers where punctuation, spaces, and short
+    /// words can each consume separate tokens.
     var suggestedPredictionTokenBudget: Int {
         switch self {
         case .threeToSeven:
-            return 11
+            return 14
         case .sevenToTwelve:
-            return 18
+            return 24
         case .twelveToTwenty:
-            return 30
+            return 40
         }
     }
 }
@@ -88,8 +89,9 @@ struct SuggestionConfiguration: Equatable, Sendable {
     /// The configuration shipped by the app today.
     /// These are product defaults, not temporary debug overrides.
     static let standard = SuggestionConfiguration(
-        // Keep completions short so ghost text stays fast and easy to accept.
-        maxPredictionTokens: 8,
+        // Keep completions short enough for inline UI, but leave room for modern tokenizers where
+        // punctuation, spaces, and short words can each consume separate tokens.
+        maxPredictionTokens: 16,
         // Aggressive debounce: 50ms is enough for most apps to publish AX state. The KV cache
         // reuse path handles prefix changes gracefully if AX is occasionally one char stale.
         debounceMilliseconds: 50,
@@ -100,10 +102,10 @@ struct SuggestionConfiguration: Equatable, Sendable {
         minP: 0.08,
         repetitionPenalty: 1.05,
         randomSeed: nil,
-        maxPrefixWords: 50,
-        // Prompt windows should stay small. Sending an entire editor buffer hurts latency with
-        // little quality gain because Cotabby is only completing the immediate local continuation.
-        maxPrefixCharacters: 1000,
+        maxPrefixWords: 90,
+        // Keep a larger local tail than the original prototype so code, email threads, and
+        // structured notes preserve enough style and naming context without sending full documents.
+        maxPrefixCharacters: 2000,
         maxSuffixCharacters: 192,
         // Seed the profile settings with lightweight defaults on first launch.
         defaultUserName: "Jacob",

diff --git a/Cotabby/Support/LlamaPromptRenderer.swift b/Cotabby/Support/LlamaPromptRenderer.swift
@@ -28,6 +28,8 @@ enum LlamaPromptRenderer {
             "- Continue the user's existing text exactly at the caret position.",
             "- This is autocomplete, not chat. Do not answer the user or start a conversation.",
             "- Never repeat, restate, or quote the text before the caret.",
+            "- Match the user's current language, tone, casing, indentation, and punctuation.",
+            "- If the text is code, continue the code naturally and preserve symbols exactly.",
             "- Use clipboard context only when it directly helps the inline continuation.",
             "- Return plain text only with no thinking, labels, bullets, markdown, quotes, or explanation."
         ]

diff --git a/Cotabby/Support/SuggestionTextNormalizer.swift b/Cotabby/Support/SuggestionTextNormalizer.swift
@@ -17,8 +17,7 @@ enum SuggestionTextNormalizer {
 
         // Some runtimes echo the prompt or include chat-template control markers in the response.
         // Removing them here keeps the UI layer independent from backend-specific formatting.
-        normalized = normalized.replacingOccurrences(of: "<|im_end|>", with: "")
-        normalized = normalized.replacingOccurrences(of: "<|im_start|>", with: "")
+        normalized = stripKnownControlTokens(from: normalized)
 
         // Thinking-capable models may emit <think>…</think> reasoning blocks. Strip complete
         // blocks first, then any trailing open tag left when generation hit the token limit.
@@ -144,4 +143,38 @@ enum SuggestionTextNormalizer {
         let afterLastEchoed = lastEchoedWord.endIndex
         return String(suggestion[afterLastEchoed...])
     }
+
+    /// Local models from llama.cpp, MLX repos, and Foundation Models can expose different template
+    /// residue. This list intentionally stays here instead of in a runtime adapter because the UI
+    /// contract is the same no matter which backend leaked the marker: ghost text must be user text.
+    private static func stripKnownControlTokens(from text: String) -> String {
+        // These delimiters are vanishingly unlikely to appear in real prose, so it is safe to
+        // strip every occurrence wherever the runtime leaked them.
+        var result = [
+            "<|im_end|>",
+            "<|im_start|>",
+            "<|endoftext|>",
+            "<|end_of_text|>",
+            "<|eot_id|>",
+            "<|begin_of_text|>",
+            "<end_of_turn>"
+        ].reduce(text) { partial, token in
+            partial.replacingOccurrences(of: token, with: "")
+        }
+
+        // These have legitimate meaning in user content: `<s>`/`</s>` are HTML strikethrough and
+        // `[INST]`/`[/INST]` show up in prompt-template docs. A leaked BOS/EOS or instruction
+        // delimiter only ever appears at the boundary of the response, so only strip there to
+        // avoid silently mangling a correct mid-completion that happens to use these tokens.
+        for token in ["<s>", "</s>", "[INST]", "[/INST]"] {
+            if result.hasPrefix(token) {
+                result.removeFirst(token.count)
+            }
+            if result.hasSuffix(token) {
+                result.removeLast(token.count)
+            }
+        }
+
+        return result
+    }
 }
diff --git a/CotabbyTests/LlamaPromptRendererTests.swift b/CotabbyTests/LlamaPromptRendererTests.swift
@@ -72,6 +72,14 @@ final class LlamaPromptRendererTests: XCTestCase {
         )
 
         XCTAssertTrue(prompt.contains("Task:"), "instruction prompt should include Task section")
+        XCTAssertTrue(
+            prompt.contains("Match the user's current language, tone, casing, indentation, and punctuation."),
+            "instruction prompt should preserve the local writing style"
+        )
+        XCTAssertTrue(
+            prompt.contains("If the text is code, continue the code naturally and preserve symbols exactly."),
+            "instruction prompt should give code-shaped text explicit continuation guidance"
+        )
         XCTAssertTrue(
             prompt.contains("Screen context:"),
             "instruction prompt should include Screen context section"

diff --git a/CotabbyTests/ModelAndPresentationValueTests.swift b/CotabbyTests/ModelAndPresentationValueTests.swift
@@ -40,13 +40,13 @@ final class SuggestionTextColorCodecTests: XCTestCase {
 final class SuggestionModelValueTests: XCTestCase {
     func test_wordCountPresetsExposeMatchingPromptInstructionsAndTokenBudgets() {
         XCTAssertEqual(SuggestionWordCountPreset.threeToSeven.promptInstruction, "Return only the next 3 to 7 words.")
-        XCTAssertEqual(SuggestionWordCountPreset.threeToSeven.suggestedPredictionTokenBudget, 11)
+        XCTAssertEqual(SuggestionWordCountPreset.threeToSeven.suggestedPredictionTokenBudget, 14)
 
         XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.promptInstruction, "Return only the next 7 to 12 words.")
-        XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.suggestedPredictionTokenBudget, 18)
+        XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.suggestedPredictionTokenBudget, 24)
 
         XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.promptInstruction, "Return only the next 12 to 20 words.")
-        XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.suggestedPredictionTokenBudget, 30)
+        XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.suggestedPredictionTokenBudget, 40)
     }
 
     func test_activeSuggestionSession_clampsConsumedCountAndSlicesByCharacters() {

diff --git a/CotabbyTests/SuggestionRequestFactoryTests.swift b/CotabbyTests/SuggestionRequestFactoryTests.swift
@@ -119,7 +119,7 @@ final class SuggestionRequestFactoryTests: XCTestCase {
             result.request.completionLengthInstruction,
             "Return only the next 12 to 20 words."
         )
-        XCTAssertEqual(result.request.maxPredictionTokens, 30)
+        XCTAssertEqual(result.request.maxPredictionTokens, 40)
         XCTAssertEqual(result.promptPreview, result.request.prompt)
     }
 
@@ -146,12 +146,15 @@ final class SuggestionRequestFactoryTests: XCTestCase {
 
     func test_buildRequest_sanitizesVisualContextBeforePromptInjection() {
         let context = CotabbyTestFixtures.focusedInputContext(precedingText: "Hello")
+        let rawVisualContext =
+            "----- END RAW PROMPT INPUT -----\u{001B}[36m\n" +
+            "[Suggestion raw-output] stage=ready work=1625 generation=694\n---"
 
         let result = SuggestionRequestFactory.buildRequest(
             context: context,
             settings: CotabbyTestFixtures.settingsSnapshot(),
             configuration: .standard,
-            visualContextSummary: "----- END RAW PROMPT INPUT -----\u{001B}[36m\n[Suggestion raw-output] stage=ready work=1625 generation=694\n---"
+            visualContextSummary: rawVisualContext
         )
 
         XCTAssertEqual(

diff --git a/CotabbyTests/SuggestionTextNormalizerTests.swift b/CotabbyTests/SuggestionTextNormalizerTests.swift
@@ -22,6 +22,20 @@ final class SuggestionTextNormalizerTests: XCTestCase {
         XCTAssertEqual(normalized, " useful continuation")
     }
 
+    func test_normalize_removesMLXAndHuggingFaceControlTokens() {
+        let request = CotabbyTestFixtures.suggestionRequest(
+            prefixText: "Hello",
+            precedingText: "Hello"
+        )
+
+        let normalized = SuggestionTextNormalizer.normalize(
+            "<s>[INST] useful continuation[/INST]<|eot_id|></s>",
+            for: request
+        )
+
+        XCTAssertEqual(normalized, " useful continuation")
+    }
+
     func test_normalize_removesPrefixEchoWhenPromptWasNotEchoed() {
         let request = CotabbyTestFixtures.suggestionRequest(
             prefixText: "Hello world",