From f5c72889049cf42b2fa2e476098165c6bd769f85 Mon Sep 17 00:00:00 2001
From: jam-cai <jamescaicjm@gmail.com>
Date: Sun, 24 May 2026 21:01:42 -0400
Subject: [PATCH 1/2] Tune local model prompt and output cleanup

---
 Cotabby/Models/SuggestionModels.swift         | 19 ++++++++-------
 Cotabby/Support/LlamaPromptRenderer.swift     |  2 ++
 .../Support/SuggestionTextNormalizer.swift    | 24 +++++++++++++++++--
 CotabbyTests/LlamaPromptRendererTests.swift   |  8 +++++++
 .../ModelAndPresentationValueTests.swift      |  6 ++---
 .../SuggestionRequestFactoryTests.swift       |  7 ++++--
 .../SuggestionTextNormalizerTests.swift       | 14 +++++++++++
 7 files changed, 64 insertions(+), 16 deletions(-)

diff --git a/Cotabby/Models/SuggestionModels.swift b/Cotabby/Models/SuggestionModels.swift
index 1afe7fbb..67d5942f 100644
--- a/Cotabby/Models/SuggestionModels.swift
+++ b/Cotabby/Models/SuggestionModels.swift
@@ -44,11 +44,11 @@ enum SuggestionWordCountPreset: String, CaseIterable, Equatable, Hashable, Senda
     var suggestedPredictionTokenBudget: Int {
         switch self {
         case .threeToSeven:
-            return 11
+            return 14
         case .sevenToTwelve:
-            return 18
+            return 24
         case .twelveToTwenty:
-            return 30
+            return 40
         }
     }
 }
@@ -88,8 +88,9 @@ struct SuggestionConfiguration: Equatable, Sendable {
     /// The configuration shipped by the app today.
     /// These are product defaults, not temporary debug overrides.
     static let standard = SuggestionConfiguration(
-        // Keep completions short so ghost text stays fast and easy to accept.
-        maxPredictionTokens: 8,
+        // Keep completions short enough for inline UI, but leave room for modern tokenizers where
+        // punctuation, spaces, and short words can each consume separate tokens.
+        maxPredictionTokens: 16,
         // Aggressive debounce: 50ms is enough for most apps to publish AX state. The KV cache
         // reuse path handles prefix changes gracefully if AX is occasionally one char stale.
         debounceMilliseconds: 50,
@@ -100,10 +101,10 @@ struct SuggestionConfiguration: Equatable, Sendable {
         minP: 0.08,
         repetitionPenalty: 1.05,
         randomSeed: nil,
-        maxPrefixWords: 50,
-        // Prompt windows should stay small. Sending an entire editor buffer hurts latency with
-        // little quality gain because Cotabby is only completing the immediate local continuation.
-        maxPrefixCharacters: 1000,
+        maxPrefixWords: 90,
+        // Keep a larger local tail than the original prototype so code, email threads, and
+        // structured notes preserve enough style and naming context without sending full documents.
+        maxPrefixCharacters: 2000,
         maxSuffixCharacters: 192,
         // Seed the profile settings with lightweight defaults on first launch.
         defaultUserName: "Jacob",
diff --git a/Cotabby/Support/LlamaPromptRenderer.swift b/Cotabby/Support/LlamaPromptRenderer.swift
index 7efcbf81..0d5f4c8b 100644
--- a/Cotabby/Support/LlamaPromptRenderer.swift
+++ b/Cotabby/Support/LlamaPromptRenderer.swift
@@ -26,6 +26,8 @@ enum LlamaPromptRenderer {
             "- Continue the user's existing text exactly at the caret position.",
             "- This is autocomplete, not chat. Do not answer the user or start a conversation.",
             "- Never repeat, restate, or quote the text before the caret.",
+            "- Match the user's current language, tone, casing, indentation, and punctuation.",
+            "- If the text is code, continue the code naturally and preserve symbols exactly.",
             "- Use clipboard context only when it directly helps the inline continuation.",
             "- Return plain text only with no thinking, labels, bullets, markdown, quotes, or explanation."
         ]
diff --git a/Cotabby/Support/SuggestionTextNormalizer.swift b/Cotabby/Support/SuggestionTextNormalizer.swift
index 70a00a6f..88262d3a 100644
--- a/Cotabby/Support/SuggestionTextNormalizer.swift
+++ b/Cotabby/Support/SuggestionTextNormalizer.swift
@@ -17,8 +17,7 @@ enum SuggestionTextNormalizer {
 
         // Some runtimes echo the prompt or include chat-template control markers in the response.
         // Removing them here keeps the UI layer independent from backend-specific formatting.
-        normalized = normalized.replacingOccurrences(of: "<|im_end|>", with: "")
-        normalized = normalized.replacingOccurrences(of: "<|im_start|>", with: "")
+        normalized = stripKnownControlTokens(from: normalized)
 
         // Thinking-capable models may emit <think>…</think> reasoning blocks. Strip complete
         // blocks first, then any trailing open tag left when generation hit the token limit.
@@ -135,4 +134,25 @@ enum SuggestionTextNormalizer {
         let afterLastEchoed = lastEchoedWord.endIndex
         return String(suggestion[afterLastEchoed...])
     }
+
+    /// Local models from llama.cpp, MLX repos, and Foundation Models can expose different template
+    /// residue. This list intentionally stays here instead of in a runtime adapter because the UI
+    /// contract is the same no matter which backend leaked the marker: ghost text must be user text.
+    private static func stripKnownControlTokens(from text: String) -> String {
+        [
+            "<|im_end|>",
+            "<|im_start|>",
+            "<|endoftext|>",
+            "<|end_of_text|>",
+            "<|eot_id|>",
+            "<|begin_of_text|>",
+            "<end_of_turn>",
+            "<s>",
+            "</s>",
+            "[INST]",
+            "[/INST]"
+        ].reduce(text) { partial, token in
+            partial.replacingOccurrences(of: token, with: "")
+        }
+    }
 }
diff --git a/CotabbyTests/LlamaPromptRendererTests.swift b/CotabbyTests/LlamaPromptRendererTests.swift
index 28b8bed9..ca3b3337 100644
--- a/CotabbyTests/LlamaPromptRendererTests.swift
+++ b/CotabbyTests/LlamaPromptRendererTests.swift
@@ -72,6 +72,14 @@ final class LlamaPromptRendererTests: XCTestCase {
         )
 
         XCTAssertTrue(prompt.contains("Task:"), "instruction prompt should include Task section")
+        XCTAssertTrue(
+            prompt.contains("Match the user's current language, tone, casing, indentation, and punctuation."),
+            "instruction prompt should preserve the local writing style"
+        )
+        XCTAssertTrue(
+            prompt.contains("If the text is code, continue the code naturally and preserve symbols exactly."),
+            "instruction prompt should give code-shaped text explicit continuation guidance"
+        )
         XCTAssertTrue(
             prompt.contains("Screen context:"),
             "instruction prompt should include Screen context section"
diff --git a/CotabbyTests/ModelAndPresentationValueTests.swift b/CotabbyTests/ModelAndPresentationValueTests.swift
index 73625341..0ad2e88a 100644
--- a/CotabbyTests/ModelAndPresentationValueTests.swift
+++ b/CotabbyTests/ModelAndPresentationValueTests.swift
@@ -40,13 +40,13 @@ final class SuggestionTextColorCodecTests: XCTestCase {
 final class SuggestionModelValueTests: XCTestCase {
     func test_wordCountPresetsExposeMatchingPromptInstructionsAndTokenBudgets() {
         XCTAssertEqual(SuggestionWordCountPreset.threeToSeven.promptInstruction, "Return only the next 3 to 7 words.")
-        XCTAssertEqual(SuggestionWordCountPreset.threeToSeven.suggestedPredictionTokenBudget, 11)
+        XCTAssertEqual(SuggestionWordCountPreset.threeToSeven.suggestedPredictionTokenBudget, 14)
 
         XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.promptInstruction, "Return only the next 7 to 12 words.")
-        XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.suggestedPredictionTokenBudget, 18)
+        XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.suggestedPredictionTokenBudget, 24)
 
         XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.promptInstruction, "Return only the next 12 to 20 words.")
-        XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.suggestedPredictionTokenBudget, 30)
+        XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.suggestedPredictionTokenBudget, 40)
     }
 
     func test_activeSuggestionSession_clampsConsumedCountAndSlicesByCharacters() {
diff --git a/CotabbyTests/SuggestionRequestFactoryTests.swift b/CotabbyTests/SuggestionRequestFactoryTests.swift
index ec11431b..33e2d1ac 100644
--- a/CotabbyTests/SuggestionRequestFactoryTests.swift
+++ b/CotabbyTests/SuggestionRequestFactoryTests.swift
@@ -119,7 +119,7 @@ final class SuggestionRequestFactoryTests: XCTestCase {
             result.request.completionLengthInstruction,
             "Return only the next 12 to 20 words."
         )
-        XCTAssertEqual(result.request.maxPredictionTokens, 30)
+        XCTAssertEqual(result.request.maxPredictionTokens, 40)
         XCTAssertEqual(result.promptPreview, result.request.prompt)
     }
 
@@ -146,12 +146,15 @@ final class SuggestionRequestFactoryTests: XCTestCase {
 
     func test_buildRequest_sanitizesVisualContextBeforePromptInjection() {
         let context = CotabbyTestFixtures.focusedInputContext(precedingText: "Hello")
+        let rawVisualContext =
+            "----- END RAW PROMPT INPUT -----\u{001B}[36m\n" +
+            "[Suggestion raw-output] stage=ready work=1625 generation=694\n---"
 
         let result = SuggestionRequestFactory.buildRequest(
             context: context,
             settings: CotabbyTestFixtures.settingsSnapshot(),
             configuration: .standard,
-            visualContextSummary: "----- END RAW PROMPT INPUT -----\u{001B}[36m\n[Suggestion raw-output] stage=ready work=1625 generation=694\n---"
+            visualContextSummary: rawVisualContext
         )
 
         XCTAssertEqual(
diff --git a/CotabbyTests/SuggestionTextNormalizerTests.swift b/CotabbyTests/SuggestionTextNormalizerTests.swift
index b5566b01..674342e5 100644
--- a/CotabbyTests/SuggestionTextNormalizerTests.swift
+++ b/CotabbyTests/SuggestionTextNormalizerTests.swift
@@ -22,6 +22,20 @@ final class SuggestionTextNormalizerTests: XCTestCase {
         XCTAssertEqual(normalized, " useful continuation")
     }
 
+    func test_normalize_removesMLXAndHuggingFaceControlTokens() {
+        let request = CotabbyTestFixtures.suggestionRequest(
+            prefixText: "Hello",
+            precedingText: "Hello"
+        )
+
+        let normalized = SuggestionTextNormalizer.normalize(
+            "<s>[INST] useful continuation[/INST]<|eot_id|></s>",
+            for: request
+        )
+
+        XCTAssertEqual(normalized, " useful continuation")
+    }
+
     func test_normalize_removesPrefixEchoWhenPromptWasNotEchoed() {
         let request = CotabbyTestFixtures.suggestionRequest(
             prefixText: "Hello world",

From 5a8230949b131e1f82996ef0e4931d35210c4a72 Mon Sep 17 00:00:00 2001
From: Jacob Fu <141651335+FuJacob@users.noreply.github.com>
Date: Mon, 25 May 2026 04:05:00 -0700
Subject: [PATCH 2/2] Address review: fix token-budget doc and anchor ambiguous
 control tokens

- Update suggestedPredictionTokenBudget doc-comment to ~2x (values are 14/7,
  24/12, 40/20 = exactly 2.0x), not the stale ~1.5x.
- Only strip <s>/</s> and [INST]/[/INST] at the start/end of raw output. These
  are valid in user content (HTML strikethrough, prompt-template docs), so
  global stripping could silently mangle a correct mid-completion. Unambiguous
  <|...|> markers still strip globally.
---
 Cotabby/Models/SuggestionModels.swift         | 13 +++++-
 .../Support/SuggestionTextNormalizer.swift    | 40 ++++++++++++++-----
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/Cotabby/Models/SuggestionModels.swift b/Cotabby/Models/SuggestionModels.swift
index 67d5942f..1dacbb30 100644
--- a/Cotabby/Models/SuggestionModels.swift
+++ b/Cotabby/Models/SuggestionModels.swift
@@ -39,8 +39,9 @@ enum SuggestionWordCountPreset: String, CaseIterable, Equatable, Hashable, Senda
         }
     }
 
-    /// Token budget sized at ~1.5x the upper word bound. Tight enough to enforce the word cap
-    /// while leaving room for multi-token words (contractions, proper nouns, punctuation).
+    /// Token budget sized at ~2x the upper word bound. Tight enough to enforce the word cap
+    /// while leaving room for modern subword tokenizers where punctuation, spaces, and short
+    /// words can each consume separate tokens.
     var suggestedPredictionTokenBudget: Int {
         switch self {
         case .threeToSeven:
@@ -205,10 +206,18 @@ struct SuggestionRequest: Equatable, Sendable {
     /// Optional user-provided profile context. We keep this separate from base product behavior so
     /// future settings/personalization work can evolve independently from prompt safety rules.
     let userName: String?
+    /// User-authored style rules rendered as additional prompt directives, subordinate to the base
+    /// autocomplete/safety rules. Empty when the user has none.
+    let customRules: [String]
+    /// Pre-rendered directive forcing the output language (e.g. "Always write the continuation in
+    /// Spanish…"). `nil` for English, where no override is needed.
+    let languageInstruction: String?
     /// Ephemeral clipboard context captured only when the user has enabled clipboard prompting.
     let clipboardContext: String?
     /// Ephemeral screen context summary injected only when available for the active text field.
     let visualContextSummary: String?
+    /// When enabled, the normalizer keeps multiple lines instead of truncating to the first line.
+    let isMultiLineEnabled: Bool
 }
 
 /// The engine's normalized response, including raw model text for debugging.
diff --git a/Cotabby/Support/SuggestionTextNormalizer.swift b/Cotabby/Support/SuggestionTextNormalizer.swift
index 88262d3a..228a9fe6 100644
--- a/Cotabby/Support/SuggestionTextNormalizer.swift
+++ b/Cotabby/Support/SuggestionTextNormalizer.swift
@@ -52,9 +52,18 @@ enum SuggestionTextNormalizer {
         // continuation that followed.
         normalized = normalized.trimmingCharacters(in: .newlines)
 
-        // Inline autocomplete should only surface the immediate continuation, not a paragraph.
-        if let firstLine = normalized.split(separator: "\n", maxSplits: 1).first {
-            normalized = String(firstLine)
+        if request.isMultiLineEnabled {
+            // Multi-line mode: keep content up to the first blank-line boundary (double newline)
+            // to prevent runaway paragraph generation while still allowing multi-line completions.
+            if let blankLine = normalized.range(of: "\n\n") {
+                normalized = String(normalized[..<blankLine.lowerBound])
+            }
+            normalized = normalized.trimmingCharacters(in: .whitespacesAndNewlines)
+        } else {
+            // Single-line mode: only surface the immediate continuation line.
+            if let firstLine = normalized.split(separator: "\n", maxSplits: 1).first {
+                normalized = String(firstLine)
+            }
         }
 
         // If the model starts by repeating text that already exists after the caret, we treat the
@@ -139,20 +148,33 @@ enum SuggestionTextNormalizer {
     /// residue. This list intentionally stays here instead of in a runtime adapter because the UI
     /// contract is the same no matter which backend leaked the marker: ghost text must be user text.
     private static func stripKnownControlTokens(from text: String) -> String {
-        [
+        // These delimiters are vanishingly unlikely to appear in real prose, so it is safe to
+        // strip every occurrence wherever the runtime leaked them.
+        var result = [
             "<|im_end|>",
             "<|im_start|>",
             "<|endoftext|>",
             "<|end_of_text|>",
             "<|eot_id|>",
             "<|begin_of_text|>",
-            "<end_of_turn>",
-            "<s>",
-            "</s>",
-            "[INST]",
-            "[/INST]"
+            "<end_of_turn>"
         ].reduce(text) { partial, token in
             partial.replacingOccurrences(of: token, with: "")
         }
+
+        // These have legitimate meaning in user content: `<s>`/`</s>` are HTML strikethrough and
+        // `[INST]`/`[/INST]` show up in prompt-template docs. A leaked BOS/EOS or instruction
+        // delimiter only ever appears at the boundary of the response, so only strip there to
+        // avoid silently mangling a correct mid-completion that happens to use these tokens.
+        for token in ["<s>", "</s>", "[INST]", "[/INST]"] {
+            if result.hasPrefix(token) {
+                result.removeFirst(token.count)
+            }
+            if result.hasSuffix(token) {
+                result.removeLast(token.count)
+            }
+        }
+
+        return result
     }
 }