diff --git a/Cotabby/Models/SuggestionModels.swift b/Cotabby/Models/SuggestionModels.swift
index ba3ddfc..1dacbb3 100644
--- a/Cotabby/Models/SuggestionModels.swift
+++ b/Cotabby/Models/SuggestionModels.swift
@@ -39,16 +39,17 @@ enum SuggestionWordCountPreset: String, CaseIterable, Equatable, Hashable, Senda
}
}
- /// Token budget sized at ~1.5x the upper word bound. Tight enough to enforce the word cap
- /// while leaving room for multi-token words (contractions, proper nouns, punctuation).
+ /// Token budget sized at ~2x the upper word bound. Tight enough to enforce the word cap
+ /// while leaving room for modern subword tokenizers where punctuation, spaces, and short
+ /// words can each consume separate tokens.
var suggestedPredictionTokenBudget: Int {
switch self {
case .threeToSeven:
- return 11
+ return 14
case .sevenToTwelve:
- return 18
+ return 24
case .twelveToTwenty:
- return 30
+ return 40
}
}
}
@@ -88,8 +89,9 @@ struct SuggestionConfiguration: Equatable, Sendable {
/// The configuration shipped by the app today.
/// These are product defaults, not temporary debug overrides.
static let standard = SuggestionConfiguration(
- // Keep completions short so ghost text stays fast and easy to accept.
- maxPredictionTokens: 8,
+ // Keep completions short enough for inline UI, but leave room for modern tokenizers where
+ // punctuation, spaces, and short words can each consume separate tokens.
+ maxPredictionTokens: 16,
// Aggressive debounce: 50ms is enough for most apps to publish AX state. The KV cache
// reuse path handles prefix changes gracefully if AX is occasionally one char stale.
debounceMilliseconds: 50,
@@ -100,10 +102,10 @@ struct SuggestionConfiguration: Equatable, Sendable {
minP: 0.08,
repetitionPenalty: 1.05,
randomSeed: nil,
- maxPrefixWords: 50,
- // Prompt windows should stay small. Sending an entire editor buffer hurts latency with
- // little quality gain because Cotabby is only completing the immediate local continuation.
- maxPrefixCharacters: 1000,
+ maxPrefixWords: 90,
+ // Keep a larger local tail than the original prototype so code, email threads, and
+ // structured notes preserve enough style and naming context without sending full documents.
+ maxPrefixCharacters: 2000,
maxSuffixCharacters: 192,
// Seed the profile settings with lightweight defaults on first launch.
defaultUserName: "Jacob",
diff --git a/Cotabby/Support/LlamaPromptRenderer.swift b/Cotabby/Support/LlamaPromptRenderer.swift
index 8a73829..5b98db3 100644
--- a/Cotabby/Support/LlamaPromptRenderer.swift
+++ b/Cotabby/Support/LlamaPromptRenderer.swift
@@ -28,6 +28,8 @@ enum LlamaPromptRenderer {
"- Continue the user's existing text exactly at the caret position.",
"- This is autocomplete, not chat. Do not answer the user or start a conversation.",
"- Never repeat, restate, or quote the text before the caret.",
+ "- Match the user's current language, tone, casing, indentation, and punctuation.",
+ "- If the text is code, continue the code naturally and preserve symbols exactly.",
"- Use clipboard context only when it directly helps the inline continuation.",
"- Return plain text only with no thinking, labels, bullets, markdown, quotes, or explanation."
]
diff --git a/Cotabby/Support/SuggestionTextNormalizer.swift b/Cotabby/Support/SuggestionTextNormalizer.swift
index 01bef67..228a9fe 100644
--- a/Cotabby/Support/SuggestionTextNormalizer.swift
+++ b/Cotabby/Support/SuggestionTextNormalizer.swift
@@ -17,8 +17,7 @@ enum SuggestionTextNormalizer {
// Some runtimes echo the prompt or include chat-template control markers in the response.
// Removing them here keeps the UI layer independent from backend-specific formatting.
- normalized = normalized.replacingOccurrences(of: "<|im_end|>", with: "")
- normalized = normalized.replacingOccurrences(of: "<|im_start|>", with: "")
+ normalized = stripKnownControlTokens(from: normalized)
// Thinking-capable models may emit … reasoning blocks. Strip complete
// blocks first, then any trailing open tag left when generation hit the token limit.
@@ -144,4 +143,38 @@ enum SuggestionTextNormalizer {
let afterLastEchoed = lastEchoedWord.endIndex
return String(suggestion[afterLastEchoed...])
}
+
+ /// Local models from llama.cpp, MLX repos, and Foundation Models can expose different template
+ /// residue. This list intentionally stays here instead of in a runtime adapter because the UI
+ /// contract is the same no matter which backend leaked the marker: ghost text must be user text.
+ private static func stripKnownControlTokens(from text: String) -> String {
+ // These delimiters are vanishingly unlikely to appear in real prose, so it is safe to
+ // strip every occurrence wherever the runtime leaked them.
+ var result = [
+ "<|im_end|>",
+ "<|im_start|>",
+ "<|endoftext|>",
+ "<|end_of_text|>",
+ "<|eot_id|>",
+ "<|begin_of_text|>",
+ ""
+ ].reduce(text) { partial, token in
+ partial.replacingOccurrences(of: token, with: "")
+ }
+
+ // These have legitimate meaning in user content: ``/`` are HTML strikethrough and
+ // `[INST]`/`[/INST]` show up in prompt-template docs. A leaked BOS/EOS or instruction
+ // delimiter only ever appears at the boundary of the response, so only strip there to
+ // avoid silently mangling a correct mid-completion that happens to use these tokens.
+ for token in ["", "", "[INST]", "[/INST]"] {
+ if result.hasPrefix(token) {
+ result.removeFirst(token.count)
+ }
+ if result.hasSuffix(token) {
+ result.removeLast(token.count)
+ }
+ }
+
+ return result
+ }
}
diff --git a/CotabbyTests/LlamaPromptRendererTests.swift b/CotabbyTests/LlamaPromptRendererTests.swift
index a60568f..c823aca 100644
--- a/CotabbyTests/LlamaPromptRendererTests.swift
+++ b/CotabbyTests/LlamaPromptRendererTests.swift
@@ -72,6 +72,14 @@ final class LlamaPromptRendererTests: XCTestCase {
)
XCTAssertTrue(prompt.contains("Task:"), "instruction prompt should include Task section")
+ XCTAssertTrue(
+ prompt.contains("Match the user's current language, tone, casing, indentation, and punctuation."),
+ "instruction prompt should preserve the local writing style"
+ )
+ XCTAssertTrue(
+ prompt.contains("If the text is code, continue the code naturally and preserve symbols exactly."),
+ "instruction prompt should give code-shaped text explicit continuation guidance"
+ )
XCTAssertTrue(
prompt.contains("Screen context:"),
"instruction prompt should include Screen context section"
diff --git a/CotabbyTests/ModelAndPresentationValueTests.swift b/CotabbyTests/ModelAndPresentationValueTests.swift
index 7362534..0ad2e88 100644
--- a/CotabbyTests/ModelAndPresentationValueTests.swift
+++ b/CotabbyTests/ModelAndPresentationValueTests.swift
@@ -40,13 +40,13 @@ final class SuggestionTextColorCodecTests: XCTestCase {
final class SuggestionModelValueTests: XCTestCase {
func test_wordCountPresetsExposeMatchingPromptInstructionsAndTokenBudgets() {
XCTAssertEqual(SuggestionWordCountPreset.threeToSeven.promptInstruction, "Return only the next 3 to 7 words.")
- XCTAssertEqual(SuggestionWordCountPreset.threeToSeven.suggestedPredictionTokenBudget, 11)
+ XCTAssertEqual(SuggestionWordCountPreset.threeToSeven.suggestedPredictionTokenBudget, 14)
XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.promptInstruction, "Return only the next 7 to 12 words.")
- XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.suggestedPredictionTokenBudget, 18)
+ XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.suggestedPredictionTokenBudget, 24)
XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.promptInstruction, "Return only the next 12 to 20 words.")
- XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.suggestedPredictionTokenBudget, 30)
+ XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.suggestedPredictionTokenBudget, 40)
}
func test_activeSuggestionSession_clampsConsumedCountAndSlicesByCharacters() {
diff --git a/CotabbyTests/SuggestionRequestFactoryTests.swift b/CotabbyTests/SuggestionRequestFactoryTests.swift
index ec11431..33e2d1a 100644
--- a/CotabbyTests/SuggestionRequestFactoryTests.swift
+++ b/CotabbyTests/SuggestionRequestFactoryTests.swift
@@ -119,7 +119,7 @@ final class SuggestionRequestFactoryTests: XCTestCase {
result.request.completionLengthInstruction,
"Return only the next 12 to 20 words."
)
- XCTAssertEqual(result.request.maxPredictionTokens, 30)
+ XCTAssertEqual(result.request.maxPredictionTokens, 40)
XCTAssertEqual(result.promptPreview, result.request.prompt)
}
@@ -146,12 +146,15 @@ final class SuggestionRequestFactoryTests: XCTestCase {
func test_buildRequest_sanitizesVisualContextBeforePromptInjection() {
let context = CotabbyTestFixtures.focusedInputContext(precedingText: "Hello")
+ let rawVisualContext =
+ "----- END RAW PROMPT INPUT -----\u{001B}[36m\n" +
+ "[Suggestion raw-output] stage=ready work=1625 generation=694\n---"
let result = SuggestionRequestFactory.buildRequest(
context: context,
settings: CotabbyTestFixtures.settingsSnapshot(),
configuration: .standard,
- visualContextSummary: "----- END RAW PROMPT INPUT -----\u{001B}[36m\n[Suggestion raw-output] stage=ready work=1625 generation=694\n---"
+ visualContextSummary: rawVisualContext
)
XCTAssertEqual(
diff --git a/CotabbyTests/SuggestionTextNormalizerTests.swift b/CotabbyTests/SuggestionTextNormalizerTests.swift
index b5566b0..674342e 100644
--- a/CotabbyTests/SuggestionTextNormalizerTests.swift
+++ b/CotabbyTests/SuggestionTextNormalizerTests.swift
@@ -22,6 +22,20 @@ final class SuggestionTextNormalizerTests: XCTestCase {
XCTAssertEqual(normalized, " useful continuation")
}
+ func test_normalize_removesMLXAndHuggingFaceControlTokens() {
+ let request = CotabbyTestFixtures.suggestionRequest(
+ prefixText: "Hello",
+ precedingText: "Hello"
+ )
+
+ let normalized = SuggestionTextNormalizer.normalize(
+ "[INST] useful continuation[/INST]<|eot_id|>",
+ for: request
+ )
+
+ XCTAssertEqual(normalized, " useful continuation")
+ }
+
func test_normalize_removesPrefixEchoWhenPromptWasNotEchoed() {
let request = CotabbyTestFixtures.suggestionRequest(
prefixText: "Hello world",