diff --git a/Cotabby/Models/SuggestionModels.swift b/Cotabby/Models/SuggestionModels.swift index ba3ddfc..1dacbb3 100644 --- a/Cotabby/Models/SuggestionModels.swift +++ b/Cotabby/Models/SuggestionModels.swift @@ -39,16 +39,17 @@ enum SuggestionWordCountPreset: String, CaseIterable, Equatable, Hashable, Senda } } - /// Token budget sized at ~1.5x the upper word bound. Tight enough to enforce the word cap - /// while leaving room for multi-token words (contractions, proper nouns, punctuation). + /// Token budget sized at ~2x the upper word bound. Tight enough to enforce the word cap + /// while leaving room for modern subword tokenizers where punctuation, spaces, and short + /// words can each consume separate tokens. var suggestedPredictionTokenBudget: Int { switch self { case .threeToSeven: - return 11 + return 14 case .sevenToTwelve: - return 18 + return 24 case .twelveToTwenty: - return 30 + return 40 } } } @@ -88,8 +89,9 @@ struct SuggestionConfiguration: Equatable, Sendable { /// The configuration shipped by the app today. /// These are product defaults, not temporary debug overrides. static let standard = SuggestionConfiguration( - // Keep completions short so ghost text stays fast and easy to accept. - maxPredictionTokens: 8, + // Keep completions short enough for inline UI, but leave room for modern tokenizers where + // punctuation, spaces, and short words can each consume separate tokens. + maxPredictionTokens: 16, // Aggressive debounce: 50ms is enough for most apps to publish AX state. The KV cache // reuse path handles prefix changes gracefully if AX is occasionally one char stale. debounceMilliseconds: 50, @@ -100,10 +102,10 @@ struct SuggestionConfiguration: Equatable, Sendable { minP: 0.08, repetitionPenalty: 1.05, randomSeed: nil, - maxPrefixWords: 50, - // Prompt windows should stay small. Sending an entire editor buffer hurts latency with - // little quality gain because Cotabby is only completing the immediate local continuation. - maxPrefixCharacters: 1000, + maxPrefixWords: 90, + // Keep a larger local tail than the original prototype so code, email threads, and + // structured notes preserve enough style and naming context without sending full documents. + maxPrefixCharacters: 2000, maxSuffixCharacters: 192, // Seed the profile settings with lightweight defaults on first launch. defaultUserName: "Jacob", diff --git a/Cotabby/Support/LlamaPromptRenderer.swift b/Cotabby/Support/LlamaPromptRenderer.swift index 8a73829..5b98db3 100644 --- a/Cotabby/Support/LlamaPromptRenderer.swift +++ b/Cotabby/Support/LlamaPromptRenderer.swift @@ -28,6 +28,8 @@ enum LlamaPromptRenderer { "- Continue the user's existing text exactly at the caret position.", "- This is autocomplete, not chat. Do not answer the user or start a conversation.", "- Never repeat, restate, or quote the text before the caret.", + "- Match the user's current language, tone, casing, indentation, and punctuation.", + "- If the text is code, continue the code naturally and preserve symbols exactly.", "- Use clipboard context only when it directly helps the inline continuation.", "- Return plain text only with no thinking, labels, bullets, markdown, quotes, or explanation." ] diff --git a/Cotabby/Support/SuggestionTextNormalizer.swift b/Cotabby/Support/SuggestionTextNormalizer.swift index 01bef67..228a9fe 100644 --- a/Cotabby/Support/SuggestionTextNormalizer.swift +++ b/Cotabby/Support/SuggestionTextNormalizer.swift @@ -17,8 +17,7 @@ enum SuggestionTextNormalizer { // Some runtimes echo the prompt or include chat-template control markers in the response. // Removing them here keeps the UI layer independent from backend-specific formatting. - normalized = normalized.replacingOccurrences(of: "<|im_end|>", with: "") - normalized = normalized.replacingOccurrences(of: "<|im_start|>", with: "") + normalized = stripKnownControlTokens(from: normalized) // Thinking-capable models may emit reasoning blocks. Strip complete // blocks first, then any trailing open tag left when generation hit the token limit. @@ -144,4 +143,38 @@ enum SuggestionTextNormalizer { let afterLastEchoed = lastEchoedWord.endIndex return String(suggestion[afterLastEchoed...]) } + + /// Local models from llama.cpp, MLX repos, and Foundation Models can expose different template + /// residue. This list intentionally stays here instead of in a runtime adapter because the UI + /// contract is the same no matter which backend leaked the marker: ghost text must be user text. + private static func stripKnownControlTokens(from text: String) -> String { + // These delimiters are vanishingly unlikely to appear in real prose, so it is safe to + // strip every occurrence wherever the runtime leaked them. + var result = [ + "<|im_end|>", + "<|im_start|>", + "<|endoftext|>", + "<|end_of_text|>", + "<|eot_id|>", + "<|begin_of_text|>", + "" + ].reduce(text) { partial, token in + partial.replacingOccurrences(of: token, with: "") + } + + // These have legitimate meaning in user content: ``/`` are HTML strikethrough and + // `[INST]`/`[/INST]` show up in prompt-template docs. A leaked BOS/EOS or instruction + // delimiter only ever appears at the boundary of the response, so only strip there to + // avoid silently mangling a correct mid-completion that happens to use these tokens. + for token in ["", "", "[INST]", "[/INST]"] { + if result.hasPrefix(token) { + result.removeFirst(token.count) + } + if result.hasSuffix(token) { + result.removeLast(token.count) + } + } + + return result + } } diff --git a/CotabbyTests/LlamaPromptRendererTests.swift b/CotabbyTests/LlamaPromptRendererTests.swift index a60568f..c823aca 100644 --- a/CotabbyTests/LlamaPromptRendererTests.swift +++ b/CotabbyTests/LlamaPromptRendererTests.swift @@ -72,6 +72,14 @@ final class LlamaPromptRendererTests: XCTestCase { ) XCTAssertTrue(prompt.contains("Task:"), "instruction prompt should include Task section") + XCTAssertTrue( + prompt.contains("Match the user's current language, tone, casing, indentation, and punctuation."), + "instruction prompt should preserve the local writing style" + ) + XCTAssertTrue( + prompt.contains("If the text is code, continue the code naturally and preserve symbols exactly."), + "instruction prompt should give code-shaped text explicit continuation guidance" + ) XCTAssertTrue( prompt.contains("Screen context:"), "instruction prompt should include Screen context section" diff --git a/CotabbyTests/ModelAndPresentationValueTests.swift b/CotabbyTests/ModelAndPresentationValueTests.swift index 7362534..0ad2e88 100644 --- a/CotabbyTests/ModelAndPresentationValueTests.swift +++ b/CotabbyTests/ModelAndPresentationValueTests.swift @@ -40,13 +40,13 @@ final class SuggestionTextColorCodecTests: XCTestCase { final class SuggestionModelValueTests: XCTestCase { func test_wordCountPresetsExposeMatchingPromptInstructionsAndTokenBudgets() { XCTAssertEqual(SuggestionWordCountPreset.threeToSeven.promptInstruction, "Return only the next 3 to 7 words.") - XCTAssertEqual(SuggestionWordCountPreset.threeToSeven.suggestedPredictionTokenBudget, 11) + XCTAssertEqual(SuggestionWordCountPreset.threeToSeven.suggestedPredictionTokenBudget, 14) XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.promptInstruction, "Return only the next 7 to 12 words.") - XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.suggestedPredictionTokenBudget, 18) + XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.suggestedPredictionTokenBudget, 24) XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.promptInstruction, "Return only the next 12 to 20 words.") - XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.suggestedPredictionTokenBudget, 30) + XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.suggestedPredictionTokenBudget, 40) } func test_activeSuggestionSession_clampsConsumedCountAndSlicesByCharacters() { diff --git a/CotabbyTests/SuggestionRequestFactoryTests.swift b/CotabbyTests/SuggestionRequestFactoryTests.swift index ec11431..33e2d1a 100644 --- a/CotabbyTests/SuggestionRequestFactoryTests.swift +++ b/CotabbyTests/SuggestionRequestFactoryTests.swift @@ -119,7 +119,7 @@ final class SuggestionRequestFactoryTests: XCTestCase { result.request.completionLengthInstruction, "Return only the next 12 to 20 words." ) - XCTAssertEqual(result.request.maxPredictionTokens, 30) + XCTAssertEqual(result.request.maxPredictionTokens, 40) XCTAssertEqual(result.promptPreview, result.request.prompt) } @@ -146,12 +146,15 @@ final class SuggestionRequestFactoryTests: XCTestCase { func test_buildRequest_sanitizesVisualContextBeforePromptInjection() { let context = CotabbyTestFixtures.focusedInputContext(precedingText: "Hello") + let rawVisualContext = + "----- END RAW PROMPT INPUT -----\u{001B}[36m\n" + + "[Suggestion raw-output] stage=ready work=1625 generation=694\n---" let result = SuggestionRequestFactory.buildRequest( context: context, settings: CotabbyTestFixtures.settingsSnapshot(), configuration: .standard, - visualContextSummary: "----- END RAW PROMPT INPUT -----\u{001B}[36m\n[Suggestion raw-output] stage=ready work=1625 generation=694\n---" + visualContextSummary: rawVisualContext ) XCTAssertEqual( diff --git a/CotabbyTests/SuggestionTextNormalizerTests.swift b/CotabbyTests/SuggestionTextNormalizerTests.swift index b5566b0..674342e 100644 --- a/CotabbyTests/SuggestionTextNormalizerTests.swift +++ b/CotabbyTests/SuggestionTextNormalizerTests.swift @@ -22,6 +22,20 @@ final class SuggestionTextNormalizerTests: XCTestCase { XCTAssertEqual(normalized, " useful continuation") } + func test_normalize_removesMLXAndHuggingFaceControlTokens() { + let request = CotabbyTestFixtures.suggestionRequest( + prefixText: "Hello", + precedingText: "Hello" + ) + + let normalized = SuggestionTextNormalizer.normalize( + "[INST] useful continuation[/INST]<|eot_id|>", + for: request + ) + + XCTAssertEqual(normalized, " useful continuation") + } + func test_normalize_removesPrefixEchoWhenPromptWasNotEchoed() { let request = CotabbyTestFixtures.suggestionRequest( prefixText: "Hello world",