From 99c82c9f79e95102f8ea1ef81655ade9ae84520d Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 28 May 2026 01:59:14 -0700 Subject: [PATCH 1/2] Propagate Task cancellation into the llama sampling loop Today, when the suggestion work controller cancels a parent Task (new keystroke, focus change), the Task.detached call inside LlamaRuntimeManager does not inherit cancellation, so core.generate runs its full prediction budget while holding autocompleteLock. The next autocomplete then waits ~100-400ms on Metal behind a result nobody wants. Two changes: 1. core.generate now polls Task.isCancelled between sampleNext calls and breaks early. This matches what summarize already does. 2. generate and summarize in the manager wrap the Task.detached await in withTaskCancellationHandler so an outer cancel actually reaches the detached task. Engine-level cancelSequence is intentionally not called for the autocomplete path: its cancelled flag is one-way, and tripping it would require destroying and recreating the persistent sequence on every cancellation, losing KV cache reuse. The Task.isCancelled poll between samples gives us per-token (~10-15ms) granularity, which is fast enough. --- .../Services/Runtime/LlamaRuntimeCore.swift | 6 +++++ .../Runtime/LlamaRuntimeManager.swift | 22 +++++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index fb8eed5..abd38dc 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -150,6 +150,12 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { var generatedText = "" for _ in 0 ..< options.maxPredictionTokens { + // Cooperative cancellation: when the wrapping Task is cancelled (caller hit a new + // keystroke, focus changed, Compose started), bail before the next sampleNext call so + // we release `autocompleteLock` instead of running the full prediction budget and + // making the next autocomplete wait behind us. + if Task.isCancelled { break } + let result = engine.sampleNext(sequenceID) if result.was_cancelled || result.is_eos { diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift index a909cb1..3942458 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift @@ -97,13 +97,22 @@ final class LlamaRuntimeManager: ObservableObject { let core = self.core do { - return try await Task.detached { + // `Task.detached` does not inherit the caller's cancellation, so an outer cancel + // would otherwise leave `core.generate` running to its full prediction budget while + // holding `autocompleteLock`. The handler forwards the cancel signal, and the loop + // inside `core.generate` polls `Task.isCancelled` between sampleNext calls. + let task = Task.detached { try core.generate( prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options ) - }.value + } + return try await withTaskCancellationHandler { + try await task.value + } onCancel: { + task.cancel() + } } catch is CancellationError { CotabbyLogger.runtime.debug("Generation cancelled") throw LlamaRuntimeError.cancelled @@ -133,12 +142,17 @@ final class LlamaRuntimeManager: ObservableObject { temperature: temperature ) do { - return try await Task.detached { + let task = Task.detached { try core.summarize( prompt: prompt, options: options ) - }.value + } + return try await withTaskCancellationHandler { + try await task.value + } onCancel: { + task.cancel() + } } catch is CancellationError { throw LlamaRuntimeError.cancelled } catch let error as LlamaRuntimeError { From 9afae9e3bc17cfb3e56f360b3057d6cfa280c638 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 28 May 2026 02:30:39 -0700 Subject: [PATCH 2/2] Surface cancellation as CancellationError so the catch path stays reachable --- .../Services/Runtime/LlamaRuntimeManager.swift | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift index 3942458..b4ed4f2 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift @@ -109,7 +109,16 @@ final class LlamaRuntimeManager: ObservableObject { ) } return try await withTaskCancellationHandler { - try await task.value + // `core.generate` cooperates with cancellation by returning the partial buffer it + // accumulated instead of throwing, which is the right behavior for the inference + // layer (the KV-cache trim and lock release still need to run on the way out). + // The manager surfaces the cancellation as a thrown `CancellationError` so the + // `catch` below stays reachable and so callers see the same vocabulary as a + // throwing path. The outer task is the one that was cancelled (that is why + // `onCancel` ran), so `Task.checkCancellation()` throws here. + let partial = try await task.value + try Task.checkCancellation() + return partial } onCancel: { task.cancel() } @@ -149,7 +158,12 @@ final class LlamaRuntimeManager: ObservableObject { ) } return try await withTaskCancellationHandler { - try await task.value + // Same pattern as `generate`: the detached task returns partial text on cancel, + // so surface the cancel here via `Task.checkCancellation()` to keep the catch + // below reachable and the runtime vocabulary consistent across both paths. + let partial = try await task.value + try Task.checkCancellation() + return partial } onCancel: { task.cancel() }