From 99c82c9f79e95102f8ea1ef81655ade9ae84520d Mon Sep 17 00:00:00 2001
From: Jacob Fu <141651335+FuJacob@users.noreply.github.com>
Date: Thu, 28 May 2026 01:59:14 -0700
Subject: [PATCH 1/2] Propagate Task cancellation into the llama sampling loop

Today, when the suggestion work controller cancels a parent Task (new
keystroke, focus change), the Task.detached call inside
LlamaRuntimeManager does not inherit cancellation, so core.generate runs
its full prediction budget while holding autocompleteLock. The next
autocomplete then waits ~100-400ms on Metal behind a result nobody
wants.

Two changes:

1. core.generate now polls Task.isCancelled between sampleNext calls and
   breaks early. This matches what summarize already does.

2. generate and summarize in the manager wrap the Task.detached await in
   withTaskCancellationHandler so an outer cancel actually reaches the
   detached task.

Engine-level cancelSequence is intentionally not called for the
autocomplete path: its cancelled flag is one-way, and tripping it would
require destroying and recreating the persistent sequence on every
cancellation, losing KV cache reuse. The Task.isCancelled poll between
samples gives us per-token (~10-15ms) granularity, which is fast enough.
---
 .../Services/Runtime/LlamaRuntimeCore.swift   |  6 +++++
 .../Runtime/LlamaRuntimeManager.swift         | 22 +++++++++++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
index fb8eed5..abd38dc 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
@@ -150,6 +150,12 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         var generatedText = ""
 
         for _ in 0 ..< options.maxPredictionTokens {
+            // Cooperative cancellation: when the wrapping Task is cancelled (caller hit a new
+            // keystroke, focus changed, Compose started), bail before the next sampleNext call so
+            // we release `autocompleteLock` instead of running the full prediction budget and
+            // making the next autocomplete wait behind us.
+            if Task.isCancelled { break }
+
             let result = engine.sampleNext(sequenceID)
 
             if result.was_cancelled || result.is_eos {
diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
index a909cb1..3942458 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
@@ -97,13 +97,22 @@ final class LlamaRuntimeManager: ObservableObject {
 
         let core = self.core
         do {
-            return try await Task.detached {
+            // `Task.detached` does not inherit the caller's cancellation, so an outer cancel
+            // would otherwise leave `core.generate` running to its full prediction budget while
+            // holding `autocompleteLock`. The handler forwards the cancel signal, and the loop
+            // inside `core.generate` polls `Task.isCancelled` between sampleNext calls.
+            let task = Task.detached {
                 try core.generate(
                     prompt: prompt,
                     cachedPrefixBytes: cachedPrefixBytes,
                     options: options
                 )
-            }.value
+            }
+            return try await withTaskCancellationHandler {
+                try await task.value
+            } onCancel: {
+                task.cancel()
+            }
         } catch is CancellationError {
             CotabbyLogger.runtime.debug("Generation cancelled")
             throw LlamaRuntimeError.cancelled
@@ -133,12 +142,17 @@ final class LlamaRuntimeManager: ObservableObject {
             temperature: temperature
         )
         do {
-            return try await Task.detached {
+            let task = Task.detached {
                 try core.summarize(
                     prompt: prompt,
                     options: options
                 )
-            }.value
+            }
+            return try await withTaskCancellationHandler {
+                try await task.value
+            } onCancel: {
+                task.cancel()
+            }
         } catch is CancellationError {
             throw LlamaRuntimeError.cancelled
         } catch let error as LlamaRuntimeError {

From 9afae9e3bc17cfb3e56f360b3057d6cfa280c638 Mon Sep 17 00:00:00 2001
From: Jacob Fu <141651335+FuJacob@users.noreply.github.com>
Date: Thu, 28 May 2026 02:30:39 -0700
Subject: [PATCH 2/2] Surface cancellation as CancellationError so the catch
 path stays reachable

---
 .../Services/Runtime/LlamaRuntimeManager.swift | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
index 3942458..b4ed4f2 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
@@ -109,7 +109,16 @@ final class LlamaRuntimeManager: ObservableObject {
                 )
             }
             return try await withTaskCancellationHandler {
-                try await task.value
+                // `core.generate` cooperates with cancellation by returning the partial buffer it
+                // accumulated instead of throwing, which is the right behavior for the inference
+                // layer (the KV-cache trim and lock release still need to run on the way out).
+                // The manager surfaces the cancellation as a thrown `CancellationError` so the
+                // `catch` below stays reachable and so callers see the same vocabulary as a
+                // throwing path. The outer task is the one that was cancelled (that is why
+                // `onCancel` ran), so `Task.checkCancellation()` throws here.
+                let partial = try await task.value
+                try Task.checkCancellation()
+                return partial
             } onCancel: {
                 task.cancel()
             }
@@ -149,7 +158,12 @@ final class LlamaRuntimeManager: ObservableObject {
                 )
             }
             return try await withTaskCancellationHandler {
-                try await task.value
+                // Same pattern as `generate`: the detached task returns partial text on cancel,
+                // so surface the cancel here via `Task.checkCancellation()` to keep the catch
+                // below reachable and the runtime vocabulary consistent across both paths.
+                let partial = try await task.value
+                try Task.checkCancellation()
+                return partial
             } onCancel: {
                 task.cancel()
             }