From 7a0bb50c5a8d623c04630f1739c1c7c2467a666b Mon Sep 17 00:00:00 2001 From: Kagura Date: Fri, 17 Apr 2026 12:43:55 +0800 Subject: [PATCH] fix(runtime): clear stale SDK session IDs on startup to prevent CLI deadlock (#25) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SDK session IDs are process-local and do not survive container restarts or process crashes. When the SQLite database lives on a persistent volume, stale IDs cause the runtime to attempt impossible resumes that leave the CLI channel stuck in a deadlock. Three-layer fix: 1. **Startup cleanup** (index.ts): clear all sdk_session_id values on boot — they are guaranteed stale after a restart. 2. **Broader retry** (runtime.ts): any error during a resume attempt now triggers a fresh-session retry, not just 'No conversation found'. This catches SDK errors we haven't seen yet. 3. **New method** (session-store.ts): clearAllSdkSessionIds() with tests covering the happy path and the no-op case. Closes #25 --- src/agent/__tests__/session-store.test.ts | 23 +++++++++++++++++++++++ src/agent/runtime.ts | 16 ++++++++++++++++ src/agent/session-store.ts | 15 +++++++++++++++ src/index.ts | 8 ++++++++ 4 files changed, 62 insertions(+) diff --git a/src/agent/__tests__/session-store.test.ts b/src/agent/__tests__/session-store.test.ts index ff53e536..4484c159 100644 --- a/src/agent/__tests__/session-store.test.ts +++ b/src/agent/__tests__/session-store.test.ts @@ -73,6 +73,29 @@ describe("SessionStore", () => { expect(session?.status).toBe("active"); }); + test("clearAllSdkSessionIds clears every stale SDK ID", () => { + store.create("cli", "conv-1"); + store.create("slack", "conv-2"); + store.create("web", "conv-3"); + + store.updateSdkSessionId("cli:conv-1", "sdk-aaa"); + store.updateSdkSessionId("slack:conv-2", "sdk-bbb"); + // web:conv-3 has no SDK session ID + + const cleared = store.clearAllSdkSessionIds(); + expect(cleared).toBe(2); + + expect(store.getByKey("cli:conv-1")?.sdk_session_id).toBeNull(); + expect(store.getByKey("slack:conv-2")?.sdk_session_id).toBeNull(); + expect(store.getByKey("web:conv-3")?.sdk_session_id).toBeNull(); + }); + + test("clearAllSdkSessionIds returns 0 when no sessions have SDK IDs", () => { + store.create("cli", "conv-1"); + const cleared = store.clearAllSdkSessionIds(); + expect(cleared).toBe(0); + }); + test("create reactivates an expired session with the same key", () => { store.create("cli", "conv-1"); store.updateSdkSessionId("cli:conv-1", "old-sdk-id"); diff --git a/src/agent/runtime.ts b/src/agent/runtime.ts index d5aa4211..c3cfc947 100644 --- a/src/agent/runtime.ts +++ b/src/agent/runtime.ts @@ -280,6 +280,22 @@ export class AgentRuntime { resultText = `Error: ${retryMsg}`; onEvent?.({ type: "error", message: retryMsg }); } + } else if (isResume) { + // Any other error during a resume attempt — the SDK session is + // likely unusable. Discard it and retry fresh. See #25. + console.log(`[runtime] Resume failed (${errorMsg}), retrying without resume: ${sessionKey}`); + this.sessionStore.clearSdkSessionId(sessionKey); + sdkSessionId = ""; + resultText = ""; + cost = emptyCost(); + emittedThinking = false; + try { + await runSdkQuery(false); + } catch (retryErr: unknown) { + const retryMsg = retryErr instanceof Error ? retryErr.message : String(retryErr); + resultText = `Error: ${retryMsg}`; + onEvent?.({ type: "error", message: retryMsg }); + } } else { resultText = `Error: ${errorMsg}`; onEvent?.({ type: "error", message: errorMsg }); diff --git a/src/agent/session-store.ts b/src/agent/session-store.ts index 88df65d1..5cdb8737 100644 --- a/src/agent/session-store.ts +++ b/src/agent/session-store.ts @@ -77,6 +77,21 @@ export class SessionStore { ); } + /** + * Clear all SDK session IDs on startup. + * + * SDK session IDs are process-local and do not survive restarts. + * Without this, container recreates leave stale IDs in SQLite + * (persisted volume), causing the runtime to attempt impossible + * resumes that deadlock the CLI channel. See #25. + */ + clearAllSdkSessionIds(): number { + const result = this.db.run( + "UPDATE sessions SET sdk_session_id = NULL WHERE sdk_session_id IS NOT NULL", + ); + return result.changes; + } + touch(sessionKey: string): void { this.db.run("UPDATE sessions SET last_active_at = datetime('now') WHERE session_key = ?", [sessionKey]); } diff --git a/src/index.ts b/src/index.ts index 50aed7a6..c9ce0643 100644 --- a/src/index.ts +++ b/src/index.ts @@ -122,6 +122,14 @@ async function main(): Promise { // agent, which means a single auth path and a single provider switch. const runtime = new AgentRuntime(config, db); + // SDK session IDs are process-local and never survive restarts. + // Clear them so the runtime does not attempt impossible resumes + // that deadlock CLI or other persistent channels. See #25. + { + const result = db.run("UPDATE sessions SET sdk_session_id = NULL WHERE sdk_session_id IS NOT NULL"); + if (result.changes > 0) console.log(`[phantom] Cleared ${result.changes} stale SDK session ID(s)`); + } + let evolution: EvolutionEngine | null = null; let evolutionCadence: EvolutionCadence | null = null; try {