From b7f43b75e12d26d036c928a91e64f57dd59c95ac Mon Sep 17 00:00:00 2001 From: Evan Nadeau <1878498+evannadeau@users.noreply.github.com> Date: Wed, 13 May 2026 19:21:58 -0700 Subject: [PATCH 1/2] feat(orchestrator): reap stale per-PID active-session files at startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per-PID active-session- files (introduced in 0.30.19+) make session_id lookup race-free under concurrent sessions, but nothing has been reaping them when the owning claude process exits. On a developer machine with many short-lived sessions per day, they accumulate indefinitely — 8 stale files observed in one project on 2026-05-13, from claude PIDs long since dead. The files are cosmetic in the sense that the legacy single `active-session` file remains the primary lookup, but a slow directory listing eventually becomes a real cost on a hot-spot workstation. This patch adds a startup sweep that walks `/.orchestrator-state/`, matches files of shape `active-session-`, probes liveness via `process.kill(pid, 0)`, and unlinks dead-PID entries. The probe is cross-platform via Node's API. Cheap, idempotent, race-safe (we only unlink files whose PID is verifiably gone). Lost races with concurrent sessions are tolerated — next startup retries. Runs once at MCP startup, unconditionally (even when the no-claude-ancestor branch is about to exit, so future startups benefit). Tested: bun run typecheck clean, bun test 516 pass / 0 fail. Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/orchestrator/dist/server.js | 51 ++++++++++++++++++--- plugins/orchestrator/mcp/server.ts | 71 ++++++++++++++++++++++++++++- 2 files changed, 114 insertions(+), 8 deletions(-) diff --git a/plugins/orchestrator/dist/server.js b/plugins/orchestrator/dist/server.js index 6b6141f..d6c379d 100644 --- a/plugins/orchestrator/dist/server.js +++ b/plugins/orchestrator/dist/server.js @@ -6519,7 +6519,7 @@ var require_dist = __commonJS((exports, module) => { // mcp/server.ts import { resolve, join as join5 } from "path"; -import { existsSync as existsSync6, readFileSync as readFileSync3, writeFileSync } from "fs"; +import { existsSync as existsSync6, readFileSync as readFileSync3, readdirSync as readdirSync2, unlinkSync as unlinkSync2, writeFileSync } from "fs"; import { execSync } from "child_process"; // node_modules/zod/v3/external.js @@ -24629,22 +24629,22 @@ async function startSidecar() { } } catch {} try { - const { unlinkSync: unlinkSync2 } = await import("fs"); - unlinkSync2(portFile); + const { unlinkSync: unlinkSync3 } = await import("fs"); + unlinkSync3(portFile); } catch {} const baseArgs = ["--port", "0", "--port-file", portFile]; let result = await trySpawn(["uvx", "--with-requirements", requirementsPath, "python", sidecarPath, ...baseArgs], portFile, "uvx", 60000); if (!result) { try { - const { unlinkSync: unlinkSync2 } = await import("fs"); - unlinkSync2(portFile); + const { unlinkSync: unlinkSync3 } = await import("fs"); + unlinkSync3(portFile); } catch {} result = await trySpawn(["python", sidecarPath, ...baseArgs], portFile, "python", 30000); } if (!result) { try { - const { unlinkSync: unlinkSync2 } = await import("fs"); - unlinkSync2(portFile); + const { unlinkSync: unlinkSync3 } = await import("fs"); + unlinkSync3(portFile); } catch {} result = await trySpawn(["python3", sidecarPath, ...baseArgs], portFile, "python3", 30000); } @@ -26357,6 +26357,43 @@ foreach ($s in $siblings) { `); } } +function reapStaleActiveSessionFiles(stateDir) { + if (!existsSync6(stateDir)) + return; + let reaped = 0; + try { + const entries = readdirSync2(stateDir); + for (const entry of entries) { + const m = entry.match(/^active-session-(\d+)$/); + if (!m) + continue; + const pid = Number(m[1]); + if (!Number.isFinite(pid) || pid <= 0) + continue; + let alive = false; + try { + process.kill(pid, 0); + alive = true; + } catch { + alive = false; + } + if (!alive) { + try { + unlinkSync2(join5(stateDir, entry)); + reaped++; + } catch {} + } + } + } catch {} + if (reaped > 0) { + process.stderr.write(`[orchestrator] startup hygiene: reaped ${reaped} stale active-session- file(s) in ${stateDir} +`); + } +} +{ + const startupProjectDir = process.env.ORCHESTRATOR_PROJECT_ROOT || process.env.CLAUDE_PROJECT_DIR || process.cwd(); + reapStaleActiveSessionFiles(join5(startupProjectDir, ".orchestrator-state")); +} var initialParentClaudePid = findClaudeAncestorPid(); var initialParentClaudeCreationTime = initialParentClaudePid !== null ? getProcessCreationTime(initialParentClaudePid) : null; if (initialParentClaudePid) { diff --git a/plugins/orchestrator/mcp/server.ts b/plugins/orchestrator/mcp/server.ts index 85eed16..5f3f74b 100644 --- a/plugins/orchestrator/mcp/server.ts +++ b/plugins/orchestrator/mcp/server.ts @@ -1,5 +1,5 @@ import { resolve, join } from "node:path"; -import { existsSync, readFileSync, writeFileSync } from "node:fs"; +import { existsSync, readFileSync, readdirSync, unlinkSync, writeFileSync } from "node:fs"; import { execSync } from "node:child_process"; import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; @@ -2822,6 +2822,75 @@ foreach ($s in $siblings) { } } +/** + * Startup hygiene: remove stale per-PID `active-session-` files + * whose owning claude process has exited. The per-PID file scheme + * (introduced in 0.30.19+) makes session_id lookup race-free for + * concurrent sessions, but nothing has been reaping these files when + * the claude process they belong to dies. On a developer machine with + * many short-lived sessions per day, they accumulate indefinitely. + * + * They are cosmetic - the legacy single `active-session` file remains + * the primary lookup - but a slow directory listing eventually becomes + * a real cost. This sweep runs once at MCP startup; it is cheap, + * idempotent, and race-safe (we only unlink files whose PID is verified + * gone via `process.kill(pid, 0)`). + */ +function reapStaleActiveSessionFiles(stateDir: string): void { + if (!existsSync(stateDir)) return; + let reaped = 0; + try { + const entries = readdirSync(stateDir); + for (const entry of entries) { + const m = entry.match(/^active-session-(\d+)$/); + if (!m) continue; + const pid = Number(m[1]); + if (!Number.isFinite(pid) || pid <= 0) continue; + // Liveness probe: process.kill(pid, 0) throws if the PID does + // not exist. ESRCH = dead PID (reap). EPERM = alive but not + // ours to signal (rare for own state files; treat as alive to + // be safe). We don't distinguish error codes here because the + // failure cost of a missed reap is one extra orphan file at + // worst - next startup will retry. + let alive = false; + try { + process.kill(pid, 0); + alive = true; + } catch { + alive = false; + } + if (!alive) { + try { + unlinkSync(join(stateDir, entry)); + reaped++; + } catch { + // Lost a race with another session, or permission issue. + // Non-fatal; next startup will retry. + } + } + } + } catch { + // readdir failure - directory may not exist, or permission denied. + // Either way nothing to reap. + } + if (reaped > 0) { + process.stderr.write( + `[orchestrator] startup hygiene: reaped ${reaped} stale active-session- file(s) in ${stateDir}\n`, + ); + } +} + +// Startup hygiene runs unconditionally - it doesn't depend on parent +// claude resolution and benefits future startups even if THIS one is +// about to exit (no-claude-ancestor case below). +{ + const startupProjectDir = + process.env.ORCHESTRATOR_PROJECT_ROOT || + process.env.CLAUDE_PROJECT_DIR || + process.cwd(); + reapStaleActiveSessionFiles(join(startupProjectDir, ".orchestrator-state")); +} + const initialParentClaudePid = findClaudeAncestorPid(); // 0.30.38: also capture parent claude.exe's creation time so the watchdog // can defend against PID reuse. Without this, when the user closes one From a28388e7e77b09e2532fd6e16315db4a14b286cd Mon Sep 17 00:00:00 2001 From: Evan Nadeau <1878498+evannadeau@users.noreply.github.com> Date: Wed, 13 May 2026 19:22:56 -0700 Subject: [PATCH 2/2] feat(orchestrator): warn about likely-orphan sibling MCPs at startup Complements the existing orphan-bun watchdog (which catches "parent dies while I'm alive" cases for the current process). The watchdog only protects processes that LOADED the watchdog code - older bun processes whose in-memory bytecode predates a fix do not benefit from that fix, and can survive forever if their original parent claude died without triggering whatever watchdog they happen to be running. Concretely: on a developer machine that pulls plugin updates, an MCP process loaded at time T1 may still be alive after the on-disk `dist/server.js` is rebuilt at T2 > T1. If the parent claude that spawned T1's bun dies after T2, the T1 bun's in-memory watchdog code is the version from T1 - any later improvements to watchdog detection are invisible to it. We observed this 2026-05-13: an orphan bun survived ~30 minutes across multiple watchdog tick intervals before manual cleanup via `kill -9`. This patch adds a startup-time scan (Linux only) that walks /proc for bun processes whose cmdline references `orchestrator/dist/server.js` and whose parent chain contains no live `claude` process within 8 hops. Suspects are logged with diagnostic guidance; we do NOT auto-kill, because sibling MCPs may co-own infrastructure shared across live sessions (the python sidecar is deliberately shared via `.sidecar-port`). Detection surfaces the issue; the operator decides. Windows is unchanged - killOlderDuplicateMcps already handles a related case (siblings sharing our parent claude). Pure orphans on Windows are rare because parent death typically reaps children. Tested: bun run typecheck clean, bun test 516 pass / 0 fail. Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/orchestrator/dist/server.js | 56 +++++++++++++++++++ plugins/orchestrator/mcp/server.ts | 85 +++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+) diff --git a/plugins/orchestrator/dist/server.js b/plugins/orchestrator/dist/server.js index d6c379d..1b60687 100644 --- a/plugins/orchestrator/dist/server.js +++ b/plugins/orchestrator/dist/server.js @@ -26390,9 +26390,65 @@ function reapStaleActiveSessionFiles(stateDir) { `); } } +function warnAboutLikelyOrphanSiblings() { + if (process.platform !== "linux") + return; + const myPid = process.pid; + const distMarker = "orchestrator/dist/server.js"; + let procDirs; + try { + procDirs = readdirSync2("/proc").filter((n) => /^\d+$/.test(n)); + } catch { + return; + } + const orphanPids = []; + for (const pidStr of procDirs) { + const pid = Number(pidStr); + if (pid === myPid) + continue; + let isSiblingMcp = false; + try { + const cmdline = readFileSync3(`/proc/${pid}/cmdline`, "utf8"); + isSiblingMcp = cmdline.includes(distMarker); + } catch { + continue; + } + if (!isSiblingMcp) + continue; + let walk = pid; + let foundClaude = false; + for (let depth = 0;depth < 8; depth++) { + try { + const stat = readFileSync3(`/proc/${walk}/stat`, "utf8"); + const rparen = stat.lastIndexOf(")"); + if (rparen < 0) + break; + const name = stat.slice(stat.indexOf("(") + 1, rparen).toLowerCase(); + if (name === "claude" || name === "claude.exe") { + foundClaude = true; + break; + } + const fields = stat.slice(rparen + 2).split(/\s+/); + const ppid = parseInt(fields[1] ?? "0", 10); + if (!ppid || ppid === walk || ppid === 1) + break; + walk = ppid; + } catch { + break; + } + } + if (!foundClaude) + orphanPids.push(pid); + } + if (orphanPids.length > 0) { + process.stderr.write(`[orchestrator] startup hygiene: detected ${orphanPids.length} likely-orphan sibling MCP process(es): pid=${orphanPids.join(",")}. Their parent claude is no longer in the process tree, suggesting they outlived their owning session and may be running stale bytecode whose watchdog never fired. Diagnose with 'pstree -ps '; clean up with 'kill -9 ' if confirmed orphan. +`); + } +} { const startupProjectDir = process.env.ORCHESTRATOR_PROJECT_ROOT || process.env.CLAUDE_PROJECT_DIR || process.cwd(); reapStaleActiveSessionFiles(join5(startupProjectDir, ".orchestrator-state")); + warnAboutLikelyOrphanSiblings(); } var initialParentClaudePid = findClaudeAncestorPid(); var initialParentClaudeCreationTime = initialParentClaudePid !== null ? getProcessCreationTime(initialParentClaudePid) : null; diff --git a/plugins/orchestrator/mcp/server.ts b/plugins/orchestrator/mcp/server.ts index 5f3f74b..a34e2c7 100644 --- a/plugins/orchestrator/mcp/server.ts +++ b/plugins/orchestrator/mcp/server.ts @@ -2880,6 +2880,90 @@ function reapStaleActiveSessionFiles(stateDir: string): void { } } +/** + * Startup hygiene: detect sibling orchestrator MCP processes whose + * parent claude is no longer alive, suggesting they outlived their + * owning session and may be running stale bytecode whose orphan + * watchdog never fired. + * + * Logs a warning naming the suspect PIDs - does NOT auto-kill, because + * killing a sibling MCP can disrupt infrastructure shared across live + * sessions (e.g. the python sidecar bound to .sidecar-port is + * deliberately shared - killing a sibling can take it down). Detection + * surfaces the issue; the operator decides whether to clean up. + * + * This complements the orphan-bun watchdog (which catches "parent dies + * while I'm alive" cases for processes loaded with the watchdog code). + * It does not help against orphans whose loaded bytecode predates the + * watchdog improvements - those need manual cleanup - but it makes + * such orphans visible at the next session's startup. + * + * Linux only. Windows already has killOlderDuplicateMcps for a related + * but different case (siblings sharing our parent claude); the orphan + * case on Windows is rare because parent death usually reaps children. + */ +function warnAboutLikelyOrphanSiblings(): void { + if (process.platform !== "linux") return; + const myPid = process.pid; + // Look for any other bun process whose cmdline references the + // orchestrator dist - that's the canonical sibling-MCP signature. + // We use a path suffix rather than an absolute marker so the check + // works regardless of where the plugin marketplace lives. + const distMarker = "orchestrator/dist/server.js"; + let procDirs: string[]; + try { + procDirs = readdirSync("/proc").filter((n) => /^\d+$/.test(n)); + } catch { + return; + } + const orphanPids: number[] = []; + for (const pidStr of procDirs) { + const pid = Number(pidStr); + if (pid === myPid) continue; + let isSiblingMcp = false; + try { + const cmdline = readFileSync(`/proc/${pid}/cmdline`, "utf8"); + isSiblingMcp = cmdline.includes(distMarker); + } catch { + continue; + } + if (!isSiblingMcp) continue; + // Walk this sibling's parent chain looking for a live claude + // process. If we never find one in 8 hops, the sibling has no + // claude ancestor in its current tree - likely orphaned. + let walk = pid; + let foundClaude = false; + for (let depth = 0; depth < 8; depth++) { + try { + const stat = readFileSync(`/proc/${walk}/stat`, "utf8"); + const rparen = stat.lastIndexOf(")"); + if (rparen < 0) break; + const name = stat + .slice(stat.indexOf("(") + 1, rparen) + .toLowerCase(); + if (name === "claude" || name === "claude.exe") { + foundClaude = true; + break; + } + const fields = stat.slice(rparen + 2).split(/\s+/); + const ppid = parseInt(fields[1] ?? "0", 10); + if (!ppid || ppid === walk || ppid === 1) break; + walk = ppid; + } catch { + break; + } + } + if (!foundClaude) orphanPids.push(pid); + } + if (orphanPids.length > 0) { + process.stderr.write( + `[orchestrator] startup hygiene: detected ${orphanPids.length} likely-orphan sibling MCP process(es): pid=${orphanPids.join(",")}. ` + + `Their parent claude is no longer in the process tree, suggesting they outlived their owning session and may be running stale bytecode whose watchdog never fired. ` + + `Diagnose with 'pstree -ps '; clean up with 'kill -9 ' if confirmed orphan.\n`, + ); + } +} + // Startup hygiene runs unconditionally - it doesn't depend on parent // claude resolution and benefits future startups even if THIS one is // about to exit (no-claude-ancestor case below). @@ -2889,6 +2973,7 @@ function reapStaleActiveSessionFiles(stateDir: string): void { process.env.CLAUDE_PROJECT_DIR || process.cwd(); reapStaleActiveSessionFiles(join(startupProjectDir, ".orchestrator-state")); + warnAboutLikelyOrphanSiblings(); } const initialParentClaudePid = findClaudeAncestorPid();