From 9da397ec2f403db471c80fdb42d19248b3153363 Mon Sep 17 00:00:00 2001 From: "Nacho F. Lizaur" Date: Mon, 2 Mar 2026 20:59:28 +0100 Subject: [PATCH] fix(opencode): robust process exit detection for child processes --- packages/opencode/src/session/prompt.ts | 8 + packages/opencode/src/shell/shell.ts | 34 +- packages/opencode/src/tool/bash.ts | 164 +++++++- packages/opencode/src/util/process.ts | 44 ++- packages/opencode/test/tool/bash.test.ts | 405 +++++++++++++++++++- packages/opencode/test/util/process.test.ts | 36 ++ 6 files changed, 671 insertions(+), 20 deletions(-) diff --git a/packages/opencode/src/session/prompt.ts b/packages/opencode/src/session/prompt.ts index 4f77920cc98..3b568ba2e6b 100644 --- a/packages/opencode/src/session/prompt.ts +++ b/packages/opencode/src/session/prompt.ts @@ -45,6 +45,7 @@ import { LLM } from "./llm" import { iife } from "@/util/iife" import { Shell } from "@/shell/shell" import { Truncate } from "@/tool/truncation" +import { stale, reap } from "@/tool/bash" // @ts-ignore globalThis.AI_SDK_LOG_WARNINGS = false @@ -284,6 +285,13 @@ export namespace SessionPrompt { using _ = defer(() => cancel(sessionID)) + const watchdog = setInterval(() => { + for (const id of stale()) { + reap(id) + } + }, 5000) + using _watchdog = defer(() => clearInterval(watchdog)) + // Structured output state // Note: On session resumption, state is reset but outputFormat is preserved // on the user message and will be retrieved from lastUser below diff --git a/packages/opencode/src/shell/shell.ts b/packages/opencode/src/shell/shell.ts index e7b7cdb3e4d..4d76d1639af 100644 --- a/packages/opencode/src/shell/shell.ts +++ b/packages/opencode/src/shell/shell.ts @@ -7,6 +7,15 @@ import { spawn, type ChildProcess } from "child_process" const SIGKILL_TIMEOUT_MS = 200 export namespace Shell { + function alive(pid: number): boolean { + try { + process.kill(pid, 0) + return true + } catch { + return false + } + } + export async function killTree(proc: ChildProcess, opts?: { exited?: () => boolean }): Promise { const pid = proc.pid if (!pid || opts?.exited?.()) return @@ -22,17 +31,24 @@ export namespace Shell { try { process.kill(-pid, "SIGTERM") - await Bun.sleep(SIGKILL_TIMEOUT_MS) - if (!opts?.exited?.()) { - process.kill(-pid, "SIGKILL") - } - } catch (_e) { - proc.kill("SIGTERM") - await Bun.sleep(SIGKILL_TIMEOUT_MS) - if (!opts?.exited?.()) { + } catch { + try { + proc.kill("SIGTERM") + } catch {} + } + + await Bun.sleep(SIGKILL_TIMEOUT_MS) + + if (opts?.exited?.() || !alive(pid)) return + try { + process.kill(-pid, "SIGKILL") + } catch { + try { proc.kill("SIGKILL") - } + } catch {} } + + await Bun.sleep(SIGKILL_TIMEOUT_MS) } const BLACKLIST = new Set(["fish", "nu"]) diff --git a/packages/opencode/src/tool/bash.ts b/packages/opencode/src/tool/bash.ts index 0751f789b7d..e593cf0c091 100644 --- a/packages/opencode/src/tool/bash.ts +++ b/packages/opencode/src/tool/bash.ts @@ -23,6 +23,40 @@ const DEFAULT_TIMEOUT = Flag.OPENCODE_EXPERIMENTAL_BASH_DEFAULT_TIMEOUT_MS || 2 export const log = Log.create({ service: "bash-tool" }) +// Registry for active bash processes — enables server-level watchdog +const active = new Map< + string, + { + pid: number + timeout: number + started: number + kill: () => void + done: () => void + } +>() + +export function stale() { + const result: string[] = [] + const now = Date.now() + for (const [id, entry] of active) { + if (now - entry.started > entry.timeout + 5000) result.push(id) + } + return result +} + +export function reap(id: string) { + const entry = active.get(id) + if (!entry) return + log.info("reaping stuck process", { + callID: id, + pid: entry.pid, + age: Date.now() - entry.started, + }) + entry.kill() + entry.done() + active.delete(id) +} + const resolveWasm = (asset: string) => { if (asset.startsWith("file://")) return fileURLToPath(asset) if (asset.startsWith("/") || /^[a-z]:/i.test(asset)) return asset @@ -180,6 +214,21 @@ export const BashTool = Tool.define("bash", async () => { detached: process.platform !== "win32", }) + if (!proc.pid) { + if (proc.exitCode !== null) { + log.info("process exited before pid could be read", { exitCode: proc.exitCode }) + } else { + throw new Error(`Failed to spawn process: pid is undefined for command "${params.command}"`) + } + } + + log.info("spawned process", { + pid: proc.pid, + command: params.command.slice(0, 100), + cwd, + timeout, + }) + let output = "" // Initialize metadata with empty output @@ -216,6 +265,7 @@ export const BashTool = Tool.define("bash", async () => { } const abortHandler = () => { + log.info("process abort triggered", { pid: proc.pid }) aborted = true void kill() } @@ -223,27 +273,135 @@ export const BashTool = Tool.define("bash", async () => { ctx.abort.addEventListener("abort", abortHandler, { once: true }) const timeoutTimer = setTimeout(() => { + log.info("process timeout triggered", { pid: proc.pid, timeout }) timedOut = true void kill() }, timeout + 100) + const started = Date.now() + + const callID = ctx.callID + if (callID) { + active.set(callID, { + pid: proc.pid!, + timeout, + started, + kill: () => Shell.killTree(proc, { exited: () => exited }), + done: () => {}, + }) + } + await new Promise((resolve, reject) => { + let resolved = false + const cleanup = () => { + if (resolved) return + resolved = true clearTimeout(timeoutTimer) + clearInterval(poll) ctx.abort.removeEventListener("abort", abortHandler) + proc.stdout?.removeListener("end", check) + proc.stderr?.removeListener("end", check) } - proc.once("exit", () => { + const done = () => { + if (resolved) return exited = true cleanup() resolve() - }) + } + + // Update the active entry with the real done callback + if (callID) { + const entry = active.get(callID) + if (entry) { + entry.done = () => { + if (resolved) return + exited = true + cleanup() + resolve() + } + } + } - proc.once("error", (error) => { + const fail = (error: Error) => { + if (resolved) return exited = true cleanup() reject(error) + } + + proc.once("exit", () => { + log.info("process exit detected via 'exit' event", { pid: proc.pid, exitCode: proc.exitCode }) + done() + }) + proc.once("close", () => { + log.info("process exit detected via 'close' event", { pid: proc.pid, exitCode: proc.exitCode }) + done() }) + proc.once("error", fail) + + // Redundancy: stdio end events fire when pipe file descriptors close + // independent of process exit monitoring — catches missed exit events + let streams = 0 + const total = (proc.stdout ? 1 : 0) + (proc.stderr ? 1 : 0) + const check = () => { + streams++ + if (streams < total) return + if (proc.exitCode !== null || proc.signalCode !== null) { + log.info("stdio end detected exit (exitCode already set)", { + pid: proc.pid, + exitCode: proc.exitCode, + }) + done() + return + } + setTimeout(() => { + log.info("stdio end deferred check", { + pid: proc.pid, + exitCode: proc.exitCode, + }) + done() + }, 50) + } + proc.stdout?.once("end", check) + proc.stderr?.once("end", check) + + // Polling watchdog: detect process exit when Bun's event loop + // fails to deliver the "exit" event (confirmed Bun bug in containers) + const poll = setInterval(() => { + if (proc.exitCode !== null || proc.signalCode !== null) { + log.info("polling watchdog detected exit via exitCode/signalCode", { + exitCode: proc.exitCode, + signalCode: proc.signalCode, + }) + done() + return + } + + // Check 2: process.kill(pid, 0) throws ESRCH if process is dead + if (proc.pid && process.platform !== "win32") { + try { + process.kill(proc.pid, 0) + } catch { + log.info("polling watchdog detected exit via kill(0) ESRCH", { + pid: proc.pid, + }) + done() + return + } + } + }, 1000) + }) + + if (callID) active.delete(callID) + + log.info("process completed", { + pid: proc.pid, + exitCode: proc.exitCode, + duration: Date.now() - started, + timedOut, + aborted, }) const resultMetadata: string[] = [] diff --git a/packages/opencode/src/util/process.ts b/packages/opencode/src/util/process.ts index 71f001a86a1..e663ca188e1 100644 --- a/packages/opencode/src/util/process.ts +++ b/packages/opencode/src/util/process.ts @@ -74,20 +74,52 @@ export namespace Process { } const exited = new Promise((resolve, reject) => { - const done = () => { + let resolved = false + + const cleanup = () => { + if (resolved) return + resolved = true opts.abort?.removeEventListener("abort", abort) if (timer) clearTimeout(timer) + clearInterval(poll) + } + + const finish = (code: number) => { + if (resolved) return + cleanup() + resolve(code) + } + + const fail = (error: Error) => { + if (resolved) return + cleanup() + reject(error) } proc.once("exit", (code, signal) => { - done() - resolve(code ?? (signal ? 1 : 0)) + finish(code ?? (signal ? 1 : 0)) }) - proc.once("error", (error) => { - done() - reject(error) + proc.once("close", (code, signal) => { + finish(code ?? (signal ? 1 : 0)) }) + + proc.once("error", fail) + const poll = setInterval(() => { + if (proc.exitCode !== null || proc.signalCode !== null) { + finish(proc.exitCode ?? (proc.signalCode ? 1 : 0)) + return + } + + if (proc.pid && process.platform !== "win32") { + try { + process.kill(proc.pid, 0) + } catch { + finish(proc.exitCode ?? 1) + return + } + } + }, 1000) }) if (opts.abort) { diff --git a/packages/opencode/test/tool/bash.test.ts b/packages/opencode/test/tool/bash.test.ts index ac93016927a..ea23a04b37f 100644 --- a/packages/opencode/test/tool/bash.test.ts +++ b/packages/opencode/test/tool/bash.test.ts @@ -1,12 +1,14 @@ import { describe, expect, test } from "bun:test" import os from "os" import path from "path" -import { BashTool } from "../../src/tool/bash" +import { BashTool, stale, reap } from "../../src/tool/bash" import { Instance } from "../../src/project/instance" import { Filesystem } from "../../src/util/filesystem" import { tmpdir } from "../fixture/fixture" import type { PermissionNext } from "../../src/permission/next" import { Truncate } from "../../src/tool/truncation" +import { Shell } from "../../src/shell/shell" +import { spawn } from "child_process" const ctx = { sessionID: "test", @@ -313,7 +315,7 @@ describe("tool.bash permissions", () => { }) }) -describe("tool.bash truncation", () => { +describe.skipIf(process.platform === "win32")("tool.bash truncation", () => { test("truncates output exceeding line limit", async () => { await Instance.provide({ directory: projectRoot, @@ -400,3 +402,402 @@ describe("tool.bash truncation", () => { }) }) }) + +describe("tool.bash defensive patterns", () => { + test("completes normally with polling active", async () => { + await Instance.provide({ + directory: projectRoot, + fn: async () => { + const bash = await BashTool.init() + const result = await bash.execute( + { command: "echo 'quick'", description: "Quick echo" }, + ctx, + ) + expect(result.metadata.exit).toBe(0) + expect(result.metadata.output).toContain("quick") + }, + }) + }) + + test("resolves within polling interval for fast commands", async () => { + await Instance.provide({ + directory: projectRoot, + fn: async () => { + const bash = await BashTool.init() + const start = Date.now() + const result = await bash.execute( + { command: "echo 'fast'", description: "Fast echo" }, + ctx, + ) + const elapsed = Date.now() - start + expect(result.metadata.exit).toBe(0) + expect(result.metadata.output).toContain("fast") + expect(elapsed).toBeLessThan(3000) + }, + }) + }) + + test.skipIf(process.platform === "win32")("handles long-running command that completes", async () => { + await Instance.provide({ + directory: projectRoot, + fn: async () => { + const bash = await BashTool.init() + const result = await bash.execute( + { command: "sleep 2 && echo done", description: "Sleep then echo" }, + ctx, + ) + expect(result.metadata.exit).toBe(0) + expect(result.metadata.output).toContain("done") + }, + }) + }) + + test("resolves when process exits normally (exit event path)", async () => { + await Instance.provide({ + directory: projectRoot, + fn: async () => { + const bash = await BashTool.init() + const result = await bash.execute( + { command: "echo 'test'", description: "Exit event test" }, + ctx, + ) + expect(result.metadata.exit).toBe(0) + }, + }) + }) + + test("does not double-resolve for normal execution", async () => { + await Instance.provide({ + directory: projectRoot, + fn: async () => { + const bash = await BashTool.init() + let count = 0 + const result = await bash.execute( + { command: "echo 'once'", description: "Single resolve test" }, + ctx, + ) + count++ + expect(count).toBe(1) + expect(result.metadata.exit).toBe(0) + expect(result.metadata.output).toContain("once") + }, + }) + }) + + test("spawns process with valid pid", async () => { + await Instance.provide({ + directory: projectRoot, + fn: async () => { + const bash = await BashTool.init() + const result = await bash.execute( + { command: "echo 'pid-test'", description: "Pid validation test" }, + ctx, + ) + expect(result.metadata.exit).toBe(0) + }, + }) + }) + + test("handles invalid command gracefully", async () => { + await Instance.provide({ + directory: projectRoot, + fn: async () => { + const bash = await BashTool.init() + const result = await bash.execute( + { command: "/nonexistent/binary/xyz", description: "Invalid command" }, + ctx, + ) + expect(result.metadata.exit).not.toBe(0) + }, + }) + }) + + test.skipIf(process.platform === "win32")("times out long-running command", async () => { + await Instance.provide({ + directory: projectRoot, + fn: async () => { + const bash = await BashTool.init() + const result = await bash.execute( + { command: "sleep 60", timeout: 1000, description: "Long sleep" }, + ctx, + ) + expect(result.output).toContain("timeout") + }, + }) + }) + + test.skipIf(process.platform === "win32")("abort signal kills process", async () => { + await Instance.provide({ + directory: projectRoot, + fn: async () => { + const bash = await BashTool.init() + const controller = new AbortController() + setTimeout(() => controller.abort(), 500) + const result = await bash.execute( + { command: "sleep 60", description: "Abortable sleep" }, + { ...ctx, abort: controller.signal }, + ) + expect(result.output).toContain("abort") + }, + }) + }) + + test("cleanup clears both timeout and polling interval", async () => { + await Instance.provide({ + directory: projectRoot, + fn: async () => { + const bash = await BashTool.init() + const result = await bash.execute( + { command: "echo 'cleanup'", description: "Cleanup test" }, + ctx, + ) + expect(result.metadata.exit).toBe(0) + // If cleanup failed, lingering timers would keep the process alive + // and this test would time out. Completing is the assertion. + }, + }) + }) +}) + +// Prove polling watchdog detects exit without exit/close events +// (simulates Bun bug where events are dropped in containers) +describe.skipIf(process.platform === "win32")("polling watchdog isolation", () => { + test("resolves via polling when exit/close events are suppressed", async () => { + const proc = spawn("echo", ["hello"], { + shell: true, + stdio: ["ignore", "pipe", "pipe"], + detached: process.platform !== "win32", + }) + + let output = "" + proc.stdout?.on("data", (chunk: Buffer) => { + output += chunk.toString() + }) + + // Wait for process to finish — but deliberately do NOT use exit/close events + await Bun.sleep(500) + + const detected = await new Promise((resolve, reject) => { + const timer = setTimeout( + () => reject(new Error("polling watchdog failed to detect exit within 3s")), + 3000, + ) + + const poll = setInterval(() => { + if (proc.exitCode !== null || proc.signalCode !== null) { + clearInterval(poll) + clearTimeout(timer) + resolve("exitCode") + return + } + if (proc.pid) { + try { + process.kill(proc.pid, 0) + } catch { + clearInterval(poll) + clearTimeout(timer) + resolve("kill-esrch") + return + } + } + }, 200) + }) + + expect(["exitCode", "kill-esrch"]).toContain(detected) + expect(output.trim()).toBe("hello") + }) + + test("resolves via polling for process that exits with non-zero code", async () => { + const proc = spawn("exit 1", [], { + shell: true, + stdio: ["ignore", "pipe", "pipe"], + detached: process.platform !== "win32", + }) + + await Bun.sleep(500) + + const detected = await new Promise((resolve, reject) => { + const timer = setTimeout( + () => reject(new Error("polling watchdog failed to detect exit within 3s")), + 3000, + ) + + const poll = setInterval(() => { + if (proc.exitCode !== null || proc.signalCode !== null) { + clearInterval(poll) + clearTimeout(timer) + resolve("exitCode") + return + } + if (proc.pid) { + try { + process.kill(proc.pid, 0) + } catch { + clearInterval(poll) + clearTimeout(timer) + resolve("kill-esrch") + return + } + } + }, 200) + }) + + expect(["exitCode", "kill-esrch"]).toContain(detected) + }) + + test("resolves via polling for killed process (simulates timeout kill)", async () => { + const proc = spawn("sleep 60", [], { + shell: true, + stdio: ["ignore", "pipe", "pipe"], + detached: process.platform !== "win32", + }) + + expect(proc.pid).toBeDefined() + + // Kill the process (simulates what timeout/abort does) + try { + process.kill(-proc.pid!, "SIGKILL") + } catch { + proc.kill("SIGKILL") + } + + await Bun.sleep(500) + + const detected = await new Promise((resolve, reject) => { + const timer = setTimeout( + () => reject(new Error("polling watchdog failed to detect killed process within 3s")), + 3000, + ) + + const poll = setInterval(() => { + if (proc.exitCode !== null || proc.signalCode !== null) { + clearInterval(poll) + clearTimeout(timer) + resolve("exitCode") + return + } + if (proc.pid) { + try { + process.kill(proc.pid, 0) + } catch { + clearInterval(poll) + clearTimeout(timer) + resolve("kill-esrch") + return + } + } + }, 200) + }) + + expect(["exitCode", "kill-esrch"]).toContain(detected) + }) +}) + +describe.skipIf(process.platform === "win32")("shell.killTree", () => { + test("terminates a running process", async () => { + const proc = spawn("sleep", ["60"], { detached: true }) + expect(proc.pid).toBeDefined() + await Shell.killTree(proc) + await Bun.sleep(100) + expect(() => process.kill(proc.pid!, 0)).toThrow() + }) + + test("handles already-dead process", async () => { + const proc = spawn("echo", ["done"]) + await new Promise((resolve) => proc.once("exit", () => resolve())) + await Shell.killTree(proc, { exited: () => true }) + }) + + test("escalates to SIGKILL when SIGTERM ignored", async () => { + const proc = spawn("bash", ["-c", "trap '' TERM; sleep 60"], { detached: true }) + expect(proc.pid).toBeDefined() + await Shell.killTree(proc) + await Bun.sleep(100) + expect(() => process.kill(proc.pid!, 0)).toThrow() + }) +}) + +describe("tool.bash diagnostic logging", () => { + test("bash tool works with diagnostic logging", async () => { + await Instance.provide({ + directory: projectRoot, + fn: async () => { + const bash = await BashTool.init() + const result = await bash.execute( + { command: "echo 'log-test'", description: "Logging test" }, + ctx, + ) + expect(result.metadata.exit).toBe(0) + expect(result.metadata.output).toContain("log-test") + }, + }) + }) +}) + +describe.skipIf(process.platform === "win32")("server-level watchdog", () => { + test("stale returns empty when no processes are registered", () => { + const ids = stale() + expect(ids).toEqual([]) + }) + + test("reap force-completes a stuck bash process", async () => { + await Instance.provide({ + directory: projectRoot, + fn: async () => { + const bash = await BashTool.init() + const id = "test-reap-" + Date.now() + const promise = bash.execute( + { command: "sleep 60", description: "Stuck process for reap test" }, + { ...ctx, callID: id }, + ) + + await Bun.sleep(300) + + reap(id) + + // The promise should now resolve (not hang forever) + const result = await promise + expect(result).toBeDefined() + expect(result.output).toBeDefined() + }, + }) + }) + + test("reap is a no-op for unknown callID", () => { + reap("nonexistent-id-" + Date.now()) + }) +}) + +describe.skipIf(process.platform === "win32")("stdio end events", () => { + test("command with stdout output completes via stdio path", async () => { + await Instance.provide({ + directory: projectRoot, + fn: async () => { + const bash = await BashTool.init() + const result = await bash.execute( + { command: "seq 1 100", description: "Generate numbered output" }, + ctx, + ) + expect(result.metadata.exit).toBe(0) + expect(result.metadata.output).toContain("1") + expect(result.metadata.output).toContain("100") + }, + }) + }) + + test("command with both stdout and stderr completes", async () => { + await Instance.provide({ + directory: projectRoot, + fn: async () => { + const bash = await BashTool.init() + const result = await bash.execute( + { command: "echo out && echo err >&2", description: "Both streams" }, + ctx, + ) + expect(result.metadata.exit).toBe(0) + expect(result.metadata.output).toContain("out") + expect(result.metadata.output).toContain("err") + }, + }) + }) +}) diff --git a/packages/opencode/test/util/process.test.ts b/packages/opencode/test/util/process.test.ts index ce599d6d8f0..4414b1add47 100644 --- a/packages/opencode/test/util/process.test.ts +++ b/packages/opencode/test/util/process.test.ts @@ -57,3 +57,39 @@ describe("util.process", () => { expect(Date.now() - started).toBeLessThan(1000) }, 3000) }) + +describe("util.process defensive patterns", () => { + test("Process.run completes normally", async () => { + const result = await Process.run(node('process.stdout.write("hello")')) + expect(result.code).toBe(0) + expect(result.stdout.toString()).toContain("hello") + }) + + test("Process.run handles failing command", async () => { + expect(Process.run(node("process.exit(1)"))).rejects.toThrow() + }) + + test("Process.run with nothrow returns non-zero code", async () => { + const result = await Process.run(node("process.exit(1)"), { nothrow: true }) + expect(result.code).not.toBe(0) + }) + + test("Process.spawn returns valid exited promise", async () => { + const proc = Process.spawn(node('process.stdout.write("test")'), { stdout: "pipe" }) + const code = await proc.exited + expect(code).toBe(0) + }) + + test("Process.spawn abort kills process", async () => { + const controller = new AbortController() + const proc = Process.spawn(node("setInterval(() => {}, 60000)"), { abort: controller.signal }) + setTimeout(() => controller.abort(), 200) + const code = await proc.exited + expect(typeof code).toBe("number") + }) + + test("Process.run completes for fast commands", async () => { + const result = await Process.run(node("process.exit(0)")) + expect(result.code).toBe(0) + }) +})