From 4f23c669e1feac745800e142df525feedb1a61a0 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Mon, 18 May 2026 18:49:04 -0400 Subject: [PATCH 1/2] Drop pnpm from CMD so Node is PID 1 and SIGTERM reaches its handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous attempt (prepending `exec` to CMD/ts-node) was insufficient: after that change, staging logs still show zero `Shutting down server for worker manager...` / `Stopping N worker(s)...` / `Draining reservations...` lines on rolling deploys, and zero reservations end up with completion_reason='interrupted'. The dying workers' last lines are pnpm's `ERR_PNPM_RECURSIVE_RUN_FIRST_FAIL` and `Command failed with signal "SIGTERM"` — pnpm is intercepting SIGTERM and not forwarding it to Node, so the registered SIGTERM handler in worker-manager.ts never fires and in-flight reservations stay locked until the 2h lease ages out. Change the four deployed Dockerfile CMDs from exec pnpm --filter "./packages/realm-server" $_script to exec /realm-server/packages/realm-server/$_script and switch the `*_script` build-args from npm script names to script paths relative to `packages/realm-server/`. The start scripts already `exec` into ts-node, so removing the pnpm wrapper makes Node the direct child of sh, and Node becomes PID 1 once the script execs. Because the scripts no longer inherit CWD/PATH from `pnpm --filter`, each script now `cd`s into the realm-server package directory and prepends `./node_modules/.bin` to PATH (matching what `pnpm --filter` set up implicitly). Independent insurance — keep regardless of whether the fix lands cleanly: Layer 1 (start scripts): each of the 7 deployed start scripts now writes `[] pid=… ppid=… about to exec ts-node` to stderr immediately before `exec ts-node`. Proves the shell got far enough to hand off. Layer 2 (Node): each of the 5 deployed Node entry files now writes a `[] STARTUP pid=… ppid=… argv=…` line to stderr synchronously near the top of the file, and each SIGTERM/SIGINT/disconnect/uncaughtException handler writes `[] received pid=… ppid=…` synchronously as its first action — before calling the existing shutdown function. The writes use `process.stderr.write` (not the logger) so the line lands even if Node terminates milliseconds later. If after a deploy we see STARTUP but no SIGTERM received, signal delivery is broken upstream of Node and we know to look elsewhere; if we see SIGTERM received but no draining log, the shutdown path itself is wedged. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/manual-deploy.yml | 8 ++--- packages/realm-server/main.ts | 23 ++++++++++++-- .../realm-server/prerender-manager.Dockerfile | 2 +- packages/realm-server/prerender.Dockerfile | 2 +- .../realm-server/prerender/manager-server.ts | 23 ++++++++++++-- .../prerender/prerender-server.ts | 23 ++++++++++++-- packages/realm-server/realm-server.Dockerfile | 2 +- .../scripts/start-prerender-manager.sh | 11 +++++++ .../scripts/start-prerender-production.sh | 11 +++++++ .../scripts/start-prerender-staging.sh | 11 +++++++ .../realm-server/scripts/start-production.sh | 11 +++++++ .../realm-server/scripts/start-staging.sh | 11 +++++++ .../scripts/start-worker-production.sh | 11 +++++++ .../scripts/start-worker-staging.sh | 11 +++++++ packages/realm-server/worker-manager.ts | 21 +++++++++++++ packages/realm-server/worker.Dockerfile | 2 +- packages/realm-server/worker.ts | 30 +++++++++++++++++-- 17 files changed, 196 insertions(+), 17 deletions(-) diff --git a/.github/workflows/manual-deploy.yml b/.github/workflows/manual-deploy.yml index 764ba6f213..e6344940b9 100644 --- a/.github/workflows/manual-deploy.yml +++ b/.github/workflows/manual-deploy.yml @@ -180,7 +180,7 @@ jobs: environment: ${{ inputs.environment }} dockerfile: "packages/realm-server/realm-server.Dockerfile" build-args: | - "realm_server_script=start:${{ inputs.environment }}" + "realm_server_script=scripts/start-${{ inputs.environment }}.sh" build-prerender-manager: name: Build prerender manager Docker image @@ -199,7 +199,7 @@ jobs: environment: ${{ inputs.environment }} dockerfile: "packages/realm-server/prerender-manager.Dockerfile" build-args: | - "prerender_manager_script=start:prerender-manager" + "prerender_manager_script=scripts/start-prerender-manager.sh" build-prerender: name: Build prerender Docker image @@ -218,7 +218,7 @@ jobs: environment: ${{ inputs.environment }} dockerfile: "packages/realm-server/prerender.Dockerfile" build-args: | - "prerender_script=start:prerender-${{ inputs.environment }}" + "prerender_script=scripts/start-prerender-${{ inputs.environment }}.sh" build-worker: name: Build worker Docker image @@ -237,7 +237,7 @@ jobs: environment: ${{ inputs.environment }} dockerfile: "packages/realm-server/worker.Dockerfile" build-args: | - "worker_script=start:worker-${{ inputs.environment }}" + "worker_script=scripts/start-worker-${{ inputs.environment }}.sh" build-pg-migration: name: Build pg-migration Docker image diff --git a/packages/realm-server/main.ts b/packages/realm-server/main.ts index cd5a2a0bcd..57cc8c7b57 100644 --- a/packages/realm-server/main.ts +++ b/packages/realm-server/main.ts @@ -43,6 +43,15 @@ import { ModuleCacheCoordinator } from './lib/module-cache-coordination'; import { resolveFullIndexOnStartup } from './lib/full-index-on-startup'; import { PUBLISHED_DIRECTORY_NAME } from '@cardstack/runtime-common'; +// Synchronous stderr stamp so we have proof the Node process actually +// started (and at what pid/ppid) independent of the logger pipeline. +// Used to triage cases where SIGTERM doesn't reach Node — if we see +// STARTUP but never SIGTERM received, signal delivery is broken +// upstream of Node. +process.stderr.write( + `[realm-server] STARTUP pid=${process.pid} ppid=${process.ppid} argv=${JSON.stringify(process.argv)}\n`, +); + let log = logger('main'); const runtimeMetadataFile = process.env.TEST_HARNESS_REALM_SERVER_METADATA_FILE; @@ -596,8 +605,18 @@ const getIndexHTML = async () => { // orchestrators trigger graceful cleanup (close httpServer, unsubscribe // realm watchers, drain queue + DB) instead of leaking the open // handles into a SIGKILL escalation. - process.on('SIGTERM', () => stopRealmServer(false)); - process.on('SIGINT', () => stopRealmServer(false)); + process.on('SIGTERM', () => { + process.stderr.write( + `[realm-server] SIGTERM received pid=${process.pid} ppid=${process.ppid}\n`, + ); + stopRealmServer(false); + }); + process.on('SIGINT', () => { + process.stderr.write( + `[realm-server] SIGINT received pid=${process.pid} ppid=${process.ppid}\n`, + ); + stopRealmServer(false); + }); process.on('message', (message) => { if (message === 'stop') { stopRealmServer(true); diff --git a/packages/realm-server/prerender-manager.Dockerfile b/packages/realm-server/prerender-manager.Dockerfile index 83759ba49c..011efb8e5b 100644 --- a/packages/realm-server/prerender-manager.Dockerfile +++ b/packages/realm-server/prerender-manager.Dockerfile @@ -25,4 +25,4 @@ EXPOSE 4222 HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \ CMD curl --fail --silent --show-error --max-time 5 --output /dev/null http://localhost:4222/ || exit 1 -CMD exec pnpm --filter "./packages/realm-server" $prerender_manager_script +CMD exec /realm-server/packages/realm-server/$prerender_manager_script diff --git a/packages/realm-server/prerender.Dockerfile b/packages/realm-server/prerender.Dockerfile index eca73ad154..7be58b3440 100644 --- a/packages/realm-server/prerender.Dockerfile +++ b/packages/realm-server/prerender.Dockerfile @@ -79,4 +79,4 @@ EXPOSE 4221 HEALTHCHECK --interval=30s --timeout=5s --start-period=60s --retries=3 \ CMD curl --fail --silent --show-error --max-time 5 --output /dev/null http://localhost:4221/ || exit 1 -CMD exec pnpm --filter "./packages/realm-server" $prerender_script +CMD exec /realm-server/packages/realm-server/$prerender_script diff --git a/packages/realm-server/prerender/manager-server.ts b/packages/realm-server/prerender/manager-server.ts index d7306430cb..6c84976386 100644 --- a/packages/realm-server/prerender/manager-server.ts +++ b/packages/realm-server/prerender/manager-server.ts @@ -11,6 +11,15 @@ import { registerService, } from '../lib/dev-service-registry'; +// Synchronous stderr stamp so we have proof the Node process actually +// started (and at what pid/ppid) independent of the logger pipeline. +// Used to triage cases where SIGTERM doesn't reach Node — if we see +// STARTUP but never SIGTERM received, signal delivery is broken +// upstream of Node. +process.stderr.write( + `[prerender-manager] STARTUP pid=${process.pid} ppid=${process.ppid} argv=${JSON.stringify(process.argv)}\n`, +); + let log = logger('prerender-manager'); let { port, exitOnSignal, forceExitTimeoutMs } = yargs(process.argv.slice(2)) @@ -86,5 +95,15 @@ function shutdown(signal: NodeJS.Signals) { // process manager to terminate after grace period. } -process.on('SIGINT', () => shutdown('SIGINT')); -process.on('SIGTERM', () => shutdown('SIGTERM')); +process.on('SIGINT', () => { + process.stderr.write( + `[prerender-manager] SIGINT received pid=${process.pid} ppid=${process.ppid}\n`, + ); + shutdown('SIGINT'); +}); +process.on('SIGTERM', () => { + process.stderr.write( + `[prerender-manager] SIGTERM received pid=${process.pid} ppid=${process.ppid}\n`, + ); + shutdown('SIGTERM'); +}); diff --git a/packages/realm-server/prerender/prerender-server.ts b/packages/realm-server/prerender/prerender-server.ts index 383dc0ad54..7d92380684 100644 --- a/packages/realm-server/prerender/prerender-server.ts +++ b/packages/realm-server/prerender/prerender-server.ts @@ -6,6 +6,15 @@ import yargs from 'yargs'; import type { Server } from 'http'; import { createPrerenderHttpServer } from './prerender-app'; +// Synchronous stderr stamp so we have proof the Node process actually +// started (and at what pid/ppid) independent of the logger pipeline. +// Used to triage cases where SIGTERM doesn't reach Node — if we see +// STARTUP but never SIGTERM received, signal delivery is broken +// upstream of Node. +process.stderr.write( + `[prerender-server] STARTUP pid=${process.pid} ppid=${process.ppid} argv=${JSON.stringify(process.argv)}\n`, +); + let log = logger('prerender-server'); let { port, count } = yargs(process.argv.slice(2)) @@ -60,5 +69,15 @@ function shutdown() { } } -process.on('SIGINT', shutdown); -process.on('SIGTERM', shutdown); +process.on('SIGINT', () => { + process.stderr.write( + `[prerender-server] SIGINT received pid=${process.pid} ppid=${process.ppid}\n`, + ); + shutdown(); +}); +process.on('SIGTERM', () => { + process.stderr.write( + `[prerender-server] SIGTERM received pid=${process.pid} ppid=${process.ppid}\n`, + ); + shutdown(); +}); diff --git a/packages/realm-server/realm-server.Dockerfile b/packages/realm-server/realm-server.Dockerfile index a78760eb77..de4730fa18 100644 --- a/packages/realm-server/realm-server.Dockerfile +++ b/packages/realm-server/realm-server.Dockerfile @@ -22,4 +22,4 @@ RUN CI=1 pnpm install -r --offline EXPOSE 3000 -CMD exec pnpm --filter "./packages/realm-server" $realm_server_script +CMD exec /realm-server/packages/realm-server/$realm_server_script diff --git a/packages/realm-server/scripts/start-prerender-manager.sh b/packages/realm-server/scripts/start-prerender-manager.sh index c32d46a3fb..962bc14e06 100755 --- a/packages/realm-server/scripts/start-prerender-manager.sh +++ b/packages/realm-server/scripts/start-prerender-manager.sh @@ -2,6 +2,17 @@ # Start the prerender manager in staging +# Run from the realm-server package directory so ts-node finds the package's +# tsconfig. Previously the CMD wrapped pnpm --filter, which set CWD for us; +# now PID 1 execs into this script directly so we set it ourselves. The PATH +# prepend gives us the local ts-node binary that `pnpm --filter` used to put +# on PATH automatically. +cd "$(cd "$(dirname "$0")" && pwd)/.." +PATH="./node_modules/.bin:$PATH" +export PATH + +echo "[start-prerender-manager] pid=$$ ppid=$PPID about to exec ts-node at $(date -Iseconds)" >&2 + NODE_ENV=production \ NODE_NO_WARNINGS=1 \ exec ts-node \ diff --git a/packages/realm-server/scripts/start-prerender-production.sh b/packages/realm-server/scripts/start-prerender-production.sh index 0173ea5277..48c0ffe200 100755 --- a/packages/realm-server/scripts/start-prerender-production.sh +++ b/packages/realm-server/scripts/start-prerender-production.sh @@ -3,6 +3,17 @@ # Start the prerender server in production # Expects REALM_SECRET_SEED to be set in the environment +# Run from the realm-server package directory so ts-node finds the package's +# tsconfig. Previously the CMD wrapped pnpm --filter, which set CWD for us; +# now PID 1 execs into this script directly so we set it ourselves. The PATH +# prepend gives us the local ts-node binary that `pnpm --filter` used to put +# on PATH automatically. +cd "$(cd "$(dirname "$0")" && pwd)/.." +PATH="./node_modules/.bin:$PATH" +export PATH + +echo "[start-prerender-production] pid=$$ ppid=$PPID about to exec ts-node at $(date -Iseconds)" >&2 + NODE_ENV=production \ NODE_NO_WARNINGS=1 \ BOXEL_HOST_URL=https://app.boxel.ai \ diff --git a/packages/realm-server/scripts/start-prerender-staging.sh b/packages/realm-server/scripts/start-prerender-staging.sh index eec3190368..c93d724a62 100755 --- a/packages/realm-server/scripts/start-prerender-staging.sh +++ b/packages/realm-server/scripts/start-prerender-staging.sh @@ -3,6 +3,17 @@ # Start the prerender server in staging # Expects REALM_SECRET_SEED to be set in the environment +# Run from the realm-server package directory so ts-node finds the package's +# tsconfig. Previously the CMD wrapped pnpm --filter, which set CWD for us; +# now PID 1 execs into this script directly so we set it ourselves. The PATH +# prepend gives us the local ts-node binary that `pnpm --filter` used to put +# on PATH automatically. +cd "$(cd "$(dirname "$0")" && pwd)/.." +PATH="./node_modules/.bin:$PATH" +export PATH + +echo "[start-prerender-staging] pid=$$ ppid=$PPID about to exec ts-node at $(date -Iseconds)" >&2 + NODE_ENV=production \ NODE_NO_WARNINGS=1 \ BOXEL_HOST_URL=https://realms-staging.stack.cards \ diff --git a/packages/realm-server/scripts/start-production.sh b/packages/realm-server/scripts/start-production.sh index bdc590c10c..4fb9296138 100755 --- a/packages/realm-server/scripts/start-production.sh +++ b/packages/realm-server/scripts/start-production.sh @@ -1,5 +1,14 @@ #! /bin/sh SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd)" +# Run from the realm-server package directory so `pnpm setup:*` resolves the +# package's npm scripts (the relative `../base/` paths inside those scripts +# depend on this CWD) and ts-node can find the package's tsconfig. Previously +# the CMD wrapped pnpm --filter, which set CWD for us; now PID 1 execs into +# this script directly so we set it ourselves. The PATH prepend gives us the +# local ts-node binary that `pnpm --filter` used to put on PATH automatically. +cd "$SCRIPTS_DIR/.." +PATH="./node_modules/.bin:$PATH" +export PATH pnpm setup:base-in-deployment pnpm setup:experiments-in-deployment pnpm setup:catalog-in-deployment @@ -19,6 +28,8 @@ SOFTWARE_FACTORY_REALM_URL="${RESOLVED_SOFTWARE_FACTORY_REALM_URL:-$DEFAULT_SOFT DEFAULT_BOXEL_HOMEPAGE_REALM_URL='https://app.boxel.ai/boxel-homepage/' BOXEL_HOMEPAGE_REALM_URL="${RESOLVED_BOXEL_HOMEPAGE_REALM_URL:-$DEFAULT_BOXEL_HOMEPAGE_REALM_URL}" +echo "[start-production] pid=$$ ppid=$PPID about to exec ts-node at $(date -Iseconds)" >&2 + NODE_NO_WARNINGS=1 \ LOW_CREDIT_THRESHOLD=2000 \ MATRIX_URL=https://matrix.boxel.ai \ diff --git a/packages/realm-server/scripts/start-staging.sh b/packages/realm-server/scripts/start-staging.sh index 381aa5f498..e12eb23666 100755 --- a/packages/realm-server/scripts/start-staging.sh +++ b/packages/realm-server/scripts/start-staging.sh @@ -1,5 +1,14 @@ #! /bin/sh SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd)" +# Run from the realm-server package directory so `pnpm setup:*` resolves the +# package's npm scripts (the relative `../base/` paths inside those scripts +# depend on this CWD) and ts-node can find the package's tsconfig. Previously +# the CMD wrapped pnpm --filter, which set CWD for us; now PID 1 execs into +# this script directly so we set it ourselves. The PATH prepend gives us the +# local ts-node binary that `pnpm --filter` used to put on PATH automatically. +cd "$SCRIPTS_DIR/.." +PATH="./node_modules/.bin:$PATH" +export PATH pnpm setup:base-in-deployment pnpm setup:experiments-in-deployment pnpm setup:catalog-in-deployment @@ -19,6 +28,8 @@ SOFTWARE_FACTORY_REALM_URL="${RESOLVED_SOFTWARE_FACTORY_REALM_URL:-$DEFAULT_SOFT DEFAULT_BOXEL_HOMEPAGE_REALM_URL='https://realms-staging.stack.cards/boxel-homepage/' BOXEL_HOMEPAGE_REALM_URL="${RESOLVED_BOXEL_HOMEPAGE_REALM_URL:-$DEFAULT_BOXEL_HOMEPAGE_REALM_URL}" +echo "[start-staging] pid=$$ ppid=$PPID about to exec ts-node at $(date -Iseconds)" >&2 + NODE_NO_WARNINGS=1 \ LOW_CREDIT_THRESHOLD=2000 \ MATRIX_URL=https://matrix-staging.stack.cards \ diff --git a/packages/realm-server/scripts/start-worker-production.sh b/packages/realm-server/scripts/start-worker-production.sh index dea4715119..0e1503f011 100755 --- a/packages/realm-server/scripts/start-worker-production.sh +++ b/packages/realm-server/scripts/start-worker-production.sh @@ -1,10 +1,21 @@ #! /bin/sh +# Run from the realm-server package directory so ts-node finds the package's +# tsconfig. Previously the CMD wrapped pnpm --filter, which set CWD for us; +# now PID 1 execs into this script directly so we set it ourselves. The PATH +# prepend gives us the local ts-node binary that `pnpm --filter` used to put +# on PATH automatically. +cd "$(cd "$(dirname "$0")" && pwd)/.." +PATH="./node_modules/.bin:$PATH" +export PATH + DEFAULT_CATALOG_REALM_URL='https://app.boxel.ai/catalog/' CATALOG_REALM_URL="${RESOLVED_CATALOG_REALM_URL:-$DEFAULT_CATALOG_REALM_URL}" DEFAULT_SOFTWARE_FACTORY_REALM_URL='https://app.boxel.ai/software-factory/' SOFTWARE_FACTORY_REALM_URL="${RESOLVED_SOFTWARE_FACTORY_REALM_URL:-$DEFAULT_SOFTWARE_FACTORY_REALM_URL}" +echo "[start-worker-production] pid=$$ ppid=$PPID about to exec ts-node at $(date -Iseconds)" >&2 + NODE_NO_WARNINGS=1 \ NODE_OPTIONS="${NODE_OPTIONS:---max-old-space-size=4096}" \ REALM_SERVER_MATRIX_USERNAME=realm_server \ diff --git a/packages/realm-server/scripts/start-worker-staging.sh b/packages/realm-server/scripts/start-worker-staging.sh index 212622dcf7..74754dc1f2 100755 --- a/packages/realm-server/scripts/start-worker-staging.sh +++ b/packages/realm-server/scripts/start-worker-staging.sh @@ -1,10 +1,21 @@ #! /bin/sh +# Run from the realm-server package directory so ts-node finds the package's +# tsconfig. Previously the CMD wrapped pnpm --filter, which set CWD for us; +# now PID 1 execs into this script directly so we set it ourselves. The PATH +# prepend gives us the local ts-node binary that `pnpm --filter` used to put +# on PATH automatically. +cd "$(cd "$(dirname "$0")" && pwd)/.." +PATH="./node_modules/.bin:$PATH" +export PATH + DEFAULT_CATALOG_REALM_URL='https://realms-staging.stack.cards/catalog/' CATALOG_REALM_URL="${RESOLVED_CATALOG_REALM_URL:-$DEFAULT_CATALOG_REALM_URL}" DEFAULT_SOFTWARE_FACTORY_REALM_URL='https://realms-staging.stack.cards/software-factory/' SOFTWARE_FACTORY_REALM_URL="${RESOLVED_SOFTWARE_FACTORY_REALM_URL:-$DEFAULT_SOFTWARE_FACTORY_REALM_URL}" +echo "[start-worker-staging] pid=$$ ppid=$PPID about to exec ts-node at $(date -Iseconds)" >&2 + NODE_NO_WARNINGS=1 \ NODE_OPTIONS="${NODE_OPTIONS:---max-old-space-size=4096}" \ REALM_SERVER_MATRIX_USERNAME=realm_server \ diff --git a/packages/realm-server/worker-manager.ts b/packages/realm-server/worker-manager.ts index 14d61aa79d..0b3bcb4845 100644 --- a/packages/realm-server/worker-manager.ts +++ b/packages/realm-server/worker-manager.ts @@ -22,6 +22,15 @@ const swallowEpipe = (err: NodeJS.ErrnoException) => { process.stdout.on('error', swallowEpipe); process.stderr.on('error', swallowEpipe); +// Synchronous stderr stamp so we have proof the Node process actually +// started (and at what pid/ppid) independent of the logger pipeline. +// Used to triage cases where SIGTERM doesn't reach Node — if we see +// STARTUP but never SIGTERM received, signal delivery is broken +// upstream of Node. +process.stderr.write( + `[worker-manager] STARTUP pid=${process.pid} ppid=${process.ppid} argv=${JSON.stringify(process.argv)}\n`, +); + import { logger, userInitiatedPriority, @@ -538,16 +547,28 @@ function runShutdownCallbacks() { } process.on('SIGINT', () => { + process.stderr.write( + `[worker-manager] SIGINT received pid=${process.pid} ppid=${process.ppid}\n`, + ); shutdown(); }); process.on('SIGTERM', () => { + process.stderr.write( + `[worker-manager] SIGTERM received pid=${process.pid} ppid=${process.ppid}\n`, + ); shutdown(); }); process.on('disconnect', () => { + process.stderr.write( + `[worker-manager] disconnect received pid=${process.pid} ppid=${process.ppid}\n`, + ); log.info(`Parent IPC disconnected, shutting down worker manager...`); shutdown(); }); process.on('uncaughtException', (err) => { + process.stderr.write( + `[worker-manager] uncaughtException pid=${process.pid} ppid=${process.ppid}\n`, + ); log.error(`Uncaught exception in worker manager:`, err); shutdown(); }); diff --git a/packages/realm-server/worker.Dockerfile b/packages/realm-server/worker.Dockerfile index 11a976d330..cd14fa4c0d 100644 --- a/packages/realm-server/worker.Dockerfile +++ b/packages/realm-server/worker.Dockerfile @@ -22,4 +22,4 @@ RUN CI=1 pnpm install -r --offline EXPOSE 3000 -CMD exec pnpm --filter "./packages/realm-server" $worker_script +CMD exec /realm-server/packages/realm-server/$worker_script diff --git a/packages/realm-server/worker.ts b/packages/realm-server/worker.ts index ec24789ffb..519e703f1b 100644 --- a/packages/realm-server/worker.ts +++ b/packages/realm-server/worker.ts @@ -14,6 +14,15 @@ const swallowEpipe = (err: NodeJS.ErrnoException) => { process.stdout.on('error', swallowEpipe); process.stderr.on('error', swallowEpipe); +// Synchronous stderr stamp so we have proof the Node process actually +// started (and at what pid/ppid) independent of the logger pipeline. +// Used to triage cases where SIGTERM doesn't reach Node — if we see +// STARTUP but never SIGTERM received, signal delivery is broken +// upstream of Node. +process.stderr.write( + `[worker] STARTUP pid=${process.pid} ppid=${process.ppid} argv=${JSON.stringify(process.argv)}\n`, +); + import { Worker, VirtualNetwork, @@ -193,9 +202,24 @@ let autoMigrate = migrateDB || undefined; process.exit(1); } }; - process.on('SIGINT', shutdown); - process.on('SIGTERM', shutdown); - process.on('disconnect', shutdown); + process.on('SIGINT', () => { + process.stderr.write( + `[worker] SIGINT received pid=${process.pid} ppid=${process.ppid}\n`, + ); + shutdown(); + }); + process.on('SIGTERM', () => { + process.stderr.write( + `[worker] SIGTERM received pid=${process.pid} ppid=${process.ppid}\n`, + ); + shutdown(); + }); + process.on('disconnect', () => { + process.stderr.write( + `[worker] disconnect received pid=${process.pid} ppid=${process.ppid}\n`, + ); + shutdown(); + }); process.on('message', (message) => { if (message === 'stop') { shutdown(); // warning this is async From 60861660771f7b729b4e8514c1896423f7a7d051 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Mon, 18 May 2026 19:06:08 -0400 Subject: [PATCH 2/2] Use FD-level writeSync(2,...) for pre-death diagnostic stamps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `process.stderr.write` is libuv-async when stderr is a pipe (the Docker / ECS case for these processes), so writes can be lost if the process exits before libuv flushes — which is the exact scenario the SIGTERM / SIGINT / disconnect / uncaughtException stamps are designed to capture. `writeSync(2, ...)` calls write(2) directly, hands the bytes to the kernel pipe buffer synchronously, and returns only after that completes. Apply this to the STARTUP stamp as well, both for consistency and so an immediate post-startup crash doesn't lose the proof-of-life line. No behavioural change other than the write semantics. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/realm-server/main.ts | 24 ++++++++----- .../realm-server/prerender/manager-server.ts | 24 ++++++++----- .../prerender/prerender-server.ts | 24 ++++++++----- packages/realm-server/worker-manager.ts | 36 +++++++++++++------ packages/realm-server/worker.ts | 27 +++++++++----- 5 files changed, 92 insertions(+), 43 deletions(-) diff --git a/packages/realm-server/main.ts b/packages/realm-server/main.ts index 57cc8c7b57..ae68adf340 100644 --- a/packages/realm-server/main.ts +++ b/packages/realm-server/main.ts @@ -1,6 +1,7 @@ import './instrument'; import './setup-logger'; // This should be first import './lib/wtfnode-on-signal'; +import { writeSync } from 'node:fs'; import { Realm, VirtualNetwork, @@ -43,12 +44,15 @@ import { ModuleCacheCoordinator } from './lib/module-cache-coordination'; import { resolveFullIndexOnStartup } from './lib/full-index-on-startup'; import { PUBLISHED_DIRECTORY_NAME } from '@cardstack/runtime-common'; -// Synchronous stderr stamp so we have proof the Node process actually -// started (and at what pid/ppid) independent of the logger pipeline. -// Used to triage cases where SIGTERM doesn't reach Node — if we see -// STARTUP but never SIGTERM received, signal delivery is broken -// upstream of Node. -process.stderr.write( +// FD-level synchronous stderr write — `writeSync(2, ...)` calls the +// write(2) syscall directly, bypassing Node's stream layer. +// `process.stderr.write` is libuv-async when stderr is a pipe (the +// Docker / ECS case), so it can be lost if the process exits before +// libuv flushes. Stamps that fire just before death need to use the +// FD-level form. Proof the Node process actually started, at what +// pid/ppid, independent of the logger pipeline. +writeSync( + 2, `[realm-server] STARTUP pid=${process.pid} ppid=${process.ppid} argv=${JSON.stringify(process.argv)}\n`, ); @@ -605,14 +609,18 @@ const getIndexHTML = async () => { // orchestrators trigger graceful cleanup (close httpServer, unsubscribe // realm watchers, drain queue + DB) instead of leaking the open // handles into a SIGKILL escalation. + // `writeSync(2, ...)` (FD-level, syscall-synchronous) for the same + // reason as the STARTUP stamp at the top of this file. process.on('SIGTERM', () => { - process.stderr.write( + writeSync( + 2, `[realm-server] SIGTERM received pid=${process.pid} ppid=${process.ppid}\n`, ); stopRealmServer(false); }); process.on('SIGINT', () => { - process.stderr.write( + writeSync( + 2, `[realm-server] SIGINT received pid=${process.pid} ppid=${process.ppid}\n`, ); stopRealmServer(false); diff --git a/packages/realm-server/prerender/manager-server.ts b/packages/realm-server/prerender/manager-server.ts index 6c84976386..df52f0d04e 100644 --- a/packages/realm-server/prerender/manager-server.ts +++ b/packages/realm-server/prerender/manager-server.ts @@ -1,6 +1,7 @@ import '../instrument'; import '../setup-logger'; import '../lib/wtfnode-on-signal'; +import { writeSync } from 'node:fs'; import { logger } from '@cardstack/runtime-common'; import type { Server } from 'http'; import { createServer } from 'http'; @@ -11,12 +12,15 @@ import { registerService, } from '../lib/dev-service-registry'; -// Synchronous stderr stamp so we have proof the Node process actually -// started (and at what pid/ppid) independent of the logger pipeline. -// Used to triage cases where SIGTERM doesn't reach Node — if we see -// STARTUP but never SIGTERM received, signal delivery is broken -// upstream of Node. -process.stderr.write( +// FD-level synchronous stderr write — `writeSync(2, ...)` calls the +// write(2) syscall directly, bypassing Node's stream layer. +// `process.stderr.write` is libuv-async when stderr is a pipe (the +// Docker / ECS case), so it can be lost if the process exits before +// libuv flushes. Stamps that fire just before death need to use the +// FD-level form. Proof the Node process actually started, at what +// pid/ppid, independent of the logger pipeline. +writeSync( + 2, `[prerender-manager] STARTUP pid=${process.pid} ppid=${process.ppid} argv=${JSON.stringify(process.argv)}\n`, ); @@ -95,14 +99,18 @@ function shutdown(signal: NodeJS.Signals) { // process manager to terminate after grace period. } +// `writeSync(2, ...)` (FD-level, syscall-synchronous) for the same +// reason as the STARTUP stamp at the top of this file. process.on('SIGINT', () => { - process.stderr.write( + writeSync( + 2, `[prerender-manager] SIGINT received pid=${process.pid} ppid=${process.ppid}\n`, ); shutdown('SIGINT'); }); process.on('SIGTERM', () => { - process.stderr.write( + writeSync( + 2, `[prerender-manager] SIGTERM received pid=${process.pid} ppid=${process.ppid}\n`, ); shutdown('SIGTERM'); diff --git a/packages/realm-server/prerender/prerender-server.ts b/packages/realm-server/prerender/prerender-server.ts index 7d92380684..e31d937f21 100644 --- a/packages/realm-server/prerender/prerender-server.ts +++ b/packages/realm-server/prerender/prerender-server.ts @@ -1,17 +1,21 @@ import '../instrument'; import '../setup-logger'; import '../lib/wtfnode-on-signal'; +import { writeSync } from 'node:fs'; import { logger } from '@cardstack/runtime-common'; import yargs from 'yargs'; import type { Server } from 'http'; import { createPrerenderHttpServer } from './prerender-app'; -// Synchronous stderr stamp so we have proof the Node process actually -// started (and at what pid/ppid) independent of the logger pipeline. -// Used to triage cases where SIGTERM doesn't reach Node — if we see -// STARTUP but never SIGTERM received, signal delivery is broken -// upstream of Node. -process.stderr.write( +// FD-level synchronous stderr write — `writeSync(2, ...)` calls the +// write(2) syscall directly, bypassing Node's stream layer. +// `process.stderr.write` is libuv-async when stderr is a pipe (the +// Docker / ECS case), so it can be lost if the process exits before +// libuv flushes. Stamps that fire just before death need to use the +// FD-level form. Proof the Node process actually started, at what +// pid/ppid, independent of the logger pipeline. +writeSync( + 2, `[prerender-server] STARTUP pid=${process.pid} ppid=${process.ppid} argv=${JSON.stringify(process.argv)}\n`, ); @@ -69,14 +73,18 @@ function shutdown() { } } +// `writeSync(2, ...)` (FD-level, syscall-synchronous) for the same +// reason as the STARTUP stamp at the top of this file. process.on('SIGINT', () => { - process.stderr.write( + writeSync( + 2, `[prerender-server] SIGINT received pid=${process.pid} ppid=${process.ppid}\n`, ); shutdown(); }); process.on('SIGTERM', () => { - process.stderr.write( + writeSync( + 2, `[prerender-server] SIGTERM received pid=${process.pid} ppid=${process.ppid}\n`, ); shutdown(); diff --git a/packages/realm-server/worker-manager.ts b/packages/realm-server/worker-manager.ts index 0b3bcb4845..0b7d2e5674 100644 --- a/packages/realm-server/worker-manager.ts +++ b/packages/realm-server/worker-manager.ts @@ -1,6 +1,7 @@ import './instrument'; import './setup-logger'; // This should be first import './lib/wtfnode-on-signal'; +import { writeSync } from 'node:fs'; // During `mise dev-all` Ctrl-C, the bash trap walks the process tree // deepest-first. The `dev-log-tee` reader on this process's stdout/stderr @@ -22,12 +23,19 @@ const swallowEpipe = (err: NodeJS.ErrnoException) => { process.stdout.on('error', swallowEpipe); process.stderr.on('error', swallowEpipe); -// Synchronous stderr stamp so we have proof the Node process actually -// started (and at what pid/ppid) independent of the logger pipeline. -// Used to triage cases where SIGTERM doesn't reach Node — if we see -// STARTUP but never SIGTERM received, signal delivery is broken -// upstream of Node. -process.stderr.write( +// FD-level synchronous stderr write — `writeSync(2, ...)` calls the +// write(2) syscall directly, bypassing Node's stream layer. We do that +// (instead of `process.stderr.write`) because the stream-layer write is +// libuv-async when stderr is a pipe (the normal Docker / ECS case), so +// it can be lost if the process exits before libuv flushes. The whole +// point of these stamps is to land *just before* the process dies, so +// async loss would defeat them. +// +// Proof the Node process actually started, at what pid/ppid, independent +// of the logger pipeline. If we see STARTUP but never SIGTERM received, +// signal delivery is broken upstream of Node. +writeSync( + 2, `[worker-manager] STARTUP pid=${process.pid} ppid=${process.ppid} argv=${JSON.stringify(process.argv)}\n`, ); @@ -546,27 +554,35 @@ function runShutdownCallbacks() { } } +// `writeSync(2, ...)` (FD-level, syscall-synchronous) for the same +// reason as the STARTUP stamp above — `process.stderr.write` is +// libuv-async to a pipe, and these stamps fire on the exact paths where +// the process is about to die, so an async write can get lost. process.on('SIGINT', () => { - process.stderr.write( + writeSync( + 2, `[worker-manager] SIGINT received pid=${process.pid} ppid=${process.ppid}\n`, ); shutdown(); }); process.on('SIGTERM', () => { - process.stderr.write( + writeSync( + 2, `[worker-manager] SIGTERM received pid=${process.pid} ppid=${process.ppid}\n`, ); shutdown(); }); process.on('disconnect', () => { - process.stderr.write( + writeSync( + 2, `[worker-manager] disconnect received pid=${process.pid} ppid=${process.ppid}\n`, ); log.info(`Parent IPC disconnected, shutting down worker manager...`); shutdown(); }); process.on('uncaughtException', (err) => { - process.stderr.write( + writeSync( + 2, `[worker-manager] uncaughtException pid=${process.pid} ppid=${process.ppid}\n`, ); log.error(`Uncaught exception in worker manager:`, err); diff --git a/packages/realm-server/worker.ts b/packages/realm-server/worker.ts index 519e703f1b..80f14ee639 100644 --- a/packages/realm-server/worker.ts +++ b/packages/realm-server/worker.ts @@ -1,6 +1,7 @@ import './instrument'; import './setup-logger'; // This should be first import './lib/wtfnode-on-signal'; +import { writeSync } from 'node:fs'; // Swallow EPIPE from stdout/stderr so a torn-down parent (worker manager // SIGKILL'd or its dev-log-tee dead during dev-all Ctrl-C) doesn't make @@ -14,12 +15,15 @@ const swallowEpipe = (err: NodeJS.ErrnoException) => { process.stdout.on('error', swallowEpipe); process.stderr.on('error', swallowEpipe); -// Synchronous stderr stamp so we have proof the Node process actually -// started (and at what pid/ppid) independent of the logger pipeline. -// Used to triage cases where SIGTERM doesn't reach Node — if we see -// STARTUP but never SIGTERM received, signal delivery is broken -// upstream of Node. -process.stderr.write( +// FD-level synchronous stderr write — `writeSync(2, ...)` calls the +// write(2) syscall directly, bypassing Node's stream layer. +// `process.stderr.write` is libuv-async when stderr is a pipe (the +// Docker / ECS case), so it can be lost if the process exits before +// libuv flushes. Stamps that fire just before death need to use the +// FD-level form. Proof the Node process actually started, at what +// pid/ppid, independent of the logger pipeline. +writeSync( + 2, `[worker] STARTUP pid=${process.pid} ppid=${process.ppid} argv=${JSON.stringify(process.argv)}\n`, ); @@ -202,20 +206,25 @@ let autoMigrate = migrateDB || undefined; process.exit(1); } }; + // `writeSync(2, ...)` (FD-level, syscall-synchronous) for the same + // reason as the STARTUP stamp at the top of this file. process.on('SIGINT', () => { - process.stderr.write( + writeSync( + 2, `[worker] SIGINT received pid=${process.pid} ppid=${process.ppid}\n`, ); shutdown(); }); process.on('SIGTERM', () => { - process.stderr.write( + writeSync( + 2, `[worker] SIGTERM received pid=${process.pid} ppid=${process.ppid}\n`, ); shutdown(); }); process.on('disconnect', () => { - process.stderr.write( + writeSync( + 2, `[worker] disconnect received pid=${process.pid} ppid=${process.ppid}\n`, ); shutdown();