From 4af133e4ed68204cf69b3b96e91b595c00a4c17c Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Mon, 18 May 2026 18:22:13 -0400 Subject: [PATCH] wait-for-host-standby: retry puppeteer.launch with diagnostics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Boxel CLI Tests on CI failed because puppeteer.launch timed out after the default 30s waiting for Chrome's DevTools WS endpoint URL to appear in stdout. The existing retry loop only covered page.goto and waitForFunction — a slow Chrome cold start aborted the script before the loop could even run. Wrap the launch in its own retry helper: - explicit 90s launch timeout per attempt (default was 30s) - up to 3 attempts with a 2s backoff between them, capped by the same 10-minute total deadline as the navigation loop - structured log lines for each attempt (executable path, args, dumpio, timeout, success/failure timing) so a future flake is debuggable from the CI log alone - on the final attempt, enable puppeteer's dumpio so Chrome's own stdout/stderr is piped through node — if launch still fails after the retries, we capture *why* (sandbox denial, missing shared lib, GPU init crash) instead of a bare "Timed out … waiting for the WS endpoint URL" Co-Authored-By: Claude Opus 4.7 (1M context) --- .../scripts/wait-for-host-standby.ts | 83 +++++++++++++++++-- 1 file changed, 76 insertions(+), 7 deletions(-) diff --git a/packages/realm-server/scripts/wait-for-host-standby.ts b/packages/realm-server/scripts/wait-for-host-standby.ts index 1f41c6b6b3..3a2a7b6f15 100644 --- a/packages/realm-server/scripts/wait-for-host-standby.ts +++ b/packages/realm-server/scripts/wait-for-host-standby.ts @@ -32,11 +32,21 @@ // against a genuinely broken vite — in normal dev that ceiling is never // reached. -import puppeteer from 'puppeteer'; +import puppeteer, { type Browser } from 'puppeteer'; const PER_ATTEMPT_TIMEOUT_MS = 30_000; const MAX_BACKOFF_MS = 5_000; const TOTAL_TIMEOUT_MS = 600_000; +// Chrome startup on a loaded CI runner occasionally takes >30s to print +// its DevTools WS endpoint to stdout. Puppeteer's default launch timeout +// is 30s, so a single slow start aborts the whole script before the +// goto/waitForFunction retry loop ever runs. Give the launch its own +// generous budget and retry it independently — the page-pool's own +// BrowserManager also relies on launch succeeding on the first try, but +// here we're a one-shot startup probe and the cost of a retry is small. +const LAUNCH_TIMEOUT_MS = 90_000; +const LAUNCH_MAX_ATTEMPTS = 3; +const LAUNCH_RETRY_BACKOFF_MS = 2_000; import { isHttpsLoopback } from '../lib/is-https-loopback'; @@ -44,6 +54,68 @@ const log = (msg: string) => console.log(`[wait-for-host-standby] ${msg}`); const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); const elapsedSec = (start: number) => Math.round((Date.now() - start) / 1000); +async function launchBrowserWithRetry({ + launchArgs, + totalDeadline, +}: { + launchArgs: string[]; + totalDeadline: number; +}): Promise { + let executablePath = process.env.PUPPETEER_EXECUTABLE_PATH; + let lastError: unknown; + for (let attempt = 1; attempt <= LAUNCH_MAX_ATTEMPTS; attempt++) { + let remaining = totalDeadline - Date.now(); + if (remaining <= 0) { + break; + } + let timeout = Math.min(LAUNCH_TIMEOUT_MS, remaining); + // On the final attempt, pipe Chrome's own stdout/stderr through node so + // that if launch is still failing we capture *why* (sandbox denial, + // missing shared library, GPU init crash, etc.) instead of a bare + // "Timed out … while waiting for the WS endpoint URL." The earlier + // attempts stay quiet on healthy runs. + let dumpio = + attempt === LAUNCH_MAX_ATTEMPTS || + process.env.WAIT_FOR_HOST_STANDBY_VERBOSE === '1'; + log( + `puppeteer.launch attempt ${attempt}/${LAUNCH_MAX_ATTEMPTS} ` + + `(timeout=${timeout}ms, executable=${executablePath ?? 'puppeteer-bundled'}, ` + + `args=${JSON.stringify(launchArgs)}, dumpio=${dumpio})`, + ); + let t0 = Date.now(); + try { + let browser = await puppeteer.launch({ + headless: true, + timeout, + dumpio, + ...(launchArgs.length > 0 ? { args: launchArgs } : {}), + ...(executablePath ? { executablePath } : {}), + }); + log( + `puppeteer.launch attempt ${attempt} succeeded after ${Date.now() - t0}ms`, + ); + return browser; + } catch (e) { + lastError = e; + let message = e instanceof Error ? e.message : String(e); + log( + `puppeteer.launch attempt ${attempt} failed after ${Date.now() - t0}ms: ${message}`, + ); + if (attempt === LAUNCH_MAX_ATTEMPTS) { + break; + } + let remainingAfterFailure = totalDeadline - Date.now(); + if (remainingAfterFailure <= LAUNCH_RETRY_BACKOFF_MS) { + break; + } + await sleep(LAUNCH_RETRY_BACKOFF_MS); + } + } + throw lastError instanceof Error + ? lastError + : new Error(`puppeteer.launch failed: ${String(lastError)}`); +} + async function main() { // Vite serves HTTPS on localhost:4200 in local dev (the realm-server // requires the mkcert leaf and vite reads the same cert). Default @@ -81,12 +153,9 @@ async function main() { log(`probing ${standbyUrl} (max ${TOTAL_TIMEOUT_MS / 1000}s)...`); let start = Date.now(); - let browser = await puppeteer.launch({ - headless: true, - ...(launchArgs.length > 0 ? { args: launchArgs } : {}), - ...(process.env.PUPPETEER_EXECUTABLE_PATH - ? { executablePath: process.env.PUPPETEER_EXECUTABLE_PATH } - : {}), + let browser = await launchBrowserWithRetry({ + launchArgs, + totalDeadline: start + TOTAL_TIMEOUT_MS, }); let attempt = 0;