From c6123ea9b390e49102d4534995e806ea1ba01408 Mon Sep 17 00:00:00 2001 From: Rafael Thayto Tani Date: Mon, 22 Jun 2026 18:51:21 -0300 Subject: [PATCH 01/10] fix(e2e): bound network and setup steps to kill flaky 300s timeouts The flaky E2E failures were Nuxt's beforeAll hitting the 300s budget. Two distinct stalls shared one opaque "hook timed out" signature: one CI run hung in `clerk link` (an untimed `fetch()` to the production Clerk API), another in `git init`. - Add a default 60s timeout to `loggedFetch`, composed with any caller signal via `AbortSignal.any` so tighter budgets (keyless's 15s) still win. A stalled connection now fails fast across every CLI command, not just in tests. - Wrap each fixture setup step (git / clerk link / clerk init / npm ci) in a per-step timeout that fails with a labeled error instead of silently eating the whole 300s budget. - Cap e2e `--parallel=4` to cut startup contention; add an explicit afterEach cleanup budget and `npm ci --no-audit --no-fund`. - Drop noisy success-path debug traces; keep failure diagnostics. Claude-Session: https://claude.ai/code/session_01V1YkHZ2Ad1okwkX9bxTYsd --- .changeset/fix-network-request-timeout.md | 5 +++ package.json | 2 +- packages/cli-core/src/lib/fetch.test.ts | 37 ++++++++++++++++ packages/cli-core/src/lib/fetch.ts | 38 ++++++++++++++--- test/e2e/lib/dev-server.ts | 6 --- test/e2e/lib/fixture-setup.ts | 52 ++++++++++++++--------- test/e2e/lib/fixture-test.ts | 16 +------ test/e2e/lib/logger.ts | 2 +- test/e2e/lib/test-user.ts | 7 --- 9 files changed, 109 insertions(+), 56 deletions(-) create mode 100644 .changeset/fix-network-request-timeout.md diff --git a/.changeset/fix-network-request-timeout.md b/.changeset/fix-network-request-timeout.md new file mode 100644 index 00000000..e5993ec7 --- /dev/null +++ b/.changeset/fix-network-request-timeout.md @@ -0,0 +1,5 @@ +--- +"clerk": patch +--- + +Add a default 60s timeout to all outbound CLI network requests. Previously a stalled connection to a Clerk API could hang a command indefinitely (with no error and no way to recover other than Ctrl-C); requests now abort with a clear, tagged error after 60s. A caller-supplied `AbortSignal` still composes with this default, so tighter per-call budgets continue to win. diff --git a/package.json b/package.json index e9f77b9f..2656a0a2 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,7 @@ "build": "bun run --filter @clerk/cli-core build", "dev": "bun run --cwd packages/cli-core dev", "test": "bun test 'packages/cli-core/src/' 'packages/extras/src/' 'scripts/' --parallel --only-failures", - "test:e2e": "bun test 'test/e2e/' --retry 1 --parallel --only-failures", + "test:e2e": "bun test 'test/e2e/' --retry 1 --parallel=4 --only-failures", "test:e2e:op": "bun run scripts/run-e2e-op.ts", "e2e:refresh-fixtures": "bun run scripts/refresh-e2e-fixtures.ts", "typecheck": "bun run --filter './packages/*' typecheck && tsc --noEmit -p scripts/tsconfig.json && tsc --noEmit -p test/e2e/tsconfig.json", diff --git a/packages/cli-core/src/lib/fetch.test.ts b/packages/cli-core/src/lib/fetch.test.ts index 39f03188..e765048c 100644 --- a/packages/cli-core/src/lib/fetch.test.ts +++ b/packages/cli-core/src/lib/fetch.test.ts @@ -41,4 +41,41 @@ describe("loggedFetch", () => { expect(init.headers.get("Authorization")).toBe("Bearer abc"); expect(init.headers.get("User-Agent")).toMatch(/^Clerk-CLI\//); }); + + // A server that accepts the connection but never responds. The mock rejects + // only when the request's AbortSignal fires, so it exercises the real timeout + // path: without a default timeout this hangs until bun's test timeout. + const hangingFetch = () => + ((_url: unknown, init: { signal?: AbortSignal }) => + new Promise((_resolve, reject) => { + init.signal?.addEventListener("abort", () => reject(init.signal!.reason)); + })) as unknown as typeof fetch; + + test("aborts with a clear, tagged error after the default timeout when the server never responds", async () => { + globalThis.fetch = hangingFetch(); + await expect( + loggedFetch("https://example.test/hang", { tag: "plapi", timeoutMs: 30 }), + ).rejects.toThrow(/plapi: request timed out after 30ms/); + }, 2000); + + test("a shorter caller signal wins over the default timeout and is not masked by the timeout message", async () => { + globalThis.fetch = hangingFetch(); + const caller = AbortSignal.timeout(20); + const err = await loggedFetch("https://example.test/hang", { + tag: "plapi", + timeoutMs: 10_000, + signal: caller, + }).catch((e: unknown) => e); + // The caller's signal fired first, so we must surface its abort, not + // pretend our 10s default timeout elapsed. + expect(String(err)).not.toMatch(/timed out after 10000ms/); + }, 2000); + + test("returns the response for a fast request without aborting", async () => { + globalThis.fetch = mock( + async () => new Response("ok", { status: 200 }), + ) as unknown as typeof fetch; + const res = await loggedFetch("https://example.test/ok", { tag: "plapi", timeoutMs: 5_000 }); + expect(res.status).toBe(200); + }); }); diff --git a/packages/cli-core/src/lib/fetch.ts b/packages/cli-core/src/lib/fetch.ts index ffebe505..0ff89ac8 100644 --- a/packages/cli-core/src/lib/fetch.ts +++ b/packages/cli-core/src/lib/fetch.ts @@ -14,7 +14,17 @@ import { buildUserAgent } from "./user-agent.ts"; const USER_AGENT = buildUserAgent(); -export type LoggedFetchInit = RequestInit & { tag: string }; +/** + * Default per-request timeout. Native `fetch()` has no timeout, so without this + * a stalled TCP connection to a Clerk API hangs the command indefinitely (this + * was the root cause of the flaky e2e setup, where `clerk link`/`clerk init` + * could hang for the full 300s test budget). 60s is generous for any single + * REST call while still bounding the worst case. Callers needing a tighter or + * looser bound pass `timeoutMs`; an explicit `signal` composes with this one. + */ +const DEFAULT_REQUEST_TIMEOUT_MS = 60_000; + +export type LoggedFetchInit = RequestInit & { tag: string; timeoutMs?: number }; /** * Normalized response shape returned by the higher-level API request wrappers @@ -29,16 +39,32 @@ export interface ApiResponse { } export async function loggedFetch(url: URL | string, options: LoggedFetchInit): Promise { - const { tag, ...init } = options; + const { tag, timeoutMs = DEFAULT_REQUEST_TIMEOUT_MS, signal: callerSignal, ...init } = options; const method = init.method ?? "GET"; const urlStr = url.toString(); const headers = new Headers(init.headers); if (!headers.has("user-agent")) headers.set("User-Agent", USER_AGENT); log.debug(`${tag}: ${method} ${urlStr}`); - const response = await withNetworkAccess( - { operation: "connect", target: urlStr, label: tag }, - async () => fetch(url, { ...init, headers }), - ); + + // Compose our default timeout with any caller-supplied signal so whichever + // fires first wins (e.g. keyless.ts's tighter 15s budget still applies). + const timeoutSignal = AbortSignal.timeout(timeoutMs); + const signal = callerSignal ? AbortSignal.any([callerSignal, timeoutSignal]) : timeoutSignal; + + let response: Response; + try { + response = await withNetworkAccess( + { operation: "connect", target: urlStr, label: tag }, + async () => fetch(url, { ...init, headers, signal }), + ); + } catch (err) { + // Distinguish our timeout from a caller abort or a plain network error, so + // the failure is self-diagnosing instead of a cryptic DOMException/hang. + if (timeoutSignal.aborted && !callerSignal?.aborted) { + throw new Error(`${tag}: request timed out after ${timeoutMs}ms — ${method} ${urlStr}`); + } + throw err; + } if (!response.ok) { // Clone so the caller can still consume the body for error construction. const body = await response.clone().text(); diff --git a/test/e2e/lib/dev-server.ts b/test/e2e/lib/dev-server.ts index 8ea78661..c9729d0b 100644 --- a/test/e2e/lib/dev-server.ts +++ b/test/e2e/lib/dev-server.ts @@ -101,8 +101,6 @@ async function tryStart(opts: { const stderrLines: string[] = []; const stdoutLines: string[] = []; - log(`starting dev server: npx ${fullCmd.join(" ")} on port ${port}`); - const proc = Bun.spawn(["npx", ...fullCmd], { cwd: projectDir, stdout: "pipe", @@ -170,7 +168,6 @@ async function tryStart(opts: { } if (await canConnect(host, port, 1000)) { - log(`dev server ready (accepting TCP on ${host}:${port})`); return { kind: "ready", value: { proc, port, host, stdout: stdoutLines, stderr: stderrLines }, @@ -222,7 +219,6 @@ export async function startDevServer(opts: { /** Kill a dev server process, falling back to SIGKILL after 5 seconds. */ export async function killDevServer(proc: Subprocess): Promise { - log("killing dev server"); proc.kill("SIGTERM"); const timeout = setTimeout(() => { @@ -234,6 +230,4 @@ export async function killDevServer(proc: Subprocess): Promise { } finally { clearTimeout(timeout); } - - log("dev server stopped"); } diff --git a/test/e2e/lib/fixture-setup.ts b/test/e2e/lib/fixture-setup.ts index 08486393..a51a1ac6 100644 --- a/test/e2e/lib/fixture-setup.ts +++ b/test/e2e/lib/fixture-setup.ts @@ -56,6 +56,25 @@ async function safeRm(path: string): Promise { } } +/** + * Run a setup step under a hard timeout so a single stalled subprocess fails + * fast with a labeled error instead of silently burning the whole 300s + * `beforeAll` budget (the flaky-setup signature seen in CI, which stalled in + * `clerk link` on one run and `git init` on another). `beforeAll` is never + * retried, so leaving the inner promise to settle on timeout is safe. + */ +async function withStepTimeout(label: string, ms: number, run: () => Promise): Promise { + let timer: ReturnType | undefined; + const timeout = new Promise((_, reject) => { + timer = setTimeout(() => reject(new Error(`${label} timed out after ${ms}ms`)), ms); + }); + try { + return await Promise.race([run(), timeout]); + } finally { + clearTimeout(timer); + } +} + /** * Pre-link the project to the test Clerk application using an isolated * CLERK_CONFIG_DIR, so `clerk init` finds an existing link and skips the @@ -152,31 +171,24 @@ export type Fixture = { export async function setupFixture(name: FixtureName): Promise { const config = fixtures[name]; const fixtureDir = join(FIXTURES_DIR, name); - log("setup started"); // Resolve symlinks (macOS /var -> /private/var) so profile keys match across commands const tmp = await realpath(tmpdir()); const projectDir = await mkdtemp(join(tmp, `clerk-e2e-${name}-`)); const configDir = await mkdtemp(join(tmp, "clerk-e2e-config-")); await copyFixture(fixtureDir, projectDir); - log("fixture copied"); let publishableKey = ""; let secretKey = ""; try { // Git-init before linking so the profile key matches for later commands - await gitInit(projectDir); - log("git init done"); - - // The magic happens here, we actually test out `clerk link` and `clerk init` - await linkProject(projectDir, configDir); - log("clerk link done"); + await withStepTimeout("git init", 60_000, () => gitInit(projectDir)); + // `clerk link`/`clerk init` hit the production Clerk API; these step budgets + // back up the CLI's own per-request timeout. + await withStepTimeout("clerk link", 60_000, () => linkProject(projectDir, configDir)); + await withStepTimeout("clerk init", 90_000, () => runClerkInit(projectDir, configDir)); - await runClerkInit(projectDir, configDir); - log("clerk init done"); - - // Verify clerk init wrote env files and extract keys. const envVars = await parseEnvFiles(projectDir); const publishableKeyName = await detectPublishableKeyName(projectDir); @@ -191,25 +203,23 @@ export async function setupFixture(name: FixtureName): Promise { throw new Error(`${secretKeyName} not found in env files written by clerk init.`); } - const install = await Bun.$`npm ci --ignore-scripts --legacy-peer-deps` - .cwd(projectDir) - .quiet() - .nothrow(); + // --no-audit/--no-fund drop npm's advisory network round-trips during `ci`. + const install = await withStepTimeout("npm ci", 240_000, () => + Bun.$`npm ci --ignore-scripts --legacy-peer-deps --no-audit --no-fund` + .cwd(projectDir) + .quiet() + .nothrow(), + ); assertSuccess("npm ci failed", install); - log("npm ci done"); } catch (err) { await safeRm(projectDir); await safeRm(configDir); throw new Error("setup failed", { cause: err }); } - log("setup complete"); - const cleanup = async () => { - log("cleanup started"); await safeRm(projectDir); await safeRm(configDir); - log("cleanup done"); }; return { diff --git a/test/e2e/lib/fixture-test.ts b/test/e2e/lib/fixture-test.ts index 6812101e..e5792276 100644 --- a/test/e2e/lib/fixture-test.ts +++ b/test/e2e/lib/fixture-test.ts @@ -101,20 +101,17 @@ export function createFixtureHarness(name: FixtureName): FixtureHarness { let users: Users | null = null; beforeAll(async () => { - log("beforeAll started"); fixture = await setupFixture(name); users = createUsers(fixture); - log("beforeAll finished"); }, 300_000); afterEach(async () => { await users?.cleanup(); - }); + }, 30_000); // BAPI deletes can exceed bun's 5s default under load; an explicit + // budget avoids silently orphaning test users when cleanup runs long. afterAll(async () => { - log("afterAll started"); await fixture?.cleanup(); - log("afterAll finished"); }, 60_000); return () => { @@ -142,14 +139,12 @@ export function runFixtureTests(harness: FixtureHarness): void { const { projectDir, config } = fixture; // Build first so type generation artifacts are available for tsc. - log("build started"); const build = await Bun.$`npx ${config.buildCmd}`.cwd(projectDir).quiet().nothrow(); if (build.exitCode !== 0) { throw new Error( `${config.buildCmd.join(" ")} failed:\n${build.stdout.toString()}\n${build.stderr.toString()}`, ); } - log("build succeeded"); }, { timeout: 300_000 }, // 5 minutes - install + build can be slow) ); @@ -164,14 +159,12 @@ export function runFixtureTests(harness: FixtureHarness): void { // framework-specific type generation), otherwise plain tsc. const useTypecheck = await hasTypecheckScript(projectDir); const command = useTypecheck ? "npm run typecheck" : "bunx tsc --noEmit"; - log(`typecheck started (${command} in ${projectDir})`); const shell = useTypecheck ? await Bun.$`npm run typecheck 2>&1`.cwd(projectDir).quiet().nothrow() : await Bun.$`bunx tsc --noEmit 2>&1`.cwd(projectDir).quiet().nothrow(); if (shell.exitCode !== 0) { throw new Error(`${command} failed in ${projectDir}:\n${shell.text()}`); } - log("typecheck succeeded"); }, { timeout: 300_000 }, // 5 minutes - install + typecheck can be slow ); @@ -198,7 +191,6 @@ export function runFileExistsTest(harness: FixtureHarness, expectedFiles: string ); const existing = found.filter(Boolean); expect(existing.length).toBeGreaterThanOrEqual(1); - log(`found: ${existing.join(", ")}`); }); } @@ -265,11 +257,9 @@ export function runBrowserTests(harness: FixtureHarness): void { context, options: frontendApiUrl ? { frontendApiUrl } : undefined, }); - log(`navigating to http://${host}:${port}`); await page.goto(`http://${host}:${port}`, { waitUntil: "load" }); // 5. Sign in - log("signing in"); await clerk.signIn({ page, signInParams: { @@ -281,7 +271,6 @@ export function runBrowserTests(harness: FixtureHarness): void { // 6. Verify Clerk loaded await clerk.loaded({ page }); - log("clerk has been loaded"); // 7. Check to see that the user is now on the window object. await page.waitForFunction( @@ -289,7 +278,6 @@ export function runBrowserTests(harness: FixtureHarness): void { null, { timeout: 10_000 }, ); - log("auth flow passed"); // Log any console errors as warnings (non-fatal) if (consoleErrors.length > 0) { diff --git a/test/e2e/lib/logger.ts b/test/e2e/lib/logger.ts index 96aa173e..d27f9b86 100644 --- a/test/e2e/lib/logger.ts +++ b/test/e2e/lib/logger.ts @@ -2,7 +2,7 @@ const startTime = Date.now(); const isDebug = process.env.CLERK_E2E_DEBUG === "1" || process.env.CLERK_E2E_DEBUG === "true"; -/** Log a timestamped message with fixture name for tracing execution order. */ +/** Emit a timestamped diagnostic line when CLERK_E2E_DEBUG is set. */ export function log(message: string): void { if (!isDebug) return; const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); diff --git a/test/e2e/lib/test-user.ts b/test/e2e/lib/test-user.ts index d679d65d..518f82ea 100644 --- a/test/e2e/lib/test-user.ts +++ b/test/e2e/lib/test-user.ts @@ -53,8 +53,6 @@ export async function createTestUser(configDir: string, target: TestUserTarget): skip_password_checks: true, }); - log(`creating test user: ${email}`); - const result = await Bun.$`bun ${CLI_PATH} users create -d ${body} --json --yes ${targetArgs(target)}` .env(clerkEnv(configDir, target)) @@ -69,7 +67,6 @@ export async function createTestUser(configDir: string, target: TestUserTarget): } const user: { id: string } = JSON.parse(result.stdout.toString()); - log(`test user created: ${user.id}`); return { id: user.id, email, password }; } @@ -80,8 +77,6 @@ export async function deleteTestUser( configDir: string, target: TestUserTarget, ): Promise { - log(`deleting test user: ${userId}`); - const result = await Bun.$`bun ${CLI_PATH} api /users/${userId} -X DELETE --yes ${targetArgs(target)}` .env(clerkEnv(configDir, target)) @@ -93,7 +88,5 @@ export async function deleteTestUser( const stderr = result.stderr.toString().trim(); const detail = stderr || stdout || "(no output)"; log(`warning: failed to delete test user ${userId}: ${detail}`); - } else { - log(`test user deleted: ${userId}`); } } From 5ce158ad1ed4a65b936429aae4bd829a351c30ed Mon Sep 17 00:00:00 2001 From: Rafael Thayto Tani Date: Mon, 22 Jun 2026 19:39:23 -0300 Subject: [PATCH 02/10] fix(e2e): kill stalled setup subprocesses; harden timeout test Address PR review feedback: - `runStep` now spawns each setup step via `Bun.spawn` with an `AbortSignal` (Bun.$ can't be cancelled), so a timed-out git/clerk/npm step is killed instead of orphaned and left to race teardown. Adds runStep unit tests. - fetch timeout test now fails if `loggedFetch` resolves instead of rejecting (no more false pass via swallowed error). - Trim verbose comments. Claude-Session: https://claude.ai/code/session_01V1YkHZ2Ad1okwkX9bxTYsd --- packages/cli-core/src/lib/fetch.test.ts | 11 ++- packages/cli-core/src/lib/fetch.ts | 15 +-- test/e2e/lib/fixture-setup.test.ts | 32 ++++++ test/e2e/lib/fixture-setup.ts | 124 +++++++++++++----------- test/e2e/lib/fixture-test.ts | 3 +- 5 files changed, 109 insertions(+), 76 deletions(-) create mode 100644 test/e2e/lib/fixture-setup.test.ts diff --git a/packages/cli-core/src/lib/fetch.test.ts b/packages/cli-core/src/lib/fetch.test.ts index e765048c..675cdb7e 100644 --- a/packages/cli-core/src/lib/fetch.test.ts +++ b/packages/cli-core/src/lib/fetch.test.ts @@ -61,13 +61,18 @@ describe("loggedFetch", () => { test("a shorter caller signal wins over the default timeout and is not masked by the timeout message", async () => { globalThis.fetch = hangingFetch(); const caller = AbortSignal.timeout(20); + // Must reject (the onFulfilled branch throws if it unexpectedly resolves)... const err = await loggedFetch("https://example.test/hang", { tag: "plapi", timeoutMs: 10_000, signal: caller, - }).catch((e: unknown) => e); - // The caller's signal fired first, so we must surface its abort, not - // pretend our 10s default timeout elapsed. + }).then( + () => { + throw new Error("expected loggedFetch to reject, but it resolved"); + }, + (e: unknown) => e, + ); + // ...with the caller's 20ms abort, not relabeled as our 10s default timeout. expect(String(err)).not.toMatch(/timed out after 10000ms/); }, 2000); diff --git a/packages/cli-core/src/lib/fetch.ts b/packages/cli-core/src/lib/fetch.ts index 0ff89ac8..62d80585 100644 --- a/packages/cli-core/src/lib/fetch.ts +++ b/packages/cli-core/src/lib/fetch.ts @@ -14,14 +14,7 @@ import { buildUserAgent } from "./user-agent.ts"; const USER_AGENT = buildUserAgent(); -/** - * Default per-request timeout. Native `fetch()` has no timeout, so without this - * a stalled TCP connection to a Clerk API hangs the command indefinitely (this - * was the root cause of the flaky e2e setup, where `clerk link`/`clerk init` - * could hang for the full 300s test budget). 60s is generous for any single - * REST call while still bounding the worst case. Callers needing a tighter or - * looser bound pass `timeoutMs`; an explicit `signal` composes with this one. - */ +/** Native `fetch()` has no timeout, so a stalled connection would hang forever. */ const DEFAULT_REQUEST_TIMEOUT_MS = 60_000; export type LoggedFetchInit = RequestInit & { tag: string; timeoutMs?: number }; @@ -46,8 +39,7 @@ export async function loggedFetch(url: URL | string, options: LoggedFetchInit): if (!headers.has("user-agent")) headers.set("User-Agent", USER_AGENT); log.debug(`${tag}: ${method} ${urlStr}`); - // Compose our default timeout with any caller-supplied signal so whichever - // fires first wins (e.g. keyless.ts's tighter 15s budget still applies). + // A caller signal (e.g. keyless.ts's tighter 15s) composes with our default. const timeoutSignal = AbortSignal.timeout(timeoutMs); const signal = callerSignal ? AbortSignal.any([callerSignal, timeoutSignal]) : timeoutSignal; @@ -58,8 +50,7 @@ export async function loggedFetch(url: URL | string, options: LoggedFetchInit): async () => fetch(url, { ...init, headers, signal }), ); } catch (err) { - // Distinguish our timeout from a caller abort or a plain network error, so - // the failure is self-diagnosing instead of a cryptic DOMException/hang. + // Only relabel when our timeout fired, not a caller abort or network error. if (timeoutSignal.aborted && !callerSignal?.aborted) { throw new Error(`${tag}: request timed out after ${timeoutMs}ms — ${method} ${urlStr}`); } diff --git a/test/e2e/lib/fixture-setup.test.ts b/test/e2e/lib/fixture-setup.test.ts new file mode 100644 index 00000000..bd95429a --- /dev/null +++ b/test/e2e/lib/fixture-setup.test.ts @@ -0,0 +1,32 @@ +import { test, expect, describe } from "bun:test"; +import { tmpdir } from "node:os"; +import { runStep } from "./fixture-setup.ts"; + +describe("runStep", () => { + const base = { cwd: tmpdir(), env: process.env }; + + test("resolves when the command exits 0", async () => { + await expect( + runStep("ok", ["bash", "-c", "exit 0"], { ...base, timeoutMs: 5_000 }), + ).resolves.toBeUndefined(); + }); + + test("rejects with a labeled error including stderr on non-zero exit", async () => { + const err = await runStep("clerk link", ["bash", "-c", "echo boom >&2; exit 3"], { + ...base, + timeoutMs: 5_000, + }).catch((e: unknown) => e); + expect(String(err)).toMatch(/clerk link failed/); + expect(String(err)).toMatch(/boom/); + }); + + test("kills the subprocess and rejects promptly when the step exceeds its timeout", async () => { + const start = Date.now(); + const err = await runStep("slow", ["sleep", "10"], { ...base, timeoutMs: 150 }).catch( + (e: unknown) => e, + ); + expect(String(err)).toMatch(/slow timed out after 150ms/); + // Proves the child was killed rather than awaited: sleep 10 would take ~10s. + expect(Date.now() - start).toBeLessThan(2_000); + }); +}); diff --git a/test/e2e/lib/fixture-setup.ts b/test/e2e/lib/fixture-setup.ts index a51a1ac6..537d6d28 100644 --- a/test/e2e/lib/fixture-setup.ts +++ b/test/e2e/lib/fixture-setup.ts @@ -25,16 +25,6 @@ function requireEnv(name: string): string { return val; } -/** Throw with a descriptive message if a shell command failed. */ -function assertSuccess( - label: string, - result: { exitCode: number; stderr: { toString(): string } }, -): void { - if (result.exitCode !== 0) { - throw new Error(`${label}:\n${result.stderr.toString()}`); - } -} - /** * Copy the fixture directory into an existing project dir. */ @@ -56,20 +46,33 @@ async function safeRm(path: string): Promise { } } +interface RunStepOptions { + cwd: string; + env: Record; + timeoutMs: number; +} + /** - * Run a setup step under a hard timeout so a single stalled subprocess fails - * fast with a labeled error instead of silently burning the whole 300s - * `beforeAll` budget (the flaky-setup signature seen in CI, which stalled in - * `clerk link` on one run and `git init` on another). `beforeAll` is never - * retried, so leaving the inner promise to settle on timeout is safe. + * Spawn a setup step, killing the child on timeout so a stall fails fast with a + * labeled error instead of silently eating the whole 300s `beforeAll` budget. */ -async function withStepTimeout(label: string, ms: number, run: () => Promise): Promise { - let timer: ReturnType | undefined; - const timeout = new Promise((_, reject) => { - timer = setTimeout(() => reject(new Error(`${label} timed out after ${ms}ms`)), ms); +export async function runStep(label: string, cmd: string[], opts: RunStepOptions): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), opts.timeoutMs); + const proc = Bun.spawn(cmd, { + cwd: opts.cwd, + env: opts.env, + stdout: "ignore", + stderr: "pipe", + signal: controller.signal, }); try { - return await Promise.race([run(), timeout]); + const [stderr, exitCode] = await Promise.all([ + new Response(proc.stderr).text().catch(() => ""), + proc.exited.catch(() => -1), + ]); + if (controller.signal.aborted) throw new Error(`${label} timed out after ${opts.timeoutMs}ms`); + if (exitCode !== 0) throw new Error(`${label} failed:\n${stderr}`); } finally { clearTimeout(timer); } @@ -84,33 +87,38 @@ async function linkProject(projectDir: string, configDir: string): Promise const appId = requireEnv("CLERK_CLI_TEST_APP_ID"); const platformAPIKey = requireEnv("CLERK_PLATFORM_API_KEY"); - const result = await Bun.$`bun ${CLI_PATH} --mode human link --app ${appId}` - .cwd(projectDir) - .env({ + await runStep("clerk link", ["bun", CLI_PATH, "--mode", "human", "link", "--app", appId], { + cwd: projectDir, + // PATH lets Bun.spawn resolve `bun`; the rest of the env stays isolated. + env: { + PATH: process.env.PATH, CLERK_CONFIG_DIR: configDir, CLERK_PLATFORM_API_KEY: platformAPIKey, - }) - .quiet() - .nothrow(); - - assertSuccess("clerk link failed", result); + }, + timeoutMs: 60_000, + }); } async function gitInit(projectDir: string): Promise { - const result = - await Bun.$`git -c commit.gpgsign=false init && git add -A && git -c commit.gpgsign=false commit -m "init" --allow-empty` - .cwd(projectDir) - .env({ + await runStep( + "git init", + [ + "bash", + "-c", + 'git -c commit.gpgsign=false init && git add -A && git -c commit.gpgsign=false commit -m "init" --allow-empty', + ], + { + cwd: projectDir, + env: { ...process.env, GIT_AUTHOR_NAME: "test", GIT_AUTHOR_EMAIL: "test@test.com", GIT_COMMITTER_NAME: "test", GIT_COMMITTER_EMAIL: "test@test.com", - }) - .quiet() - .nothrow(); - - assertSuccess("git init failed", result); + }, + timeoutMs: 60_000, + }, + ); } /** @@ -121,16 +129,19 @@ async function gitInit(projectDir: string): Promise { async function runClerkInit(projectDir: string, configDir: string): Promise { const platformAPIKey = requireEnv("CLERK_PLATFORM_API_KEY"); - const result = await Bun.$`bun ${CLI_PATH} --mode human init --yes --no-skills` - .cwd(projectDir) - .env({ - CLERK_CONFIG_DIR: configDir, - CLERK_PLATFORM_API_KEY: platformAPIKey, - }) - .quiet() - .nothrow(); - - assertSuccess("clerk init failed", result); + await runStep( + "clerk init", + ["bun", CLI_PATH, "--mode", "human", "init", "--yes", "--no-skills"], + { + cwd: projectDir, + env: { + PATH: process.env.PATH, + CLERK_CONFIG_DIR: configDir, + CLERK_PLATFORM_API_KEY: platformAPIKey, + }, + timeoutMs: 90_000, + }, + ); } /** Parse env files written by clerk init into a merged Record. @@ -183,11 +194,9 @@ export async function setupFixture(name: FixtureName): Promise { try { // Git-init before linking so the profile key matches for later commands - await withStepTimeout("git init", 60_000, () => gitInit(projectDir)); - // `clerk link`/`clerk init` hit the production Clerk API; these step budgets - // back up the CLI's own per-request timeout. - await withStepTimeout("clerk link", 60_000, () => linkProject(projectDir, configDir)); - await withStepTimeout("clerk init", 90_000, () => runClerkInit(projectDir, configDir)); + await gitInit(projectDir); + await linkProject(projectDir, configDir); + await runClerkInit(projectDir, configDir); const envVars = await parseEnvFiles(projectDir); @@ -203,14 +212,11 @@ export async function setupFixture(name: FixtureName): Promise { throw new Error(`${secretKeyName} not found in env files written by clerk init.`); } - // --no-audit/--no-fund drop npm's advisory network round-trips during `ci`. - const install = await withStepTimeout("npm ci", 240_000, () => - Bun.$`npm ci --ignore-scripts --legacy-peer-deps --no-audit --no-fund` - .cwd(projectDir) - .quiet() - .nothrow(), + await runStep( + "npm ci", + ["npm", "ci", "--ignore-scripts", "--legacy-peer-deps", "--no-audit", "--no-fund"], + { cwd: projectDir, env: process.env, timeoutMs: 240_000 }, ); - assertSuccess("npm ci failed", install); } catch (err) { await safeRm(projectDir); await safeRm(configDir); diff --git a/test/e2e/lib/fixture-test.ts b/test/e2e/lib/fixture-test.ts index e5792276..87cab863 100644 --- a/test/e2e/lib/fixture-test.ts +++ b/test/e2e/lib/fixture-test.ts @@ -107,8 +107,7 @@ export function createFixtureHarness(name: FixtureName): FixtureHarness { afterEach(async () => { await users?.cleanup(); - }, 30_000); // BAPI deletes can exceed bun's 5s default under load; an explicit - // budget avoids silently orphaning test users when cleanup runs long. + }, 30_000); // BAPI deletes can exceed bun's 5s default under load afterAll(async () => { await fixture?.cleanup(); From 7d8c1a9806aae1aa472b8209ca5b1e4d745dedb0 Mon Sep 17 00:00:00 2001 From: Rafael Thayto Tani Date: Mon, 22 Jun 2026 20:02:57 -0300 Subject: [PATCH 03/10] fix(e2e): revert setup steps to Bun.$ + Promise.race timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Bun.spawn rewrite (5ce158a) regressed the E2E job: 3 fixtures hung the full 300s in beforeAll with no per-step timeout recovering, because reading a killed child's piped stderr to EOF can block when a grandchild keeps the pipe open. Restore the prior approach, which passed E2E in 52s: - setup steps use Bun.$ again, wrapped in the Promise.race `withStepTimeout` (a timed-out step's subprocess is left to settle — beforeAll is never retried, so it can't cascade). - drop the runStep Bun.spawn helper and its unit test. The real root-cause fix (the 60s loggedFetch timeout that bounds a stalled clerk link/init network call at the source) is unchanged. Claude-Session: https://claude.ai/code/session_01V1YkHZ2Ad1okwkX9bxTYsd --- test/e2e/lib/fixture-setup.test.ts | 32 -------- test/e2e/lib/fixture-setup.ts | 121 +++++++++++++---------------- 2 files changed, 56 insertions(+), 97 deletions(-) delete mode 100644 test/e2e/lib/fixture-setup.test.ts diff --git a/test/e2e/lib/fixture-setup.test.ts b/test/e2e/lib/fixture-setup.test.ts deleted file mode 100644 index bd95429a..00000000 --- a/test/e2e/lib/fixture-setup.test.ts +++ /dev/null @@ -1,32 +0,0 @@ -import { test, expect, describe } from "bun:test"; -import { tmpdir } from "node:os"; -import { runStep } from "./fixture-setup.ts"; - -describe("runStep", () => { - const base = { cwd: tmpdir(), env: process.env }; - - test("resolves when the command exits 0", async () => { - await expect( - runStep("ok", ["bash", "-c", "exit 0"], { ...base, timeoutMs: 5_000 }), - ).resolves.toBeUndefined(); - }); - - test("rejects with a labeled error including stderr on non-zero exit", async () => { - const err = await runStep("clerk link", ["bash", "-c", "echo boom >&2; exit 3"], { - ...base, - timeoutMs: 5_000, - }).catch((e: unknown) => e); - expect(String(err)).toMatch(/clerk link failed/); - expect(String(err)).toMatch(/boom/); - }); - - test("kills the subprocess and rejects promptly when the step exceeds its timeout", async () => { - const start = Date.now(); - const err = await runStep("slow", ["sleep", "10"], { ...base, timeoutMs: 150 }).catch( - (e: unknown) => e, - ); - expect(String(err)).toMatch(/slow timed out after 150ms/); - // Proves the child was killed rather than awaited: sleep 10 would take ~10s. - expect(Date.now() - start).toBeLessThan(2_000); - }); -}); diff --git a/test/e2e/lib/fixture-setup.ts b/test/e2e/lib/fixture-setup.ts index 537d6d28..7da1ef6c 100644 --- a/test/e2e/lib/fixture-setup.ts +++ b/test/e2e/lib/fixture-setup.ts @@ -25,6 +25,16 @@ function requireEnv(name: string): string { return val; } +/** Throw with a descriptive message if a shell command failed. */ +function assertSuccess( + label: string, + result: { exitCode: number; stderr: { toString(): string } }, +): void { + if (result.exitCode !== 0) { + throw new Error(`${label}:\n${result.stderr.toString()}`); + } +} + /** * Copy the fixture directory into an existing project dir. */ @@ -46,33 +56,18 @@ async function safeRm(path: string): Promise { } } -interface RunStepOptions { - cwd: string; - env: Record; - timeoutMs: number; -} - /** - * Spawn a setup step, killing the child on timeout so a stall fails fast with a - * labeled error instead of silently eating the whole 300s `beforeAll` budget. + * Run a setup step under a hard timeout so a stall fails fast with a labeled + * error instead of silently burning the whole 300s `beforeAll` budget. The + * subprocess is left to settle on timeout — `beforeAll` is never retried. */ -export async function runStep(label: string, cmd: string[], opts: RunStepOptions): Promise { - const controller = new AbortController(); - const timer = setTimeout(() => controller.abort(), opts.timeoutMs); - const proc = Bun.spawn(cmd, { - cwd: opts.cwd, - env: opts.env, - stdout: "ignore", - stderr: "pipe", - signal: controller.signal, +async function withStepTimeout(label: string, ms: number, run: () => Promise): Promise { + let timer: ReturnType | undefined; + const timeout = new Promise((_, reject) => { + timer = setTimeout(() => reject(new Error(`${label} timed out after ${ms}ms`)), ms); }); try { - const [stderr, exitCode] = await Promise.all([ - new Response(proc.stderr).text().catch(() => ""), - proc.exited.catch(() => -1), - ]); - if (controller.signal.aborted) throw new Error(`${label} timed out after ${opts.timeoutMs}ms`); - if (exitCode !== 0) throw new Error(`${label} failed:\n${stderr}`); + return await Promise.race([run(), timeout]); } finally { clearTimeout(timer); } @@ -87,38 +82,33 @@ async function linkProject(projectDir: string, configDir: string): Promise const appId = requireEnv("CLERK_CLI_TEST_APP_ID"); const platformAPIKey = requireEnv("CLERK_PLATFORM_API_KEY"); - await runStep("clerk link", ["bun", CLI_PATH, "--mode", "human", "link", "--app", appId], { - cwd: projectDir, - // PATH lets Bun.spawn resolve `bun`; the rest of the env stays isolated. - env: { - PATH: process.env.PATH, + const result = await Bun.$`bun ${CLI_PATH} --mode human link --app ${appId}` + .cwd(projectDir) + .env({ CLERK_CONFIG_DIR: configDir, CLERK_PLATFORM_API_KEY: platformAPIKey, - }, - timeoutMs: 60_000, - }); + }) + .quiet() + .nothrow(); + + assertSuccess("clerk link failed", result); } async function gitInit(projectDir: string): Promise { - await runStep( - "git init", - [ - "bash", - "-c", - 'git -c commit.gpgsign=false init && git add -A && git -c commit.gpgsign=false commit -m "init" --allow-empty', - ], - { - cwd: projectDir, - env: { + const result = + await Bun.$`git -c commit.gpgsign=false init && git add -A && git -c commit.gpgsign=false commit -m "init" --allow-empty` + .cwd(projectDir) + .env({ ...process.env, GIT_AUTHOR_NAME: "test", GIT_AUTHOR_EMAIL: "test@test.com", GIT_COMMITTER_NAME: "test", GIT_COMMITTER_EMAIL: "test@test.com", - }, - timeoutMs: 60_000, - }, - ); + }) + .quiet() + .nothrow(); + + assertSuccess("git init failed", result); } /** @@ -129,19 +119,16 @@ async function gitInit(projectDir: string): Promise { async function runClerkInit(projectDir: string, configDir: string): Promise { const platformAPIKey = requireEnv("CLERK_PLATFORM_API_KEY"); - await runStep( - "clerk init", - ["bun", CLI_PATH, "--mode", "human", "init", "--yes", "--no-skills"], - { - cwd: projectDir, - env: { - PATH: process.env.PATH, - CLERK_CONFIG_DIR: configDir, - CLERK_PLATFORM_API_KEY: platformAPIKey, - }, - timeoutMs: 90_000, - }, - ); + const result = await Bun.$`bun ${CLI_PATH} --mode human init --yes --no-skills` + .cwd(projectDir) + .env({ + CLERK_CONFIG_DIR: configDir, + CLERK_PLATFORM_API_KEY: platformAPIKey, + }) + .quiet() + .nothrow(); + + assertSuccess("clerk init failed", result); } /** Parse env files written by clerk init into a merged Record. @@ -194,9 +181,10 @@ export async function setupFixture(name: FixtureName): Promise { try { // Git-init before linking so the profile key matches for later commands - await gitInit(projectDir); - await linkProject(projectDir, configDir); - await runClerkInit(projectDir, configDir); + await withStepTimeout("git init", 60_000, () => gitInit(projectDir)); + // clerk link/init budgets back up the CLI's own per-request fetch timeout. + await withStepTimeout("clerk link", 60_000, () => linkProject(projectDir, configDir)); + await withStepTimeout("clerk init", 90_000, () => runClerkInit(projectDir, configDir)); const envVars = await parseEnvFiles(projectDir); @@ -212,11 +200,14 @@ export async function setupFixture(name: FixtureName): Promise { throw new Error(`${secretKeyName} not found in env files written by clerk init.`); } - await runStep( - "npm ci", - ["npm", "ci", "--ignore-scripts", "--legacy-peer-deps", "--no-audit", "--no-fund"], - { cwd: projectDir, env: process.env, timeoutMs: 240_000 }, + // --no-audit/--no-fund drop npm's advisory network round-trips during `ci`. + const install = await withStepTimeout("npm ci", 240_000, () => + Bun.$`npm ci --ignore-scripts --legacy-peer-deps --no-audit --no-fund` + .cwd(projectDir) + .quiet() + .nothrow(), ); + assertSuccess("npm ci failed", install); } catch (err) { await safeRm(projectDir); await safeRm(configDir); From e3bcc0071bca3dcadefea1dae84ed236cc7ad119 Mon Sep 17 00:00:00 2001 From: Rafael Thayto Tani Date: Mon, 22 Jun 2026 20:08:22 -0300 Subject: [PATCH 04/10] fix(e2e): drop per-step setup timeouts (revert Bun.spawn deadlock) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Bun.spawn `runStep` rewrite (5ce158a) regressed CI. `clerk init` runs an internal `npm install` with inherited stderr (init/heuristics.ts installSdk), so when the per-step AbortSignal SIGKILLed the CLI, the npm grandchild survived holding the stderr pipe open — `new Response(proc.stderr).text()` never EOF'd, the timeout never threw, and the 300s beforeAll fired instead. 3 fixtures hung. Root realization: `clerk init` and `npm ci` do package installs whose duration scales with CI contention, so any fixed per-step budget false-fails under load (clerk init blew past its 90s budget in the failing run). You can't fix contention-driven flakiness by capping variable-duration install work tighter. Fix: remove per-step timeouts entirely. The real root-cause fix — the 60s loggedFetch timeout — still bounds the only thing that can truly hang (network calls); `--parallel=4` cuts contention; the 300s beforeAll is the backstop. Setup steps return to plain Bun.$ (as on main). Removes runStep and its test. Claude-Session: https://claude.ai/code/session_01V1YkHZ2Ad1okwkX9bxTYsd --- test/e2e/lib/fixture-setup.ts | 40 +++++++++++------------------------ 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/test/e2e/lib/fixture-setup.ts b/test/e2e/lib/fixture-setup.ts index 7da1ef6c..b98345bf 100644 --- a/test/e2e/lib/fixture-setup.ts +++ b/test/e2e/lib/fixture-setup.ts @@ -56,23 +56,6 @@ async function safeRm(path: string): Promise { } } -/** - * Run a setup step under a hard timeout so a stall fails fast with a labeled - * error instead of silently burning the whole 300s `beforeAll` budget. The - * subprocess is left to settle on timeout — `beforeAll` is never retried. - */ -async function withStepTimeout(label: string, ms: number, run: () => Promise): Promise { - let timer: ReturnType | undefined; - const timeout = new Promise((_, reject) => { - timer = setTimeout(() => reject(new Error(`${label} timed out after ${ms}ms`)), ms); - }); - try { - return await Promise.race([run(), timeout]); - } finally { - clearTimeout(timer); - } -} - /** * Pre-link the project to the test Clerk application using an isolated * CLERK_CONFIG_DIR, so `clerk init` finds an existing link and skips the @@ -180,11 +163,14 @@ export async function setupFixture(name: FixtureName): Promise { let secretKey = ""; try { - // Git-init before linking so the profile key matches for later commands - await withStepTimeout("git init", 60_000, () => gitInit(projectDir)); - // clerk link/init budgets back up the CLI's own per-request fetch timeout. - await withStepTimeout("clerk link", 60_000, () => linkProject(projectDir, configDir)); - await withStepTimeout("clerk init", 90_000, () => runClerkInit(projectDir, configDir)); + // Git-init before linking so the profile key matches for later commands. + // These steps install packages (clerk init / npm ci) whose duration scales + // with CI load, so they're not given fixed per-step timeouts; the loggedFetch + // request timeout bounds the only thing that can truly hang (the network + // calls), and the 300s beforeAll is the outer backstop. + await gitInit(projectDir); + await linkProject(projectDir, configDir); + await runClerkInit(projectDir, configDir); const envVars = await parseEnvFiles(projectDir); @@ -201,12 +187,10 @@ export async function setupFixture(name: FixtureName): Promise { } // --no-audit/--no-fund drop npm's advisory network round-trips during `ci`. - const install = await withStepTimeout("npm ci", 240_000, () => - Bun.$`npm ci --ignore-scripts --legacy-peer-deps --no-audit --no-fund` - .cwd(projectDir) - .quiet() - .nothrow(), - ); + const install = await Bun.$`npm ci --ignore-scripts --legacy-peer-deps --no-audit --no-fund` + .cwd(projectDir) + .quiet() + .nothrow(); assertSuccess("npm ci failed", install); } catch (err) { await safeRm(projectDir); From 5cd16363ac77a1c370a1e3b4a666b6f4f7a29187 Mon Sep 17 00:00:00 2001 From: Rafael Thayto Tani Date: Mon, 22 Jun 2026 20:24:25 -0300 Subject: [PATCH 05/10] fix(e2e): bound npm ci fetch timeout to stop 300s setup hangs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The remaining flake is npm, not the CLI. `npm ci`'s default `fetch-timeout` is 300000ms — identical to the test's 300s beforeAll budget — so a single stalled npm registry connection hangs setup until the hook times out. (clerk init's installSdk skips here because the isolated env has no PATH, so npm ci is the only unbounded npm install.) - npm ci: add --fetch-timeout=60000 --fetch-retries=5 so a stalled fetch aborts at 60s and retries, mirroring the CLI's loggedFetch timeout. - Restore the debug-gated git/link/init/npm step markers so any residual hang names the exact step instead of an opaque "hook timed out". Claude-Session: https://claude.ai/code/session_01V1YkHZ2Ad1okwkX9bxTYsd --- test/e2e/lib/fixture-setup.ts | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/test/e2e/lib/fixture-setup.ts b/test/e2e/lib/fixture-setup.ts index b98345bf..36e6190f 100644 --- a/test/e2e/lib/fixture-setup.ts +++ b/test/e2e/lib/fixture-setup.ts @@ -158,19 +158,21 @@ export async function setupFixture(name: FixtureName): Promise { const projectDir = await mkdtemp(join(tmp, `clerk-e2e-${name}-`)); const configDir = await mkdtemp(join(tmp, "clerk-e2e-config-")); await copyFixture(fixtureDir, projectDir); + log("fixture copied"); let publishableKey = ""; let secretKey = ""; try { // Git-init before linking so the profile key matches for later commands. - // These steps install packages (clerk init / npm ci) whose duration scales - // with CI load, so they're not given fixed per-step timeouts; the loggedFetch - // request timeout bounds the only thing that can truly hang (the network - // calls), and the 300s beforeAll is the outer backstop. + // Step markers are debug-gated (CLERK_E2E_DEBUG) and pinpoint which step + // stalls if setup ever hits the 300s beforeAll budget. await gitInit(projectDir); + log("git init done"); await linkProject(projectDir, configDir); + log("clerk link done"); await runClerkInit(projectDir, configDir); + log("clerk init done"); const envVars = await parseEnvFiles(projectDir); @@ -186,12 +188,17 @@ export async function setupFixture(name: FixtureName): Promise { throw new Error(`${secretKeyName} not found in env files written by clerk init.`); } - // --no-audit/--no-fund drop npm's advisory network round-trips during `ci`. - const install = await Bun.$`npm ci --ignore-scripts --legacy-peer-deps --no-audit --no-fund` - .cwd(projectDir) - .quiet() - .nothrow(); + // npm's default fetch-timeout is 300s — the same as the beforeAll budget — + // so a single stalled registry connection hangs setup until the test times + // out. Bound each fetch to 60s and retry, mirroring the CLI's own request + // timeout. --no-audit/--no-fund drop npm's advisory network round-trips. + const install = + await Bun.$`npm ci --ignore-scripts --legacy-peer-deps --no-audit --no-fund --fetch-timeout=60000 --fetch-retries=5` + .cwd(projectDir) + .quiet() + .nothrow(); assertSuccess("npm ci failed", install); + log("npm ci done"); } catch (err) { await safeRm(projectDir); await safeRm(configDir); From beac04379eb514f71779cbad31f02f3f5022e560 Mon Sep 17 00:00:00 2001 From: Rafael Thayto Tani Date: Mon, 22 Jun 2026 20:27:04 -0300 Subject: [PATCH 06/10] fix(e2e): cap npm fetch-timeout so a stalled install can't hang setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The persistent 300s beforeAll hang was npm, not the CLI. npm's default fetch-timeout is 300000ms, so one stalled registry connection during either npm operation in setup blocks until the test budget expires. The previous commit bounded `npm ci` but missed the other one: `clerk init` runs an internal `npm install @clerk/` (installSdk), which was still unbounded — that's what hung the Vue fixture at 300007ms. Write a project `.npmrc` (fetch-timeout=30s, fetch-retries=3) before any npm runs. Both `clerk init`'s install and `npm ci` use projectDir as cwd, so it covers both: a stalled fetch now aborts in 30s and retries on a fresh connection instead of waiting 5 minutes. Worst case ~120s, safely under the 300s budget. Drops the redundant per-command npm flags. Claude-Session: https://claude.ai/code/session_01V1YkHZ2Ad1okwkX9bxTYsd --- test/e2e/lib/fixture-setup.ts | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/test/e2e/lib/fixture-setup.ts b/test/e2e/lib/fixture-setup.ts index 36e6190f..8529a3a1 100644 --- a/test/e2e/lib/fixture-setup.ts +++ b/test/e2e/lib/fixture-setup.ts @@ -42,6 +42,20 @@ async function copyFixture(fixtureDir: string, projectDir: string): Promise { + await Bun.write( + join(projectDir, ".npmrc"), + "fetch-timeout=30000\nfetch-retries=3\nfetch-retry-mintimeout=1000\nfetch-retry-maxtimeout=10000\n", + ); +} + /** * Best-effort recursive remove. Cleanup runs after the test has already * passed, so a stray filesystem error here must not fail the test. Bun's @@ -158,6 +172,7 @@ export async function setupFixture(name: FixtureName): Promise { const projectDir = await mkdtemp(join(tmp, `clerk-e2e-${name}-`)); const configDir = await mkdtemp(join(tmp, "clerk-e2e-config-")); await copyFixture(fixtureDir, projectDir); + await writeNpmrc(projectDir); log("fixture copied"); let publishableKey = ""; @@ -188,15 +203,12 @@ export async function setupFixture(name: FixtureName): Promise { throw new Error(`${secretKeyName} not found in env files written by clerk init.`); } - // npm's default fetch-timeout is 300s — the same as the beforeAll budget — - // so a single stalled registry connection hangs setup until the test times - // out. Bound each fetch to 60s and retry, mirroring the CLI's own request - // timeout. --no-audit/--no-fund drop npm's advisory network round-trips. - const install = - await Bun.$`npm ci --ignore-scripts --legacy-peer-deps --no-audit --no-fund --fetch-timeout=60000 --fetch-retries=5` - .cwd(projectDir) - .quiet() - .nothrow(); + // fetch-timeout/retries come from the project .npmrc (writeNpmrc); --no-audit + // and --no-fund drop npm's advisory network round-trips during `ci`. + const install = await Bun.$`npm ci --ignore-scripts --legacy-peer-deps --no-audit --no-fund` + .cwd(projectDir) + .quiet() + .nothrow(); assertSuccess("npm ci failed", install); log("npm ci done"); } catch (err) { From a2fca892d68cac996af0528ce84bccb7bfe99b00 Mon Sep 17 00:00:00 2001 From: Rafael Thayto Tani Date: Mon, 22 Jun 2026 20:40:24 -0300 Subject: [PATCH 07/10] fix(e2e): serialize fixtures to stop Bun.$ subprocess stalls under load MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Across four CI runs the 300s beforeAll hang moved randomly between fixtures AND steps — including `git init`, a local, near-instant, near-silent command. That rules out npm, the network, loggedFetch and the earlier Bun.spawn pipe deadlock: the only thing that explains a trivial `git` subprocess hanging 300s intermittently and only under `--parallel` is Bun.$ subprocess spawning/reaping stalling under high concurrent load (each of 4 workers spawns git + 2 `bun` CLIs + npm + a dev server + chromium at once). Run fixtures serially (`--parallel=1`, still isolated) so at most one fixture's subprocesses run at a time. Bump the E2E job timeout 30->45m for the slower serial run. Keeps the .npmrc fetch-timeout and loggedFetch fixes. Claude-Session: https://claude.ai/code/session_01V1YkHZ2Ad1okwkX9bxTYsd --- .github/workflows/ci.yml | 4 +++- package.json | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e06e6b97..52e57513 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -99,7 +99,9 @@ jobs: runs-on: blacksmith-8vcpu-ubuntu-2404 container: image: mcr.microsoft.com/playwright:v1.60.0-noble - timeout-minutes: 30 + # Fixtures run serially (test:e2e --parallel=1) to avoid Bun.$ subprocess + # stalls under concurrency, so allow more wall-clock than the parallel run. + timeout-minutes: 45 steps: - name: Install unzip (required by setup-bun) run: apt-get update && apt-get install -y unzip diff --git a/package.json b/package.json index 2656a0a2..57af27f6 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,7 @@ "build": "bun run --filter @clerk/cli-core build", "dev": "bun run --cwd packages/cli-core dev", "test": "bun test 'packages/cli-core/src/' 'packages/extras/src/' 'scripts/' --parallel --only-failures", - "test:e2e": "bun test 'test/e2e/' --retry 1 --parallel=4 --only-failures", + "test:e2e": "bun test 'test/e2e/' --retry 1 --parallel=1 --only-failures", "test:e2e:op": "bun run scripts/run-e2e-op.ts", "e2e:refresh-fixtures": "bun run scripts/refresh-e2e-fixtures.ts", "typecheck": "bun run --filter './packages/*' typecheck && tsc --noEmit -p scripts/tsconfig.json && tsc --noEmit -p test/e2e/tsconfig.json", From cef0d3c94e9498d193376f54bd20ee98115d3fcc Mon Sep 17 00:00:00 2001 From: Rafael Thayto Tani Date: Mon, 22 Jun 2026 20:53:36 -0300 Subject: [PATCH 08/10] fix(e2e): retry clerk link/init on non-fetch stalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Serializing fixtures fixed the contention-driven setup hangs, but exposed a second, independent flake: `clerk link` (and `init`) intermittently hang ~300s in a non-fetch path the CLI's loggedFetch timeout can't bound — in human mode they shell out to git and can stall on a git subprocess or prompt. It lands on a different fixture each run, so it's transient, not deterministic. Wrap both CLI steps in withRetry: a stall trips a hard timeout (90s/120s, above loggedFetch's 60s so genuinely-slow API calls aren't pre-empted) and the retry runs a fresh subprocess. Promise.race abandons the hung process (no stream deadlock); beforeAll isn't retried so the orphan can't cascade. --- test/e2e/lib/fixture-setup.ts | 36 +++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/test/e2e/lib/fixture-setup.ts b/test/e2e/lib/fixture-setup.ts index 8529a3a1..a3a82309 100644 --- a/test/e2e/lib/fixture-setup.ts +++ b/test/e2e/lib/fixture-setup.ts @@ -70,6 +70,36 @@ async function safeRm(path: string): Promise { } } +/** + * Run a step with a hard timeout, retrying once on a fresh subprocess. In human + * mode `clerk link`/`clerk init` shell out to git and can intermittently stall + * in a non-fetch path (a git subprocess, a prompt) that the CLI's own request + * timeout doesn't bound — which would otherwise burn the whole 300s beforeAll + * budget. Promise.race abandons a hung subprocess (no stream deadlock), and the + * retry lands on a clean run; beforeAll is not retried, so a brief orphan can't + * cascade. + */ +async function withRetry(label: string, timeoutMs: number, fn: () => Promise): Promise { + for (let attempt = 1; attempt <= 2; attempt++) { + let timer: ReturnType | undefined; + const timeout = new Promise((_, reject) => { + timer = setTimeout( + () => reject(new Error(`${label} timed out after ${timeoutMs}ms`)), + timeoutMs, + ); + }); + try { + await Promise.race([fn(), timeout]); + return; + } catch (err) { + if (attempt === 2) throw err; + log(`${label} attempt ${attempt} failed (${err}); retrying`); + } finally { + clearTimeout(timer); + } + } +} + /** * Pre-link the project to the test Clerk application using an isolated * CLERK_CONFIG_DIR, so `clerk init` finds an existing link and skips the @@ -184,9 +214,11 @@ export async function setupFixture(name: FixtureName): Promise { // stalls if setup ever hits the 300s beforeAll budget. await gitInit(projectDir); log("git init done"); - await linkProject(projectDir, configDir); + // Budgets sit above loggedFetch's 60s request timeout so a genuinely slow + // API call is handled there; withRetry only trips on a non-fetch stall. + await withRetry("clerk link", 90_000, () => linkProject(projectDir, configDir)); log("clerk link done"); - await runClerkInit(projectDir, configDir); + await withRetry("clerk init", 120_000, () => runClerkInit(projectDir, configDir)); log("clerk init done"); const envVars = await parseEnvFiles(projectDir); From aa14b3e3a96a563cf84e887cf2ab7f2d57e69abd Mon Sep 17 00:00:00 2001 From: Rafael Thayto Tani Date: Mon, 22 Jun 2026 21:38:24 -0300 Subject: [PATCH 09/10] fix(e2e): wrap all setup steps in retry; restore parallel run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Harden the setup against the intermittent Bun.$ subprocess stall (a spawned git/clerk/npm step occasionally never resolves — verified a Promise.race timeout still fires during the hang, so a retry recovers it). - withRetry now wraps every step: git init, clerk link, clerk init, npm ci. A hung attempt is abandoned at its budget and a fresh subprocess retried. - Tighten the project .npmrc (fetch-timeout 30s->20s, retries 3->2) so a real npm stall resolves well under the step budgets and can't false-trip them. - Restore --parallel=4 (retry absorbs the higher hang frequency) and revert the E2E job timeout to 30m. Keeps the loggedFetch 60s request timeout (bounds the CLI's own API calls). Claude-Session: https://claude.ai/code/session_01V1YkHZ2Ad1okwkX9bxTYsd --- .github/workflows/ci.yml | 4 +--- package.json | 2 +- test/e2e/lib/fixture-setup.ts | 22 ++++++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 52e57513..e06e6b97 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -99,9 +99,7 @@ jobs: runs-on: blacksmith-8vcpu-ubuntu-2404 container: image: mcr.microsoft.com/playwright:v1.60.0-noble - # Fixtures run serially (test:e2e --parallel=1) to avoid Bun.$ subprocess - # stalls under concurrency, so allow more wall-clock than the parallel run. - timeout-minutes: 45 + timeout-minutes: 30 steps: - name: Install unzip (required by setup-bun) run: apt-get update && apt-get install -y unzip diff --git a/package.json b/package.json index 57af27f6..2656a0a2 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,7 @@ "build": "bun run --filter @clerk/cli-core build", "dev": "bun run --cwd packages/cli-core dev", "test": "bun test 'packages/cli-core/src/' 'packages/extras/src/' 'scripts/' --parallel --only-failures", - "test:e2e": "bun test 'test/e2e/' --retry 1 --parallel=1 --only-failures", + "test:e2e": "bun test 'test/e2e/' --retry 1 --parallel=4 --only-failures", "test:e2e:op": "bun run scripts/run-e2e-op.ts", "e2e:refresh-fixtures": "bun run scripts/refresh-e2e-fixtures.ts", "typecheck": "bun run --filter './packages/*' typecheck && tsc --noEmit -p scripts/tsconfig.json && tsc --noEmit -p test/e2e/tsconfig.json", diff --git a/test/e2e/lib/fixture-setup.ts b/test/e2e/lib/fixture-setup.ts index a3a82309..0c849312 100644 --- a/test/e2e/lib/fixture-setup.ts +++ b/test/e2e/lib/fixture-setup.ts @@ -45,14 +45,14 @@ async function copyFixture(fixtureDir: string, projectDir: string): Promise { await Bun.write( join(projectDir, ".npmrc"), - "fetch-timeout=30000\nfetch-retries=3\nfetch-retry-mintimeout=1000\nfetch-retry-maxtimeout=10000\n", + "fetch-timeout=20000\nfetch-retries=2\nfetch-retry-mintimeout=1000\nfetch-retry-maxtimeout=8000\n", ); } @@ -212,7 +212,7 @@ export async function setupFixture(name: FixtureName): Promise { // Git-init before linking so the profile key matches for later commands. // Step markers are debug-gated (CLERK_E2E_DEBUG) and pinpoint which step // stalls if setup ever hits the 300s beforeAll budget. - await gitInit(projectDir); + await withRetry("git init", 30_000, () => gitInit(projectDir)); log("git init done"); // Budgets sit above loggedFetch's 60s request timeout so a genuinely slow // API call is handled there; withRetry only trips on a non-fetch stall. @@ -237,11 +237,13 @@ export async function setupFixture(name: FixtureName): Promise { // fetch-timeout/retries come from the project .npmrc (writeNpmrc); --no-audit // and --no-fund drop npm's advisory network round-trips during `ci`. - const install = await Bun.$`npm ci --ignore-scripts --legacy-peer-deps --no-audit --no-fund` - .cwd(projectDir) - .quiet() - .nothrow(); - assertSuccess("npm ci failed", install); + await withRetry("npm ci", 120_000, async () => { + const install = await Bun.$`npm ci --ignore-scripts --legacy-peer-deps --no-audit --no-fund` + .cwd(projectDir) + .quiet() + .nothrow(); + assertSuccess("npm ci failed", install); + }); log("npm ci done"); } catch (err) { await safeRm(projectDir); From d0bf0ebe0f6c291dbff191c290a14cff74ed9d92 Mon Sep 17 00:00:00 2001 From: Rafael Thayto Tani Date: Tue, 23 Jun 2026 13:09:58 -0300 Subject: [PATCH 10/10] fix(e2e): link in agent mode so the retry is idempotent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The retry on `clerk link` was making things worse: attempt 1 writes the profile then the process intermittently hangs (a lingering handle after setProfile, not a fetch — confirmed AbortSignal.timeout is unref'd), so withRetry kills it at 90s; attempt 2 then ran `clerk link --mode human` on the now-linked project, hit the interactive "re-link?" confirm prompt, and failed with "Already linked" (3/3 rerun sample failed this way). Run link in `--mode agent`: on an already-linked project it prints status and exits 0 instead of prompting, so the retry's second attempt succeeds. `clerk init` is already idempotent on re-run ("Clerk is already set up" -> exit 0). --- test/e2e/lib/fixture-setup.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/e2e/lib/fixture-setup.ts b/test/e2e/lib/fixture-setup.ts index 0c849312..2b06df9b 100644 --- a/test/e2e/lib/fixture-setup.ts +++ b/test/e2e/lib/fixture-setup.ts @@ -109,7 +109,10 @@ async function linkProject(projectDir: string, configDir: string): Promise const appId = requireEnv("CLERK_CLI_TEST_APP_ID"); const platformAPIKey = requireEnv("CLERK_PLATFORM_API_KEY"); - const result = await Bun.$`bun ${CLI_PATH} --mode human link --app ${appId}` + // Agent mode keeps link non-interactive: if a retry re-runs it on a project + // the first (hung-then-killed) attempt already linked, agent mode prints + // "already linked" and exits 0 instead of blocking on a human confirm prompt. + const result = await Bun.$`bun ${CLI_PATH} --mode agent link --app ${appId}` .cwd(projectDir) .env({ CLERK_CONFIG_DIR: configDir,