diff --git a/.changeset/lazy-app-init-sharding.md b/.changeset/lazy-app-init-sharding.md new file mode 100644 index 00000000000..a845151cc84 --- /dev/null +++ b/.changeset/lazy-app-init-sharding.md @@ -0,0 +1,2 @@ +--- +--- diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cab3138702e..3940405f6b3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -275,7 +275,7 @@ jobs: integration-tests: needs: [check-permissions, build-packages] if: ${{ github.event_name != 'pull_request' || github.event.pull_request.draft == false }} - name: Integration Tests (${{ matrix.test-name }}, ${{ matrix.test-project }}${{ matrix.next-version && format(', {0}', matrix.next-version) || '' }}) + name: Integration Tests (${{ matrix.test-name }}, ${{ matrix.test-project }}${{ matrix.next-version && format(', {0}', matrix.next-version) || '' }}${{ matrix.shard && format(', shard {0}', matrix.shard) || '' }}) permissions: contents: read actions: write # needed for actions/upload-artifact @@ -315,9 +315,33 @@ jobs: - test-name: "nextjs" test-project: "chrome" next-version: "15" + shard: "1/3" + shard-label: "1-of-3" + - test-name: "nextjs" + test-project: "chrome" + next-version: "15" + shard: "2/3" + shard-label: "2-of-3" + - test-name: "nextjs" + test-project: "chrome" + next-version: "15" + shard: "3/3" + shard-label: "3-of-3" + - test-name: "nextjs" + test-project: "chrome" + next-version: "16" + shard: "1/3" + shard-label: "1-of-3" + - test-name: "nextjs" + test-project: "chrome" + next-version: "16" + shard: "2/3" + shard-label: "2-of-3" - test-name: "nextjs" test-project: "chrome" next-version: "16" + shard: "3/3" + shard-label: "3-of-3" - test-name: "quickstart" test-project: "chrome" next-version: "15" @@ -365,6 +389,7 @@ jobs: E2E_NEXTJS_VERSION: ${{ matrix.next-version }} E2E_PROJECT: ${{ matrix.test-project }} INTEGRATION_INSTANCE_KEYS: ${{ secrets.INTEGRATION_INSTANCE_KEYS }} + PLAYWRIGHT_SHARD: ${{ matrix.shard || '' }} run: | # Use turbo's built-in --affected flag to detect changes # This automatically uses GITHUB_BASE_REF in GitHub Actions @@ -449,13 +474,14 @@ jobs: E2E_CLERK_ENCRYPTION_KEY: ${{ matrix.clerk-encryption-key }} INTEGRATION_INSTANCE_KEYS: ${{ secrets.INTEGRATION_INSTANCE_KEYS }} NODE_EXTRA_CA_CERTS: ${{ github.workspace }}/integration/certs/rootCA.pem + PLAYWRIGHT_SHARD: ${{ matrix.shard || '' }} VERCEL_AUTOMATION_BYPASS_SECRET: ${{ secrets.VERCEL_AUTOMATION_BYPASS_SECRET }} - name: Upload test-results if: ${{ cancelled() || failure() }} uses: actions/upload-artifact@v4 with: - name: playwright-traces-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test-name }}${{ matrix.next-version && format('-next{0}', matrix.next-version) || '' }} + name: playwright-traces-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test-name }}${{ matrix.next-version && format('-next{0}', matrix.next-version) || '' }}${{ matrix.shard-label && format('-shard-{0}', matrix.shard-label) || '' }} path: integration/test-results retention-days: 1 diff --git a/integration/models/longRunningApplication.ts b/integration/models/longRunningApplication.ts index 18be6c14204..b1a023269c5 100644 --- a/integration/models/longRunningApplication.ts +++ b/integration/models/longRunningApplication.ts @@ -1,7 +1,7 @@ import { parsePublishableKey } from '@clerk/shared/keys'; import { clerkSetup } from '@clerk/testing/playwright'; -import { awaitableTreekill, fs } from '../scripts'; +import { acquireProcessLock, awaitableTreekill, fs } from '../scripts'; import type { Application } from './application'; import type { ApplicationConfig } from './applicationConfig'; import type { EnvironmentConfig } from './environment'; @@ -16,6 +16,18 @@ const getPort = (_url: string) => { return Number.parseInt(url.port || (url.protocol === 'https:' ? '443' : '80')); }; +/** + * Check if a server is responding at the given URL. + */ +const isServerReady = async (url: string): Promise => { + try { + const res = await fetch(url); + return res.ok; + } catch { + return false; + } +}; + export type LongRunningApplication = ReturnType; export type LongRunningApplicationParams = { id: string; @@ -29,7 +41,8 @@ export type LongRunningApplicationParams = { * Its interface is the same as the Application and the ApplicationConfig interface, * making it interchangeable with the Application and ApplicationConfig. * - * After init() is called, all mutating methods on the config are ignored. + * init() is lazy and idempotent: it checks the state file first, and uses + * file-based locking to ensure only one process initializes each app. */ export const longRunningApplication = (params: LongRunningApplicationParams) => { const { id } = params; @@ -54,92 +67,195 @@ export const longRunningApplication = (params: LongRunningApplicationParams) => env ||= environmentConfig().fromJson(data.env); }; + /** + * Try to adopt an already-running app from the state file. + * Returns true if the app is running and state was loaded. + */ + const tryAdoptFromStateFile = async (): Promise => { + try { + const apps = stateFile.getLongRunningApps(); + const data = apps?.[id]; + if (!data?.serverUrl) { + return false; + } + const ready = await isServerReady(data.serverUrl); + if (ready) { + port = data.port; + serverUrl = data.serverUrl; + pid = data.pid; + appDir = data.appDir; + env = params.env; + // Propagate testing tokens to this worker process so that + // setupClerkTestingToken() can bypass bot protection. + if (data.clerkFapi) { + process.env.CLERK_FAPI = data.clerkFapi; + } + if (data.clerkTestingToken) { + process.env.CLERK_TESTING_TOKEN = data.clerkTestingToken; + } + return true; + } + return false; + } catch { + // State file may be partially written by another process — not an error + return false; + } + }; + + /** + * Perform the full app initialization: testing tokens, commit, install, build, serve. + */ + const doFullInit = async () => { + const log = (msg: string) => console.log(`[${name}] ${msg}`); + log('Starting full init...'); + + try { + const publishableKey = params.env.publicVariables.get('CLERK_PUBLISHABLE_KEY'); + const secretKey = params.env.privateVariables.get('CLERK_SECRET_KEY'); + const apiUrl = params.env.privateVariables.get('CLERK_API_URL'); + const { instanceType, frontendApi: frontendApiUrl } = parsePublishableKey(publishableKey); + + if (instanceType !== 'development') { + log('Skipping setup of testing tokens for non-development instance'); + } else { + log('Setting up testing tokens...'); + await clerkSetup({ + publishableKey, + frontendApiUrl, + secretKey, + // @ts-expect-error apiUrl is not a typed option for clerkSetup, but it is accepted at runtime. + apiUrl, + dotenv: false, + }); + log('Testing tokens setup complete'); + } + } catch (error) { + console.error('Error setting up testing tokens:', error); + throw error; + } + + try { + log('Committing config...'); + app = await config.commit(); + log(`Config committed, appDir: ${app.appDir}`); + } catch (error) { + console.error('Error committing config:', error); + throw error; + } + + try { + await app.withEnv(params.env); + } catch (error) { + console.error('Error setting up environment:', error); + throw error; + } + + try { + log('Running setup (pnpm install)...'); + await app.setup(); + log('Setup complete'); + } catch (error) { + console.error('Error during app setup:', error); + throw error; + } + + try { + log('Building app...'); + const buildTimeout = new Promise((_, reject) => + setTimeout(() => reject(new Error(`Build timed out after 120s for ${name}`)), 120_000), + ); + await Promise.race([app.build(), buildTimeout]); + log('Build complete'); + } catch (error) { + console.error('Error during app build:', error); + throw error; + } + + try { + log('Starting serve (detached)...'); + const serveResult = await app.serve({ detached: true }); + port = serveResult.port; + serverUrl = serveResult.serverUrl; + pid = serveResult.pid; + appDir = app.appDir; + log(`Serve complete: port=${port}, serverUrl=${serverUrl}, pid=${pid}`); + // Serialize state file writes across all apps to prevent concurrent + // read-modify-write from clobbering entries written by other workers. + const releaseStateFileLock = await acquireProcessLock('__state-file__'); + try { + stateFile.addLongRunningApp({ + port, + serverUrl, + pid, + id, + appDir, + env: params.env.toJson(), + clerkFapi: process.env.CLERK_FAPI, + clerkTestingToken: process.env.CLERK_TESTING_TOKEN, + }); + } finally { + releaseStateFileLock(); + } + } catch (error) { + console.error('Error during app serve:', error); + throw error; + } + }; + const self = new Proxy( { - // will be called by global.setup.ts and by the test runner - // the first time this is called, the app starts and the state is persisted in the state file + /** + * Lazy, idempotent init. Safe to call from multiple Playwright workers. + * - If the app is already running (found in state file + server responds), reuses it. + * - Otherwise, acquires a file lock and initializes. Other workers wait for the lock. + */ init: async () => { const log = (msg: string) => console.log(`[${name}] ${msg}`); - log('Starting init...'); - try { - const publishableKey = params.env.publicVariables.get('CLERK_PUBLISHABLE_KEY'); - const secretKey = params.env.privateVariables.get('CLERK_SECRET_KEY'); - const apiUrl = params.env.privateVariables.get('CLERK_API_URL'); - const { instanceType, frontendApi: frontendApiUrl } = parsePublishableKey(publishableKey); - - if (instanceType !== 'development') { - log('Skipping setup of testing tokens for non-development instance'); - } else { - log('Setting up testing tokens...'); - await clerkSetup({ - publishableKey, - frontendApiUrl, - secretKey, - // @ts-expect-error apiUrl is not a typed option for clerkSetup, but it is accepted at runtime. - apiUrl, - dotenv: false, - }); - log('Testing tokens setup complete'); - } - } catch (error) { - console.error('Error setting up testing tokens:', error); - throw error; - } - try { - log('Committing config...'); - app = await config.commit(); - log(`Config committed, appDir: ${app.appDir}`); - } catch (error) { - console.error('Error committing config:', error); - throw error; - } - try { - await app.withEnv(params.env); - } catch (error) { - console.error('Error setting up environment:', error); - throw error; - } - try { - log('Running setup (pnpm install)...'); - await app.setup(); - log('Setup complete'); - } catch (error) { - console.error('Error during app setup:', error); - throw error; + + // Fast path: already initialized in this process + if (serverUrl && (await isServerReady(serverUrl))) { + log('Already initialized in this process'); + return; } - try { - log('Building app...'); - const buildTimeout = new Promise((_, reject) => - setTimeout(() => reject(new Error(`Build timed out after 120s for ${name}`)), 120_000), - ); - await Promise.race([app.build(), buildTimeout]); - log('Build complete'); - } catch (error) { - console.error('Error during app build:', error); - throw error; + + // Check if another process already initialized this app + if (await tryAdoptFromStateFile()) { + log(`Adopted from state file: ${serverUrl}`); + return; } + + // Need to initialize — acquire lock to prevent duplicate work + log('Acquiring init lock...'); + const releaseLock = await acquireProcessLock(id); try { - log('Starting serve (detached)...'); - const serveResult = await app.serve({ detached: true }); - port = serveResult.port; - serverUrl = serveResult.serverUrl; - pid = serveResult.pid; - appDir = app.appDir; - log(`Serve complete: port=${port}, serverUrl=${serverUrl}, pid=${pid}`); - stateFile.addLongRunningApp({ port, serverUrl, pid, id, appDir, env: params.env.toJson() }); - } catch (error) { - console.error('Error during app serve:', error); - throw error; + // Double-check after acquiring lock (another process may have finished while we waited) + if (await tryAdoptFromStateFile()) { + log(`Adopted from state file after lock: ${serverUrl}`); + return; + } + + // We hold the lock and the app is not running — do full init + await doFullInit(); + } finally { + releaseLock(); } }, // will be called by global.teardown.ts destroy: async () => { readFromStateFile(); + if (!pid && !appDir) { + console.log(`Skipping destroy for ${name}: no pid or appDir`); + return; + } console.log(`Destroying ${serverUrl}`); - await awaitableTreekill(pid, 'SIGKILL'); - // TODO: Test whether this is necessary now that we have awaitableTreekill - await new Promise(res => setTimeout(res, 2000)); - await fs.rm(appDir, { recursive: true, force: true }); + if (pid) { + await awaitableTreekill(pid, 'SIGKILL'); + // TODO: Test whether this is necessary now that we have awaitableTreekill + await new Promise(res => setTimeout(res, 2000)); + } + if (appDir) { + await fs.rm(appDir, { recursive: true, force: true }); + } }, // read the persisted state and behave like an app commit: () => { diff --git a/integration/models/stateFile.ts b/integration/models/stateFile.ts index e5713b422f4..d662f90b87d 100644 --- a/integration/models/stateFile.ts +++ b/integration/models/stateFile.ts @@ -9,6 +9,8 @@ export type AppParams = { pid?: number; appDir: string; env: ReturnType; + clerkFapi?: string; + clerkTestingToken?: string; }; type StandaloneAppParams = { diff --git a/integration/playwright.config.ts b/integration/playwright.config.ts index 007d17768cc..ae82cbf8886 100644 --- a/integration/playwright.config.ts +++ b/integration/playwright.config.ts @@ -24,8 +24,21 @@ export const common: PlaywrightTestConfig = { }, } as const; +// Parse optional shard from env (e.g., PLAYWRIGHT_SHARD="1/3") +const parseShard = (shardEnv: string | undefined) => { + if (!shardEnv) { + return undefined; + } + const [current, total] = shardEnv.split('/').map(Number); + if (!current || !total || current > total) { + return undefined; + } + return { current, total }; +}; + export default defineConfig({ ...common, + shard: parseShard(process.env.PLAYWRIGHT_SHARD), projects: [ { diff --git a/integration/scripts/index.ts b/integration/scripts/index.ts index ff301be7798..53ed3455b14 100644 --- a/integration/scripts/index.ts +++ b/integration/scripts/index.ts @@ -17,3 +17,4 @@ export { awaitableTreekill } from './awaitableTreekill'; export { startClerkJsHttpServer, killClerkJsHttpServer } from './clerkJsServer'; export { startClerkUiHttpServer, killClerkUiHttpServer } from './clerkUiServer'; export { startHttpServer, killHttpServer, getTempDir } from './httpServer'; +export { acquireProcessLock } from './processLock'; diff --git a/integration/scripts/processLock.ts b/integration/scripts/processLock.ts new file mode 100644 index 00000000000..345a6515d38 --- /dev/null +++ b/integration/scripts/processLock.ts @@ -0,0 +1,62 @@ +import * as path from 'node:path'; + +import { constants } from '../constants'; + +import { fs } from './index'; + +const LOCK_DIR = path.join(constants.TMP_DIR, '.locks'); +const STALE_LOCK_MS = 120_000; // 2 minutes + +/** + * Acquire a file-based lock. Returns a release function. + * If the lock is held by another process, polls until it becomes available. + * Stale locks (older than STALE_LOCK_MS) are automatically reclaimed. + */ +export const acquireProcessLock = async ( + lockName: string, + opts: { timeoutMs?: number; pollIntervalMs?: number } = {}, +): Promise<() => void> => { + const { timeoutMs = 180_000, pollIntervalMs = 500 } = opts; + const lockFile = path.join(LOCK_DIR, `${lockName}.lock`); + await fs.ensureDir(LOCK_DIR); + + const start = Date.now(); + + while (Date.now() - start < timeoutMs) { + try { + // Atomic create — fails with EEXIST if lock is held + fs.writeFileSync(lockFile, JSON.stringify({ pid: process.pid, time: Date.now() }), { flag: 'wx' }); + // Lock acquired + return () => { + try { + fs.unlinkSync(lockFile); + } catch { + // Lock file already removed — not an error + } + }; + } catch (e: any) { + if (e.code !== 'EEXIST') { + throw e; + } + + // Lock exists — check for staleness + try { + const content = fs.readFileSync(lockFile, 'utf-8'); + const { time } = JSON.parse(content); + if (Date.now() - time > STALE_LOCK_MS) { + // Stale lock — reclaim it + fs.unlinkSync(lockFile); + continue; + } + } catch { + // Lock was released between our check and read — retry + continue; + } + + // Lock is held and not stale — wait + await new Promise(resolve => setTimeout(resolve, pollIntervalMs)); + } + } + + throw new Error(`Timed out waiting for lock "${lockName}" after ${timeoutMs}ms`); +}; diff --git a/integration/testUtils/testAgainstRunningApps.ts b/integration/testUtils/testAgainstRunningApps.ts index 2b8c67752d7..9afbc7bf77e 100644 --- a/integration/testUtils/testAgainstRunningApps.ts +++ b/integration/testUtils/testAgainstRunningApps.ts @@ -46,6 +46,15 @@ export function testAgainstRunningApps(runningAppsParams: RunningAppsParams) { test.describe(title, () => { runningApps(runningAppsParams).forEach(app => { test.describe(`${app.name}`, () => { + test.beforeAll(async () => { + // Workers may block waiting for another worker to finish initializing + // an app, so allow up to 5 minutes for the lock + init sequence. + test.setTimeout(300_000); + // Lazy init: starts the app if it's not already running. + // Uses file-based locking so only one worker initializes each app. + await app.init(); + }); + cb({ app }); }); }); diff --git a/integration/tests/global.setup.ts b/integration/tests/global.setup.ts index 9125fab770d..2059b7f572b 100644 --- a/integration/tests/global.setup.ts +++ b/integration/tests/global.setup.ts @@ -1,8 +1,7 @@ import { test as setup } from '@playwright/test'; import { constants } from '../constants'; -import { appConfigs } from '../presets'; -import { fs, parseEnvOptions, startClerkJsHttpServer, startClerkUiHttpServer } from '../scripts'; +import { fs, startClerkJsHttpServer, startClerkUiHttpServer } from '../scripts'; setup('start long running apps', async () => { setup.setTimeout(300_000); @@ -12,14 +11,8 @@ setup('start long running apps', async () => { await startClerkJsHttpServer(); await startClerkUiHttpServer(); - const { appIds } = parseEnvOptions(); - if (appIds.length) { - const apps = appConfigs.longRunningApps.getByPattern(appIds); - // state cannot be shared using playwright, - // so we save the state in a file using a strategy similar to `storageState` - await Promise.all(apps.map(app => app.init())); - } else { - // start a single app using the available env variables - } - console.log('Apps started'); + // Apps are now initialized lazily by testAgainstRunningApps via test.beforeAll. + // Each app is started on-demand when a test first needs it, using file-based + // locking to prevent duplicate initialization across Playwright workers. + console.log('HTTP servers started. Apps will initialize lazily.'); }); diff --git a/integration/tests/handshake.test.ts b/integration/tests/handshake.test.ts index dc6975fc524..89167fd1577 100644 --- a/integration/tests/handshake.test.ts +++ b/integration/tests/handshake.test.ts @@ -1076,7 +1076,7 @@ test.describe('Client handshake with organization activation @nextjs', () => { }); test.afterAll('setup local Clerk API mock', async () => { - await app.teardown(); + await app?.teardown(); return new Promise(resolve => jwksServer.close(() => resolve())); }); @@ -1460,7 +1460,7 @@ test.describe('Client handshake with an organization activation avoids infinite }); test.afterAll('setup local Clerk API mock', async () => { - await thisApp.teardown(); + await thisApp?.teardown(); return new Promise(resolve => jwksServer.close(() => resolve())); }); diff --git a/turbo.json b/turbo.json index 3059180e78f..3bd7cb46222 100644 --- a/turbo.json +++ b/turbo.json @@ -222,7 +222,7 @@ "outputLogs": "new-only" }, "//#test:integration:nextjs": { - "env": ["CLEANUP", "DEBUG", "E2E_*", "INTEGRATION_INSTANCE_KEYS"], + "env": ["CLEANUP", "DEBUG", "E2E_*", "INTEGRATION_INSTANCE_KEYS", "PLAYWRIGHT_SHARD"], "inputs": ["integration/**"], "outputLogs": "new-only" },