From af826c6653505448bfca4f09b343cc04ec5f31cd Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Mon, 18 May 2026 18:29:51 -0400 Subject: [PATCH 1/3] CI flake diagnostics: readiness-check + startup phase logging The "Error: Timed out waiting for: .../_readiness-check" failure in the host-test job recurs without enough log signal to root-cause. Today the realm-server log artifact jumps from "is starting indexing" to nothing for 10 minutes; the worker log similarly silences between "completed invalidations" and "completed index visit"; the wait-on output never shows what HTTP response (if any) it saw. Temporary diagnostic logging, all tagged "[ci-readiness-diag]": - Realm#readinessCheck: entry log, 15s heartbeat while #startedUp is pending, fulfillment log with total wait time. - Realm.#startup: per-phase info logs (copy / isNewIndex / fullIndexOnStartup / awaiting fullIndex / awaiting isWorldReadable / complete) so we can see exactly where a realm is parked. - RealmIndexUpdater.publishFullIndex: log job_id and queue.publish duration the moment the from-scratch job is durably enqueued. - IndexRunner.fromScratch: visit-loop progress every 25 files (filesCompleted / totalFiles / lastUrl / elapsedMs). - ci-host.yaml host-test step: DEBUG=start-server-and-test so wait-on prints each probe's HTTP status / connection error. Removal: `git grep ci-readiness-diag` locates every block. Each new log line carries the "[ci-readiness-diag]" prefix in its message so search on log artifacts also finds them. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci-host.yaml | 9 +++ packages/runtime-common/index-runner.ts | 17 +++++ .../runtime-common/realm-index-updater.ts | 12 +++- packages/runtime-common/realm.ts | 67 ++++++++++++++++++- 4 files changed, 101 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-host.yaml b/.github/workflows/ci-host.yaml index ba2e54c3e0f..78b714febcb 100644 --- a/.github/workflows/ci-host.yaml +++ b/.github/workflows/ci-host.yaml @@ -251,6 +251,15 @@ jobs: HOST_TEST_PARTITION: ${{ matrix.shardIndex }} HOST_TEST_PARTITION_COUNT: ${{ matrix.shardTotal }} DBUS_SYSTEM_BUS_ADDRESS: unix:path=/run/dbus/system_bus_socket + # [ci-readiness-diag] — start-server-and-test reads DEBUG and + # propagates `log: true` + `verbose: true` to wait-on, so each + # readiness-probe attempt prints its HTTP status / connection + # error. Distinguishes "server not listening" from "server + # hanging mid-request" from "server returning 500" when the + # 10-minute wait-on timer fires. Remove together with the + # other [ci-readiness-diag] markers once the root cause is + # identified. + DEBUG: start-server-and-test working-directory: packages/host - name: Upload junit report to GitHub Actions Artifacts diff --git a/packages/runtime-common/index-runner.ts b/packages/runtime-common/index-runner.ts index 06741f00da6..5b0adaae70b 100644 --- a/packages/runtime-common/index-runner.ts +++ b/packages/runtime-common/index-runner.ts @@ -216,6 +216,15 @@ export class IndexRunner { }); try { let filesCompleted = 0; + // [ci-readiness-diag] — visit-loop heartbeat. Today the only + // markers between `completed invalidations in Xms` and + // `completed index visit in Yms` are per-file onProgress events + // (callback-routed, not in the worker log). A stuck visit looks + // like a 10-minute log gap. Emit a progress line every 25 files + // so the realm-server/worker artifact pinpoints WHERE the loop + // parked. Remove together with the other [ci-readiness-diag] + // blocks once the root cause is identified. + const VISIT_PROGRESS_INTERVAL = 25; for (let invalidation of invalidations) { // Resume guard. If a previous attempt of this same job already // wrote URL_X to the working table AND the EFS mtime hasn't @@ -252,6 +261,14 @@ export class IndexRunner { filesCompleted, totalFiles: invalidations.length, }); + if ( + filesCompleted % VISIT_PROGRESS_INTERVAL === 0 || + filesCompleted === invalidations.length + ) { + current.#perfLog.info( + `[ci-readiness-diag] ${jobIdentity(current.#jobInfo)} visit progress ${filesCompleted}/${invalidations.length} for realm ${current.realmURL.href} (elapsedMs=${Date.now() - visitStart}, last=${invalidation.href})`, + ); + } } if (resumedSkipped > 0) { current.#perfLog.debug( diff --git a/packages/runtime-common/realm-index-updater.ts b/packages/runtime-common/realm-index-updater.ts index d9b74b2cab9..52d8b1de0e1 100644 --- a/packages/runtime-common/realm-index-updater.ts +++ b/packages/runtime-common/realm-index-updater.ts @@ -142,8 +142,8 @@ export class RealmIndexUpdater { let startedAt = performance.now(); this.#log.info(`Realm ${this.realmURL.href} is starting indexing`); - let published = (async () => - await enqueueReindexRealmJob( + let published = (async () => { + let job = await enqueueReindexRealmJob( this.#realm.url, await this.#realm.getRealmOwnerUsername(), this.#queue, @@ -152,7 +152,13 @@ export class RealmIndexUpdater { { clearLastModified: opts?.clearLastModified, }, - ))(); + ); + // [ci-readiness-diag] — see also realm.ts readinessCheck + #startup + this.#log.info( + `[ci-readiness-diag] Realm ${this.realmURL.href} from-scratch job durably enqueued as job_id=${job.id} (queue.publish took ${((performance.now() - startedAt) / 1000).toFixed(2)}s)`, + ); + return job; + })(); let completed = published .then(async (job) => { diff --git a/packages/runtime-common/realm.ts b/packages/runtime-common/realm.ts index 5f5f4a307e6..8eda2d4fa1b 100644 --- a/packages/runtime-common/realm.ts +++ b/packages/runtime-common/realm.ts @@ -1065,7 +1065,42 @@ export class Realm { _request: Request, requestContext: RequestContext, ) { - await this.#startedUp.promise; + // [ci-readiness-diag] — block runs from this line to the matching + // marker below. Remove the entry/heartbeat/exit logs together once + // CI flake is rooted out; the bare `await this.#startedUp.promise` + // is the only behavioral statement. + let waitStart = Date.now(); + this.#log.info( + `[ci-readiness-diag] readiness check requested for realm ${this.url}`, + ); + let heartbeat: ReturnType | undefined = setInterval( + () => { + let elapsed = ((Date.now() - waitStart) / 1000).toFixed(1); + this.#log.info( + `[ci-readiness-diag] readiness check still waiting on #startedUp for realm ${this.url} (elapsed=${elapsed}s)`, + ); + }, + 15_000, + ); + if ( + heartbeat && + typeof (heartbeat as unknown as { unref?: () => void }).unref === + 'function' + ) { + (heartbeat as unknown as { unref: () => void }).unref(); + } + try { + await this.#startedUp.promise; + } finally { + if (heartbeat !== undefined) { + clearInterval(heartbeat); + } + } + let totalElapsed = ((Date.now() - waitStart) / 1000).toFixed(1); + this.#log.info( + `[ci-readiness-diag] readiness check fulfilled for realm ${this.url} (waited ${totalElapsed}s)`, + ); + // [ci-readiness-diag] — end of removable block. return createResponse({ body: null, @@ -2405,8 +2440,19 @@ export class Realm { async #startup(opts?: { fromScratchIndexPriority?: number }) { await Promise.resolve(); let startTime = Date.now(); + // [ci-readiness-diag] — every `this.#log.info('[ci-readiness-diag]...')` + // line in this method is a temporary phase marker for the CI + // readiness-check timeout flake. Remove all of them together once the + // root cause is identified; the existing control flow is unchanged. + this.#log.info(`[ci-readiness-diag] startup begin for realm ${this.url}`); if (this.#copiedFromRealm) { + this.#log.info( + `[ci-readiness-diag] startup phase=copy from ${this.#copiedFromRealm.href} for realm ${this.url}`, + ); await this.#realmIndexUpdater.copy(this.#copiedFromRealm); + this.#log.info( + `[ci-readiness-diag] startup phase=copy resolved for realm ${this.url} (elapsed=${Date.now() - startTime}ms)`, + ); this.broadcastRealmEvent({ eventName: 'index', indexType: 'copy', @@ -2415,13 +2461,26 @@ export class Realm { }); } else { let isNewIndex = await this.#realmIndexUpdater.isNewIndex(); + this.#log.info( + `[ci-readiness-diag] startup isNewIndex=${isNewIndex} fullIndexOnStartup=${this.#fullIndexOnStartup} for realm ${this.url}`, + ); if (isNewIndex || this.#fullIndexOnStartup) { let priority = opts?.fromScratchIndexPriority ?? this.#fromScratchIndexPriority; let promise = this.#realmIndexUpdater.fullIndex(priority); if (isNewIndex) { // we only await the full indexing at boot if this is a brand new index + this.#log.info( + `[ci-readiness-diag] startup awaiting fullIndex (new index) for realm ${this.url}`, + ); await promise; + this.#log.info( + `[ci-readiness-diag] startup fullIndex resolved for realm ${this.url} (elapsed=${Date.now() - startTime}ms)`, + ); + } else { + this.#log.info( + `[ci-readiness-diag] startup kicked off fire-and-forget fullIndex for realm ${this.url}`, + ); } // not sure how useful this event is--nothing is currently listening for // it, and it may happen during or after the full index... @@ -2432,7 +2491,13 @@ export class Realm { }); } } + this.#log.info( + `[ci-readiness-diag] startup awaiting isWorldReadable for realm ${this.url}`, + ); await this.isWorldReadable(); + this.#log.info( + `[ci-readiness-diag] startup complete for realm ${this.url} in ${Date.now() - startTime}ms`, + ); this.#perfLog.debug( `realm server ${this.url} startup in ${Date.now() - startTime} ms`, From 9ec47d48b9f2f923461bba8a1267cd58a0d2e558 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Mon, 18 May 2026 18:41:23 -0400 Subject: [PATCH 2/3] Address Copilot review on diagnostic logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two findings on PR #4873: 1. readinessCheck created a setInterval heartbeat per probe. wait-on polls every 0.5–2s for up to 10 minutes, so a stalled startup would accumulate ~300 concurrent intervals all heartbeating the same Deferred, ~300 log lines every 15s. Moved the heartbeat to #startup so it runs exactly once per realm-lifetime regardless of probe count. readinessCheck keeps only entry + fulfillment logs. 2. publishFullIndex labeled the elapsed time "queue.publish took", but startedAt is captured before getRealmOwnerUsername() and the clearLastModified write inside enqueueReindexRealmJob. Renamed the label to "enqueue path took" so it accurately covers the full path from "is starting indexing" to "durably enqueued". Co-Authored-By: Claude Opus 4.7 (1M context) --- .../runtime-common/realm-index-updater.ts | 8 +- packages/runtime-common/realm.ts | 154 +++++++++--------- 2 files changed, 85 insertions(+), 77 deletions(-) diff --git a/packages/runtime-common/realm-index-updater.ts b/packages/runtime-common/realm-index-updater.ts index 52d8b1de0e1..8f51b7beb09 100644 --- a/packages/runtime-common/realm-index-updater.ts +++ b/packages/runtime-common/realm-index-updater.ts @@ -153,9 +153,13 @@ export class RealmIndexUpdater { clearLastModified: opts?.clearLastModified, }, ); - // [ci-readiness-diag] — see also realm.ts readinessCheck + #startup + // [ci-readiness-diag] — startedAt covers the full enqueue path + // (getRealmOwnerUsername + the optional clearLastModified write + // inside enqueueReindexRealmJob + the queue insert), not just the + // SQL publish. Labeled accordingly. See also realm.ts + // readinessCheck + #startup. this.#log.info( - `[ci-readiness-diag] Realm ${this.realmURL.href} from-scratch job durably enqueued as job_id=${job.id} (queue.publish took ${((performance.now() - startedAt) / 1000).toFixed(2)}s)`, + `[ci-readiness-diag] Realm ${this.realmURL.href} from-scratch job durably enqueued as job_id=${job.id} (enqueue path took ${((performance.now() - startedAt) / 1000).toFixed(2)}s)`, ); return job; })(); diff --git a/packages/runtime-common/realm.ts b/packages/runtime-common/realm.ts index 8eda2d4fa1b..711a84e2015 100644 --- a/packages/runtime-common/realm.ts +++ b/packages/runtime-common/realm.ts @@ -1065,42 +1065,19 @@ export class Realm { _request: Request, requestContext: RequestContext, ) { - // [ci-readiness-diag] — block runs from this line to the matching - // marker below. Remove the entry/heartbeat/exit logs together once - // CI flake is rooted out; the bare `await this.#startedUp.promise` - // is the only behavioral statement. + // [ci-readiness-diag] — entry / fulfillment markers. The per-realm + // startup heartbeat lives in #startup so that wait-on's 0.5–2s + // probe cadence doesn't multiply heartbeat intervals (~300 of them + // accumulating across a 10-minute hang) on a single Deferred. let waitStart = Date.now(); this.#log.info( `[ci-readiness-diag] readiness check requested for realm ${this.url}`, ); - let heartbeat: ReturnType | undefined = setInterval( - () => { - let elapsed = ((Date.now() - waitStart) / 1000).toFixed(1); - this.#log.info( - `[ci-readiness-diag] readiness check still waiting on #startedUp for realm ${this.url} (elapsed=${elapsed}s)`, - ); - }, - 15_000, - ); - if ( - heartbeat && - typeof (heartbeat as unknown as { unref?: () => void }).unref === - 'function' - ) { - (heartbeat as unknown as { unref: () => void }).unref(); - } - try { - await this.#startedUp.promise; - } finally { - if (heartbeat !== undefined) { - clearInterval(heartbeat); - } - } + await this.#startedUp.promise; let totalElapsed = ((Date.now() - waitStart) / 1000).toFixed(1); this.#log.info( `[ci-readiness-diag] readiness check fulfilled for realm ${this.url} (waited ${totalElapsed}s)`, ); - // [ci-readiness-diag] — end of removable block. return createResponse({ body: null, @@ -2445,59 +2422,86 @@ export class Realm { // readiness-check timeout flake. Remove all of them together once the // root cause is identified; the existing control flow is unchanged. this.#log.info(`[ci-readiness-diag] startup begin for realm ${this.url}`); - if (this.#copiedFromRealm) { - this.#log.info( - `[ci-readiness-diag] startup phase=copy from ${this.#copiedFromRealm.href} for realm ${this.url}`, - ); - await this.#realmIndexUpdater.copy(this.#copiedFromRealm); - this.#log.info( - `[ci-readiness-diag] startup phase=copy resolved for realm ${this.url} (elapsed=${Date.now() - startTime}ms)`, - ); - this.broadcastRealmEvent({ - eventName: 'index', - indexType: 'copy', - sourceRealmURL: this.#copiedFromRealm.href, - realmURL: this.url, - }); - } else { - let isNewIndex = await this.#realmIndexUpdater.isNewIndex(); - this.#log.info( - `[ci-readiness-diag] startup isNewIndex=${isNewIndex} fullIndexOnStartup=${this.#fullIndexOnStartup} for realm ${this.url}`, - ); - if (isNewIndex || this.#fullIndexOnStartup) { - let priority = - opts?.fromScratchIndexPriority ?? this.#fromScratchIndexPriority; - let promise = this.#realmIndexUpdater.fullIndex(priority); - if (isNewIndex) { - // we only await the full indexing at boot if this is a brand new index - this.#log.info( - `[ci-readiness-diag] startup awaiting fullIndex (new index) for realm ${this.url}`, - ); - await promise; - this.#log.info( - `[ci-readiness-diag] startup fullIndex resolved for realm ${this.url} (elapsed=${Date.now() - startTime}ms)`, - ); - } else { - this.#log.info( - `[ci-readiness-diag] startup kicked off fire-and-forget fullIndex for realm ${this.url}`, - ); - } - // not sure how useful this event is--nothing is currently listening for - // it, and it may happen during or after the full index... + // [ci-readiness-diag] — one heartbeat per realm-lifetime, not per + // readiness-probe (wait-on polls every 0.5–2s; a per-request + // heartbeat would multiply to hundreds of concurrent intervals). + // Cleared in the finally below so a startup that fails or throws + // doesn't leak the timer past the lifetime of this method. + let heartbeat: ReturnType | undefined = setInterval( + () => { + let elapsed = ((Date.now() - startTime) / 1000).toFixed(1); + this.#log.info( + `[ci-readiness-diag] startup still pending for realm ${this.url} (elapsed=${elapsed}s)`, + ); + }, + 15_000, + ); + if ( + heartbeat && + typeof (heartbeat as unknown as { unref?: () => void }).unref === + 'function' + ) { + (heartbeat as unknown as { unref: () => void }).unref(); + } + try { + if (this.#copiedFromRealm) { + this.#log.info( + `[ci-readiness-diag] startup phase=copy from ${this.#copiedFromRealm.href} for realm ${this.url}`, + ); + await this.#realmIndexUpdater.copy(this.#copiedFromRealm); + this.#log.info( + `[ci-readiness-diag] startup phase=copy resolved for realm ${this.url} (elapsed=${Date.now() - startTime}ms)`, + ); this.broadcastRealmEvent({ eventName: 'index', - indexType: 'full', + indexType: 'copy', + sourceRealmURL: this.#copiedFromRealm.href, realmURL: this.url, }); + } else { + let isNewIndex = await this.#realmIndexUpdater.isNewIndex(); + this.#log.info( + `[ci-readiness-diag] startup isNewIndex=${isNewIndex} fullIndexOnStartup=${this.#fullIndexOnStartup} for realm ${this.url}`, + ); + if (isNewIndex || this.#fullIndexOnStartup) { + let priority = + opts?.fromScratchIndexPriority ?? this.#fromScratchIndexPriority; + let promise = this.#realmIndexUpdater.fullIndex(priority); + if (isNewIndex) { + // we only await the full indexing at boot if this is a brand new index + this.#log.info( + `[ci-readiness-diag] startup awaiting fullIndex (new index) for realm ${this.url}`, + ); + await promise; + this.#log.info( + `[ci-readiness-diag] startup fullIndex resolved for realm ${this.url} (elapsed=${Date.now() - startTime}ms)`, + ); + } else { + this.#log.info( + `[ci-readiness-diag] startup kicked off fire-and-forget fullIndex for realm ${this.url}`, + ); + } + // not sure how useful this event is--nothing is currently listening for + // it, and it may happen during or after the full index... + this.broadcastRealmEvent({ + eventName: 'index', + indexType: 'full', + realmURL: this.url, + }); + } + } + this.#log.info( + `[ci-readiness-diag] startup awaiting isWorldReadable for realm ${this.url}`, + ); + await this.isWorldReadable(); + this.#log.info( + `[ci-readiness-diag] startup complete for realm ${this.url} in ${Date.now() - startTime}ms`, + ); + } finally { + if (heartbeat !== undefined) { + clearInterval(heartbeat); } } - this.#log.info( - `[ci-readiness-diag] startup awaiting isWorldReadable for realm ${this.url}`, - ); - await this.isWorldReadable(); - this.#log.info( - `[ci-readiness-diag] startup complete for realm ${this.url} in ${Date.now() - startTime}ms`, - ); this.#perfLog.debug( `realm server ${this.url} startup in ${Date.now() - startTime} ms`, From 43f127ec2dd7f3a34534a14f79b230ad3ab077f9 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Mon, 18 May 2026 18:51:40 -0400 Subject: [PATCH 3/3] Move diagnostics behind opt-in `readiness-diag` debug logger MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review: the diagnostic logs shouldn't be `info`-level with `[ci-readiness-diag]` baked into each message — they should be `debug` on a common logger category that's opt-in via `LOG_LEVELS`. That makes them invisible in production by default, and the cleanup story is "drop `readiness-diag=debug` from `HOST_TEST_LOG_LEVELS`" with the code-removal as a follow-up. - New named logger `readiness-diag` on each of `Realm`, `RealmIndexUpdater`, and `IndexRunner` (as `#readinessDiag`). - Every prior `this.#log.info('[ci-readiness-diag] ...')` call site rewritten as `this.#readinessDiag.debug('...')`. The string prefix drops out — the logger name does that job. - `HOST_TEST_LOG_LEVELS` in `mise-tasks/test-services/host` now includes `readiness-diag=debug`. Production realm-server keeps `*=info` so the new logger stays silent there. - Renamed in-code marker comments from `[ci-readiness-diag]` to `[readiness-diag]` for consistency with the logger name. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci-host.yaml | 4 +- mise-tasks/test-services/host | 2 +- packages/runtime-common/index-runner.ts | 12 ++-- .../runtime-common/realm-index-updater.ts | 8 ++- packages/runtime-common/realm.ts | 64 ++++++++++--------- 5 files changed, 50 insertions(+), 40 deletions(-) diff --git a/.github/workflows/ci-host.yaml b/.github/workflows/ci-host.yaml index 78b714febcb..889fc8e1d98 100644 --- a/.github/workflows/ci-host.yaml +++ b/.github/workflows/ci-host.yaml @@ -251,13 +251,13 @@ jobs: HOST_TEST_PARTITION: ${{ matrix.shardIndex }} HOST_TEST_PARTITION_COUNT: ${{ matrix.shardTotal }} DBUS_SYSTEM_BUS_ADDRESS: unix:path=/run/dbus/system_bus_socket - # [ci-readiness-diag] — start-server-and-test reads DEBUG and + # [readiness-diag] — start-server-and-test reads DEBUG and # propagates `log: true` + `verbose: true` to wait-on, so each # readiness-probe attempt prints its HTTP status / connection # error. Distinguishes "server not listening" from "server # hanging mid-request" from "server returning 500" when the # 10-minute wait-on timer fires. Remove together with the - # other [ci-readiness-diag] markers once the root cause is + # other [readiness-diag] markers once the root cause is # identified. DEBUG: start-server-and-test working-directory: packages/host diff --git a/mise-tasks/test-services/host b/mise-tasks/test-services/host index 0a7cddd8ee1..ba4137c9d80 100755 --- a/mise-tasks/test-services/host +++ b/mise-tasks/test-services/host @@ -22,7 +22,7 @@ esac BASE_REALM_READY="${REALM_READY_SCHEME}://${REALM_BASE_URL#*://}/base/${READY_PATH}" NODE_TEST_REALM_READY="${REALM_TEST_READY_SCHEME}://${REALM_TEST_URL#*://}/node-test/${READY_PATH}" -HOST_TEST_LOG_LEVELS="${HOST_TEST_LOG_LEVELS:-*=info,realm:requests=warn,realm-index-updater=debug,index-runner=debug,index-perf=debug,index-writer=debug,worker=debug,worker-manager=debug}" +HOST_TEST_LOG_LEVELS="${HOST_TEST_LOG_LEVELS:-*=info,realm:requests=warn,realm-index-updater=debug,index-runner=debug,index-perf=debug,index-writer=debug,worker=debug,worker-manager=debug,readiness-diag=debug}" SKIP_CATALOG="${SKIP_CATALOG:-}" # START_SERVER_AND_TEST_INSECURE=1 scopes to the wait-on readiness probe. diff --git a/packages/runtime-common/index-runner.ts b/packages/runtime-common/index-runner.ts index 5b0adaae70b..4ca0edf81d2 100644 --- a/packages/runtime-common/index-runner.ts +++ b/packages/runtime-common/index-runner.ts @@ -53,6 +53,8 @@ export class IndexRunner { #log = logger('index-runner'); #fetch: typeof globalThis.fetch; #perfLog = logger('index-perf'); + // [readiness-diag] — opt-in CI flake diagnostics. Remove with call site. + #readinessDiag = logger('readiness-diag'); #realmPaths: RealmPaths; #ignoreData: Record; #prerenderer: Prerenderer; @@ -216,14 +218,14 @@ export class IndexRunner { }); try { let filesCompleted = 0; - // [ci-readiness-diag] — visit-loop heartbeat. Today the only + // [readiness-diag] — visit-loop heartbeat. Today the only // markers between `completed invalidations in Xms` and // `completed index visit in Yms` are per-file onProgress events // (callback-routed, not in the worker log). A stuck visit looks // like a 10-minute log gap. Emit a progress line every 25 files // so the realm-server/worker artifact pinpoints WHERE the loop - // parked. Remove together with the other [ci-readiness-diag] - // blocks once the root cause is identified. + // parked. Opt-in via the `readiness-diag` logger. Remove with + // the other [readiness-diag] blocks once root cause is found. const VISIT_PROGRESS_INTERVAL = 25; for (let invalidation of invalidations) { // Resume guard. If a previous attempt of this same job already @@ -265,8 +267,8 @@ export class IndexRunner { filesCompleted % VISIT_PROGRESS_INTERVAL === 0 || filesCompleted === invalidations.length ) { - current.#perfLog.info( - `[ci-readiness-diag] ${jobIdentity(current.#jobInfo)} visit progress ${filesCompleted}/${invalidations.length} for realm ${current.realmURL.href} (elapsedMs=${Date.now() - visitStart}, last=${invalidation.href})`, + current.#readinessDiag.debug( + `${jobIdentity(current.#jobInfo)} visit progress ${filesCompleted}/${invalidations.length} for realm ${current.realmURL.href} (elapsedMs=${Date.now() - visitStart}, last=${invalidation.href})`, ); } } diff --git a/packages/runtime-common/realm-index-updater.ts b/packages/runtime-common/realm-index-updater.ts index 8f51b7beb09..c0726838ab8 100644 --- a/packages/runtime-common/realm-index-updater.ts +++ b/packages/runtime-common/realm-index-updater.ts @@ -27,6 +27,8 @@ import ignore, { type Ignore } from 'ignore'; export class RealmIndexUpdater { #realm: Realm; #log = logger('realm-index-updater'); + // [readiness-diag] — opt-in CI flake diagnostics. Remove with call site. + #readinessDiag = logger('readiness-diag'); #ignoreData: Record = {}; // Bumped every time a from-scratch result writes #ignoreData. Concurrent // incrementals capture this version when they snapshot #ignoreData; if a @@ -153,13 +155,13 @@ export class RealmIndexUpdater { clearLastModified: opts?.clearLastModified, }, ); - // [ci-readiness-diag] — startedAt covers the full enqueue path + // [readiness-diag] — startedAt covers the full enqueue path // (getRealmOwnerUsername + the optional clearLastModified write // inside enqueueReindexRealmJob + the queue insert), not just the // SQL publish. Labeled accordingly. See also realm.ts // readinessCheck + #startup. - this.#log.info( - `[ci-readiness-diag] Realm ${this.realmURL.href} from-scratch job durably enqueued as job_id=${job.id} (enqueue path took ${((performance.now() - startedAt) / 1000).toFixed(2)}s)`, + this.#readinessDiag.debug( + `Realm ${this.realmURL.href} from-scratch job durably enqueued as job_id=${job.id} (enqueue path took ${((performance.now() - startedAt) / 1000).toFixed(2)}s)`, ); return job; })(); diff --git a/packages/runtime-common/realm.ts b/packages/runtime-common/realm.ts index 711a84e2015..3f4b008b5b3 100644 --- a/packages/runtime-common/realm.ts +++ b/packages/runtime-common/realm.ts @@ -685,6 +685,11 @@ export class Realm { #router: Router; #log = logger('realm'); #perfLog = logger('perf'); + // [readiness-diag] — opt-in logger, off in production, enabled in CI + // host tests via `readiness-diag=debug` in HOST_TEST_LOG_LEVELS. + // Remove this field together with the [readiness-diag] call sites + // once the CI readiness-check timeout flake is rooted out. + #readinessDiag = logger('readiness-diag'); #updateItems: UpdateItem[] = []; #flushUpdateEvents: Promise | undefined; #recentWrites: Map = new Map(); @@ -1065,18 +1070,18 @@ export class Realm { _request: Request, requestContext: RequestContext, ) { - // [ci-readiness-diag] — entry / fulfillment markers. The per-realm + // [readiness-diag] — entry / fulfillment markers. The per-realm // startup heartbeat lives in #startup so that wait-on's 0.5–2s // probe cadence doesn't multiply heartbeat intervals (~300 of them // accumulating across a 10-minute hang) on a single Deferred. let waitStart = Date.now(); - this.#log.info( - `[ci-readiness-diag] readiness check requested for realm ${this.url}`, + this.#readinessDiag.debug( + `readiness check requested for realm ${this.url}`, ); await this.#startedUp.promise; let totalElapsed = ((Date.now() - waitStart) / 1000).toFixed(1); - this.#log.info( - `[ci-readiness-diag] readiness check fulfilled for realm ${this.url} (waited ${totalElapsed}s)`, + this.#readinessDiag.debug( + `readiness check fulfilled for realm ${this.url} (waited ${totalElapsed}s)`, ); return createResponse({ @@ -2417,12 +2422,13 @@ export class Realm { async #startup(opts?: { fromScratchIndexPriority?: number }) { await Promise.resolve(); let startTime = Date.now(); - // [ci-readiness-diag] — every `this.#log.info('[ci-readiness-diag]...')` - // line in this method is a temporary phase marker for the CI - // readiness-check timeout flake. Remove all of them together once the - // root cause is identified; the existing control flow is unchanged. - this.#log.info(`[ci-readiness-diag] startup begin for realm ${this.url}`); - // [ci-readiness-diag] — one heartbeat per realm-lifetime, not per + // [readiness-diag] — every `this.#readinessDiag.debug(...)` line in + // this method is a temporary phase marker for the CI readiness-check + // timeout flake. The whole logger is opt-in (off in production, on + // in CI host tests via HOST_TEST_LOG_LEVELS). Remove the call sites + // together with the `#readinessDiag` field once root cause is found. + this.#readinessDiag.debug(`startup begin for realm ${this.url}`); + // [readiness-diag] — one heartbeat per realm-lifetime, not per // readiness-probe (wait-on polls every 0.5–2s; a per-request // heartbeat would multiply to hundreds of concurrent intervals). // Cleared in the finally below so a startup that fails or throws @@ -2430,8 +2436,8 @@ export class Realm { let heartbeat: ReturnType | undefined = setInterval( () => { let elapsed = ((Date.now() - startTime) / 1000).toFixed(1); - this.#log.info( - `[ci-readiness-diag] startup still pending for realm ${this.url} (elapsed=${elapsed}s)`, + this.#readinessDiag.debug( + `startup still pending for realm ${this.url} (elapsed=${elapsed}s)`, ); }, 15_000, @@ -2445,12 +2451,12 @@ export class Realm { } try { if (this.#copiedFromRealm) { - this.#log.info( - `[ci-readiness-diag] startup phase=copy from ${this.#copiedFromRealm.href} for realm ${this.url}`, + this.#readinessDiag.debug( + `startup phase=copy from ${this.#copiedFromRealm.href} for realm ${this.url}`, ); await this.#realmIndexUpdater.copy(this.#copiedFromRealm); - this.#log.info( - `[ci-readiness-diag] startup phase=copy resolved for realm ${this.url} (elapsed=${Date.now() - startTime}ms)`, + this.#readinessDiag.debug( + `startup phase=copy resolved for realm ${this.url} (elapsed=${Date.now() - startTime}ms)`, ); this.broadcastRealmEvent({ eventName: 'index', @@ -2460,8 +2466,8 @@ export class Realm { }); } else { let isNewIndex = await this.#realmIndexUpdater.isNewIndex(); - this.#log.info( - `[ci-readiness-diag] startup isNewIndex=${isNewIndex} fullIndexOnStartup=${this.#fullIndexOnStartup} for realm ${this.url}`, + this.#readinessDiag.debug( + `startup isNewIndex=${isNewIndex} fullIndexOnStartup=${this.#fullIndexOnStartup} for realm ${this.url}`, ); if (isNewIndex || this.#fullIndexOnStartup) { let priority = @@ -2469,16 +2475,16 @@ export class Realm { let promise = this.#realmIndexUpdater.fullIndex(priority); if (isNewIndex) { // we only await the full indexing at boot if this is a brand new index - this.#log.info( - `[ci-readiness-diag] startup awaiting fullIndex (new index) for realm ${this.url}`, + this.#readinessDiag.debug( + `startup awaiting fullIndex (new index) for realm ${this.url}`, ); await promise; - this.#log.info( - `[ci-readiness-diag] startup fullIndex resolved for realm ${this.url} (elapsed=${Date.now() - startTime}ms)`, + this.#readinessDiag.debug( + `startup fullIndex resolved for realm ${this.url} (elapsed=${Date.now() - startTime}ms)`, ); } else { - this.#log.info( - `[ci-readiness-diag] startup kicked off fire-and-forget fullIndex for realm ${this.url}`, + this.#readinessDiag.debug( + `startup kicked off fire-and-forget fullIndex for realm ${this.url}`, ); } // not sure how useful this event is--nothing is currently listening for @@ -2490,12 +2496,12 @@ export class Realm { }); } } - this.#log.info( - `[ci-readiness-diag] startup awaiting isWorldReadable for realm ${this.url}`, + this.#readinessDiag.debug( + `startup awaiting isWorldReadable for realm ${this.url}`, ); await this.isWorldReadable(); - this.#log.info( - `[ci-readiness-diag] startup complete for realm ${this.url} in ${Date.now() - startTime}ms`, + this.#readinessDiag.debug( + `startup complete for realm ${this.url} in ${Date.now() - startTime}ms`, ); } finally { if (heartbeat !== undefined) {