diff --git a/.github/workflows/ci-host.yaml b/.github/workflows/ci-host.yaml index ba2e54c3e0..889fc8e1d9 100644 --- a/.github/workflows/ci-host.yaml +++ b/.github/workflows/ci-host.yaml @@ -251,6 +251,15 @@ jobs: HOST_TEST_PARTITION: ${{ matrix.shardIndex }} HOST_TEST_PARTITION_COUNT: ${{ matrix.shardTotal }} DBUS_SYSTEM_BUS_ADDRESS: unix:path=/run/dbus/system_bus_socket + # [readiness-diag] — start-server-and-test reads DEBUG and + # propagates `log: true` + `verbose: true` to wait-on, so each + # readiness-probe attempt prints its HTTP status / connection + # error. Distinguishes "server not listening" from "server + # hanging mid-request" from "server returning 500" when the + # 10-minute wait-on timer fires. Remove together with the + # other [readiness-diag] markers once the root cause is + # identified. + DEBUG: start-server-and-test working-directory: packages/host - name: Upload junit report to GitHub Actions Artifacts diff --git a/mise-tasks/test-services/host b/mise-tasks/test-services/host index 0a7cddd8ee..ba4137c9d8 100755 --- a/mise-tasks/test-services/host +++ b/mise-tasks/test-services/host @@ -22,7 +22,7 @@ esac BASE_REALM_READY="${REALM_READY_SCHEME}://${REALM_BASE_URL#*://}/base/${READY_PATH}" NODE_TEST_REALM_READY="${REALM_TEST_READY_SCHEME}://${REALM_TEST_URL#*://}/node-test/${READY_PATH}" -HOST_TEST_LOG_LEVELS="${HOST_TEST_LOG_LEVELS:-*=info,realm:requests=warn,realm-index-updater=debug,index-runner=debug,index-perf=debug,index-writer=debug,worker=debug,worker-manager=debug}" +HOST_TEST_LOG_LEVELS="${HOST_TEST_LOG_LEVELS:-*=info,realm:requests=warn,realm-index-updater=debug,index-runner=debug,index-perf=debug,index-writer=debug,worker=debug,worker-manager=debug,readiness-diag=debug}" SKIP_CATALOG="${SKIP_CATALOG:-}" # START_SERVER_AND_TEST_INSECURE=1 scopes to the wait-on readiness probe. diff --git a/packages/runtime-common/index-runner.ts b/packages/runtime-common/index-runner.ts index 06741f00da..4ca0edf81d 100644 --- a/packages/runtime-common/index-runner.ts +++ b/packages/runtime-common/index-runner.ts @@ -53,6 +53,8 @@ export class IndexRunner { #log = logger('index-runner'); #fetch: typeof globalThis.fetch; #perfLog = logger('index-perf'); + // [readiness-diag] — opt-in CI flake diagnostics. Remove with call site. + #readinessDiag = logger('readiness-diag'); #realmPaths: RealmPaths; #ignoreData: Record; #prerenderer: Prerenderer; @@ -216,6 +218,15 @@ export class IndexRunner { }); try { let filesCompleted = 0; + // [readiness-diag] — visit-loop heartbeat. Today the only + // markers between `completed invalidations in Xms` and + // `completed index visit in Yms` are per-file onProgress events + // (callback-routed, not in the worker log). A stuck visit looks + // like a 10-minute log gap. Emit a progress line every 25 files + // so the realm-server/worker artifact pinpoints WHERE the loop + // parked. Opt-in via the `readiness-diag` logger. Remove with + // the other [readiness-diag] blocks once root cause is found. + const VISIT_PROGRESS_INTERVAL = 25; for (let invalidation of invalidations) { // Resume guard. If a previous attempt of this same job already // wrote URL_X to the working table AND the EFS mtime hasn't @@ -252,6 +263,14 @@ export class IndexRunner { filesCompleted, totalFiles: invalidations.length, }); + if ( + filesCompleted % VISIT_PROGRESS_INTERVAL === 0 || + filesCompleted === invalidations.length + ) { + current.#readinessDiag.debug( + `${jobIdentity(current.#jobInfo)} visit progress ${filesCompleted}/${invalidations.length} for realm ${current.realmURL.href} (elapsedMs=${Date.now() - visitStart}, last=${invalidation.href})`, + ); + } } if (resumedSkipped > 0) { current.#perfLog.debug( diff --git a/packages/runtime-common/realm-index-updater.ts b/packages/runtime-common/realm-index-updater.ts index d9b74b2cab..c0726838ab 100644 --- a/packages/runtime-common/realm-index-updater.ts +++ b/packages/runtime-common/realm-index-updater.ts @@ -27,6 +27,8 @@ import ignore, { type Ignore } from 'ignore'; export class RealmIndexUpdater { #realm: Realm; #log = logger('realm-index-updater'); + // [readiness-diag] — opt-in CI flake diagnostics. Remove with call site. + #readinessDiag = logger('readiness-diag'); #ignoreData: Record = {}; // Bumped every time a from-scratch result writes #ignoreData. Concurrent // incrementals capture this version when they snapshot #ignoreData; if a @@ -142,8 +144,8 @@ export class RealmIndexUpdater { let startedAt = performance.now(); this.#log.info(`Realm ${this.realmURL.href} is starting indexing`); - let published = (async () => - await enqueueReindexRealmJob( + let published = (async () => { + let job = await enqueueReindexRealmJob( this.#realm.url, await this.#realm.getRealmOwnerUsername(), this.#queue, @@ -152,7 +154,17 @@ export class RealmIndexUpdater { { clearLastModified: opts?.clearLastModified, }, - ))(); + ); + // [readiness-diag] — startedAt covers the full enqueue path + // (getRealmOwnerUsername + the optional clearLastModified write + // inside enqueueReindexRealmJob + the queue insert), not just the + // SQL publish. Labeled accordingly. See also realm.ts + // readinessCheck + #startup. + this.#readinessDiag.debug( + `Realm ${this.realmURL.href} from-scratch job durably enqueued as job_id=${job.id} (enqueue path took ${((performance.now() - startedAt) / 1000).toFixed(2)}s)`, + ); + return job; + })(); let completed = published .then(async (job) => { diff --git a/packages/runtime-common/realm.ts b/packages/runtime-common/realm.ts index 5f5f4a307e..3f4b008b5b 100644 --- a/packages/runtime-common/realm.ts +++ b/packages/runtime-common/realm.ts @@ -685,6 +685,11 @@ export class Realm { #router: Router; #log = logger('realm'); #perfLog = logger('perf'); + // [readiness-diag] — opt-in logger, off in production, enabled in CI + // host tests via `readiness-diag=debug` in HOST_TEST_LOG_LEVELS. + // Remove this field together with the [readiness-diag] call sites + // once the CI readiness-check timeout flake is rooted out. + #readinessDiag = logger('readiness-diag'); #updateItems: UpdateItem[] = []; #flushUpdateEvents: Promise | undefined; #recentWrites: Map = new Map(); @@ -1065,7 +1070,19 @@ export class Realm { _request: Request, requestContext: RequestContext, ) { + // [readiness-diag] — entry / fulfillment markers. The per-realm + // startup heartbeat lives in #startup so that wait-on's 0.5–2s + // probe cadence doesn't multiply heartbeat intervals (~300 of them + // accumulating across a 10-minute hang) on a single Deferred. + let waitStart = Date.now(); + this.#readinessDiag.debug( + `readiness check requested for realm ${this.url}`, + ); await this.#startedUp.promise; + let totalElapsed = ((Date.now() - waitStart) / 1000).toFixed(1); + this.#readinessDiag.debug( + `readiness check fulfilled for realm ${this.url} (waited ${totalElapsed}s)`, + ); return createResponse({ body: null, @@ -2405,34 +2422,92 @@ export class Realm { async #startup(opts?: { fromScratchIndexPriority?: number }) { await Promise.resolve(); let startTime = Date.now(); - if (this.#copiedFromRealm) { - await this.#realmIndexUpdater.copy(this.#copiedFromRealm); - this.broadcastRealmEvent({ - eventName: 'index', - indexType: 'copy', - sourceRealmURL: this.#copiedFromRealm.href, - realmURL: this.url, - }); - } else { - let isNewIndex = await this.#realmIndexUpdater.isNewIndex(); - if (isNewIndex || this.#fullIndexOnStartup) { - let priority = - opts?.fromScratchIndexPriority ?? this.#fromScratchIndexPriority; - let promise = this.#realmIndexUpdater.fullIndex(priority); - if (isNewIndex) { - // we only await the full indexing at boot if this is a brand new index - await promise; - } - // not sure how useful this event is--nothing is currently listening for - // it, and it may happen during or after the full index... + // [readiness-diag] — every `this.#readinessDiag.debug(...)` line in + // this method is a temporary phase marker for the CI readiness-check + // timeout flake. The whole logger is opt-in (off in production, on + // in CI host tests via HOST_TEST_LOG_LEVELS). Remove the call sites + // together with the `#readinessDiag` field once root cause is found. + this.#readinessDiag.debug(`startup begin for realm ${this.url}`); + // [readiness-diag] — one heartbeat per realm-lifetime, not per + // readiness-probe (wait-on polls every 0.5–2s; a per-request + // heartbeat would multiply to hundreds of concurrent intervals). + // Cleared in the finally below so a startup that fails or throws + // doesn't leak the timer past the lifetime of this method. + let heartbeat: ReturnType | undefined = setInterval( + () => { + let elapsed = ((Date.now() - startTime) / 1000).toFixed(1); + this.#readinessDiag.debug( + `startup still pending for realm ${this.url} (elapsed=${elapsed}s)`, + ); + }, + 15_000, + ); + if ( + heartbeat && + typeof (heartbeat as unknown as { unref?: () => void }).unref === + 'function' + ) { + (heartbeat as unknown as { unref: () => void }).unref(); + } + try { + if (this.#copiedFromRealm) { + this.#readinessDiag.debug( + `startup phase=copy from ${this.#copiedFromRealm.href} for realm ${this.url}`, + ); + await this.#realmIndexUpdater.copy(this.#copiedFromRealm); + this.#readinessDiag.debug( + `startup phase=copy resolved for realm ${this.url} (elapsed=${Date.now() - startTime}ms)`, + ); this.broadcastRealmEvent({ eventName: 'index', - indexType: 'full', + indexType: 'copy', + sourceRealmURL: this.#copiedFromRealm.href, realmURL: this.url, }); + } else { + let isNewIndex = await this.#realmIndexUpdater.isNewIndex(); + this.#readinessDiag.debug( + `startup isNewIndex=${isNewIndex} fullIndexOnStartup=${this.#fullIndexOnStartup} for realm ${this.url}`, + ); + if (isNewIndex || this.#fullIndexOnStartup) { + let priority = + opts?.fromScratchIndexPriority ?? this.#fromScratchIndexPriority; + let promise = this.#realmIndexUpdater.fullIndex(priority); + if (isNewIndex) { + // we only await the full indexing at boot if this is a brand new index + this.#readinessDiag.debug( + `startup awaiting fullIndex (new index) for realm ${this.url}`, + ); + await promise; + this.#readinessDiag.debug( + `startup fullIndex resolved for realm ${this.url} (elapsed=${Date.now() - startTime}ms)`, + ); + } else { + this.#readinessDiag.debug( + `startup kicked off fire-and-forget fullIndex for realm ${this.url}`, + ); + } + // not sure how useful this event is--nothing is currently listening for + // it, and it may happen during or after the full index... + this.broadcastRealmEvent({ + eventName: 'index', + indexType: 'full', + realmURL: this.url, + }); + } + } + this.#readinessDiag.debug( + `startup awaiting isWorldReadable for realm ${this.url}`, + ); + await this.isWorldReadable(); + this.#readinessDiag.debug( + `startup complete for realm ${this.url} in ${Date.now() - startTime}ms`, + ); + } finally { + if (heartbeat !== undefined) { + clearInterval(heartbeat); } } - await this.isWorldReadable(); this.#perfLog.debug( `realm server ${this.url} startup in ${Date.now() - startTime} ms`,