Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/ci-host.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,15 @@ jobs:
HOST_TEST_PARTITION: ${{ matrix.shardIndex }}
HOST_TEST_PARTITION_COUNT: ${{ matrix.shardTotal }}
DBUS_SYSTEM_BUS_ADDRESS: unix:path=/run/dbus/system_bus_socket
# [readiness-diag] — start-server-and-test reads DEBUG and
# propagates `log: true` + `verbose: true` to wait-on, so each
# readiness-probe attempt prints its HTTP status / connection
# error. Distinguishes "server not listening" from "server
# hanging mid-request" from "server returning 500" when the
# 10-minute wait-on timer fires. Remove together with the
# other [readiness-diag] markers once the root cause is
# identified.
DEBUG: start-server-and-test
working-directory: packages/host

- name: Upload junit report to GitHub Actions Artifacts
Expand Down
2 changes: 1 addition & 1 deletion mise-tasks/test-services/host
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ esac
BASE_REALM_READY="${REALM_READY_SCHEME}://${REALM_BASE_URL#*://}/base/${READY_PATH}"
NODE_TEST_REALM_READY="${REALM_TEST_READY_SCHEME}://${REALM_TEST_URL#*://}/node-test/${READY_PATH}"

HOST_TEST_LOG_LEVELS="${HOST_TEST_LOG_LEVELS:-*=info,realm:requests=warn,realm-index-updater=debug,index-runner=debug,index-perf=debug,index-writer=debug,worker=debug,worker-manager=debug}"
HOST_TEST_LOG_LEVELS="${HOST_TEST_LOG_LEVELS:-*=info,realm:requests=warn,realm-index-updater=debug,index-runner=debug,index-perf=debug,index-writer=debug,worker=debug,worker-manager=debug,readiness-diag=debug}"
SKIP_CATALOG="${SKIP_CATALOG:-}"

# START_SERVER_AND_TEST_INSECURE=1 scopes to the wait-on readiness probe.
Expand Down
19 changes: 19 additions & 0 deletions packages/runtime-common/index-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ export class IndexRunner {
#log = logger('index-runner');
#fetch: typeof globalThis.fetch;
#perfLog = logger('index-perf');
// [readiness-diag] — opt-in CI flake diagnostics. Remove with call site.
#readinessDiag = logger('readiness-diag');
#realmPaths: RealmPaths;
#ignoreData: Record<string, string>;
#prerenderer: Prerenderer;
Expand Down Expand Up @@ -216,6 +218,15 @@ export class IndexRunner {
});
try {
let filesCompleted = 0;
// [readiness-diag] — visit-loop heartbeat. Today the only
// markers between `completed invalidations in Xms` and
// `completed index visit in Yms` are per-file onProgress events
// (callback-routed, not in the worker log). A stuck visit looks
// like a 10-minute log gap. Emit a progress line every 25 files
// so the realm-server/worker artifact pinpoints WHERE the loop
// parked. Opt-in via the `readiness-diag` logger. Remove with
// the other [readiness-diag] blocks once root cause is found.
const VISIT_PROGRESS_INTERVAL = 25;
for (let invalidation of invalidations) {
// Resume guard. If a previous attempt of this same job already
// wrote URL_X to the working table AND the EFS mtime hasn't
Expand Down Expand Up @@ -252,6 +263,14 @@ export class IndexRunner {
filesCompleted,
totalFiles: invalidations.length,
});
if (
filesCompleted % VISIT_PROGRESS_INTERVAL === 0 ||
filesCompleted === invalidations.length
) {
current.#readinessDiag.debug(
`${jobIdentity(current.#jobInfo)} visit progress ${filesCompleted}/${invalidations.length} for realm ${current.realmURL.href} (elapsedMs=${Date.now() - visitStart}, last=${invalidation.href})`,
);
}
}
if (resumedSkipped > 0) {
current.#perfLog.debug(
Expand Down
18 changes: 15 additions & 3 deletions packages/runtime-common/realm-index-updater.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ import ignore, { type Ignore } from 'ignore';
export class RealmIndexUpdater {
#realm: Realm;
#log = logger('realm-index-updater');
// [readiness-diag] — opt-in CI flake diagnostics. Remove with call site.
#readinessDiag = logger('readiness-diag');
#ignoreData: Record<string, string> = {};
// Bumped every time a from-scratch result writes #ignoreData. Concurrent
// incrementals capture this version when they snapshot #ignoreData; if a
Expand Down Expand Up @@ -142,8 +144,8 @@ export class RealmIndexUpdater {
let startedAt = performance.now();

this.#log.info(`Realm ${this.realmURL.href} is starting indexing`);
let published = (async () =>
await enqueueReindexRealmJob(
let published = (async () => {
let job = await enqueueReindexRealmJob(
this.#realm.url,
await this.#realm.getRealmOwnerUsername(),
this.#queue,
Expand All @@ -152,7 +154,17 @@ export class RealmIndexUpdater {
{
clearLastModified: opts?.clearLastModified,
},
))();
);
// [readiness-diag] — startedAt covers the full enqueue path
// (getRealmOwnerUsername + the optional clearLastModified write
// inside enqueueReindexRealmJob + the queue insert), not just the
// SQL publish. Labeled accordingly. See also realm.ts
// readinessCheck + #startup.
this.#readinessDiag.debug(
`Realm ${this.realmURL.href} from-scratch job durably enqueued as job_id=${job.id} (enqueue path took ${((performance.now() - startedAt) / 1000).toFixed(2)}s)`,
);
return job;
})();

let completed = published
.then(async (job) => {
Expand Down
119 changes: 97 additions & 22 deletions packages/runtime-common/realm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,11 @@ export class Realm {
#router: Router;
#log = logger('realm');
#perfLog = logger('perf');
// [readiness-diag] — opt-in logger, off in production, enabled in CI
// host tests via `readiness-diag=debug` in HOST_TEST_LOG_LEVELS.
// Remove this field together with the [readiness-diag] call sites
// once the CI readiness-check timeout flake is rooted out.
#readinessDiag = logger('readiness-diag');
#updateItems: UpdateItem[] = [];
#flushUpdateEvents: Promise<void> | undefined;
#recentWrites: Map<string, number> = new Map();
Expand Down Expand Up @@ -1065,7 +1070,19 @@ export class Realm {
_request: Request,
requestContext: RequestContext,
) {
// [readiness-diag] — entry / fulfillment markers. The per-realm
// startup heartbeat lives in #startup so that wait-on's 0.5–2s
// probe cadence doesn't multiply heartbeat intervals (~300 of them
// accumulating across a 10-minute hang) on a single Deferred.
let waitStart = Date.now();
this.#readinessDiag.debug(
`readiness check requested for realm ${this.url}`,
);
await this.#startedUp.promise;
let totalElapsed = ((Date.now() - waitStart) / 1000).toFixed(1);
this.#readinessDiag.debug(
`readiness check fulfilled for realm ${this.url} (waited ${totalElapsed}s)`,
);

return createResponse({
body: null,
Expand Down Expand Up @@ -2405,34 +2422,92 @@ export class Realm {
async #startup(opts?: { fromScratchIndexPriority?: number }) {
await Promise.resolve();
let startTime = Date.now();
if (this.#copiedFromRealm) {
await this.#realmIndexUpdater.copy(this.#copiedFromRealm);
this.broadcastRealmEvent({
eventName: 'index',
indexType: 'copy',
sourceRealmURL: this.#copiedFromRealm.href,
realmURL: this.url,
});
} else {
let isNewIndex = await this.#realmIndexUpdater.isNewIndex();
if (isNewIndex || this.#fullIndexOnStartup) {
let priority =
opts?.fromScratchIndexPriority ?? this.#fromScratchIndexPriority;
let promise = this.#realmIndexUpdater.fullIndex(priority);
if (isNewIndex) {
// we only await the full indexing at boot if this is a brand new index
await promise;
}
// not sure how useful this event is--nothing is currently listening for
// it, and it may happen during or after the full index...
// [readiness-diag] — every `this.#readinessDiag.debug(...)` line in
// this method is a temporary phase marker for the CI readiness-check
// timeout flake. The whole logger is opt-in (off in production, on
// in CI host tests via HOST_TEST_LOG_LEVELS). Remove the call sites
// together with the `#readinessDiag` field once root cause is found.
this.#readinessDiag.debug(`startup begin for realm ${this.url}`);
// [readiness-diag] — one heartbeat per realm-lifetime, not per
// readiness-probe (wait-on polls every 0.5–2s; a per-request
// heartbeat would multiply to hundreds of concurrent intervals).
// Cleared in the finally below so a startup that fails or throws
// doesn't leak the timer past the lifetime of this method.
let heartbeat: ReturnType<typeof setInterval> | undefined = setInterval(
() => {
let elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
this.#readinessDiag.debug(
`startup still pending for realm ${this.url} (elapsed=${elapsed}s)`,
);
},
15_000,
);
if (
heartbeat &&
typeof (heartbeat as unknown as { unref?: () => void }).unref ===
'function'
) {
(heartbeat as unknown as { unref: () => void }).unref();
}
try {
if (this.#copiedFromRealm) {
this.#readinessDiag.debug(
`startup phase=copy from ${this.#copiedFromRealm.href} for realm ${this.url}`,
);
await this.#realmIndexUpdater.copy(this.#copiedFromRealm);
this.#readinessDiag.debug(
`startup phase=copy resolved for realm ${this.url} (elapsed=${Date.now() - startTime}ms)`,
);
this.broadcastRealmEvent({
eventName: 'index',
indexType: 'full',
indexType: 'copy',
sourceRealmURL: this.#copiedFromRealm.href,
realmURL: this.url,
});
} else {
let isNewIndex = await this.#realmIndexUpdater.isNewIndex();
this.#readinessDiag.debug(
`startup isNewIndex=${isNewIndex} fullIndexOnStartup=${this.#fullIndexOnStartup} for realm ${this.url}`,
);
if (isNewIndex || this.#fullIndexOnStartup) {
let priority =
opts?.fromScratchIndexPriority ?? this.#fromScratchIndexPriority;
let promise = this.#realmIndexUpdater.fullIndex(priority);
if (isNewIndex) {
// we only await the full indexing at boot if this is a brand new index
this.#readinessDiag.debug(
`startup awaiting fullIndex (new index) for realm ${this.url}`,
);
await promise;
this.#readinessDiag.debug(
`startup fullIndex resolved for realm ${this.url} (elapsed=${Date.now() - startTime}ms)`,
);
} else {
this.#readinessDiag.debug(
`startup kicked off fire-and-forget fullIndex for realm ${this.url}`,
);
}
// not sure how useful this event is--nothing is currently listening for
// it, and it may happen during or after the full index...
this.broadcastRealmEvent({
eventName: 'index',
indexType: 'full',
realmURL: this.url,
});
}
}
this.#readinessDiag.debug(
`startup awaiting isWorldReadable for realm ${this.url}`,
);
await this.isWorldReadable();
this.#readinessDiag.debug(
`startup complete for realm ${this.url} in ${Date.now() - startTime}ms`,
);
} finally {
if (heartbeat !== undefined) {
clearInterval(heartbeat);
}
}
await this.isWorldReadable();

this.#perfLog.debug(
`realm server ${this.url} startup in ${Date.now() - startTime} ms`,
Expand Down
Loading