From 171c609575653688cd46d7a609e2731d412daf71 Mon Sep 17 00:00:00 2001 From: Luke Melia Date: Wed, 1 Jul 2026 14:41:22 -0400 Subject: [PATCH 1/3] Graceful degradation when a trusted realm server is unreachable at boot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Boot assembly fetched each trusted server's realm list via `_realm-auth` under `Promise.all`. One server failing rejected the whole assembly, which bubbled up to `start()`'s catch and logged the user out — blocking boot and hiding all of the user's realms. - realm-server: assemble via `Promise.allSettled` so reachable servers still contribute their realms; record unreachable servers in tracked `unreachableRealmServers` state (cleared on logout/reset). - matrix-service: don't log out when the boot-time `authenticateToAllAccessibleRealms` fails; add `retryUnreachableRealmServers` plus a bounded background retry that merges recovered realms and clears the notice. - workspace-chooser: show an unobtrusive notice naming the unreachable server(s). - tests: `setRealmAuthFailure` mock toggle; boot-assembly degradation/retry coverage and a notice-rendering test. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../operator-mode/workspace-chooser/index.gts | 51 +++++++++ packages/host/app/services/matrix-service.ts | 91 +++++++++++++++- packages/host/app/services/realm-server.ts | 102 ++++++++++++++---- packages/host/tests/helpers/index.gts | 1 + .../tests/helpers/realm-server-mock/index.ts | 9 ++ .../tests/helpers/realm-server-mock/routes.ts | 3 + .../tests/helpers/realm-server-mock/types.ts | 3 + ...kspace-chooser-unreachable-notice-test.gts | 91 ++++++++++++++++ .../matrix-service-boot-assembly-test.ts | 76 +++++++++++++ 9 files changed, 402 insertions(+), 25 deletions(-) create mode 100644 packages/host/tests/integration/components/workspace-chooser-unreachable-notice-test.gts diff --git a/packages/host/app/components/operator-mode/workspace-chooser/index.gts b/packages/host/app/components/operator-mode/workspace-chooser/index.gts index 20c4cc6608..b470a36f00 100644 --- a/packages/host/app/components/operator-mode/workspace-chooser/index.gts +++ b/packages/host/app/components/operator-mode/workspace-chooser/index.gts @@ -16,6 +16,7 @@ import { Lock, StarFilled, TriangleRight, + Warning as WarningIcon, } from '@cardstack/boxel-ui/icons'; import type { Icon } from '@cardstack/boxel-ui/icons'; @@ -58,6 +59,26 @@ export default class WorkspaceChooser extends Component { return this.realmServer.archivedRealms; } + // Trusted realm servers that couldn't be reached during boot. When present, + // an unobtrusive notice names them so the user understands some workspaces + // may be missing; the notice clears once the background retry recovers them. + private get unreachableRealmServers() { + return this.realmServer.unreachableRealmServers; + } + + private get unreachableRealmServersMessage() { + let hosts = this.unreachableRealmServers.map((serverURL) => { + try { + return new URL(serverURL).host; + } catch { + return serverURL; + } + }); + let servers = + hosts.length === 1 ? hosts[0] : `${hosts.length} realm servers`; + return `Couldn’t reach ${servers}. Some workspaces may be missing — retrying…`; + } + // Show the archived count once we've loaded the list (or already have entries // from an archive action this session). Before the first load we don't know // the count, so the row shows just "Archived". @@ -376,6 +397,20 @@ export default class WorkspaceChooser extends Component { {{/in-element}} {{/if}}
+ {{#if this.unreachableRealmServers.length}} +
+ + {{this.unreachableRealmServersMessage}} +
+ {{/if}}
@@ -601,6 +636,22 @@ export default class WorkspaceChooser extends Component { color: var(--boxel-400); font: 400 var(--boxel-font-sm); } + .unreachable-notice { + display: flex; + align-items: center; + gap: var(--boxel-sp-xs); + max-width: 40rem; + padding: var(--boxel-sp-xs) var(--boxel-sp-sm); + border-radius: var(--boxel-border-radius); + background-color: rgba(255 255 255 / 10%); + color: var(--boxel-light); + font: 400 var(--boxel-font-sm); + letter-spacing: var(--boxel-lsp-xs); + } + .unreachable-notice-icon { + --icon-color: var(--boxel-warning-100); + flex-shrink: 0; + } } diff --git a/packages/host/app/services/matrix-service.ts b/packages/host/app/services/matrix-service.ts index a813af5332..d3604ea67f 100644 --- a/packages/host/app/services/matrix-service.ts +++ b/packages/host/app/services/matrix-service.ts @@ -7,7 +7,13 @@ import Service, { service } from '@ember/service'; import { isTesting } from '@embroider/macros'; import { cached, tracked } from '@glimmer/tracking'; -import { dropTask, task, timeout } from 'ember-concurrency'; +import { + dropTask, + rawTimeout, + restartableTask, + task, + timeout, +} from 'ember-concurrency'; import window from 'ember-window-mock'; import { cloneDeep } from 'lodash-es'; @@ -156,6 +162,10 @@ import type * as MatrixSDK from 'matrix-js-sdk'; const { matrixURL } = ENV; const STATE_EVENTS_OF_INTEREST = ['m.room.create', 'm.room.name']; +// Backoff for retrying trusted servers that were unreachable at boot. Bounded +// so a persistently-down server doesn't spin forever. +const UNREACHABLE_RETRY_INTERVAL_MS = 10_000; +const MAX_UNREACHABLE_RETRY_ATTEMPTS = 6; const realmEventsLogger = logger('realm:events'); @@ -462,6 +472,7 @@ export default class MatrixService extends Service { await this.loginToRealms(); await this.loadMoreAuthRooms(realmURLs); } + this.scheduleUnreachableRealmServerRetry(); } catch (err) { console.error( 'Failed to assemble realms from trusted servers in app.boxel.realm-servers account data', @@ -1041,7 +1052,18 @@ export default class MatrixService extends Service { if (isTesting()) console.warn('[start-phase] authenticateToAllAccessibleRealms'); - await this.realmServer.authenticateToAllAccessibleRealms(); + try { + await this.realmServer.authenticateToAllAccessibleRealms(); + } catch (e) { + // A trusted server being unreachable must not fail boot. Boot + // assembly already recorded it in `unreachableRealmServers` and a + // retry is scheduled below; realms from reachable servers still + // authenticate individually via `loginToRealms`. + console.error( + 'Failed to authenticate to all accessible realms at boot', + e, + ); + } } // Login here triggers other setup code that needs to happen after // otherwise we don't have the realm info. @@ -1051,6 +1073,11 @@ export default class MatrixService extends Service { this.postLoginCompleted = true; if (isTesting()) console.warn('[start-phase] postLoginCompleted=true'); + + // If any trusted server was unreachable during boot assembly, keep + // the reachable realms and retry the unreachable ones in the + // background so they load (and the notice clears) once they recover. + this.scheduleUnreachableRealmServerRetry(); } catch (e) { console.log('Error starting Matrix client', e); await this.logout(); @@ -1177,6 +1204,66 @@ export default class MatrixService extends Service { ); } + // Re-attempt the trusted servers that were unreachable during boot + // assembly. On success their realms are merged into the available list and + // authenticated so they appear without a reload; the "couldn't reach + // " notice clears as `unreachableRealmServers` empties. Returns true + // once every previously-unreachable server has recovered. Public so tests + // can drive recovery deterministically rather than waiting on the background + // timer. + async retryUnreachableRealmServers(): Promise { + let toRetry = [...this.realmServer.unreachableRealmServers]; + if (toRetry.length === 0) { + return true; + } + let recovered = + await this.realmServer.fetchUserRealmsFromTrustedServers(toRetry); + if (recovered.length > 0) { + let merged = [ + ...new Set([ + ...this.realmServer.userRealmIdentifiers, + ...recovered.map(ri), + ]), + ]; + await this.realmServer.setAvailableRealmIdentifiers(merged); + await this.loginToRealms(); + await this.loadMoreAuthRooms(recovered); + } + return this.realmServer.unreachableRealmServers.length === 0; + } + + private scheduleUnreachableRealmServerRetry() { + if (isTesting()) { + // Tests drive recovery via `retryUnreachableRealmServers()` directly so + // the assertions are deterministic; skip the background timer loop, which + // would otherwise keep firing while a stubbed server stays down. + return; + } + if (this.realmServer.unreachableRealmServers.length === 0) { + return; + } + this.retryUnreachableRealmServersTask.perform(); + } + + private retryUnreachableRealmServersTask = restartableTask(async () => { + for ( + let attempt = 0; + attempt < MAX_UNREACHABLE_RETRY_ATTEMPTS && + this.realmServer.unreachableRealmServers.length > 0; + attempt++ + ) { + await rawTimeout(UNREACHABLE_RETRY_INTERVAL_MS); + if (this.isDestroying || this.isDestroyed) { + return; + } + try { + await this.retryUnreachableRealmServers(); + } catch (err) { + console.error('Failed to retry unreachable realm servers', err); + } + } + }); + async createRealmSession(realmURL: URL) { await this.#clientReadyDeferred.promise; return this.client.createRealmSession(realmURL); diff --git a/packages/host/app/services/realm-server.ts b/packages/host/app/services/realm-server.ts index 4dfb069623..fb17b2533b 100644 --- a/packages/host/app/services/realm-server.ts +++ b/packages/host/app/services/realm-server.ts @@ -110,6 +110,11 @@ export default class RealmServerService extends Service { ]); private archivedRealmsList = new TrackedArray([]); private archivedRealmsFetched = false; + // Trusted servers whose `_realm-auth` call failed at boot assembly (network + // error, timeout, or non-2xx). Tracked so the UI can surface an unobtrusive + // "couldn't reach " notice; entries clear as a retry recovers each + // server. + private unreachableRealmServersList = new TrackedArray([]); private _ready = new Deferred(); private eventSubscribers: Map = new Map(); @@ -142,6 +147,10 @@ export default class RealmServerService extends Service { // reference) is what makes the getter recompute to the empty list. this.archivedRealmsList.splice(0, this.archivedRealmsList.length); this.archivedRealmsFetched = false; + this.unreachableRealmServersList.splice( + 0, + this.unreachableRealmServersList.length, + ); this.eventSubscribers = new Map(); this._ready = new Deferred(); this._ready.fulfill(); @@ -326,6 +335,10 @@ export default class RealmServerService extends Service { type: 'base', url: baseRealm.url, }); + this.unreachableRealmServersList.splice( + 0, + this.unreachableRealmServersList.length, + ); window.localStorage.removeItem(sessionLocalStorageKey); } @@ -375,31 +388,74 @@ export default class RealmServerService extends Service { // TODO: remove once multi-realm-server federation lands. this.assertOwnRealmServer(trustedServerURLs); await this.login(); - let perServerRealmURLs = await Promise.all( - trustedServerURLs.map(async (serverURL) => { - let normalizedServerURL = ensureTrailingSlash(serverURL); - let response = await this.network.fetch( - `${normalizedServerURL}_realm-auth`, - { - method: 'POST', - headers: { - Accept: SupportedMimeType.JSONAPI, - 'Content-Type': 'application/json', - Authorization: `Bearer ${this.token}`, - }, - }, + // A trusted server that's unreachable (network error, timeout, or a + // non-2xx `_realm-auth`) must never block boot or hide the realms served + // by the servers that *are* reachable. `allSettled` lets us assemble from + // the reachable servers, record the unreachable ones so a notice can name + // them, and (via matrix-service) schedule a retry. + let results = await Promise.allSettled( + trustedServerURLs.map((serverURL) => + this.fetchUserRealmsFromServer(serverURL), + ), + ); + let realmURLs: string[] = []; + results.forEach((result, index) => { + let normalizedServerURL = ensureTrailingSlash(trustedServerURLs[index]); + if (result.status === 'fulfilled') { + realmURLs.push(...result.value); + this.markRealmServerReachable(normalizedServerURL); + } else { + this.markRealmServerUnreachable(normalizedServerURL); + console.error( + `Failed to fetch user realms from trusted server ${normalizedServerURL}`, + result.reason, ); - if (!response.ok) { - let responseText = await response.text(); - throw new Error( - `Failed to fetch user realms from trusted server ${normalizedServerURL}: ${response.status} - ${responseText}`, - ); - } - let tokens = (await response.json()) as Record; - return Object.keys(tokens); - }), + } + }); + return [...new Set(realmURLs)]; + } + + private async fetchUserRealmsFromServer( + serverURL: string, + ): Promise { + let normalizedServerURL = ensureTrailingSlash(serverURL); + let response = await this.network.fetch( + `${normalizedServerURL}_realm-auth`, + { + method: 'POST', + headers: { + Accept: SupportedMimeType.JSONAPI, + 'Content-Type': 'application/json', + Authorization: `Bearer ${this.token}`, + }, + }, ); - return [...new Set(perServerRealmURLs.flat())]; + if (!response.ok) { + let responseText = await response.text(); + throw new Error( + `Failed to fetch user realms from trusted server ${normalizedServerURL}: ${response.status} - ${responseText}`, + ); + } + let tokens = (await response.json()) as Record; + return Object.keys(tokens); + } + + @cached + get unreachableRealmServers(): string[] { + return [...this.unreachableRealmServersList]; + } + + private markRealmServerUnreachable(serverURL: string) { + if (!this.unreachableRealmServersList.includes(serverURL)) { + this.unreachableRealmServersList.push(serverURL); + } + } + + private markRealmServerReachable(serverURL: string) { + let index = this.unreachableRealmServersList.indexOf(serverURL); + if (index >= 0) { + this.unreachableRealmServersList.splice(index, 1); + } } @cached diff --git a/packages/host/tests/helpers/index.gts b/packages/host/tests/helpers/index.gts index ad1496f45f..fb3a20118f 100644 --- a/packages/host/tests/helpers/index.gts +++ b/packages/host/tests/helpers/index.gts @@ -95,6 +95,7 @@ export { createJWT, testRealmSecretSeed } from './test-auth'; export { registerRealmAuthSessionRoomEnsurer, resetCatalogRealmURL, + setRealmAuthFailure, setupAuthEndpoints, setCatalogRealmURL, } from './realm-server-mock'; diff --git a/packages/host/tests/helpers/realm-server-mock/index.ts b/packages/host/tests/helpers/realm-server-mock/index.ts index 5c63686117..e8286085ef 100644 --- a/packages/host/tests/helpers/realm-server-mock/index.ts +++ b/packages/host/tests/helpers/realm-server-mock/index.ts @@ -87,6 +87,15 @@ export function setupAuthEndpoints( } } +// Simulate a trusted realm server going unreachable (or recovering) at its +// `_realm-auth` endpoint, so boot-assembly graceful-degradation and retry can +// be exercised deterministically. +export function setRealmAuthFailure(shouldFail: boolean) { + let network = getService('network') as NetworkService; + let state = ensureRealmServerMockState(network); + state.failRealmAuth = shouldFail; +} + export function registerRealmAuthSessionRoomEnsurer( callback: EnsureSessionRoom, ) { diff --git a/packages/host/tests/helpers/realm-server-mock/routes.ts b/packages/host/tests/helpers/realm-server-mock/routes.ts index 0c7309854e..8f796fe81d 100644 --- a/packages/host/tests/helpers/realm-server-mock/routes.ts +++ b/packages/host/tests/helpers/realm-server-mock/routes.ts @@ -301,6 +301,9 @@ function registerAuthRoutes() { registerRealmServerRoute({ path: '/_realm-auth', handler: async (_req, _url, state: RealmServerMockState) => { + if (state.failRealmAuth) { + return new Response('realm server unreachable', { status: 503 }); + } let realmServerURL = ensureTrailingSlash(_url.origin); const authTokens: Record = {}; for (let [realmURL, permissions] of state.realmPermissions.entries()) { diff --git a/packages/host/tests/helpers/realm-server-mock/types.ts b/packages/host/tests/helpers/realm-server-mock/types.ts index f28e3d9149..9e1e074b39 100644 --- a/packages/host/tests/helpers/realm-server-mock/types.ts +++ b/packages/host/tests/helpers/realm-server-mock/types.ts @@ -13,6 +13,9 @@ export type RealmServerMockState = { archivedRealms: Map; mountedVirtualNetwork?: unknown; ensureSessionRoom?: EnsureSessionRoom; + // When true, `_realm-auth` responds 503 — used to simulate a trusted realm + // server that's unreachable during boot assembly. + failRealmAuth?: boolean; }; export type RealmServerMockRouteHandler = ( diff --git a/packages/host/tests/integration/components/workspace-chooser-unreachable-notice-test.gts b/packages/host/tests/integration/components/workspace-chooser-unreachable-notice-test.gts new file mode 100644 index 0000000000..0784fcd6f0 --- /dev/null +++ b/packages/host/tests/integration/components/workspace-chooser-unreachable-notice-test.gts @@ -0,0 +1,91 @@ +import type { RenderingTestContext } from '@ember/test-helpers'; +import { settled } from '@ember/test-helpers'; + +import GlimmerComponent from '@glimmer/component'; + +import { getService } from '@universal-ember/test-support'; +import { module, test } from 'qunit'; + +import { baseRealm, ensureTrailingSlash } from '@cardstack/runtime-common'; + +import WorkspaceChooser from '@cardstack/host/components/operator-mode/workspace-chooser'; +import ENV from '@cardstack/host/config/environment'; +import type MatrixService from '@cardstack/host/services/matrix-service'; +import type RealmServerService from '@cardstack/host/services/realm-server'; + +import { + testRealmURL, + setupIntegrationTestRealm, + setupLocalIndexing, + setRealmAuthFailure, +} from '../../helpers'; +import { setupBaseRealm } from '../../helpers/base-realm'; +import { setupMockMatrix } from '../../helpers/mock-matrix'; +import { renderComponent } from '../../helpers/render-component'; +import { setupRenderingTest } from '../../helpers/setup'; + +const testRealmServerURL = ensureTrailingSlash(ENV.realmServerURL); + +// The workspace chooser surfaces an unobtrusive notice naming any trusted +// realm server that couldn't be reached during boot assembly, so the user +// understands some workspaces may be missing. The notice clears once a retry +// recovers the server. +module( + 'Integration | workspace-chooser | unreachable realm server notice', + function (hooks) { + setupRenderingTest(hooks); + setupBaseRealm(hooks); + setupLocalIndexing(hooks); + + let mockMatrixUtils = setupMockMatrix(hooks, { + loggedInAs: '@testuser:localhost', + activeRealms: [baseRealm.url, testRealmURL], + activeRealmServers: [testRealmServerURL], + }); + + hooks.beforeEach(async function (this: RenderingTestContext) { + await setupIntegrationTestRealm({ + mockMatrixUtils, + contents: {}, + startMatrix: false, + }); + let realmServer = getService('realm-server') as RealmServerService; + await realmServer.setAvailableRealmIdentifiers([]); + // The trusted server is unreachable while boot assembles the realm list. + setRealmAuthFailure(true); + let matrixService = getService('matrix-service') as MatrixService; + await matrixService.ready; + await matrixService.start(); + }); + + test('shows a notice naming the unreachable server, then clears it after a successful retry', async function (assert) { + await renderComponent( + class TestDriver extends GlimmerComponent { + + }, + ); + + assert + .dom('[data-test-unreachable-realm-servers-notice]') + .exists('the notice is shown while the trusted server is unreachable'); + assert + .dom('[data-test-unreachable-realm-servers-notice]') + .containsText( + new URL(testRealmServerURL).host, + 'the notice names the server', + ); + + // The server recovers; retrying assembly clears the notice. + setRealmAuthFailure(false); + let matrixService = getService('matrix-service') as MatrixService; + await matrixService.retryUnreachableRealmServers(); + await settled(); + + assert + .dom('[data-test-unreachable-realm-servers-notice]') + .doesNotExist('the notice clears once the server is reachable again'); + }); + }, +); diff --git a/packages/host/tests/integration/matrix-service-boot-assembly-test.ts b/packages/host/tests/integration/matrix-service-boot-assembly-test.ts index 7ecdb6e192..5ad0779cd5 100644 --- a/packages/host/tests/integration/matrix-service-boot-assembly-test.ts +++ b/packages/host/tests/integration/matrix-service-boot-assembly-test.ts @@ -13,6 +13,7 @@ import { testRealmURL, setupIntegrationTestRealm, setupLocalIndexing, + setRealmAuthFailure, } from '../helpers'; import { setupBaseRealm } from '../helpers/base-realm'; @@ -296,6 +297,81 @@ module( }, ); +// Graceful degradation: a trusted server that's unreachable during boot +// assembly (its `_realm-auth` fails/times out) must never block boot or hide +// the realms served by the servers that are reachable. The unreachable server +// is recorded so a notice can name it, and a retry recovers it — the notice +// clears on success. +module( + 'Integration | matrix-service | graceful degradation when a trusted server is unreachable', + function (hooks) { + setupRenderingTest(hooks); + setupBaseRealm(hooks); + setupLocalIndexing(hooks); + + let mockMatrixUtils = setupMockMatrix(hooks, { + loggedInAs: '@testuser:localhost', + activeRealms: [baseRealm.url, testRealmURL], + activeRealmServers: [testRealmServerURL], + }); + + hooks.beforeEach(async function (this: RenderingTestContext) { + await setupIntegrationTestRealm({ + mockMatrixUtils, + contents: {}, + startMatrix: false, + }); + let realmServer = getService('realm-server') as RealmServerService; + await realmServer.setAvailableRealmIdentifiers([]); + // Simulate the trusted server being unreachable during boot assembly. + setRealmAuthFailure(true); + let matrixService = getService('matrix-service') as MatrixService; + await matrixService.ready; + await matrixService.start(); + }); + + test('boot completes without dropping the reachable base realm', async function (assert) { + let realmServer = getService('realm-server') as RealmServerService; + assert.ok( + realmServer.availableRealmIdentifiers.includes(ri(baseRealm.url)), + 'the base realm still loads when a trusted server is unreachable', + ); + assert.notOk( + realmServer.availableRealmIdentifiers.includes(ri(testRealmURL)), + 'the unreachable server’s realm is not in the list yet', + ); + }); + + test('the unreachable trusted server is recorded so a notice can name it', async function (assert) { + let realmServer = getService('realm-server') as RealmServerService; + assert.deepEqual( + realmServer.unreachableRealmServers, + [testRealmServerURL], + 'the unreachable trusted server is recorded', + ); + }); + + test('retry recovers the realm and clears the notice once the server is reachable', async function (assert) { + let realmServer = getService('realm-server') as RealmServerService; + let matrixService = getService('matrix-service') as MatrixService; + + setRealmAuthFailure(false); + let allRecovered = await matrixService.retryUnreachableRealmServers(); + + assert.ok(allRecovered, 'retry reports all servers recovered'); + assert.ok( + realmServer.availableRealmIdentifiers.includes(ri(testRealmURL)), + 'the previously-unreachable realm now appears', + ); + assert.deepEqual( + realmServer.unreachableRealmServers, + [], + 'the notice clears once the server is reachable', + ); + }); + }, +); + module( 'Integration | matrix-service | already-migrated account is untouched', function (hooks) { From 3ba99f86549aabbd02f063d87b9ca22c3211f9ad Mon Sep 17 00:00:00 2001 From: Luke Melia Date: Wed, 1 Jul 2026 15:25:18 -0400 Subject: [PATCH 2/3] Address review: don't wipe realms on transient refresh; narrow boot auth catch Two issues from Codex review: - P1: the runtime `app.boxel.realm-servers` account-data handler replaced the available-realms list with the partial result from `fetchUserRealmsFromTrustedServers`, so a transiently-unreachable server erased already-loaded workspaces. Extracted the handler body into `applyTrustedRealmServersAccountData`, which merges (never removes) while any server is unreachable and only replaces on a fully reachable assembly. - P2: the boot-time catch around `authenticateToAllAccessibleRealms` swallowed every failure. Now it only swallows when a trusted server was actually recorded unreachable; otherwise it rethrows so boot fails loudly instead of reaching `postLoginCompleted` unauthenticated. Added a regression test that a refresh during an outage keeps the loaded realm. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/host/app/services/matrix-service.ts | 59 ++++++++++++------ .../matrix-service-boot-assembly-test.ts | 60 +++++++++++++++++++ 2 files changed, 102 insertions(+), 17 deletions(-) diff --git a/packages/host/app/services/matrix-service.ts b/packages/host/app/services/matrix-service.ts index d3604ea67f..8df3bd2e59 100644 --- a/packages/host/app/services/matrix-service.ts +++ b/packages/host/app/services/matrix-service.ts @@ -461,18 +461,7 @@ export default class MatrixService extends Service { // start(); here we log and leave the available-realms list as // it was. try { - let realmURLs = - await this.realmServer.fetchUserRealmsFromTrustedServers( - realmServers, - ); - await this.realmServer.setAvailableRealmIdentifiers( - realmURLs.map(ri), - ); - if (this.postLoginCompleted) { - await this.loginToRealms(); - await this.loadMoreAuthRooms(realmURLs); - } - this.scheduleUnreachableRealmServerRetry(); + await this.applyTrustedRealmServersAccountData(realmServers); } catch (err) { console.error( 'Failed to assemble realms from trusted servers in app.boxel.realm-servers account data', @@ -1055,12 +1044,18 @@ export default class MatrixService extends Service { try { await this.realmServer.authenticateToAllAccessibleRealms(); } catch (e) { - // A trusted server being unreachable must not fail boot. Boot - // assembly already recorded it in `unreachableRealmServers` and a - // retry is scheduled below; realms from reachable servers still - // authenticate individually via `loginToRealms`. + // A trusted server being unreachable must not fail boot: assembly + // recorded it in `unreachableRealmServers`, a retry is scheduled + // below, and realms from reachable servers still authenticate + // individually via `loginToRealms`. But only swallow when there's + // actually an unreachable server to blame — otherwise this is an + // unrelated auth failure and boot must fail loudly (logout) rather + // than proceed to `postLoginCompleted` while unauthenticated. + if (this.realmServer.unreachableRealmServers.length === 0) { + throw e; + } console.error( - 'Failed to authenticate to all accessible realms at boot', + 'Failed to authenticate to all accessible realms because a trusted server is unreachable', e, ); } @@ -1204,6 +1199,36 @@ export default class MatrixService extends Service { ); } + // Re-assemble the available-realms list from a runtime + // `app.boxel.realm-servers` account-data event. Unlike the fail-loud boot + // assembly, an event-time refresh must be conservative: because + // `fetchUserRealmsFromTrustedServers` now returns a partial list when a + // trusted server is unreachable (rather than throwing), replacing the list + // with that partial result would erase the realms served by a server that's + // only transiently down. So when any server was unreachable this round we + // merge (add newly-discovered realms, never remove) and let the retry + // reconcile; only a fully reachable assembly is authoritative enough to + // remove realms. Called by the AccountData listener and directly by tests. + async applyTrustedRealmServersAccountData(realmServers: string[]) { + let realmURLs = + await this.realmServer.fetchUserRealmsFromTrustedServers(realmServers); + if (this.realmServer.unreachableRealmServers.length > 0) { + await this.realmServer.setAvailableRealmIdentifiers([ + ...new Set([ + ...this.realmServer.userRealmIdentifiers, + ...realmURLs.map(ri), + ]), + ]); + } else { + await this.realmServer.setAvailableRealmIdentifiers(realmURLs.map(ri)); + } + if (this.postLoginCompleted) { + await this.loginToRealms(); + await this.loadMoreAuthRooms(realmURLs); + } + this.scheduleUnreachableRealmServerRetry(); + } + // Re-attempt the trusted servers that were unreachable during boot // assembly. On success their realms are merged into the available list and // authenticated so they appear without a reload; the "couldn't reach diff --git a/packages/host/tests/integration/matrix-service-boot-assembly-test.ts b/packages/host/tests/integration/matrix-service-boot-assembly-test.ts index 5ad0779cd5..f08af1aaea 100644 --- a/packages/host/tests/integration/matrix-service-boot-assembly-test.ts +++ b/packages/host/tests/integration/matrix-service-boot-assembly-test.ts @@ -372,6 +372,66 @@ module( }, ); +// A runtime `app.boxel.realm-servers` account-data refresh (as opposed to the +// fail-loud boot assembly) must not erase already-loaded workspaces when a +// trusted server is transiently unreachable. The event-time path merges rather +// than replaces while any server is unreachable. +module( + 'Integration | matrix-service | account-data refresh survives a transient outage', + function (hooks) { + setupRenderingTest(hooks); + setupBaseRealm(hooks); + setupLocalIndexing(hooks); + + let mockMatrixUtils = setupMockMatrix(hooks, { + loggedInAs: '@testuser:localhost', + activeRealms: [baseRealm.url, testRealmURL], + activeRealmServers: [testRealmServerURL], + }); + + hooks.beforeEach(async function (this: RenderingTestContext) { + await setupIntegrationTestRealm({ + mockMatrixUtils, + contents: {}, + startMatrix: false, + }); + let realmServer = getService('realm-server') as RealmServerService; + await realmServer.setAvailableRealmIdentifiers([]); + // Boot healthy so the trusted-servers path is authoritative and the + // user's realm is loaded before the simulated outage. + let matrixService = getService('matrix-service') as MatrixService; + await matrixService.ready; + await matrixService.start(); + }); + + test('a refresh while the server is unreachable keeps the already-loaded realm', async function (assert) { + let realmServer = getService('realm-server') as RealmServerService; + let matrixService = getService('matrix-service') as MatrixService; + + assert.ok( + realmServer.availableRealmIdentifiers.includes(ri(testRealmURL)), + 'the realm is loaded after a healthy boot', + ); + + // The server goes down; a runtime account-data refresh arrives. + setRealmAuthFailure(true); + await matrixService.applyTrustedRealmServersAccountData([ + testRealmServerURL, + ]); + + assert.ok( + realmServer.availableRealmIdentifiers.includes(ri(testRealmURL)), + 'the transiently-unreachable realm is not wiped from the list', + ); + assert.deepEqual( + realmServer.unreachableRealmServers, + [testRealmServerURL], + 'the server is recorded as unreachable so the notice shows', + ); + }); + }, +); + module( 'Integration | matrix-service | already-migrated account is untouched', function (hooks) { From a41b9de7f71bfa9a33127a5287cb04339661e5f4 Mon Sep 17 00:00:00 2001 From: Luke Melia Date: Sat, 4 Jul 2026 11:36:06 -0400 Subject: [PATCH 3/3] Name every unreachable realm server in the workspace-chooser notice The notice named the server only when exactly one was unreachable; with more than one it fell back to " realm servers" and named none, contradicting the notice's contract. Join the host list so every unreachable server is named. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../app/components/operator-mode/workspace-chooser/index.gts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/host/app/components/operator-mode/workspace-chooser/index.gts b/packages/host/app/components/operator-mode/workspace-chooser/index.gts index b470a36f00..4dfbd7f6bd 100644 --- a/packages/host/app/components/operator-mode/workspace-chooser/index.gts +++ b/packages/host/app/components/operator-mode/workspace-chooser/index.gts @@ -74,8 +74,9 @@ export default class WorkspaceChooser extends Component { return serverURL; } }); - let servers = - hosts.length === 1 ? hosts[0] : `${hosts.length} realm servers`; + // Name every unreachable server, per the notice's contract. Trusted + // servers are few in practice, so a comma-joined list stays readable. + let servers = hosts.join(', '); return `Couldn’t reach ${servers}. Some workspaces may be missing — retrying…`; }