diff --git a/.server-changes/realtime-runs-subscription-scalability.md b/.server-changes/realtime-runs-subscription-scalability.md new file mode 100644 index 00000000000..5de00aae675 --- /dev/null +++ b/.server-changes/realtime-runs-subscription-scalability.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add a new backend for the realtime runs feed (single runs, tags, and batches) that scales under high concurrency, available behind a feature flag diff --git a/apps/supervisor/package.json b/apps/supervisor/package.json index 7a3537dbc04..2725fe2b729 100644 --- a/apps/supervisor/package.json +++ b/apps/supervisor/package.json @@ -18,7 +18,7 @@ "@kubernetes/client-node": "^1.0.0", "@trigger.dev/core": "workspace:*", "dockerode": "^4.0.6", - "ioredis": "^5.3.2", + "ioredis": "~5.6.0", "p-limit": "^6.2.0", "prom-client": "^15.1.0", "socket.io": "4.7.4", diff --git a/apps/webapp/app/entry.server.tsx b/apps/webapp/app/entry.server.tsx index 9996eb7b30a..1282127cb20 100644 --- a/apps/webapp/app/entry.server.tsx +++ b/apps/webapp/app/entry.server.tsx @@ -27,6 +27,7 @@ import { registerRunEngineEventBusHandlers, setupBatchQueueCallbacks, } from "./v3/runEngineHandlers.server"; +import { registerRunChangeNotifierHandlers } from "./services/realtime/runChangeNotifierHandlers.server"; // Touch the sessions replication singleton at entry so it boots deterministically // on webapp startup. The singleton's initializer wires start (gated on // `clickhouseFactory.isReady()`) and SIGTERM/SIGINT shutdown — mirrors @@ -269,6 +270,9 @@ process.on("uncaughtException", (error, origin) => { singleton("RunEngineEventBusHandlers", registerRunEngineEventBusHandlers); singleton("SetupBatchQueueCallbacks", setupBatchQueueCallbacks); +// Attach the realtime run-changed publish delegations to the engine event bus. +// No-ops (registers nothing) unless REALTIME_BACKEND_NATIVE_ENABLED=1. +singleton("RunChangeNotifierHandlers", registerRunChangeNotifierHandlers); // Wrapped in singleton() so Remix's dev-mode CJS reloads don't append // duplicate copies of the processor — Sentry's processor list lives in diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index c55bb424001..38cbf6e07db 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -300,6 +300,45 @@ const EnvironmentSchema = z .int() .default(24 * 60 * 60 * 1000), // 1 day in milliseconds + // Master switch for the native realtime backend; off = Electric serves everything, publishes no-op. + REALTIME_BACKEND_NATIVE_ENABLED: z.string().default("0"), + // Live long-poll backstop hold (ms); matches Electric's ~20s cadence. + REALTIME_BACKEND_NATIVE_LIVE_POLL_TIMEOUT_MS: z.coerce.number().int().default(20_000), + // Jitter ratio on the live-poll hold (0.15 = ±15%) to avoid synchronized refetch herds. + REALTIME_BACKEND_NATIVE_LIVE_POLL_JITTER_RATIO: z.coerce.number().default(0.15), + // Hard cap on the tag-list snapshot size. + REALTIME_BACKEND_NATIVE_MAX_LIST_RESULTS: z.coerce.number().int().default(1_000), + // TTL/size of the coalescing cache for the multi-run resolve+hydrate (same-filter feeds share one query). + REALTIME_BACKEND_NATIVE_RUNSET_CACHE_TTL_MS: z.coerce.number().int().default(1_000), + REALTIME_BACKEND_NATIVE_RUNSET_CACHE_MAX_ENTRIES: z.coerce.number().int().default(5_000), + // Size/TTL of the per-handle working-set cache used to diff multi-run live polls. + REALTIME_BACKEND_NATIVE_WORKING_SET_MAX_ENTRIES: z.coerce.number().int().default(10_000), + REALTIME_BACKEND_NATIVE_WORKING_SET_TTL_MS: z.coerce.number().int().default(300_000), + // Bucket (ms) the tag-list createdAt floor is quantized to so same-tag feeds share a cache entry; 0 disables. + REALTIME_BACKEND_NATIVE_RUNSET_CREATED_AT_BUCKET_MS: z.coerce.number().int().default(60_000), + // Leading-edge throttle (ms) on per-env wake delivery; 0 wakes on every change. + REALTIME_BACKEND_NATIVE_ENV_WAKE_COALESCE_WINDOW_MS: z.coerce.number().int().default(250), + // "1" shares per-connection replay cursors fleet-wide via Redis, so a load-balancer hop reads the connection's true inter-poll gap instead of cold-resolving. + REALTIME_BACKEND_NATIVE_SHARED_REPLAY_CURSORS: z.string().default("1"), + // "1" holds a multi-run live poll open on a non-matching wake instead of replying up-to-date. + REALTIME_BACKEND_NATIVE_HOLD_ON_EMPTY: z.string().default("1"), + // Max concurrent fresh ClickHouse resolves per instance (reconnect-stampede gate); 0 disables. + REALTIME_BACKEND_NATIVE_RESOLVE_ADMISSION_LIMIT: z.coerce.number().int().default(16), + // Replay window (ms) for buffered change records delivered to newly-armed feeds; 0 disables. + REALTIME_BACKEND_NATIVE_REPLAY_WINDOW_MS: z.coerce.number().int().default(2_000), + // Cap on buffered recent records per env (latest record per run). + REALTIME_BACKEND_NATIVE_REPLAY_MAX_RUNS: z.coerce.number().int().default(512), + // Keep an env subscribed + buffering this long (ms) after its last feed closes; 0 disables. + REALTIME_BACKEND_NATIVE_UNSUBSCRIBE_LINGER_MS: z.coerce.number().int().default(5_000), + // Fallback per-env concurrent-connection limit when the org has none configured. + REALTIME_BACKEND_NATIVE_DEFAULT_CONCURRENCY_LIMIT: z.coerce.number().int().default(100_000), + // TTL/size of the single-run read-through cache that collapses duplicate refetch bursts. + REALTIME_BACKEND_NATIVE_RUN_CACHE_TTL_MS: z.coerce.number().int().default(250), + REALTIME_BACKEND_NATIVE_RUN_CACHE_MAX_ENTRIES: z.coerce.number().int().default(5_000), + // TTL/size of the per-org realtimeBackend flag cache used to pick the serving backend. + REALTIME_BACKEND_FLAG_CACHE_TTL_MS: z.coerce.number().int().default(30_000), + REALTIME_BACKEND_FLAG_CACHE_MAX_ENTRIES: z.coerce.number().int().default(50_000), + PUBSUB_REDIS_HOST: z .string() .optional() @@ -332,6 +371,36 @@ const EnvironmentSchema = z PUBSUB_REDIS_TLS_DISABLED: z.string().default(process.env.REDIS_TLS_DISABLED ?? "false"), PUBSUB_REDIS_CLUSTER_MODE_ENABLED: z.string().default("0"), + // Dedicated pub/sub Redis for the native realtime backend; falls back to PUBSUB_REDIS_* then REDIS_*. + REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_HOST: z + .string() + .optional() + .transform((v) => v ?? process.env.PUBSUB_REDIS_HOST ?? process.env.REDIS_HOST), + REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_PORT: z.coerce + .number() + .optional() + .transform((v) => { + if (v !== undefined) return v; + const raw = process.env.PUBSUB_REDIS_PORT ?? process.env.REDIS_PORT; + return raw ? parseInt(raw) : undefined; + }), + REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_USERNAME: z + .string() + .optional() + .transform((v) => v ?? process.env.PUBSUB_REDIS_USERNAME ?? process.env.REDIS_USERNAME), + REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_PASSWORD: z + .string() + .optional() + .transform((v) => v ?? process.env.PUBSUB_REDIS_PASSWORD ?? process.env.REDIS_PASSWORD), + REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_TLS_DISABLED: z + .string() + .default(process.env.PUBSUB_REDIS_TLS_DISABLED ?? process.env.REDIS_TLS_DISABLED ?? "false"), + REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_CLUSTER_MODE_ENABLED: z + .string() + .default(process.env.PUBSUB_REDIS_CLUSTER_MODE_ENABLED ?? "0"), + // Use sharded pub/sub (SSUBSCRIBE/SPUBLISH) in cluster mode; "0" forces classic pub/sub. + REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_SHARDED_ENABLED: z.string().default("1"), + DEFAULT_ENV_EXECUTION_CONCURRENCY_LIMIT: z.coerce.number().int().default(100), DEFAULT_ENV_EXECUTION_CONCURRENCY_BURST_FACTOR: z.coerce.number().default(1.0), DEFAULT_ORG_EXECUTION_CONCURRENCY_LIMIT: z.coerce.number().int().default(300), @@ -1608,6 +1677,18 @@ const EnvironmentSchema = z .enum(["log", "error", "warn", "info", "debug"]) .default("info"), RUN_ENGINE_CLICKHOUSE_COMPRESSION_REQUEST: z.string().default("1"), + // Dedicated ClickHouse pool for the native backend's tag/batch id resolution; falls back to CLICKHOUSE_URL. + REALTIME_BACKEND_NATIVE_CLICKHOUSE_URL: z + .string() + .optional() + .transform((v) => v ?? process.env.CLICKHOUSE_URL), + REALTIME_BACKEND_NATIVE_CLICKHOUSE_KEEP_ALIVE_ENABLED: z.string().default("1"), + REALTIME_BACKEND_NATIVE_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS: z.coerce.number().int().optional(), + REALTIME_BACKEND_NATIVE_CLICKHOUSE_MAX_OPEN_CONNECTIONS: z.coerce.number().int().default(10), + REALTIME_BACKEND_NATIVE_CLICKHOUSE_LOG_LEVEL: z + .enum(["log", "error", "warn", "info", "debug"]) + .default("info"), + REALTIME_BACKEND_NATIVE_CLICKHOUSE_COMPRESSION_REQUEST: z.string().default("1"), EVENTS_CLICKHOUSE_BATCH_SIZE: z.coerce.number().int().default(1000), EVENTS_CLICKHOUSE_FLUSH_INTERVAL_MS: z.coerce.number().int().default(1000), METRICS_CLICKHOUSE_BATCH_SIZE: z.coerce.number().int().default(10000), diff --git a/apps/webapp/app/models/runtimeEnvironment.server.ts b/apps/webapp/app/models/runtimeEnvironment.server.ts index 64b1da3be49..be05adaa8a7 100644 --- a/apps/webapp/app/models/runtimeEnvironment.server.ts +++ b/apps/webapp/app/models/runtimeEnvironment.server.ts @@ -237,10 +237,20 @@ export async function findEnvironmentBySlug( return environment ? toAuthenticated(environment) : null; } +// The authenticated environment plus the run scalars the realtime publish needs. +// Both come from one taskRun read — see findEnvironmentFromRun. +export type EnvironmentFromRun = { + environment: AuthenticatedEnvironment; + runTags: string[]; + batchId: string | null; +}; + export async function findEnvironmentFromRun( runId: string, tx?: PrismaClientOrTransaction -): Promise { +): Promise { + // The include (no select) already pulls every taskRun scalar, so runTags/batchId + // ride along for free — no extra query for the realtime publish to send a full record. const taskRun = await (tx ?? $replica).taskRun.findFirst({ where: { id: runId, @@ -249,7 +259,14 @@ export async function findEnvironmentFromRun( runtimeEnvironment: { include: authIncludeBase }, }, }); - return taskRun?.runtimeEnvironment ? toAuthenticated(taskRun.runtimeEnvironment) : null; + if (!taskRun?.runtimeEnvironment) { + return null; + } + return { + environment: toAuthenticated(taskRun.runtimeEnvironment), + runTags: taskRun.runTags, + batchId: taskRun.batchId, + }; } export async function createNewSession( diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts index ceae1efb4b4..d3d92d1fe5d 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts @@ -12,6 +12,7 @@ import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { authenticateApiRequest } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; import { updateMetadataService } from "~/services/metadata/updateMetadataInstance.server"; +import { publishChangeRecord } from "~/services/realtime/runChangeNotifierInstance.server"; import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; import { ServiceValidationError } from "~/v3/services/common.server"; import { applyMetadataMutationToBufferedRun } from "~/v3/mollifier/applyMetadataMutation.server"; @@ -184,7 +185,15 @@ const { action } = createActionApiRoute( return json({ error: "Internal Server Error" }, { status: 500 }); } if (pgResult) { - return json(pgResult, { status: 200 }); + // Reflect metadata.set() on a live feed before the next lifecycle event. Publish the + // internal id (the router keys single-run feeds by it, not the friendly id from the URL). + publishChangeRecord({ + runId: pgResult.runId, + envId: env.id, + tags: pgResult.runTags, + batchId: pgResult.batchId, + }); + return json({ metadata: pgResult.metadata }, { status: 200 }); } // PG miss. Target run is either buffered or genuinely absent. diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts index ef7f3180bf3..e98d3f35823 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts @@ -7,6 +7,7 @@ import { MAX_TAGS_PER_RUN } from "~/models/taskRunTag.server"; import { authenticateApiRequest } from "~/services/apiAuth.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; import { logger } from "~/services/logger.server"; +import { publishChangeRecord } from "~/services/realtime/runChangeNotifierInstance.server"; import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server"; // Pull the existing tags out of a buffer entry's serialised payload so @@ -90,6 +91,14 @@ export async function action({ request, params }: ActionFunctionArgs) { }, data: { runTags: { push: newTags } }, }); + // Publish a run-changed record with the NEW tag set so tag feeds reindex + // (no-op unless enabled). + publishChangeRecord({ + runId: taskRun.id, + envId: env.id, + tags: existing.concat(newTags), + batchId: taskRun.batchId, + }); return json({ message: `Successfully set ${newTags.length} new tags.` }, { status: 200 }); }, // Buffer-applied patch path. The mutateSnapshot Lua deduplicates diff --git a/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts b/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts index 2b8fb106681..add50434d48 100644 --- a/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts +++ b/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts @@ -1,7 +1,7 @@ import { z } from "zod"; import { $replica } from "~/db.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; -import { realtimeClient } from "~/services/realtimeClientGlobal.server"; +import { resolveRealtimeStreamClient } from "~/services/realtime/resolveRealtimeStreamClient.server"; import { anyResource, createLoaderApiRoute } from "~/services/routeBuilders/apiBuilder.server"; const ParamsSchema = z.object({ @@ -33,7 +33,10 @@ export const loader = createLoaderApiRoute( }, }, async ({ authentication, request, resource: batchRun, apiVersion }) => { - return realtimeClient.streamBatch( + // Pick the Electric proxy or the native backend per org (defaults to Electric); both implement streamBatch. + const client = await resolveRealtimeStreamClient(authentication.environment); + + return client.streamBatch( request.url, authentication.environment, batchRun.id, diff --git a/apps/webapp/app/routes/realtime.v1.runs.$runId.ts b/apps/webapp/app/routes/realtime.v1.runs.$runId.ts index e03787c6200..46118c1d894 100644 --- a/apps/webapp/app/routes/realtime.v1.runs.$runId.ts +++ b/apps/webapp/app/routes/realtime.v1.runs.$runId.ts @@ -2,7 +2,7 @@ import { json } from "@remix-run/server-runtime"; import { z } from "zod"; import { $replica } from "~/db.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; -import { realtimeClient } from "~/services/realtimeClientGlobal.server"; +import { resolveRealtimeStreamClient } from "~/services/realtime/resolveRealtimeStreamClient.server"; import { anyResource, createLoaderApiRoute, @@ -48,17 +48,17 @@ export const loader = createLoaderApiRoute( }, }, async ({ authentication, request, resource: run, apiVersion }) => { - return realtimeClient.streamRun( + // Pick the Electric proxy or the native backend per org (defaults to Electric); both implement streamRun. + const client = await resolveRealtimeStreamClient(authentication.environment); + + return client.streamRun( request.url, authentication.environment, run.id, apiVersion, authentication.realtime, request.headers.get("x-trigger-electric-version") ?? undefined, - // Propagate abort on client disconnect so the upstream Electric long-poll - // fetch is cancelled too. Without this, undici buffers from the unconsumed - // upstream response body accumulate until Electric's poll timeout, causing - // steady RSS growth on api (see docs/runbooks for the H1 isolation test). + // Propagate abort on client disconnect so the upstream Electric long-poll is cancelled too, else undici buffers grow RSS until the poll timeout. getRequestAbortSignal() ); } diff --git a/apps/webapp/app/routes/realtime.v1.runs.ts b/apps/webapp/app/routes/realtime.v1.runs.ts index b04c2d55bbc..2e3617800fe 100644 --- a/apps/webapp/app/routes/realtime.v1.runs.ts +++ b/apps/webapp/app/routes/realtime.v1.runs.ts @@ -1,6 +1,6 @@ import { z } from "zod"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; -import { realtimeClient } from "~/services/realtimeClientGlobal.server"; +import { resolveRealtimeStreamClient } from "~/services/realtime/resolveRealtimeStreamClient.server"; import { anyResource, createLoaderApiRoute, @@ -25,12 +25,7 @@ export const loader = createLoaderApiRoute( authorization: { action: "read", resource: (_, __, searchParams) => - // Pre-RBAC, the resource was the searchParams object itself and - // the legacy `checkAuthorization` iterated `Object.keys`, so a - // JWT with type-level `read:tags` (no id) granted access to the - // unfiltered runs stream. Including `{ type: "tags" }` here - // preserves that — per-id `read:tags:` still grants only - // when the filter includes that tag. + // `{ type: "tags" }` preserves pre-RBAC type-level `read:tags` access to the unfiltered stream; per-id `read:tags:` still grants only when the filter includes that tag. anyResource([ { type: "runs" }, { type: "tags" }, @@ -39,7 +34,10 @@ export const loader = createLoaderApiRoute( }, }, async ({ searchParams, authentication, request, apiVersion }) => { - return realtimeClient.streamRuns( + // Pick the Electric proxy or the native backend per org (defaults to Electric); both implement streamRuns. + const client = await resolveRealtimeStreamClient(authentication.environment); + + return client.streamRuns( request.url, authentication.environment, searchParams, diff --git a/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts b/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts index fb7f384fd27..794938e9807 100644 --- a/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts +++ b/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts @@ -211,6 +211,36 @@ function initializeRunEngineClickhouseClient(): ClickHouse { }); } +/** Realtime runs feed tag/batch id resolution (`REALTIME_BACKEND_NATIVE_CLICKHOUSE_URL`); + * falls back to the default client if unset. */ +const defaultRealtimeClickhouseClient = singleton( + "realtimeClickhouseClient", + initializeRealtimeClickhouseClient +); + +function initializeRealtimeClickhouseClient(): ClickHouse { + if (!env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_URL) { + return defaultClickhouseClient; + } + + const url = new URL(env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_URL); + url.searchParams.delete("secure"); + + return new ClickHouse({ + url: url.toString(), + name: "realtime-runs-clickhouse", + keepAlive: { + enabled: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_KEEP_ALIVE_ENABLED === "1", + idleSocketTtl: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS, + }, + logLevel: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_LOG_LEVEL, + compression: { + request: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_COMPRESSION_REQUEST === "1", + }, + maxOpenConnections: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_MAX_OPEN_CONNECTIONS, + }); +} + /** Task events (`EVENTS_CLICKHOUSE_URL`); not exported — accessed via factory. */ const defaultEventsClickhouseClient = singleton( "eventsClickhouseClient", @@ -257,7 +287,8 @@ export type ClientType = | "logs" | "query" | "admin" - | "engine"; + | "engine" + | "realtime"; function buildOrgClickhouseClient(url: string, clientType: ClientType): ClickHouse { const parsed = new URL(url); @@ -330,6 +361,20 @@ function buildOrgClickhouseClient(url: string, clientType: ClientType): ClickHou }, maxOpenConnections: env.RUN_ENGINE_CLICKHOUSE_MAX_OPEN_CONNECTIONS, }); + case "realtime": + return new ClickHouse({ + url: parsed.toString(), + name, + keepAlive: { + enabled: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_KEEP_ALIVE_ENABLED === "1", + idleSocketTtl: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS, + }, + logLevel: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_LOG_LEVEL, + compression: { + request: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_COMPRESSION_REQUEST === "1", + }, + maxOpenConnections: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_MAX_OPEN_CONNECTIONS, + }); case "standard": case "query": case "admin": @@ -398,6 +443,8 @@ export class ClickhouseFactory { return defaultAdminClickhouseClient; case "engine": return defaultRunEngineClickhouseClient; + case "realtime": + return defaultRealtimeClickhouseClient; } } diff --git a/apps/webapp/app/services/metadata/updateMetadata.server.ts b/apps/webapp/app/services/metadata/updateMetadata.server.ts index cfb946a1024..3948da046f9 100644 --- a/apps/webapp/app/services/metadata/updateMetadata.server.ts +++ b/apps/webapp/app/services/metadata/updateMetadata.server.ts @@ -308,6 +308,8 @@ export class UpdateMetadataService { }, select: { id: true, + batchId: true, + runTags: true, completedAt: true, status: true, metadata: true, @@ -355,6 +357,10 @@ export class UpdateMetadataService { return { metadata: newMetadata, + // Internal id + membership keys, so callers can publish full realtime records the router routes by index. + runId: taskRun.id, + batchId: taskRun.batchId, + runTags: taskRun.runTags, }; } diff --git a/apps/webapp/app/services/realtime/boundedTtlCache.ts b/apps/webapp/app/services/realtime/boundedTtlCache.ts new file mode 100644 index 00000000000..ac422880ded --- /dev/null +++ b/apps/webapp/app/services/realtime/boundedTtlCache.ts @@ -0,0 +1,50 @@ +/** + * Tiny in-process bounded TTL cache shared by the realtime feeds: entries expire after `ttlMs` (evicted on read), + * and at-capacity writes sweep expired entries then drop the oldest. A stored `undefined` is indistinguishable from a miss (use `null` for absence). + */ +export class BoundedTtlCache { + readonly #entries = new Map(); + + constructor( + private readonly ttlMs: number, + private readonly maxEntries: number + ) {} + + get(key: string): V | undefined { + const entry = this.#entries.get(key); + if (!entry) { + return undefined; + } + if (entry.expiresAt > Date.now()) { + return entry.value; + } + // Evict on read so expired entries don't linger until the next at-capacity + // sweep — important for read-heavy / low-churn caches (per-handle working sets). + this.#entries.delete(key); + return undefined; + } + + set(key: string, value: V): void { + // Only run capacity eviction when inserting a NEW key — updating an existing key + // doesn't grow the map, so it must never drop an unrelated live entry. + if (!this.#entries.has(key) && this.#entries.size >= this.maxEntries) { + const now = Date.now(); + for (const [key, entry] of this.#entries) { + if (entry.expiresAt <= now) { + this.#entries.delete(key); + } + } + if (this.#entries.size >= this.maxEntries) { + const oldest = this.#entries.keys().next().value; + if (oldest !== undefined) { + this.#entries.delete(oldest); + } + } + } + this.#entries.set(key, { value, expiresAt: Date.now() + this.ttlMs }); + } + + get size(): number { + return this.#entries.size; + } +} diff --git a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts new file mode 100644 index 00000000000..317b1c15454 --- /dev/null +++ b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts @@ -0,0 +1,39 @@ +import { type ClickHouse } from "@internal/clickhouse"; +import { type PrismaClientOrTransaction } from "~/db.server"; +import { RunsRepository } from "~/services/runsRepository/runsRepository.server"; +import { type RunListFilter, type RunListResolver } from "./runReader.server"; + +export type ClickHouseRunListResolverOptions = { + /** Resolves the per-organization ClickHouse client (multi-tenant routing). */ + getClickhouse: (organizationId: string) => Promise; + prisma: PrismaClientOrTransaction; +}; + +/** + * Resolves the realtime tag/list filter into matching run ids via ClickHouse `listRunIds` (filter-only; + * rows hydrated from Postgres by id afterward). Tag matching is contains-ALL, byte-matching Electric's + * `runTags @> ARRAY[...]` shape. + */ +export class ClickHouseRunListResolver implements RunListResolver { + constructor(private readonly options: ClickHouseRunListResolverOptions) {} + + async resolveMatchingRunIds(filter: RunListFilter): Promise { + const clickhouse = await this.options.getClickhouse(filter.organizationId); + const repository = new RunsRepository({ clickhouse, prisma: this.options.prisma }); + + const { runIds } = await repository.listRunIds({ + organizationId: filter.organizationId, + projectId: filter.projectId, + environmentId: filter.environmentId, + tags: filter.tags && filter.tags.length > 0 ? filter.tags : undefined, + // Contains-ALL, matching the Electric shape's `runTags @> ARRAY[...]` semantics. + tagsMatch: "all", + batchId: filter.batchId, + from: filter.createdAtAfter?.getTime(), + page: { size: filter.limit }, + }); + + // listRunIds is keyset-paginated; runIds is already capped to page.size (= limit). + return runIds; + } +} diff --git a/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts b/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts new file mode 100644 index 00000000000..efe711a7273 --- /dev/null +++ b/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts @@ -0,0 +1,279 @@ +/** + * Pure (no DB/Redis/env) Electric HTTP shape-stream wire serializer, byte-faithful to what the + * deployed `@electric-sql/client` (1.0.14 + 0.4.0) and the SDK's `SubscribeRunRawShape` expect. + * Each column value is wire-encoded as a string (or null) decoded via the `electric-schema` header; + * `up-to-date` is the only control message that makes the client emit, and re-sending a full row is idempotent. + */ + +export type ElectricColumnType = + | "text" + | "timestamp" + | "int4" + | "int8" + | "float8" + | "bool" + | "jsonb"; + +type ElectricColumn = { + name: string; + type: ElectricColumnType; + /** Array dimensionality. 1 => `type[]` (Postgres `{a,b}` literal). */ + dims?: number; + /** Array columns only: true when the column has no SQL default, so an empty value emits `null` (not `{}`). Prisma erases this distinction, so we re-derive it here. */ + emptyArrayAsNull?: boolean; +}; + +/** Columns the realtime run feed exposes; keep in sync with `DEFAULT_ELECTRIC_COLUMNS`. `type`/`dims` drive the schema header and value encoding. */ +export const RUN_ELECTRIC_COLUMNS: ReadonlyArray = [ + { name: "id", type: "text" }, + { name: "taskIdentifier", type: "text" }, + { name: "createdAt", type: "timestamp" }, + { name: "updatedAt", type: "timestamp" }, + { name: "startedAt", type: "timestamp" }, + { name: "delayUntil", type: "timestamp" }, + { name: "queuedAt", type: "timestamp" }, + { name: "expiredAt", type: "timestamp" }, + { name: "completedAt", type: "timestamp" }, + { name: "friendlyId", type: "text" }, + { name: "number", type: "int4" }, + { name: "isTest", type: "bool" }, + { name: "status", type: "text" }, + { name: "usageDurationMs", type: "int4" }, + { name: "costInCents", type: "float8" }, + { name: "baseCostInCents", type: "float8" }, + { name: "ttl", type: "text" }, + { name: "payload", type: "text" }, + { name: "payloadType", type: "text" }, + { name: "metadata", type: "text" }, + { name: "metadataType", type: "text" }, + { name: "output", type: "text" }, + { name: "outputType", type: "text" }, + { name: "runTags", type: "text", dims: 1, emptyArrayAsNull: true }, + { name: "error", type: "jsonb" }, + { name: "realtimeStreams", type: "text", dims: 1 }, +]; + +/** Columns that can never be skipped via `skipColumns` (mirrors realtimeClient). */ +export const RESERVED_COLUMNS = ["id", "taskIdentifier", "friendlyId", "status", "createdAt"]; + +/** A single run hydrated for the realtime feed; structurally compatible with the `RunHydrator` Prisma `TaskRun` projection. */ +export type RealtimeRunRow = { + id: string; + taskIdentifier: string; + createdAt: Date; + updatedAt: Date; + startedAt: Date | null; + delayUntil: Date | null; + queuedAt: Date | null; + expiredAt: Date | null; + completedAt: Date | null; + friendlyId: string; + number: number; + isTest: boolean; + status: string; + usageDurationMs: number; + costInCents: number; + baseCostInCents: number; + ttl: string | null; + payload: string; + payloadType: string; + metadata: string | null; + metadataType: string; + output: string | null; + outputType: string; + runTags: string[]; + error: unknown; + realtimeStreams: string[]; +}; + +type Operation = "insert" | "update" | "delete"; + +type ChangeMessage = { + key: string; + value: Record; + headers: { operation: Operation }; +}; + +type ControlMessage = { + headers: { control: "up-to-date" | "must-refetch" }; +}; + +type ShapeMessage = ChangeMessage | ControlMessage; + +const UP_TO_DATE: ControlMessage = { headers: { control: "up-to-date" } }; + +function effectiveSkipColumns(skipColumns: string[]): Set { + return new Set(skipColumns.filter((c) => c !== "" && !RESERVED_COLUMNS.includes(c))); +} + +function quoteArrayElement(value: string): string { + return `"${value.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`; +} + +function pgArrayLiteral(values: unknown[]): string { + if (values.length === 0) { + return "{}"; + } + return `{${values.map((v) => quoteArrayElement(String(v))).join(",")}}`; +} + +function serializeValue(value: unknown, column: ElectricColumn): string | null { + if (value === null || value === undefined) { + return null; + } + + if (column.dims && column.dims > 0) { + if (!Array.isArray(value)) { + return null; + } + // A no-default array column stores NULL when empty, so Electric emits `null` + // (not `{}`); match that here since Prisma handed us `[]` for the NULL value. + if (value.length === 0 && column.emptyArrayAsNull) { + return null; + } + return pgArrayLiteral(value); + } + + switch (column.type) { + case "bool": + // Postgres text representation; the client's parseBool accepts "t"/"f". + return value ? "t" : "f"; + case "timestamp": + // The SDK's RawShapeDate appends "Z" before parsing, so we emit the ISO + // string WITHOUT the trailing "Z". + return value instanceof Date ? value.toISOString().slice(0, -1) : String(value); + case "jsonb": + return JSON.stringify(value); + case "int4": + case "int8": + case "float8": + case "text": + default: + return String(value); + } +} + +/** The merge key the client uses to reassemble a row across insert/update cycles. */ +export function runShapeKey(runId: string): string { + return `"public"."TaskRun"/"${runId}"`; +} + +/** Encode a single run row into the wire `value` object (column -> string|null). */ +export function serializeRunRow( + row: RealtimeRunRow, + skipColumns: string[] = [] +): Record { + const skip = effectiveSkipColumns(skipColumns); + const value: Record = {}; + + for (const column of RUN_ELECTRIC_COLUMNS) { + if (skip.has(column.name)) { + continue; + } + value[column.name] = serializeValue((row as Record)[column.name], column); + } + + return value; +} + +/** The `electric-schema` response header value for the (optionally trimmed) column set. */ +export function buildElectricSchemaHeader(skipColumns: string[] = []): string { + const skip = effectiveSkipColumns(skipColumns); + const schema: Record = {}; + + for (const column of RUN_ELECTRIC_COLUMNS) { + if (skip.has(column.name)) { + continue; + } + schema[column.name] = column.dims ? { type: column.type, dims: column.dims } : { type: column.type }; + } + + return JSON.stringify(schema); +} + +/** Initial snapshot body: an `insert` for the row (if present) then `up-to-date`; an absent row emits a bare `up-to-date` (empty shape). */ +export function buildSnapshotBody(row: RealtimeRunRow | null, skipColumns: string[] = []): string { + const messages: ShapeMessage[] = []; + if (row) { + messages.push({ + key: runShapeKey(row.id), + value: serializeRunRow(row, skipColumns), + headers: { operation: "insert" }, + }); + } + messages.push(UP_TO_DATE); + return JSON.stringify(messages); +} + +/** Live body when the row advanced: a full-row `update` followed by `up-to-date`. */ +export function buildUpdateBody(row: RealtimeRunRow, skipColumns: string[] = []): string { + const messages: ShapeMessage[] = [ + { + key: runShapeKey(row.id), + value: serializeRunRow(row, skipColumns), + headers: { operation: "update" }, + }, + UP_TO_DATE, + ]; + return JSON.stringify(messages); +} + +/** Live body when nothing advanced: a bare `up-to-date` (no row emission). */ +export function buildUpToDateBody(): string { + return JSON.stringify([UP_TO_DATE]); +} + +export type RowChange = { row: RealtimeRunRow; operation: "insert" | "update" }; + +/** Multi-row body for the tag-list feed: one change message per row then `up-to-date` (empty `changes` emits a bare `up-to-date`). */ +export function buildRowsBody(changes: RowChange[], skipColumns: string[] = []): string { + const messages: ShapeMessage[] = changes.map((change) => ({ + key: runShapeKey(change.row.id), + value: serializeRunRow(change.row, skipColumns), + headers: { operation: change.operation }, + })); + messages.push(UP_TO_DATE); + return JSON.stringify(messages); +} + +/** A row change whose wire `value` was already serialized (once, shared across feeds by + * the EnvChangeRouter); the per-feed `operation` is applied here. */ +export type SerializedRowChange = { + runId: string; + value: Record; + operation: "insert" | "update"; +}; + +/** Like `buildRowsBody`, but from values serialized once per (runId, columnSet) upstream, + * so a run matching many feeds is serialized once and reused across their bodies. */ +export function buildRowsBodyFromSerialized(changes: SerializedRowChange[]): string { + const messages: ShapeMessage[] = changes.map((change) => ({ + key: runShapeKey(change.runId), + value: change.value, + headers: { operation: change.operation }, + })); + messages.push(UP_TO_DATE); + return JSON.stringify(messages); +} + +export const INITIAL_OFFSET = "-1"; + +/** Opaque `_` offset token (client `${number}_${number}` type); the first segment lets a live request detect whether the row advanced. */ +export function encodeOffset(updatedAtMs: number, seq: number): string { + return `${Math.trunc(updatedAtMs)}_${Math.trunc(seq)}`; +} + +/** Extract the `updatedAt` epoch-ms a client last saw from its echoed offset. */ +export function parseOffsetUpdatedAtMs(offset: string | null | undefined): number { + if (!offset) { + return 0; + } + const [first] = offset.split("_"); + const value = Number(first); + return Number.isFinite(value) && value > 0 ? value : 0; +} + +/** Mirror of realtimeClient's DEQUEUED->EXECUTING rewrite for non-current API versions. */ +export function rewriteBodyForLegacyApiVersion(body: string): string { + return body.replace(/"status":"DEQUEUED"/g, '"status":"EXECUTING"'); +} diff --git a/apps/webapp/app/services/realtime/envChangeRouter.server.ts b/apps/webapp/app/services/realtime/envChangeRouter.server.ts new file mode 100644 index 00000000000..587f0e55c5a --- /dev/null +++ b/apps/webapp/app/services/realtime/envChangeRouter.server.ts @@ -0,0 +1,524 @@ +import { type ChangeRecord } from "./runChangeNotifier.server"; +import { type RealtimeRunRow, serializeRunRow } from "./electricStreamProtocol.server"; +import { logger } from "~/services/logger.server"; + +/** + * EnvChangeRouter — per-instance routing layer that fans one env's change stream out to the feeds it + * matches. Owns one subscription per env (over the RunChangeNotifier) plus an inverted index of held + * feeds, then per batch: routes via the index, batch-hydrates matched runs once per column set, + * serializes each row's wire value once, and resolves each matched feed's pending wait. Stateless across reconnects. + */ + +export type WakeReason = "notify" | "timeout" | "abort"; + +/** A feed's membership predicate over the env stream. */ +export type FeedFilter = + | { kind: "run"; runId: string } + | { kind: "tag"; tags: string[]; createdAtFloorMs?: number } + | { kind: "batch"; batchId: string }; + +/** A matched run handed to a feed: the hydrated row (for the feed's working-set diff) and + * its wire `value` serialized once for this feed's column set (shared across feeds). */ +export type MatchedRow = { row: RealtimeRunRow; value: Record }; + +export type WaitResult = { reason: WakeReason; rows: MatchedRow[] }; + +/** Minimal deps so the router is unit-testable without Redis/Postgres. */ +export interface EnvChangeSource { + subscribeToEnv(environmentId: string, onBatch: (records: ChangeRecord[]) => void): () => void; +} +export interface RowHydrator { + hydrateByIds( + environmentId: string, + ids: string[], + skipColumns: string[] + ): Promise; +} + +export type EnvChangeRouterOptions = { + source: EnvChangeSource; + hydrator: RowHydrator; + /** Observability: a hydrate-by-id batch ran (count = runs hydrated this tick). */ + onHydrate?: (runCount: number) => void; + /** How far back (ms) a newly-armed feed replays buffered records. 0 disables replay. */ + replayWindowMs?: number; + /** Cap on buffered recent records per env (latest record per run). */ + replayMaxRunsPerEnv?: number; + /** How long (ms) to keep an env subscribed + buffering after its last feed closes. 0 disables. */ + unsubscribeLingerMs?: number; + /** Observability: a replay scan found candidates and delivered rows (or none survived). */ + onReplay?: (result: "delivered" | "empty") => void; + /** Observability: a buffered record was evicted. `cap` evictions mean the env churns more + * runs inside the window than the buffer holds (the replay guarantee is degrading). */ + onReplayEviction?: (reason: "cap" | "window") => void; +}; + +const DEFAULT_REPLAY_WINDOW_MS = 2_000; +const DEFAULT_REPLAY_MAX_RUNS_PER_ENV = 512; +const DEFAULT_UNSUBSCRIBE_LINGER_MS = 5_000; + +/** Handle a feed holds for the duration of one long-poll. */ +export type FeedRegistration = { + /** Wait for the next batch matching this feed (or timeout/abort), with the matched runs + * hydrated + serialized for this feed's columns. One wait active at a time. */ + waitForMatch(signal: AbortSignal | undefined, timeoutMs: number): Promise; + /** Deregister from the index; unsubscribes the env when the last feed leaves. */ + close(): void; + /** False when this instance's env subscription is younger than the replay window, so a + * change in the caller's inter-poll gap may have been missed (hop/cold start) — the + * caller should resolve once instead of holding blind. */ + gapCovered: boolean; +}; + +type Feed = { + filter: FeedFilter; + skipColumns: string[]; + columnSig: string; + /** The currently-waiting poll's resolver (null between polls). */ + resolve: ((result: WaitResult) => void) | null; + /** Buffered records at or before this timestamp have been replayed (or predate this feed). */ + replayCursorMs: number; +}; + +type EnvState = { + unsubscribe: () => void; + feeds: Set; + byRunId: Map>; + byTag: Map>; + byBatchId: Map>; + /** All tag feeds, for routing partial records (no tags) as hydrate-to-classify candidates. */ + tagFeeds: Set; + /** Tag feeds with no tag filter — they match every record but are unreachable via byTag. */ + unfilteredTagFeeds: Set; + /** When this env's channel subscription started (for the gap-coverage check). */ + subscribedAtMs: number; + /** Latest record per run, insertion-ordered, for replaying inter-poll gaps to newly-armed feeds. */ + recent: Map; + /** Pending teardown while the env lingers with zero feeds. */ + lingerTimer?: ReturnType; +}; + +function addToIndex(index: Map>, key: string, feed: Feed) { + let set = index.get(key); + if (!set) { + set = new Set(); + index.set(key, set); + } + set.add(feed); +} + +function removeFromIndex(index: Map>, key: string, feed: Feed) { + const set = index.get(key); + if (set) { + set.delete(feed); + if (set.size === 0) { + index.delete(key); + } + } +} + +export class EnvChangeRouter { + readonly #envs = new Map(); + + constructor(private readonly options: EnvChangeRouterOptions) {} + + register( + environmentId: string, + filter: FeedFilter, + skipColumns: string[], + opts?: { + /** When the caller last received data for this connection. Bounds the replay to the + * true inter-poll gap; older than the window can't be proven covered. */ + replaySinceMs?: number; + } + ): FeedRegistration { + const env = this.#ensureEnv(environmentId); + const replayWindowMs = this.options.replayWindowMs ?? DEFAULT_REPLAY_WINDOW_MS; + const now = Date.now(); + const windowFloorMs = now - replayWindowMs; + const sinceMs = opts?.replaySinceMs ?? windowFloorMs; + const feed: Feed = { + filter, + skipColumns, + columnSig: skipColumns.length > 0 ? [...skipColumns].sort().join(",") : "", + resolve: null, + // First arm replays the caller's inter-poll gap; later arms only what arrived since. + // The buffer only spans the window, so never rewind past it. + replayCursorMs: Math.max(sinceMs, windowFloorMs), + }; + + env.feeds.add(feed); + this.#indexFeed(env, feed); + + const waitForMatch = (signal: AbortSignal | undefined, timeoutMs: number) => + new Promise((resolve) => { + if (signal?.aborted) { + resolve({ reason: "abort", rows: [] }); + return; + } + let settled = false; + let timer: ReturnType | undefined; + let onAbort: (() => void) | undefined; + const settle = (result: WaitResult) => { + if (settled) return; + settled = true; + feed.resolve = null; + if (timer) clearTimeout(timer); + if (signal && onAbort) signal.removeEventListener("abort", onAbort); + resolve(result); + }; + feed.resolve = settle; + timer = setTimeout(() => settle({ reason: "timeout", rows: [] }), timeoutMs); + timer.unref?.(); + if (signal) { + onAbort = () => settle({ reason: "abort", rows: [] }); + signal.addEventListener("abort", onAbort, { once: true }); + } + // Deliver any buffered records this feed hasn't seen (catches changes that + // landed while the caller was between polls). + if (replayWindowMs > 0 && env.recent.size > 0) { + this.#replayRecent(environmentId, env, feed).catch((error) => { + logger.error("[envChangeRouter] failed to replay buffered records", { + environmentId, + error, + }); + }); + } + }); + + const close = () => { + if (!env.feeds.has(feed)) { + return; + } + env.feeds.delete(feed); + this.#deindexFeed(env, feed); + // Resolve any in-flight wait so the poll doesn't hang. + feed.resolve?.({ reason: "abort", rows: [] }); + feed.resolve = null; + if (env.feeds.size === 0) { + this.#scheduleEnvTeardown(environmentId, env); + } + }; + + return { + waitForMatch, + close, + // Covered when this instance was already subscribed (and buffering) at the gap's + // start, and the gap fits inside the buffer's window. + gapCovered: + replayWindowMs <= 0 || (env.subscribedAtMs <= sinceMs && sinceMs >= windowFloorMs), + }; + } + + /** Distinct environments currently routed (for metrics). */ + get activeEnvCount(): number { + return this.#envs.size; + } + + /** Currently-held feeds by kind (for metrics) — the system's capacity unit. */ + get heldFeedCounts(): { run: number; tag: number; batch: number } { + const counts = { run: 0, tag: 0, batch: 0 }; + for (const env of this.#envs.values()) { + for (const feed of env.feeds) { + counts[feed.filter.kind]++; + } + } + return counts; + } + + #ensureEnv(environmentId: string): EnvState { + const existing = this.#envs.get(environmentId); + if (existing) { + // A pending teardown is cancelled by new interest; the buffer survives the gap. + if (existing.lingerTimer) { + clearTimeout(existing.lingerTimer); + existing.lingerTimer = undefined; + } + return existing; + } + const env: EnvState = { + unsubscribe: () => {}, + feeds: new Set(), + byRunId: new Map(), + byTag: new Map(), + byBatchId: new Map(), + tagFeeds: new Set(), + unfilteredTagFeeds: new Set(), + subscribedAtMs: Date.now(), + recent: new Map(), + }; + this.#envs.set(environmentId, env); + env.unsubscribe = this.options.source.subscribeToEnv(environmentId, (records) => { + this.#bufferRecent(env, records); + // Fire-and-forget; catch hydrate failures here (unhandled rejection exits the process) — waiters time out into the backstop. + this.#onBatch(environmentId, env, records).catch((error) => { + logger.error("[envChangeRouter] failed to route a change batch", { + environmentId, + error, + }); + }); + }); + return env; + } + + /** Keep the env subscribed + buffering for a linger after its last feed closes, so a + * client's next poll (or another instance hop landing back here) can replay the gap. */ + #scheduleEnvTeardown(environmentId: string, env: EnvState) { + const lingerMs = this.options.unsubscribeLingerMs ?? DEFAULT_UNSUBSCRIBE_LINGER_MS; + if (lingerMs <= 0) { + this.#envs.delete(environmentId); + env.unsubscribe(); + return; + } + if (env.lingerTimer) { + clearTimeout(env.lingerTimer); + } + env.lingerTimer = setTimeout(() => { + if (env.feeds.size === 0) { + this.#envs.delete(environmentId); + env.unsubscribe(); + } + }, lingerMs); + env.lingerTimer.unref?.(); + } + + /** Upsert the latest record per run (insertion-ordered) and prune to the window + cap. */ + #bufferRecent(env: EnvState, records: ChangeRecord[]) { + const windowMs = this.options.replayWindowMs ?? DEFAULT_REPLAY_WINDOW_MS; + if (windowMs <= 0) { + return; + } + const maxRuns = this.options.replayMaxRunsPerEnv ?? DEFAULT_REPLAY_MAX_RUNS_PER_ENV; + const now = Date.now(); + for (const record of records) { + env.recent.delete(record.runId); + env.recent.set(record.runId, { record, receivedAtMs: now }); + } + const cutoff = now - windowMs; + for (const [runId, entry] of env.recent) { + if (entry.receivedAtMs >= cutoff && env.recent.size <= maxRuns) { + break; + } + this.options.onReplayEviction?.(entry.receivedAtMs < cutoff ? "window" : "cap"); + env.recent.delete(runId); + } + } + + /** Whether a buffered record matches a feed's predicate (mirrors #onBatch's routing). */ + #recordMatchesFeed(record: ChangeRecord, feed: Feed): boolean { + switch (feed.filter.kind) { + case "run": + return record.runId === feed.filter.runId; + case "batch": + return record.batchId != null && record.batchId === feed.filter.batchId; + case "tag": { + const tags = feed.filter.tags; + // Unfiltered feed matches everything; partial record (no tags) = hydrate-to-classify. + if (tags.length === 0 || record.tags === undefined) { + return true; + } + return record.tags.some((tag) => tags.includes(tag)); + } + } + } + + /** Deliver buffered records newer than the feed's cursor through the normal + * hydrate -> serialize -> settle pipeline. Already-seen rows diff to nothing downstream. */ + async #replayRecent(environmentId: string, env: EnvState, feed: Feed) { + const cursor = feed.replayCursorMs; + feed.replayCursorMs = Date.now(); + + const runIds: string[] = []; + for (const [runId, entry] of env.recent) { + if (entry.receivedAtMs > cursor && this.#recordMatchesFeed(entry.record, feed)) { + runIds.push(runId); + } + } + if (runIds.length === 0 || !feed.resolve) { + return; + } + + const hydrated = await this.options.hydrator.hydrateByIds( + environmentId, + runIds, + feed.skipColumns + ); + this.options.onHydrate?.(hydrated.length); + + const rows: MatchedRow[] = []; + for (const row of hydrated) { + if (feed.filter.kind === "tag" && !this.#tagRowMatches(row, feed.filter)) { + continue; + } + rows.push({ row, value: serializeRunRow(row, feed.skipColumns) }); + } + + if (rows.length > 0 && feed.resolve) { + this.options.onReplay?.("delivered"); + feed.resolve({ reason: "notify", rows }); + } else { + this.options.onReplay?.("empty"); + } + } + + #indexFeed(env: EnvState, feed: Feed) { + switch (feed.filter.kind) { + case "run": + addToIndex(env.byRunId, feed.filter.runId, feed); + break; + case "batch": + addToIndex(env.byBatchId, feed.filter.batchId, feed); + break; + case "tag": + env.tagFeeds.add(feed); + if (feed.filter.tags.length === 0) { + env.unfilteredTagFeeds.add(feed); + } + for (const tag of feed.filter.tags) { + addToIndex(env.byTag, tag, feed); + } + break; + } + } + + #deindexFeed(env: EnvState, feed: Feed) { + switch (feed.filter.kind) { + case "run": + removeFromIndex(env.byRunId, feed.filter.runId, feed); + break; + case "batch": + removeFromIndex(env.byBatchId, feed.filter.batchId, feed); + break; + case "tag": + env.tagFeeds.delete(feed); + env.unfilteredTagFeeds.delete(feed); + for (const tag of feed.filter.tags) { + removeFromIndex(env.byTag, tag, feed); + } + break; + } + } + + async #onBatch(environmentId: string, env: EnvState, records: ChangeRecord[]) { + // 1. Route each record to the held feeds it matches; collect matched runIds per feed. + const matchedRunIdsByFeed = new Map>(); + const addMatch = (feed: Feed, runId: string) => { + if (!feed.resolve) { + // Feed isn't currently waiting (between polls). Drop — its backstop catches gaps. + return; + } + let set = matchedRunIdsByFeed.get(feed); + if (!set) { + set = new Set(); + matchedRunIdsByFeed.set(feed, set); + } + set.add(runId); + }; + + for (const record of records) { + // run feeds: exact runId match. + const runFeeds = env.byRunId.get(record.runId); + if (runFeeds) { + for (const feed of runFeeds) addMatch(feed, record.runId); + } + + // batch feeds: exact batchId match (only when the record carries one). + if (record.batchId) { + const batchFeeds = env.byBatchId.get(record.batchId); + if (batchFeeds) { + for (const feed of batchFeeds) addMatch(feed, record.runId); + } + } + + // tag feeds. + if (record.tags !== undefined) { + // Full record: prune via the tag index; only feeds whose filter intersects match. + const seen = new Set(); + for (const tag of record.tags) { + const tagFeeds = env.byTag.get(tag); + if (!tagFeeds) continue; + for (const feed of tagFeeds) { + if (seen.has(feed)) continue; + seen.add(feed); + addMatch(feed, record.runId); + } + } + // Unfiltered tag feeds match every record but live outside the index. + for (const feed of env.unfilteredTagFeeds) addMatch(feed, record.runId); + } else { + // Partial record (no membership data): route to every tag feed as a candidate to + // hydrate-and-classify (rare; the publish side emits full records in practice). + for (const feed of env.tagFeeds) addMatch(feed, record.runId); + } + } + + if (matchedRunIdsByFeed.size === 0) { + return; + } + + // 2. Batch-hydrate ONCE per column set, then 3. serialize ONCE per (runId, column set). + const runIdsByColumnSig = new Map }>(); + for (const [feed, runIds] of matchedRunIdsByFeed) { + let group = runIdsByColumnSig.get(feed.columnSig); + if (!group) { + group = { skipColumns: feed.skipColumns, runIds: new Set() }; + runIdsByColumnSig.set(feed.columnSig, group); + } + for (const id of runIds) group.runIds.add(id); + } + + const hydratedByColumnSig = new Map>(); + await Promise.all( + [...runIdsByColumnSig.entries()].map(async ([columnSig, group]) => { + const ids = [...group.runIds]; + const rows = await this.options.hydrator.hydrateByIds( + environmentId, + ids, + group.skipColumns + ); + this.options.onHydrate?.(rows.length); + const map = new Map(); + for (const row of rows) { + map.set(row.id, { row, value: serializeRunRow(row, group.skipColumns) }); + } + hydratedByColumnSig.set(columnSig, map); + }) + ); + + // 4. Assemble each feed's matched rows (post-filtering tag feeds against the + // authoritative hydrated row) and resolve its pending wait. + for (const [feed, runIds] of matchedRunIdsByFeed) { + if (!feed.resolve) { + continue; // stopped waiting while we hydrated; its next poll/backstop covers it + } + const hydrated = hydratedByColumnSig.get(feed.columnSig); + if (!hydrated) continue; + + const rows: MatchedRow[] = []; + for (const runId of runIds) { + const matched = hydrated.get(runId); + if (!matched) continue; // run not found / left the table + if (feed.filter.kind === "tag" && !this.#tagRowMatches(matched.row, feed.filter)) { + continue; // re-confirm tags + createdAt floor against the authoritative row + } + rows.push(matched); + } + + if (rows.length > 0) { + feed.resolve({ reason: "notify", rows }); + } + // No surviving rows (e.g. a partial-record candidate that didn't actually match): + // leave the feed waiting; nothing relevant changed for it. + } + } + + /** Authoritative re-check for tag feeds: the hydrated row carries ALL the filter's tags + * (Electric's `runTags @> ARRAY[...]` semantics) and its createdAt is within the window. */ + #tagRowMatches(row: RealtimeRunRow, filter: Extract): boolean { + if (filter.createdAtFloorMs !== undefined && row.createdAt.getTime() < filter.createdAtFloorMs) { + return false; + } + const rowTags = row.runTags ?? []; + return filter.tags.every((tag) => rowTags.includes(tag)); + } +} diff --git a/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts b/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts new file mode 100644 index 00000000000..00e50ed9fcc --- /dev/null +++ b/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts @@ -0,0 +1,1046 @@ +import { json } from "@remix-run/server-runtime"; +import { safeParseNaturalLanguageDurationAgo } from "@trigger.dev/core/v3/isomorphic"; +import { randomUUID } from "node:crypto"; +import { API_VERSIONS, CURRENT_API_VERSION } from "~/api/versions"; +import { + type CachedLimitProvider, + type RealtimeEnvironment, + type RealtimeRequestOptions, + type RealtimeRunsParams, +} from "../realtimeClient.server"; +import { logger } from "../logger.server"; +import { + buildElectricSchemaHeader, + buildRowsBody, + buildRowsBodyFromSerialized, + buildSnapshotBody, + buildUpdateBody, + buildUpToDateBody, + encodeOffset, + INITIAL_OFFSET, + parseOffsetUpdatedAtMs, + type RealtimeRunRow, + rewriteBodyForLegacyApiVersion, + RESERVED_COLUMNS, + type RowChange, + type SerializedRowChange, +} from "./electricStreamProtocol.server"; +import { BoundedTtlCache } from "./boundedTtlCache"; +import { + type EnvChangeRouter, + type FeedFilter, + type MatchedRow, +} from "./envChangeRouter.server"; +import { type RunHydrator, type RunListResolver } from "./runReader.server"; +import { type RealtimeConcurrencyLimiter } from "./realtimeConcurrencyLimiter.server"; +import { InMemoryReplayCursorStore, type ReplayCursorStore } from "./replayCursorStore.server"; + +/** Widened with projectId so the tag-list feed can resolve ids via ClickHouse (needs org + project + env). */ +export type RealtimeListEnvironment = RealtimeEnvironment & { projectId: string }; + +/** The realtime feeds the run routes depend on (single-run, tag-list, batch); both backends satisfy it. */ +export interface RealtimeStreamClient { + streamRun( + url: URL | string, + environment: RealtimeEnvironment, + runId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise; + streamRuns( + url: URL | string, + environment: RealtimeListEnvironment, + params: RealtimeRunsParams, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise; + streamBatch( + url: URL | string, + environment: RealtimeListEnvironment, + batchId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise; +} + +export type WakeupReason = "notify" | "timeout" | "abort"; + +/** How a live poll resolved: `fast-hydrate` (router woke us, hydrate-by-id), `full-resolve` + * (backstop), or `cold-resolve` (fresh env subscription probed once instead of holding blind). */ +export type LivePollPath = "fast-hydrate" | "full-resolve" | "cold-resolve"; + +export type NativeRealtimeClientOptions = { + runReader: RunHydrator; + /** Resolves the tag/list filter into the matching id-set (filter-only). */ + runListResolver: RunListResolver; + /** Per-instance routing layer over the single env change channel. */ + router: EnvChangeRouter; + limiter: RealtimeConcurrencyLimiter; + cachedLimitProvider: CachedLimitProvider; + /** Fallback per-env concurrent-connection limit when the org has none cached. */ + defaultConcurrencyLimit?: number; + /** Backstop wait before refetching on a live request (ms). Defaults to 20000. */ + livePollTimeoutMs?: number; + /** Jitter ratio applied to the live-poll timeout (0.15 = ±15%). */ + livePollJitterRatio?: number; + /** Ceiling for the tag-list createdAt lookback window (ms). */ + maximumCreatedAtFilterAgeMs: number; + /** Hard cap on tag-list snapshot size. Defaults to 1000. */ + maxListResults?: number; + /** TTL (ms) for the multi-run resolve+hydrate coalescing cache (initial + backstop). */ + runSetResolveCacheTtlMs?: number; + /** Max entries in the resolve+hydrate cache. Defaults to 5000. */ + runSetResolveCacheMaxEntries?: number; + /** Max entries in the per-handle working-set cache. Defaults to 10000. */ + listCacheMaxEntries?: number; + /** TTL (ms) for working-set cache entries. Defaults to 300000. */ + workingSetCacheTtlMs?: number; + /** Epoch-aligned bucket (ms) the tag-list createdAt floor is floored to, so same-tag feeds share a cache entry. Defaults to 60000; 0 disables. */ + runSetCreatedAtBucketMs?: number; + /** When true (default), a multi-run live poll holds until a real delta or the backstop rather than returning an empty up-to-date. */ + holdOnEmpty?: boolean; + /** Max concurrent fresh ClickHouse resolves (cache misses) per instance, bounding a distinct-filter stampede. Defaults to 16; 0 disables. */ + resolveAdmissionLimit?: number; + /** Per-connection replay-cursor store. Inject a fleet-shared (Redis) store so an instance + * hop reads the connection's true inter-poll gap instead of cold-probing; defaults to a + * per-instance in-memory cache. */ + replayCursorStore?: ReplayCursorStore; + /** Observability hook: why a live request woke (notify vs timeout vs abort). */ + onWakeup?: (reason: WakeupReason) => void; + /** Observability hook: how a live poll resolved (fast path vs full resolve). */ + onLivePollPath?: (path: LivePollPath) => void; + /** Observability hook: whether a multi-run resolve hit the cache, coalesced onto an in-flight resolve, or missed. */ + onRunSetResolve?: (result: "hit" | "miss" | "coalesced") => void; + /** Observability hook: latency (ms) of the ClickHouse resolve / Postgres hydrate. */ + onRunSetQuery?: (stage: "resolve" | "hydrate", ms: number) => void; + /** Observability hook: a fresh resolve waited `ms` for an admission permit (only when the gate engaged). */ + onResolveAdmissionWait?: (ms: number) => void; + /** Observability hook: a live emission left the server — lag is now minus the newest + * emitted row's updatedAt (the end-to-end delivery SLI), rowCount the delta size. */ + onEmit?: (path: LivePollPath, lagMs: number, rowCount: number) => void; + /** Observability hook: a backstop resolve found missed changes (delivered) or nothing + * (empty). Sustained `delivered` means the notify/replay path is leaking. */ + onBackstopResult?: (result: "delivered" | "empty") => void; + /** Observability hook: a poll was rejected by the per-env concurrency limiter (429). */ + onConcurrencyRejected?: () => void; +}; + +const DEFAULT_CONCURRENCY_LIMIT = 100_000; +// Matches Electric's ~20s live long-poll hold (jittered ±15% per request). +const DEFAULT_LIVE_POLL_TIMEOUT_MS = 20_000; +const DEFAULT_LIVE_POLL_JITTER_RATIO = 0.15; +const DEFAULT_MAX_LIST_RESULTS = 1_000; +const LIST_CACHE_TTL_MS = 5 * 60_000; +const LIST_CACHE_MAX_ENTRIES = 10_000; +const DEFAULT_RUNSET_CACHE_TTL_MS = 1_000; +const DEFAULT_RUNSET_CACHE_MAX_ENTRIES = 5_000; +const DEFAULT_RUNSET_CREATED_AT_BUCKET_MS = 60_000; +const DEFAULT_RESOLVE_ADMISSION_LIMIT = 16; + +/** Fair FIFO semaphore bounding concurrent fresh ClickHouse resolves. Sits behind the single-flight + TTL cache, so only genuine cache-miss resolves take a permit. */ +class ResolveAdmissionGate { + #available: number; + #inUse = 0; + readonly #waiters: Array<() => void> = []; + + constructor(limit: number) { + this.#available = limit; + } + + /** Permits currently held (for a metrics gauge); never exceeds the limit. */ + get inUse(): number { + return this.#inUse; + } + + async acquire(): Promise { + if (this.#available > 0) { + this.#available--; + this.#inUse++; + return; + } + await new Promise((resolve) => this.#waiters.push(resolve)); + this.#inUse++; + } + + release(): void { + this.#inUse--; + const next = this.#waiters.shift(); + if (next) { + next(); // hand the freed permit straight to the next waiter (FIFO, no count churn) + } else { + this.#available++; + } + } +} + +/** A multi-run feed's filter: tag-list sets `tags` (+ pinned `createdAtAfter`); the batch feed sets `batchId`. */ +type RunSetFilter = { + tags?: string[]; + batchId?: string; + createdAtAfter?: Date; +}; + +/** Per-handle working set: runId -> last-emitted updatedAt (ms), so live polls emit only rows that advanced. */ +type WorkingSet = Map; + +type ResponseHeaderInput = { + offset: string; + handle: string; + cursor?: string; + schema?: string; +}; + +/** + * Native-backend implementation of the realtime run feeds. All three feeds are predicates over ONE + * per-environment change stream (the EnvChangeRouter), which decides membership, hydrates the matched + * runs, and serializes their wire values once; this client owns the snapshot, the per-handle working-set + * diff, the ClickHouse backstop, and the wire response (opaque `offset`/`handle`/`cursor` tokens). + */ +export class NativeRealtimeClient implements RealtimeStreamClient { + #seq = 0; + readonly #workingSetCache: BoundedTtlCache; + /** Coalescing cache for the multi-run resolve+hydrate, keyed by (env, filter, columns), so identical filters share one resolve. */ + readonly #runSetCache: BoundedTtlCache; + readonly #runSetInflight = new Map>(); + /** Bounds concurrent fresh CH resolves (undefined => unbounded). */ + readonly #admissionGate?: ResolveAdmissionGate; + /** Per-connection: when this connection's last response was sent, so the router's + * replay covers exactly the inter-poll gap instead of rewinding a full window. + * Fleet-shared when a store is injected (hops stop looking like unknown gaps). */ + readonly #replayCursors: ReplayCursorStore; + + constructor(private readonly options: NativeRealtimeClientOptions) { + this.#workingSetCache = new BoundedTtlCache( + options.workingSetCacheTtlMs ?? LIST_CACHE_TTL_MS, + options.listCacheMaxEntries ?? LIST_CACHE_MAX_ENTRIES + ); + this.#replayCursors = + options.replayCursorStore ?? + new InMemoryReplayCursorStore( + options.workingSetCacheTtlMs ?? LIST_CACHE_TTL_MS, + options.listCacheMaxEntries ?? LIST_CACHE_MAX_ENTRIES + ); + this.#runSetCache = new BoundedTtlCache( + options.runSetResolveCacheTtlMs ?? DEFAULT_RUNSET_CACHE_TTL_MS, + options.runSetResolveCacheMaxEntries ?? DEFAULT_RUNSET_CACHE_MAX_ENTRIES + ); + const admissionLimit = options.resolveAdmissionLimit ?? DEFAULT_RESOLVE_ADMISSION_LIMIT; + if (admissionLimit > 0) { + this.#admissionGate = new ResolveAdmissionGate(admissionLimit); + } + } + + /** Current size of the per-handle working-set cache (for a metrics gauge). */ + get workingSetCacheSize(): number { + return this.#workingSetCache.size; + } + + /** Fresh CH resolves currently holding an admission permit (for a metrics gauge). */ + get resolveAdmissionInUse(): number { + return this.#admissionGate?.inUse ?? 0; + } + + async streamRun( + url: URL | string, + environment: RealtimeEnvironment, + runId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const { offset, handle, isLive, skipColumns } = this.#parseStreamRequest(url, requestOptions); + + // Initial snapshot — no prior offset/handle. + if (offset === INITIAL_OFFSET || !handle) { + const row = await this.options.runReader.getRunById(environment.id, runId); + return this.#snapshotResponse(runId, row, skipColumns, apiVersion, clientVersion); + } + + if (isLive) { + return this.#liveResponse({ + environment, + runId, + offset, + handle, + skipColumns, + apiVersion, + clientVersion, + signal, + }); + } + + // Non-live catch-up with a handle: re-emit the current snapshot (idempotent). + const row = await this.options.runReader.getRunById(environment.id, runId); + return this.#snapshotResponse(runId, row, skipColumns, apiVersion, clientVersion, handle); + } + + async streamRuns( + url: URL | string, + environment: RealtimeListEnvironment, + params: RealtimeRunsParams, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const { offset, handle, isLive, skipColumns } = this.#parseStreamRequest(url, requestOptions); + const tags = params.tags ?? []; + + // Initial snapshot — pin the createdAt window in a fresh handle. + if (offset === INITIAL_OFFSET || !handle) { + const createdAtFilterMs = this.#computeCreatedAtFilter(params.createdAt).getTime(); + return this.#runSetSnapshotResponse( + environment, + { tags, createdAtAfter: new Date(createdAtFilterMs) }, + this.#mintListHandle(createdAtFilterMs), + skipColumns, + apiVersion, + clientVersion + ); + } + + // Recover the pinned window from the handle so the lower bound never drifts. + // Re-clamp the recovered value to the max-age floor so a stale or crafted handle + // can't widen the lookback past the configured ceiling. + const recoveredMs = this.#filterMsFromHandle(handle); + const filter: RunSetFilter = { + tags, + createdAtAfter: new Date( + recoveredMs !== undefined + ? this.#clampCreatedAtFloor(recoveredMs) + : this.#computeCreatedAtFilter(params.createdAt).getTime() + ), + }; + + if (isLive) { + return this.#runSetLiveResponse( + environment, + filter, + handle, + offset, + skipColumns, + apiVersion, + clientVersion, + signal + ); + } + + // Non-live catch-up under the same handle. + return this.#runSetSnapshotResponse( + environment, + filter, + handle, + skipColumns, + apiVersion, + clientVersion + ); + } + + async streamBatch( + url: URL | string, + environment: RealtimeListEnvironment, + batchId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const { offset, handle, isLive, skipColumns } = this.#parseStreamRequest(url, requestOptions); + + const filter: RunSetFilter = { batchId }; + + if (offset !== INITIAL_OFFSET && handle && isLive) { + return this.#runSetLiveResponse( + environment, + filter, + handle, + offset, + skipColumns, + apiVersion, + clientVersion, + signal + ); + } + + // Initial snapshot + non-live catch-up. The handle must be per-connection, never + // derived from the batchId: working sets are keyed by handle, and a shared handle + // lets one subscriber's emit permanently suppress the same row for another. + return this.#runSetSnapshotResponse( + environment, + filter, + handle ?? this.#mintBatchHandle(batchId), + skipColumns, + apiVersion, + clientVersion + ); + } + + #snapshotResponse( + runId: string, + row: Awaited>, + skipColumns: string[], + apiVersion: API_VERSIONS, + clientVersion?: string, + existingHandle?: string + ): Response { + const body = buildSnapshotBody(row, skipColumns); + const offset = row ? encodeOffset(row.updatedAt.getTime(), this.#nextSeq()) : encodeOffset(0, 0); + return this.#buildResponse(body, apiVersion, clientVersion, { + offset, + handle: existingHandle ?? this.#mintHandle(runId), + schema: buildElectricSchemaHeader(skipColumns), + }); + } + + /** Live poll for a single-run feed: emit a full-row `update` only when the row advanced past the client's offset, else a bare `up-to-date`. */ + async #liveResponse(params: { + environment: RealtimeEnvironment; + runId: string; + offset: string; + handle: string; + skipColumns: string[]; + apiVersion: API_VERSIONS; + clientVersion?: string; + signal?: AbortSignal; + }): Promise { + const { environment, runId, offset, handle, skipColumns, apiVersion, clientVersion, signal } = + params; + + return this.#withConcurrencySlot(environment, async () => { + const lastSeenMs = parseOffsetUpdatedAtMs(offset); + const registration = this.options.router.register( + environment.id, + { kind: "run", runId }, + skipColumns + ); + const deadline = Date.now() + this.#jitteredTimeout(); + + try { + // Cold start (fresh env subscription, e.g. an instance hop): a change in the + // caller's inter-poll gap may have been missed — check the row once, then hold. + if (!registration.gapCovered) { + this.options.onLivePollPath?.("cold-resolve"); + const probed = await this.options.runReader.getRunById(environment.id, runId); + if (probed && probed.updatedAt.getTime() > lastSeenMs) { + const seq = this.#nextSeq(); + this.options.onEmit?.("cold-resolve", Date.now() - probed.updatedAt.getTime(), 1); + return this.#buildResponse( + buildUpdateBody(probed, skipColumns), + apiVersion, + clientVersion, + { + offset: encodeOffset(probed.updatedAt.getTime(), seq), + handle, + cursor: String(seq), + } + ); + } + } + + while (true) { + const remaining = deadline - Date.now(); + const { reason, rows } = + remaining > 0 + ? await registration.waitForMatch(signal, remaining) + : { reason: "timeout" as const, rows: [] as MatchedRow[] }; + this.options.onWakeup?.(reason); + + if (reason === "abort") { + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset, + handle, + cursor: String(this.#nextSeq()), + }); + } + + if (reason === "notify" && rows.length > 0) { + // The router hydrated + serialized this run; emit it (only on advance). + this.options.onLivePollPath?.("fast-hydrate"); + const matched = rows[0]; + const updatedAtMs = matched.row.updatedAt.getTime(); + if (updatedAtMs > lastSeenMs) { + const seq = this.#nextSeq(); + this.options.onEmit?.("fast-hydrate", Date.now() - updatedAtMs, 1); + return this.#buildResponse( + buildRowsBodyFromSerialized([ + { runId: matched.row.id, value: matched.value, operation: "update" }, + ]), + apiVersion, + clientVersion, + { offset: encodeOffset(updatedAtMs, seq), handle, cursor: String(seq) } + ); + } + // Already seen (e.g. a replayed record): keep holding rather than returning an + // empty up-to-date the client would immediately re-issue. + continue; + } + + // Backstop timeout: re-check the run directly (no ClickHouse for the single-run feed). + this.options.onLivePollPath?.("full-resolve"); + const row = await this.options.runReader.getRunById(environment.id, runId); + const seq = this.#nextSeq(); + if (row && row.updatedAt.getTime() > lastSeenMs) { + this.options.onBackstopResult?.("delivered"); + this.options.onEmit?.("full-resolve", Date.now() - row.updatedAt.getTime(), 1); + return this.#buildResponse( + buildUpdateBody(row, skipColumns), + apiVersion, + clientVersion, + { + offset: encodeOffset(row.updatedAt.getTime(), seq), + handle, + cursor: String(seq), + } + ); + } + this.options.onBackstopResult?.("empty"); + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset, + handle, + cursor: String(seq), + }); + } + } finally { + registration.close(); + } + }); + } + + /** Initial (and non-live catch-up) snapshot for a multi-run feed: resolve the + * id-set, hydrate, emit every row as an `insert`, and seed the working set. */ + async #runSetSnapshotResponse( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + handle: string, + skipColumns: string[], + apiVersion: API_VERSIONS, + clientVersion?: string + ): Promise { + const rows = await this.#resolveAndHydrate(environment, filter, skipColumns); + + const changes: RowChange[] = rows.map((row) => ({ row, operation: "insert" as const })); + + // updatedAt comes from the authoritative Postgres hydrate, not ClickHouse. + const seen: WorkingSet = new Map(); + let maxUpdatedAt = 0; + for (const row of rows) { + const updatedAtMs = row.updatedAt.getTime(); + seen.set(row.id, updatedAtMs); + maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); + } + this.#workingSetCache.set(this.#workingSetKey(environment.id, handle), seen); + this.#replayCursors.set(this.#workingSetKey(environment.id, handle), Date.now()); + + return this.#buildResponse(buildRowsBody(changes, skipColumns), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, this.#nextSeq()), + handle, + schema: buildElectricSchemaHeader(skipColumns), + }); + } + + /** Live poll for a multi-run feed: fast path diffs router-notified rows against the working set; the timeout backstop does a full ClickHouse resolve. */ + async #runSetLiveResponse( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + handle: string, + offset: string, + skipColumns: string[], + apiVersion: API_VERSIONS, + clientVersion: string | undefined, + signal: AbortSignal | undefined + ): Promise { + return this.#withConcurrencySlot(environment, async () => { + const offsetFloorMs = parseOffsetUpdatedAtMs(offset); + // Total time to hold this long-poll, jittered to avoid synchronized refetch herds. + const deadline = Date.now() + this.#jitteredTimeout(); + const holdOnEmpty = this.options.holdOnEmpty ?? true; + + // Working set we diff against: seeded from the cache (or the offset floor on a + // miss) and advanced on each refetch within this held request. + const workingSetKey = this.#workingSetKey(environment.id, handle); + let prevSeen = this.#workingSetCache.get(workingSetKey); + + const markPollEnd = () => this.#replayCursors.set(workingSetKey, Date.now()); + const emitFromSerialized = (changes: SerializedRowChange[], maxUpdatedAt: number): Response => { + const seq = this.#nextSeq(); + markPollEnd(); + return this.#buildResponse(buildRowsBodyFromSerialized(changes), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, seq), + handle, + cursor: String(seq), + }); + }; + const emitFromRows = (changes: RowChange[], maxUpdatedAt: number): Response => { + const seq = this.#nextSeq(); + markPollEnd(); + return this.#buildResponse(buildRowsBody(changes, skipColumns), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, seq), + handle, + cursor: String(seq), + }); + }; + const emitUpToDate = (maxUpdatedAt: number): Response => { + const seq = this.#nextSeq(); + markPollEnd(); + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, seq), + handle, + cursor: String(seq), + }); + }; + + // When this connection last received data, so replay covers exactly its gap. A store + // error degrades to undefined (cold probe), never a failed poll. + const replaySinceMs = await this.#replayCursors.get(workingSetKey); + const registration = this.options.router.register( + environment.id, + this.#feedFilter(filter), + skipColumns, + { replaySinceMs } + ); + + // Cold start (fresh env subscription, e.g. an instance hop): resolve once up front + // instead of holding blind — a change in the caller's inter-poll gap may have been missed. + let coldProbe = !registration.gapCovered; + + try { + while (true) { + if (coldProbe) { + coldProbe = false; + this.options.onLivePollPath?.("cold-resolve"); + const resolved = await this.#resolveAndHydrate(environment, filter, skipColumns); + const { changes, maxUpdatedAt, touched } = this.#diffRows( + resolved, + prevSeen, + offsetFloorMs + ); + this.#workingSetCache.set(workingSetKey, touched); + prevSeen = touched; + if (changes.length > 0) { + this.options.onEmit?.("cold-resolve", Date.now() - maxUpdatedAt, changes.length); + return emitFromRows(changes, maxUpdatedAt); + } + continue; // nothing was missed — hold as usual + } + + const remaining = deadline - Date.now(); + const { reason, rows } = + remaining > 0 + ? await registration.waitForMatch(signal, remaining) + : { reason: "timeout" as const, rows: [] as MatchedRow[] }; + this.options.onWakeup?.(reason); + + if (reason === "abort") { + return emitUpToDate(offsetFloorMs); + } + + // FAST PATH: the router already confirmed membership + the createdAt window and + // hydrated/serialized the matched runs. Just diff against the working set. + if (reason === "notify") { + this.options.onLivePollPath?.("fast-hydrate"); + const { changes, maxUpdatedAt, touched } = this.#diffMatched( + rows, + prevSeen, + offsetFloorMs + ); + // Merge (not replace): the router only surfaced the changed subset, so keep the + // rest of the working set intact. The backstop full-resolve rebuilds it. + const merged = this.#mergeWorkingSet(prevSeen, touched); + this.#workingSetCache.set(workingSetKey, merged); + prevSeen = merged; + + if (changes.length > 0) { + this.options.onEmit?.("fast-hydrate", Date.now() - maxUpdatedAt, changes.length); + return emitFromSerialized(changes, maxUpdatedAt); + } + // Matched but no row advanced (already seen). Keep holding. + if (holdOnEmpty) { + continue; + } + return emitUpToDate(maxUpdatedAt); + } + + // BACKSTOP: full ClickHouse resolve + hydrate. Replaces the working set so runs + // that left the filter stop being tracked (the client keeps showing them). + this.options.onLivePollPath?.("full-resolve"); + const resolved = await this.#resolveAndHydrate(environment, filter, skipColumns); + const { changes, maxUpdatedAt, touched } = this.#diffRows( + resolved, + prevSeen, + offsetFloorMs + ); + this.#workingSetCache.set(workingSetKey, touched); + prevSeen = touched; + + if (changes.length > 0) { + this.options.onBackstopResult?.("delivered"); + this.options.onEmit?.("full-resolve", Date.now() - maxUpdatedAt, changes.length); + return emitFromRows(changes, maxUpdatedAt); + } + // Empty backstop diff: timeout returns up-to-date; (holdOnEmpty never reaches + // here on a notify — those are handled in the fast path above). + this.options.onBackstopResult?.("empty"); + return emitUpToDate(maxUpdatedAt); + } + } finally { + registration.close(); + } + }); + } + + /** Translate a multi-run filter into the router's membership predicate. */ + #feedFilter(filter: RunSetFilter): FeedFilter { + if (filter.batchId !== undefined) { + return { kind: "batch", batchId: filter.batchId }; + } + return { + kind: "tag", + tags: filter.tags ?? [], + createdAtFloorMs: filter.createdAtAfter?.getTime(), + }; + } + + /** Diff router-matched rows (already serialized) against the prior working set, pairing + * each row's shared `value` with this feed's operation. */ + #diffMatched( + matched: MatchedRow[], + prevSeen: WorkingSet | undefined, + offsetFloorMs: number + ): { changes: SerializedRowChange[]; maxUpdatedAt: number; touched: WorkingSet } { + const changes: SerializedRowChange[] = []; + const touched: WorkingSet = new Map(); + let maxUpdatedAt = offsetFloorMs; + for (const { row, value } of matched) { + const updatedAtMs = row.updatedAt.getTime(); + touched.set(row.id, updatedAtMs); + maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); + + if (prevSeen) { + const prior = prevSeen.get(row.id); + if (prior === undefined) { + changes.push({ runId: row.id, value, operation: "insert" }); + } else if (updatedAtMs > prior) { + changes.push({ runId: row.id, value, operation: "update" }); + } + } else if (updatedAtMs > offsetFloorMs) { + changes.push({ runId: row.id, value, operation: "update" }); + } + } + return { changes, maxUpdatedAt, touched }; + } + + /** Diff hydrated rows against the prior working set on Postgres `updatedAt`: not-in-set is `insert`, advanced is `update`. */ + #diffRows( + rows: RealtimeRunRow[], + prevSeen: WorkingSet | undefined, + offsetFloorMs: number + ): { changes: RowChange[]; maxUpdatedAt: number; touched: WorkingSet } { + const changes: RowChange[] = []; + const touched: WorkingSet = new Map(); + let maxUpdatedAt = offsetFloorMs; + for (const row of rows) { + const updatedAtMs = row.updatedAt.getTime(); + touched.set(row.id, updatedAtMs); + maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); + + if (prevSeen) { + const prior = prevSeen.get(row.id); + if (prior === undefined) { + changes.push({ row, operation: "insert" }); + } else if (updatedAtMs > prior) { + changes.push({ row, operation: "update" }); + } + } else if (updatedAtMs > offsetFloorMs) { + changes.push({ row, operation: "update" }); + } + } + return { changes, maxUpdatedAt, touched }; + } + + /** Merge fast-path touched rows into the prior working set. The fast path only saw the + * changed subset, so we keep the rest (the backstop full-resolve does the exact rebuild). */ + #mergeWorkingSet(prevSeen: WorkingSet | undefined, touched: WorkingSet): WorkingSet { + const merged: WorkingSet = new Map(prevSeen ?? undefined); + for (const [id, updatedAtMs] of touched) { + merged.set(id, updatedAtMs); + } + return merged; + } + + /** Resolve the filter's id-set (ClickHouse) and hydrate (Postgres), coalesced + short-TTL cached so identical filters share one resolve+hydrate. */ + async #resolveAndHydrate( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + skipColumns: string[] + ): Promise { + const key = this.#runSetCacheKey(environment.id, filter, skipColumns); + + const cached = this.#runSetCache.get(key); + if (cached) { + this.options.onRunSetResolve?.("hit"); + return cached; + } + + const existing = this.#runSetInflight.get(key); + if (existing) { + this.options.onRunSetResolve?.("coalesced"); + return existing; + } + + this.options.onRunSetResolve?.("miss"); + // Registered in #runSetInflight synchronously below, so same-filter callers that arrive + // while this is still waiting for an admission permit coalesce onto it (one permit, not N). + const promise = this.#admitAndResolveUncached(environment, filter, skipColumns) + .then((rows) => { + this.#runSetCache.set(key, rows); + return rows; + }) + .finally(() => { + this.#runSetInflight.delete(key); + }); + + this.#runSetInflight.set(key, promise); + return promise; + } + + /** Acquire an admission permit (if the gate is enabled) before the fresh CH+PG resolve, so + * a distinct-filter stampede is throttled to the configured concurrency. */ + async #admitAndResolveUncached( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + skipColumns: string[] + ): Promise { + if (!this.#admissionGate) { + return this.#resolveAndHydrateUncached(environment, filter, skipColumns); + } + const waitStart = Date.now(); + await this.#admissionGate.acquire(); + const waited = Date.now() - waitStart; + if (waited > 0) { + this.options.onResolveAdmissionWait?.(waited); + } + try { + return await this.#resolveAndHydrateUncached(environment, filter, skipColumns); + } finally { + this.#admissionGate.release(); + } + } + + async #resolveAndHydrateUncached( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + skipColumns: string[] + ): Promise { + const resolveStart = Date.now(); + const ids = await this.#resolveIds(environment, filter); + this.options.onRunSetQuery?.("resolve", Date.now() - resolveStart); + + const hydrateStart = Date.now(); + const rows = await this.options.runReader.hydrateByIds(environment.id, ids, skipColumns); + this.options.onRunSetQuery?.("hydrate", Date.now() - hydrateStart); + + return rows; + } + + /** Stable cache key for the resolve+hydrate cache. Same key => same id-set and the + * same projected columns, so cached rows always match the requesting feed. */ + #runSetCacheKey(environmentId: string, filter: RunSetFilter, skipColumns: string[]): string { + // JSON-encode the arrays (not a join) so a tag containing the separator can't collide with a different filter. + const tags = filter.tags && filter.tags.length > 0 ? JSON.stringify([...filter.tags].sort()) : ""; + const cols = skipColumns.length > 0 ? JSON.stringify([...skipColumns].sort()) : ""; + const maxListResults = this.options.maxListResults ?? DEFAULT_MAX_LIST_RESULTS; + return `${environmentId}|${tags}|${filter.batchId ?? ""}|${ + filter.createdAtAfter?.getTime() ?? "" + }|${maxListResults}|${cols}`; + } + + async #resolveIds(environment: RealtimeListEnvironment, filter: RunSetFilter): Promise { + const maxListResults = this.options.maxListResults ?? DEFAULT_MAX_LIST_RESULTS; + const ids = await this.options.runListResolver.resolveMatchingRunIds({ + organizationId: environment.organizationId, + projectId: environment.projectId, + environmentId: environment.id, + tags: filter.tags, + batchId: filter.batchId, + createdAtAfter: filter.createdAtAfter, + limit: maxListResults, + }); + + if (ids.length >= maxListResults) { + logger.warn("[nativeRealtimeClient] run-set feed hit the result cap", { + environmentId: environment.id, + filter, + cap: maxListResults, + }); + } + + return ids; + } + + #computeCreatedAtFilter(createdAt: string | undefined): Date { + // Clamp to the maximum lookback window, mirroring realtimeClient. + const floor = new Date(Date.now() - this.options.maximumCreatedAtFilterAgeMs); + const parsed = safeParseNaturalLanguageDurationAgo(createdAt ?? "24h"); + const resolved = !parsed || parsed < floor ? floor : parsed; + // Bucket the lower bound so same-tag feeds share a cache key; floored, so the window only ever widens by < bucket. + return new Date(this.#bucketCreatedAtMs(resolved.getTime())); + } + + #bucketCreatedAtMs(ms: number): number { + const bucket = this.options.runSetCreatedAtBucketMs ?? DEFAULT_RUNSET_CREATED_AT_BUCKET_MS; + return bucket > 0 ? Math.floor(ms / bucket) * bucket : ms; + } + + /** Clamp a handle-recovered createdAt lower bound up to the max-age floor (so a + * stale or crafted handle can't widen the window past the ceiling), then re-bucket. */ + #clampCreatedAtFloor(ms: number): number { + const floorMs = Date.now() - this.options.maximumCreatedAtFilterAgeMs; + return this.#bucketCreatedAtMs(Math.max(ms, floorMs)); + } + + #mintListHandle(createdAtFilterMs: number): string { + // Pins the createdAt threshold in the opaque handle so live polls reuse the + // same lower bound even on a working-set cache miss. + return `runs_${Math.trunc(createdAtFilterMs)}_${this.#mintUniqueSuffix()}`; + } + + #mintBatchHandle(batchId: string): string { + return `batch_${batchId}_${this.#mintUniqueSuffix()}`; + } + + #mintUniqueSuffix(): string { + // The seq alone isn't unique across instances/restarts; behind a non-sticky ALB a + // collision would land two connections on one working-set cache entry. + return `${this.#nextSeq()}_${randomUUID().slice(0, 8)}`; + } + + #workingSetKey(environmentId: string, handle: string): string { + // The handle is client-echoed; env-prefix the key so a foreign handle can never + // read or overwrite another tenant's working set. + return `${environmentId}:${handle}`; + } + + #filterMsFromHandle(handle: string): number | undefined { + const parts = handle.split("_"); + if (parts[0] !== "runs") { + return undefined; + } + const ms = Number(parts[1]); + return Number.isFinite(ms) && ms > 0 ? ms : undefined; + } + + #parseStreamRequest( + url: URL | string, + requestOptions?: RealtimeRequestOptions + ): { offset: string; handle: string | null; isLive: boolean; skipColumns: string[] } { + const $url = new URL(url.toString()); + return { + offset: $url.searchParams.get("offset") ?? INITIAL_OFFSET, + handle: $url.searchParams.get("handle") ?? $url.searchParams.get("shape_id"), + isLive: $url.searchParams.get("live") === "true", + skipColumns: this.#resolveSkipColumns($url, requestOptions), + }; + } + + /** Runs `work` inside a per-env concurrency slot (429 if over the org limit, 500 if the limit can't be read), always releasing it after. */ + async #withConcurrencySlot( + environment: RealtimeEnvironment, + work: () => Promise + ): Promise { + const requestId = randomUUID(); + const concurrencyLimit = await this.options.cachedLimitProvider.getCachedLimit( + environment.organizationId, + this.options.defaultConcurrencyLimit ?? DEFAULT_CONCURRENCY_LIMIT + ); + + if (concurrencyLimit == null) { + logger.error("[nativeRealtimeClient] Failed to get concurrency limit", { + organizationId: environment.organizationId, + }); + return json({ error: "Failed to get concurrency limit" }, { status: 500 }); + } + + const canProceed = await this.options.limiter.incrementAndCheck( + environment.id, + requestId, + concurrencyLimit + ); + + if (!canProceed) { + this.options.onConcurrencyRejected?.(); + return json({ error: "Too many concurrent requests" }, { status: 429 }); + } + + try { + return await work(); + } finally { + await this.options.limiter.decrement(environment.id, requestId); + } + } + + #jitteredTimeout(): number { + const base = this.options.livePollTimeoutMs ?? DEFAULT_LIVE_POLL_TIMEOUT_MS; + // Jittered to avoid synchronized refetch herds. + const ratio = this.options.livePollJitterRatio ?? DEFAULT_LIVE_POLL_JITTER_RATIO; + return Math.round(base * (1 - ratio + Math.random() * 2 * ratio)); + } + + #buildResponse( + body: string, + apiVersion: API_VERSIONS, + clientVersion: string | undefined, + headers: ResponseHeaderInput + ): Response { + const finalBody = + apiVersion === CURRENT_API_VERSION ? body : rewriteBodyForLegacyApiVersion(body); + + const responseHeaders = new Headers(); + responseHeaders.set("content-type", "application/json"); + responseHeaders.set("cache-control", "no-store"); + + // Expose the electric-* headers cross-origin or the deployed react-hooks fail with MissingHeadersError (bearer requests are non-credentialed, so wildcard is safe). + responseHeaders.set("access-control-allow-origin", "*"); + responseHeaders.set("access-control-expose-headers", "*"); + + // Modern clients send `x-trigger-electric-version` and read `electric-offset`/`electric-handle`; legacy clients omit it and read the shape-id/chunk-last-offset names. + if (clientVersion) { + responseHeaders.set("electric-offset", headers.offset); + responseHeaders.set("electric-handle", headers.handle); + } else { + responseHeaders.set("electric-chunk-last-offset", headers.offset); + responseHeaders.set("electric-shape-id", headers.handle); + } + + if (headers.cursor !== undefined) { + responseHeaders.set("electric-cursor", headers.cursor); + } + if (headers.schema !== undefined) { + responseHeaders.set("electric-schema", headers.schema); + } + + return new Response(finalBody, { status: 200, headers: responseHeaders }); + } + + #mintHandle(runId: string): string { + // Stable per-run handle: the single-run shape never changes columns, so the + // client never needs a must-refetch from a handle change. + return `run-${runId}`; + } + + #nextSeq(): number { + this.#seq = (this.#seq + 1) % Number.MAX_SAFE_INTEGER; + return this.#seq; + } + + #resolveSkipColumns(url: URL, requestOptions?: RealtimeRequestOptions): string[] { + const raw = requestOptions?.skipColumns ?? url.searchParams.get("skipColumns")?.split(",") ?? []; + return raw.map((c) => c.trim()).filter((c) => c !== "" && !RESERVED_COLUMNS.includes(c)); + } +} diff --git a/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts new file mode 100644 index 00000000000..c41149f0cc6 --- /dev/null +++ b/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts @@ -0,0 +1,216 @@ +import { getMeter } from "@internal/tracing"; +import { $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { singleton } from "~/utils/singleton"; +import { getCachedLimit } from "../platform.v3.server"; +import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; +import { ClickHouseRunListResolver } from "./clickHouseRunListResolver.server"; +import { EnvChangeRouter } from "./envChangeRouter.server"; +import { NativeRealtimeClient } from "./nativeRealtimeClient.server"; +import { RealtimeConcurrencyLimiter } from "./realtimeConcurrencyLimiter.server"; +import { getRunChangeNotifier } from "./runChangeNotifierInstance.server"; +import { RedisReplayCursorStore } from "./replayCursorStore.server"; +import { RunHydrator } from "./runReader.server"; + +// Process-singleton wiring for the native realtime client; only constructed when a +// request actually routes to it, so a disabled webapp never instantiates it. +function initializeNativeRealtimeClient(): NativeRealtimeClient { + const meter = getMeter("realtime-native"); + + const wakeups = meter.createCounter("realtime_native.wakeups", { + description: + "Live realtime wakeups by reason. A rising 'timeout' share suggests a write site is missing its publishChangeRecord delegate.", + }); + + const runSetResolves = meter.createCounter("realtime_native.runset_resolves", { + description: + "Multi-run (tag-list/batch) resolve+hydrate outcomes. 'hit'/'coalesced' vs 'miss' shows how effectively concurrent same-filter feeds share a single ClickHouse + Postgres query.", + }); + + const runSetQueryMs = meter.createHistogram("realtime_native.runset_query_ms", { + description: "Latency of the multi-run resolve (ClickHouse) and hydrate (Postgres) stages.", + unit: "ms", + }); + + const livePollPaths = meter.createCounter("realtime_native.live_polls", { + description: + "How live polls resolved. 'fast-hydrate' = router wake with rows hydrated by id (no ClickHouse); 'full-resolve' = backstop; 'cold-resolve' = fresh env subscription probed once.", + }); + + const routerHydrates = meter.createCounter("realtime_native.router_hydrated_runs", { + description: + "Runs hydrated by the EnvChangeRouter's batch-hydrate (one query per column set per wake, shared across all feeds matching the same run).", + }); + + const resolveAdmissionWaits = meter.createCounter("realtime_native.resolve_admission_waits", { + description: + "Fresh ClickHouse resolves that had to queue for an admission permit. A rising count means a distinct-filter reconnect stampede is being throttled (the gate is doing its job).", + }); + + const replays = meter.createCounter("realtime_native.replays", { + description: + "Buffered change records replayed to a newly-armed feed (inter-poll gap recovery). 'delivered' = rows reached the feed; 'empty' = candidates hydrated but none survived the filter/diff.", + }); + + const replayEvictions = meter.createCounter("realtime_native.replay_evictions", { + description: + "Replay-buffer evictions. 'window' expiry is normal; 'cap' means an env churns more runs inside the window than the buffer holds (replay guarantee degrading — retune the knobs).", + }); + + const deliveryLagMs = meter.createHistogram("realtime_native.delivery_lag_ms", { + description: + "Live emissions: now minus the newest emitted row's updatedAt (PG clock vs app clock, so approximate). The end-to-end delivery SLI — a p99 near the backstop hold means wakes are being missed.", + unit: "ms", + }); + + const emittedRows = meter.createHistogram("realtime_native.emitted_rows", { + description: + "Rows per live emission. Deltas should be small; a fat tail means working-set/offset-floor fallbacks are re-emitting full sets.", + unit: "rows", + }); + + const backstops = meter.createCounter("realtime_native.backstops", { + description: + "Backstop full resolves by outcome. 'empty' is normal idle behavior; sustained 'delivered' means the notify/replay path missed changes — alert on it.", + }); + + const concurrencyRejections = meter.createCounter("realtime_native.concurrency_rejections", { + description: "Polls rejected (429) by the per-env concurrency limiter.", + }); + + const replayCursorOps = meter.createCounter("realtime_native.replay_cursor_ops", { + description: + "Shared replay-cursor store operations by outcome. Errors degrade hops to cold resolves (watch live_polls{path='cold-resolve'} rise with them), never failed polls.", + }); + + const limiter = new RealtimeConcurrencyLimiter({ + keyPrefix: "tr:realtime:native:concurrency", + redis: { + port: env.RATE_LIMIT_REDIS_PORT, + host: env.RATE_LIMIT_REDIS_HOST, + username: env.RATE_LIMIT_REDIS_USERNAME, + password: env.RATE_LIMIT_REDIS_PASSWORD, + tlsDisabled: env.RATE_LIMIT_REDIS_TLS_DISABLED === "true", + clusterMode: env.RATE_LIMIT_REDIS_CLUSTER_MODE_ENABLED === "1", + }, + }); + + // Fleet-shared replay cursors (one timestamp per connection) on the same Redis as the + // change channel, so a load-balancer hop reads the connection's true inter-poll gap. + const replayCursorStore = + env.REALTIME_BACKEND_NATIVE_SHARED_REPLAY_CURSORS === "1" + ? new RedisReplayCursorStore({ + redis: { + host: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_HOST, + port: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_PORT, + username: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_USERNAME, + password: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_PASSWORD, + tlsDisabled: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_TLS_DISABLED === "true", + clusterMode: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_CLUSTER_MODE_ENABLED === "1", + }, + ttlMs: env.REALTIME_BACKEND_NATIVE_WORKING_SET_TTL_MS, + onResult: (op, ok) => replayCursorOps.add(1, { op, result: ok ? "ok" : "error" }), + }) + : undefined; + + // One RunHydrator shared by the router and the client, so its single-flight + short-TTL cache covers both. + const runReader = new RunHydrator({ + replica: $replica, + cacheTtlMs: env.REALTIME_BACKEND_NATIVE_RUN_CACHE_TTL_MS, + maxCacheEntries: env.REALTIME_BACKEND_NATIVE_RUN_CACHE_MAX_ENTRIES, + }); + + const router = new EnvChangeRouter({ + source: getRunChangeNotifier(), + hydrator: runReader, + onHydrate: (runCount) => routerHydrates.add(runCount), + replayWindowMs: env.REALTIME_BACKEND_NATIVE_REPLAY_WINDOW_MS, + replayMaxRunsPerEnv: env.REALTIME_BACKEND_NATIVE_REPLAY_MAX_RUNS, + unsubscribeLingerMs: env.REALTIME_BACKEND_NATIVE_UNSUBSCRIBE_LINGER_MS, + onReplay: (result) => replays.add(1, { result }), + onReplayEviction: (reason) => replayEvictions.add(1, { reason }), + }); + + const client = new NativeRealtimeClient({ + runReader, + runListResolver: new ClickHouseRunListResolver({ + getClickhouse: (organizationId) => + clickhouseFactory.getClickhouseForOrganization(organizationId, "realtime"), + prisma: $replica, + }), + router, + limiter, + cachedLimitProvider: { + async getCachedLimit(organizationId, defaultValue) { + const result = await getCachedLimit( + organizationId, + "realtimeConcurrentConnections", + defaultValue + ); + return result.val; + }, + }, + defaultConcurrencyLimit: env.REALTIME_BACKEND_NATIVE_DEFAULT_CONCURRENCY_LIMIT, + livePollTimeoutMs: env.REALTIME_BACKEND_NATIVE_LIVE_POLL_TIMEOUT_MS, + livePollJitterRatio: env.REALTIME_BACKEND_NATIVE_LIVE_POLL_JITTER_RATIO, + maximumCreatedAtFilterAgeMs: env.REALTIME_MAXIMUM_CREATED_AT_FILTER_AGE_IN_MS, + maxListResults: env.REALTIME_BACKEND_NATIVE_MAX_LIST_RESULTS, + runSetResolveCacheTtlMs: env.REALTIME_BACKEND_NATIVE_RUNSET_CACHE_TTL_MS, + runSetResolveCacheMaxEntries: env.REALTIME_BACKEND_NATIVE_RUNSET_CACHE_MAX_ENTRIES, + listCacheMaxEntries: env.REALTIME_BACKEND_NATIVE_WORKING_SET_MAX_ENTRIES, + workingSetCacheTtlMs: env.REALTIME_BACKEND_NATIVE_WORKING_SET_TTL_MS, + runSetCreatedAtBucketMs: env.REALTIME_BACKEND_NATIVE_RUNSET_CREATED_AT_BUCKET_MS, + holdOnEmpty: env.REALTIME_BACKEND_NATIVE_HOLD_ON_EMPTY === "1", + resolveAdmissionLimit: env.REALTIME_BACKEND_NATIVE_RESOLVE_ADMISSION_LIMIT, + replayCursorStore, + onWakeup: (reason) => wakeups.add(1, { reason }), + onLivePollPath: (path) => livePollPaths.add(1, { path }), + onRunSetResolve: (result) => runSetResolves.add(1, { result }), + onRunSetQuery: (stage, ms) => runSetQueryMs.record(ms, { stage }), + onResolveAdmissionWait: () => resolveAdmissionWaits.add(1), + onEmit: (path, lagMs, rowCount) => { + deliveryLagMs.record(Math.max(lagMs, 0), { path }); + emittedRows.record(rowCount); + }, + onBackstopResult: (result) => backstops.add(1, { result }), + onConcurrencyRejected: () => concurrencyRejections.add(1), + }); + + meter + .createObservableGauge("realtime_native.working_set_size", { + description: + "Entries in the per-handle working-set cache (one per active multi-run feed session).", + }) + .addCallback((result) => result.observe(client.workingSetCacheSize)); + + meter + .createObservableGauge("realtime_native.resolve_admission_in_use", { + description: + "Fresh ClickHouse resolves currently holding an admission permit (live concurrency against the gate's limit).", + }) + .addCallback((result) => result.observe(client.resolveAdmissionInUse)); + + meter + .createObservableGauge("realtime_native.held_feeds", { + description: "Long-polls currently held, by feed kind — the system's capacity unit.", + }) + .addCallback((result) => { + const counts = router.heldFeedCounts; + result.observe(counts.run, { kind: "run" }); + result.observe(counts.tag, { kind: "tag" }); + result.observe(counts.batch, { kind: "batch" }); + }); + + meter + .createObservableGauge("realtime_native.active_envs", { + description: + "Environments currently routed on this instance (held feeds + lingering subscriptions).", + }) + .addCallback((result) => result.observe(router.activeEnvCount)); + + return client; +} + +export function getNativeRealtimeClient(): NativeRealtimeClient { + return singleton("nativeRealtimeClient", initializeNativeRealtimeClient); +} diff --git a/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts b/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts new file mode 100644 index 00000000000..1b6fdb3b0b4 --- /dev/null +++ b/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts @@ -0,0 +1,107 @@ +import { Callback, Result } from "ioredis"; +import { createRedisClient, RedisClient, RedisWithClusterOptions } from "~/redis.server"; +import { logger } from "../logger.server"; + +export type RealtimeConcurrencyLimiterOptions = { + redis: RedisWithClusterOptions; + keyPrefix: string; + /** How long a tracked request lives before it's swept as stale (seconds). */ + expiryTimeInSeconds?: number; + connectionName?: string; +}; + +/** + * Per-environment concurrent-connection limiter for realtime long-polls; a standalone copy of the limiter in + * `realtimeClient.server.ts` (identical Lua + key shape, different key prefix) so the native backend tracks independently. + */ +export class RealtimeConcurrencyLimiter { + private redis: RedisClient; + private expiryTimeInSeconds: number; + + constructor(private options: RealtimeConcurrencyLimiterOptions) { + this.redis = createRedisClient( + options.connectionName ?? "trigger:realtime:native:concurrency", + options.redis + ); + this.expiryTimeInSeconds = options.expiryTimeInSeconds ?? 60 * 5; + this.#registerCommands(); + } + + async incrementAndCheck(environmentId: string, requestId: string, limit: number): Promise { + const key = this.#getKey(environmentId); + const now = Date.now(); + + const result = await this.redis.incrementAndCheckRealtimeNativeConcurrency( + key, + now.toString(), + requestId, + this.expiryTimeInSeconds.toString(), + (now - this.expiryTimeInSeconds * 1000).toString(), + limit.toString() + ); + + return result === 1; + } + + async decrement(environmentId: string, requestId: string): Promise { + const key = this.#getKey(environmentId); + await this.redis.zrem(key, requestId); + } + + #getKey(environmentId: string): string { + return `${this.options.keyPrefix}:${environmentId}`; + } + + #registerCommands() { + this.redis.defineCommand("incrementAndCheckRealtimeNativeConcurrency", { + numberOfKeys: 1, + lua: /* lua */ ` + local concurrencyKey = KEYS[1] + + local timestamp = tonumber(ARGV[1]) + local requestId = ARGV[2] + local expiryTime = tonumber(ARGV[3]) + local cutoffTime = tonumber(ARGV[4]) + local limit = tonumber(ARGV[5]) + + -- Remove expired entries + redis.call('ZREMRANGEBYSCORE', concurrencyKey, '-inf', cutoffTime) + + -- Add the new request to the sorted set + redis.call('ZADD', concurrencyKey, timestamp, requestId) + + -- Set the expiry time on the key + redis.call('EXPIRE', concurrencyKey, expiryTime) + + -- Get the total number of concurrent requests + local totalRequests = redis.call('ZCARD', concurrencyKey) + + -- Check if the limit has been exceeded + if totalRequests > limit then + redis.call('ZREM', concurrencyKey, requestId) + return 0 + end + + return 1 + `, + }); + + this.redis.on("error", (error) => { + logger.error("[realtimeConcurrencyLimiter] redis error", { error }); + }); + } +} + +declare module "ioredis" { + interface RedisCommander { + incrementAndCheckRealtimeNativeConcurrency( + key: string, + timestamp: string, + requestId: string, + expiryTime: string, + cutoffTime: string, + limit: string, + callback?: Callback + ): Result; + } +} diff --git a/apps/webapp/app/services/realtime/replayCursorStore.server.ts b/apps/webapp/app/services/realtime/replayCursorStore.server.ts new file mode 100644 index 00000000000..597957704af --- /dev/null +++ b/apps/webapp/app/services/realtime/replayCursorStore.server.ts @@ -0,0 +1,145 @@ +import { createRedisClient, type RedisClient, type RedisWithClusterOptions } from "~/redis.server"; +import { logger } from "../logger.server"; +import { BoundedTtlCache } from "./boundedTtlCache"; + +/** + * Per-connection replay cursors ("when did this connection last receive data"), keyed by the + * env-prefixed working-set key. Sharing them fleet-wide makes an instance hop look like a normal + * inter-poll gap instead of an unknown one, so hops stop triggering cold resolves and full-window + * replays. Values are single timestamps, so the shared store stays cheap. + */ +export interface ReplayCursorStore { + /** The connection's last-response timestamp; undefined on miss OR error (the caller + * degrades to a cold probe / full-window replay, never blocks the poll). */ + get(key: string): Promise; + /** Fire-and-forget stamp; must never throw. */ + set(key: string, ms: number): void; +} + +/** Per-instance fallback with the same shape (used when the shared store is disabled, and in tests). */ +export class InMemoryReplayCursorStore implements ReplayCursorStore { + readonly #cache: BoundedTtlCache; + + constructor(ttlMs: number, maxEntries: number) { + this.#cache = new BoundedTtlCache(ttlMs, maxEntries); + } + + async get(key: string): Promise { + return this.#cache.get(key); + } + + set(key: string, ms: number): void { + this.#cache.set(key, ms); + } +} + +export type RedisReplayCursorStoreOptions = { + redis: RedisWithClusterOptions; + /** Entry TTL (ms); matches the working-set TTL so both views of a connection age out together. */ + ttlMs: number; + /** Read deadline (ms): a slow or down Redis degrades the poll to a cold probe instead of stalling it. */ + getTimeoutMs?: number; + keyPrefix?: string; + connectionName?: string; + /** Observability hook: a store op settled (errors are the degradation signal, not failures). */ + onResult?: (op: "get" | "set", ok: boolean) => void; +}; + +const DEFAULT_KEY_PREFIX = "realtime:replay-cursor:"; +const DEFAULT_GET_TIMEOUT_MS = 250; +const TIMED_OUT = Symbol("replay-cursor-get-timeout"); + +export class RedisReplayCursorStore implements ReplayCursorStore { + #client: RedisClient | undefined; + + constructor(private readonly options: RedisReplayCursorStoreOptions) {} + + async get(key: string): Promise { + try { + const raw = await this.#getWithDeadline(this.#key(key)); + if (raw === TIMED_OUT) { + this.options.onResult?.("get", false); + logger.warn("[replayCursorStore] replay-cursor read timed out", { key }); + return undefined; + } + this.options.onResult?.("get", true); + if (raw === null) { + return undefined; + } + const ms = Number(raw); + return Number.isFinite(ms) && ms > 0 ? ms : undefined; + } catch (error) { + this.options.onResult?.("get", false); + logger.error("[replayCursorStore] failed to read a replay cursor", { error, key }); + return undefined; + } + } + + /** GET raced against the read deadline (ioredis queues commands while disconnected, which + * would otherwise stall every poll start through an outage). */ + #getWithDeadline(key: string): Promise { + return new Promise((resolve, reject) => { + const timer = setTimeout( + () => resolve(TIMED_OUT), + this.options.getTimeoutMs ?? DEFAULT_GET_TIMEOUT_MS + ); + timer.unref?.(); + this.#ensureClient() + .get(key) + .then( + (value) => { + clearTimeout(timer); + resolve(value); + }, + (error) => { + clearTimeout(timer); + reject(error); + } + ); + }); + } + + set(key: string, ms: number): void { + try { + this.#ensureClient() + .set(this.#key(key), String(ms), "PX", this.options.ttlMs) + .then( + () => this.options.onResult?.("set", true), + (error) => { + this.options.onResult?.("set", false); + logger.error("[replayCursorStore] failed to write a replay cursor", { error, key }); + } + ); + } catch (error) { + this.options.onResult?.("set", false); + logger.error("[replayCursorStore] failed to write a replay cursor", { error, key }); + } + } + + async quit(): Promise { + const client = this.#client; + this.#client = undefined; + if (!client) return; + try { + // Bounded graceful QUIT; cursor writes are best-effort, so force-close beyond it. + await Promise.race([client.quit(), new Promise((resolve) => setTimeout(resolve, 500))]); + } catch { + // force-close below + } + client.disconnect(); + } + + #key(key: string): string { + return `${this.options.keyPrefix ?? DEFAULT_KEY_PREFIX}${key}`; + } + + #ensureClient(): RedisClient { + if (!this.#client) { + this.#client = createRedisClient( + this.options.connectionName ?? "trigger:realtime:replay-cursors", + this.options.redis + ); + } + return this.#client; + } +} diff --git a/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts b/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts new file mode 100644 index 00000000000..69ca81cf2cc --- /dev/null +++ b/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts @@ -0,0 +1,93 @@ +import { $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { singleton } from "~/utils/singleton"; +import { FEATURE_FLAG } from "~/v3/featureFlags"; +import { makeFlag } from "~/v3/featureFlags.server"; +import { logger } from "../logger.server"; +import { type RealtimeEnvironment } from "../realtimeClient.server"; +import { realtimeClient } from "../realtimeClientGlobal.server"; +import { BoundedTtlCache } from "./boundedTtlCache"; +import { type RealtimeStreamClient } from "./nativeRealtimeClient.server"; +import { getNativeRealtimeClient } from "./nativeRealtimeClientInstance.server"; +import { getShadowRealtimeClient } from "./shadowRealtimeClientInstance.server"; + +type RealtimeBackend = "electric" | "native" | "shadow"; + +// Two gates, both defaulting to the Electric path: the env master switch, then the +// per-org `realtimeBackend` feature flag (cached so long-polls don't hit the DB per request). +const nativeBackendEnabled = env.REALTIME_BACKEND_NATIVE_ENABLED === "1"; + +const flag = singleton("realtimeBackendFlag", () => makeFlag($replica)); +const backendCache = singleton( + "realtimeBackendCache", + () => + new BoundedTtlCache( + env.REALTIME_BACKEND_FLAG_CACHE_TTL_MS, + env.REALTIME_BACKEND_FLAG_CACHE_MAX_ENTRIES + ) +); + +export async function resolveRealtimeStreamClient( + environment: RealtimeEnvironment & { organization?: { featureFlags?: unknown } } +): Promise { + if (!nativeBackendEnabled) { + return realtimeClient; + } + + // The authenticated environment already carries the org's feature flags; pass them + // through so a cache miss doesn't need an extra organization read. + const orgFeatureFlags = environment.organization + ? environment.organization.featureFlags ?? {} + : undefined; + + switch (await getRealtimeBackend(environment.organizationId, orgFeatureFlags)) { + case "native": + return getNativeRealtimeClient(); + case "shadow": + // The client is still served Electric; the native path is diffed in the background. + return getShadowRealtimeClient(); + case "electric": + default: + return realtimeClient; + } +} + +async function getRealtimeBackend( + organizationId: string, + orgFeatureFlags: unknown | undefined +): Promise { + const cached = backendCache.get(organizationId); + if (cached !== undefined) { + return cached; + } + + let backend: RealtimeBackend = "electric"; + + try { + const overrides = + orgFeatureFlags !== undefined + ? orgFeatureFlags + : ( + await $replica.organization.findFirst({ + where: { id: organizationId }, + select: { featureFlags: true }, + }) + )?.featureFlags; + + backend = await flag({ + key: FEATURE_FLAG.realtimeBackend, + defaultValue: "electric", + overrides: (overrides as Record) ?? {}, + }); + } catch (error) { + // Never let a flag lookup failure break the realtime feed. + logger.error("[resolveRealtimeStreamClient] failed to resolve realtimeBackend flag", { + organizationId, + error, + }); + backend = "electric"; + } + + backendCache.set(organizationId, backend); + return backend; +} diff --git a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts new file mode 100644 index 00000000000..f295c02d3f8 --- /dev/null +++ b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts @@ -0,0 +1,325 @@ +import { createRedisClient, RedisClient, RedisWithClusterOptions } from "~/redis.server"; +import { logger } from "../logger.server"; + +export const CHANGE_RECORD_VERSION = 1; + +/** + * A self-describing run-change fact published once to the run's environment channel; row state is + * never on the wire. `tags` present (even `[]`) marks a "full" record a feed can classify locally; + * `tags` absent marks a "partial" record (envId+runId only) a tag feed must hydrate to classify. + */ +export type ChangeRecord = { + v: number; + runId: string; + envId: string; + tags?: string[]; + batchId?: string | null; + createdAtMs?: number; + updatedAtMs?: number; + status?: string; +}; + +/** What a publish site provides; the notifier stamps the version. */ +export type ChangeRecordInput = Omit; + +export function encodeChangeRecord(record: ChangeRecord): string { + return JSON.stringify(record); +} + +/** Decode a wire message into a ChangeRecord; a bare/malformed frame degrades to a partial record rather than throwing. */ +export function decodeChangeRecord(message: string): ChangeRecord { + if (message.length === 0 || message[0] !== "{") { + return { v: 0, runId: message, envId: "" }; + } + try { + const parsed = JSON.parse(message) as Partial; + if (parsed && typeof parsed.runId === "string") { + return { + v: parsed.v ?? 0, + runId: parsed.runId, + envId: parsed.envId ?? "", + tags: parsed.tags, + batchId: parsed.batchId, + createdAtMs: parsed.createdAtMs, + updatedAtMs: parsed.updatedAtMs, + status: parsed.status, + }; + } + } catch { + // fall through to the bare-runId fallback + } + return { v: 0, runId: message, envId: "" }; +} + +export type RunChangeNotifierOptions = { + redis: RedisWithClusterOptions; + /** Channel name prefix; the envId is appended inside a hash-tag for slot locality. */ + channelPrefix?: string; + connectionName?: string; + /** Leading-edge throttle (ms) for the per-env channel, bounding the wake rate per env. Defaults to 100ms; 0 disables. */ + envWakeCoalesceWindowMs?: number; + /** Use Redis sharded pub/sub (SSUBSCRIBE/SPUBLISH); cluster-only and requires `clusterOptions.shardedSubscribers`. Defaults to false (classic). */ + shardedPubSub?: boolean; + /** Observability hook: a publish settled (ok) or failed (the leading degradation signal). */ + onPublishResult?: (ok: boolean) => void; + /** Observability hook: a raw channel message arrived (pre-coalesce). */ + onMessageReceived?: () => void; + /** Observability hook: a coalesced batch was delivered to listeners (records per batch). */ + onBatchDelivered?: (recordCount: number) => void; +}; + +const DEFAULT_CHANNEL_PREFIX = "realtime:"; +const DEFAULT_ENV_WAKE_COALESCE_WINDOW_MS = 100; + +/** + * RunChangeNotifier — carries "run X changed" facts from write sites to the realtime feeds over ONE + * per-environment channel (`env:{}`, hash-tagged so an env stays on one cluster slot). + * Uses one shared multiplexed subscriber per process (refcounted), created lazily, and a fire-and-forget + * `publish` that never throws — a dropped publish only costs latency because the consumer has a backstop. + */ +export class RunChangeNotifier { + #publisher: RedisClient | undefined; + #subscriber: RedisClient | undefined; + readonly #listeners = new Map void>>(); + /** Per-channel accumulator of records since the last delivery, deduped by runId (latest per run wins), so a coalesced wake carries every run that moved. */ + readonly #pending = new Map>(); + readonly #channelPrefix: string; + readonly #connectionName: string; + readonly #coalesceWindowMs: number; + /** When true, use sharded pub/sub (SSUBSCRIBE/SPUBLISH/smessage) — see options. */ + readonly #sharded: boolean; + /** Active coalescing windows per channel. */ + readonly #coalesceTimers = new Map>(); + /** Channels that received a message while their window was open (need a trailing wake). */ + readonly #coalesceDirty = new Set(); + + constructor(private readonly options: RunChangeNotifierOptions) { + this.#channelPrefix = options.channelPrefix ?? DEFAULT_CHANNEL_PREFIX; + this.#connectionName = options.connectionName ?? "trigger:realtime:run-change-notifier"; + this.#coalesceWindowMs = options.envWakeCoalesceWindowMs ?? DEFAULT_ENV_WAKE_COALESCE_WINDOW_MS; + this.#sharded = options.shardedPubSub ?? false; + } + + /** Fire-and-forget publish of a run-changed fact to the run's environment channel; never throws. */ + publish(input: ChangeRecordInput): void { + const record: ChangeRecord = { v: CHANGE_RECORD_VERSION, ...input }; + this.#publishToChannel(this.#channelForEnv(record.envId), encodeChangeRecord(record)); + } + + /** Fire-and-forget publish of many run-changed facts. Never throws. */ + publishMany(inputs: ChangeRecordInput[]): void { + for (const input of inputs) { + this.publish(input); + } + } + + #publishToChannel(channel: string, payload: string): void { + try { + const publisher = this.#ensurePublisher(); + // Sharded pub/sub (SPUBLISH) routes to the channel's slot owner; classic PUBLISH + // broadcasts cluster-wide. The channel is hash-tagged by envId. + const result = this.#sharded + ? publisher.spublish(channel, payload) + : publisher.publish(channel, payload); + if (typeof (result as Promise)?.then === "function") { + (result as Promise).then( + () => this.options.onPublishResult?.(true), + (error) => { + this.options.onPublishResult?.(false); + logger.error("[runChangeNotifier] Failed to publish run-changed notification", { + error, + channel, + }); + } + ); + } else { + this.options.onPublishResult?.(true); + } + } catch (error) { + this.options.onPublishResult?.(false); + logger.error("[runChangeNotifier] Failed to publish run-changed notification", { + error, + channel, + }); + } + } + + /** Subscribe to an env's run-change stream; refcounted over the shared subscriber (first listener SUBSCRIBEs, last UNSUBSCRIBEs). */ + subscribeToEnv(environmentId: string, onBatch: (records: ChangeRecord[]) => void): () => void { + const channel = this.#channelForEnv(environmentId); + const subscriber = this.#ensureSubscriber(); + + let listeners = this.#listeners.get(channel); + if (!listeners) { + listeners = new Set(); + this.#listeners.set(channel, listeners); + this.#subscribeChannel(subscriber, channel).catch((error) => { + logger.error("[runChangeNotifier] Failed to subscribe to run-change channel", { + error, + channel, + }); + }); + } + listeners.add(onBatch); + + let unsubscribed = false; + return () => { + if (unsubscribed) { + return; + } + unsubscribed = true; + + const current = this.#listeners.get(channel); + if (!current) { + return; + } + current.delete(onBatch); + if (current.size === 0) { + // Drop the channel from the map only after Redis confirms UNSUBSCRIBE and no new listener re-subscribed in the meantime. + this.#unsubscribeChannel(subscriber, channel) + .then(() => { + const latest = this.#listeners.get(channel); + if (!latest) { + return; + } + if (latest.size === 0) { + this.#listeners.delete(channel); + } else { + // A listener arrived during the in-flight UNSUBSCRIBE; re-subscribe so it keeps receiving (the backstop covers the gap). + this.#subscribeChannel(subscriber, channel).catch((error) => { + logger.error("[runChangeNotifier] Failed to re-subscribe to run-change channel", { + error, + channel, + }); + }); + } + }) + .catch((error) => { + // UNSUBSCRIBE failed (likely still subscribed in Redis): keep the empty map entry so a future subscriber reuses it. + logger.error("[runChangeNotifier] Failed to unsubscribe from run-change channel", { + error, + channel, + }); + }); + } + }; + } + + /** Number of distinct env channels currently subscribed (for metrics). */ + get activeSubscriptionCount(): number { + return this.#listeners.size; + } + + async quit(): Promise { + for (const timer of this.#coalesceTimers.values()) { + clearTimeout(timer); + } + this.#coalesceTimers.clear(); + this.#coalesceDirty.clear(); + this.#pending.clear(); + await Promise.allSettled([this.#subscriber?.quit(), this.#publisher?.quit()]); + this.#subscriber = undefined; + this.#publisher = undefined; + this.#listeners.clear(); + } + + #ensurePublisher(): RedisClient { + if (!this.#publisher) { + this.#publisher = createRedisClient(`${this.#connectionName}:pub`, this.options.redis); + } + return this.#publisher; + } + + #ensureSubscriber(): RedisClient { + if (!this.#subscriber) { + const subscriber = createRedisClient(`${this.#connectionName}:sub`, this.options.redis); + const onMessage = (channel: string, message: string) => this.#onMessage(channel, message); + // Classic pub/sub delivers "message"; sharded pub/sub delivers "smessage". Register + // both so the delivery path is identical regardless of mode. + subscriber.on("message", onMessage); + subscriber.on("smessage", onMessage); + this.#subscriber = subscriber; + } + return this.#subscriber; + } + + /** SUBSCRIBE (classic) vs SSUBSCRIBE (sharded, cluster-only). */ + #subscribeChannel(subscriber: RedisClient, channel: string): Promise { + return this.#sharded ? subscriber.ssubscribe(channel) : subscriber.subscribe(channel); + } + + /** UNSUBSCRIBE (classic) vs SUNSUBSCRIBE (sharded, cluster-only). */ + #unsubscribeChannel(subscriber: RedisClient, channel: string): Promise { + return this.#sharded ? subscriber.sunsubscribe(channel) : subscriber.unsubscribe(channel); + } + + #onMessage(channel: string, message: string) { + this.options.onMessageReceived?.(); + // Accumulate the decoded record (deduped by runId) before delivering, so a coalesced + // wake carries every run that moved during the window. + this.#addPending(channel, decodeChangeRecord(message)); + + if (this.#coalesceWindowMs > 0) { + this.#deliverCoalesced(channel); + return; + } + this.#deliver(channel); + } + + /** Accumulate a record into the channel's pending batch, deduped by runId (a later + * record for the same run replaces the earlier one, keeping the freshest keys). */ + #addPending(channel: string, record: ChangeRecord) { + let batch = this.#pending.get(channel); + if (!batch) { + batch = new Map(); + this.#pending.set(channel, batch); + } + batch.set(record.runId, record); + } + + #deliver(channel: string) { + // Drain the accumulated batch (and clear it) so listeners woken now get every run that + // changed since the last delivery, and a later message starts a fresh batch. + const batchMap = this.#pending.get(channel); + const batch = batchMap ? [...batchMap.values()] : []; + this.#pending.delete(channel); + + const listeners = this.#listeners.get(channel); + if (!listeners || batch.length === 0) { + return; + } + this.options.onBatchDelivered?.(batch.length); + for (const onBatch of [...listeners]) { + onBatch(batch); + } + } + + /** Leading-edge throttle capping the wake rate to ~1/window: deliver the first wake immediately, then one trailing wake per window while activity continues. Lossless. */ + #deliverCoalesced(channel: string) { + if (this.#coalesceTimers.has(channel)) { + this.#coalesceDirty.add(channel); + return; + } + this.#deliver(channel); + this.#openCoalesceWindow(channel); + } + + #openCoalesceWindow(channel: string) { + const timer = setTimeout(() => { + this.#coalesceTimers.delete(channel); + if (this.#coalesceDirty.delete(channel)) { + this.#deliver(channel); + this.#openCoalesceWindow(channel); + } + }, this.#coalesceWindowMs); + // Don't let a pending coalescing window hold the process open at shutdown. + timer.unref?.(); + this.#coalesceTimers.set(channel, timer); + } + + // Hash-tagged (`...{}`) so all of an env's traffic maps to one cluster slot (one + // shard) under sharded pub/sub. + #channelForEnv(environmentId: string): string { + return `${this.#channelPrefix}env:{${environmentId}}`; + } +} diff --git a/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts new file mode 100644 index 00000000000..b7d90122db0 --- /dev/null +++ b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts @@ -0,0 +1,87 @@ +import { env } from "~/env.server"; +import { engine } from "~/v3/runEngine.server"; +import { logger } from "../logger.server"; +import { publishChangeRecord } from "./runChangeNotifierInstance.server"; + +/** + * Builds and publishes a self-describing `ChangeRecord` for the lifecycle events whose engine-bus payload + * already carries env + tags + batchId. Terminal transitions, runAttemptFailed, and runMetadataUpdated publish + * from `runEngineHandlers.server.ts` instead. Coverage isn't exhaustive — a dropped transition only adds latency + * because the consumer has a periodic backstop full-resolve. The env master switch is `REALTIME_BACKEND_NATIVE_ENABLED`. + */ +export function registerRunChangeNotifierHandlers() { + // Return truthy in every path so singleton() caches this factory and never re-runs it (re-running would attach duplicate engine-bus listeners on dev reload). + if (env.REALTIME_BACKEND_NATIVE_ENABLED !== "1") { + return true; + } + + // Run created: the first signal for a brand-new run (born QUEUED with no status transition), so it surfaces before ClickHouse ingests it. + engine.eventBus.on("runCreated", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + + // Status transitions (checkpoint suspend/resume, pending version, dequeue). + engine.eventBus.on("runStatusChanged", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + + // Dequeue/lock (sets startedAt) and attempt start (DEQUEUED -> EXECUTING) — the + // most-watched "my run started" transitions. + engine.eventBus.on("runLocked", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + engine.eventBus.on("runAttemptStarted", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + + engine.eventBus.on("runRetryScheduled", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + + // Delay lifecycle (delayUntil / queued-after-delay changes). + engine.eventBus.on("runDelayRescheduled", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + engine.eventBus.on("runEnqueuedAfterDelay", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + + logger.info("[runChangeNotifier] realtime change-record builder registered"); + + return true; +} diff --git a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts new file mode 100644 index 00000000000..b656052c339 --- /dev/null +++ b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts @@ -0,0 +1,85 @@ +import { getMeter } from "@internal/tracing"; +import { env } from "~/env.server"; +import { singleton } from "~/utils/singleton"; +import { RunChangeNotifier, type ChangeRecordInput } from "./runChangeNotifier.server"; + +/** + * Process-singleton wiring for the RunChangeNotifier plus the gated convenience functions write sites + * delegate to. The notifier is constructed lazily, so `REALTIME_BACKEND_NATIVE_ENABLED=0` (default) opens no Redis connections. + */ +const nativeBackendEnabled = env.REALTIME_BACKEND_NATIVE_ENABLED === "1"; + +function initializeRunChangeNotifier(): RunChangeNotifier { + const clusterMode = env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_CLUSTER_MODE_ENABLED === "1"; + // Sharded pub/sub only works against a cluster; classic pub/sub there would + // broadcast every message to every node, so this is what actually shards load. + const shardedPubSub = clusterMode && env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_SHARDED_ENABLED === "1"; + + const meter = getMeter("realtime-notifier"); + + const publishes = meter.createCounter("realtime_notifier.publishes", { + description: + "Change-record publishes by outcome. Failures are the leading indicator that feeds are degrading to their backstops (pub/sub Redis trouble).", + }); + + const received = meter.createCounter("realtime_notifier.messages_received", { + description: "Raw channel messages received by this instance's subscriber, pre-coalesce.", + }); + + const delivered = meter.createCounter("realtime_notifier.batches_delivered", { + description: + "Coalesced batches delivered to listeners. received/batches = the coalesce ratio (how hard a busy env is being collapsed).", + }); + + const notifier = new RunChangeNotifier({ + redis: { + host: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_HOST, + port: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_PORT, + username: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_USERNAME, + password: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_PASSWORD, + tlsDisabled: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_TLS_DISABLED === "true", + clusterMode, + // One subscriber connection per shard so SSUBSCRIBE routes to the slot owner. + ...(shardedPubSub ? { clusterOptions: { shardedSubscribers: true } } : {}), + }, + envWakeCoalesceWindowMs: env.REALTIME_BACKEND_NATIVE_ENV_WAKE_COALESCE_WINDOW_MS, + shardedPubSub, + onPublishResult: (ok) => publishes.add(1, { result: ok ? "ok" : "error" }), + onMessageReceived: () => received.add(1), + onBatchDelivered: () => delivered.add(1), + }); + + meter + .createObservableGauge("realtime_notifier.active_subscriptions", { + description: "Distinct env channels currently subscribed for realtime change notifications.", + }) + .addCallback((result) => result.observe(notifier.activeSubscriptionCount)); + + return notifier; +} + +/** Lazily construct (and memoize) the notifier singleton. */ +export function getRunChangeNotifier(): RunChangeNotifier { + return singleton("runChangeNotifier", initializeRunChangeNotifier); +} + +/** Whether the notifier subsystem is enabled for this process. */ +export function isRunChangeNotifierEnabled(): boolean { + return nativeBackendEnabled; +} + +/** Fire-and-forget publish of a run-changed record. No-op (and no notifier construction) + * when disabled, so publish sites can call it unconditionally. */ +export function publishChangeRecord(input: ChangeRecordInput): void { + if (!nativeBackendEnabled) { + return; + } + getRunChangeNotifier().publish(input); +} + +export function publishManyChangeRecords(inputs: ChangeRecordInput[]): void { + if (!nativeBackendEnabled) { + return; + } + getRunChangeNotifier().publishMany(inputs); +} diff --git a/apps/webapp/app/services/realtime/runReader.server.ts b/apps/webapp/app/services/realtime/runReader.server.ts new file mode 100644 index 00000000000..e8509d73de4 --- /dev/null +++ b/apps/webapp/app/services/realtime/runReader.server.ts @@ -0,0 +1,163 @@ +import { type Prisma, type PrismaClient } from "@trigger.dev/database"; +import { BoundedTtlCache } from "./boundedTtlCache"; +import { RESERVED_COLUMNS, type RealtimeRunRow } from "./electricStreamProtocol.server"; + +/** + * RunReader — the pluggable read half of the native-backend realtime feed: ClickHouse is filter-only + * (resolves ids), Postgres always hydrates row columns. Owns the `RunHydrator` (by-id) and the + * `RunListResolver` interface (the tag/list filter -> id-set seam, implemented over ClickHouse). + */ + +/** The TaskRun columns the realtime feed projects (mirrors DEFAULT_ELECTRIC_COLUMNS). */ +export const RUN_HYDRATOR_SELECT = { + id: true, + taskIdentifier: true, + createdAt: true, + updatedAt: true, + startedAt: true, + delayUntil: true, + queuedAt: true, + expiredAt: true, + completedAt: true, + friendlyId: true, + number: true, + isTest: true, + status: true, + usageDurationMs: true, + costInCents: true, + baseCostInCents: true, + ttl: true, + payload: true, + payloadType: true, + metadata: true, + metadataType: true, + output: true, + outputType: true, + runTags: true, + error: true, + realtimeStreams: true, +} satisfies Prisma.TaskRunSelect; + +/** Columns hydrated regardless of `skipColumns`: `id` keys the row, `updatedAt` drives the offset and working-set diff. */ +const ALWAYS_HYDRATED_COLUMNS = new Set(["id", "updatedAt", ...RESERVED_COLUMNS]); + +/** Project `RUN_HYDRATOR_SELECT` down to the columns the client didn't skip (plus + * the always-needed ones). An empty skip set returns the full select unchanged. */ +export function buildHydratorSelect(skipColumns: string[] = []): Prisma.TaskRunSelect { + if (skipColumns.length === 0) { + return RUN_HYDRATOR_SELECT; + } + const skip = new Set(skipColumns); + const select: Record = {}; + for (const column of Object.keys(RUN_HYDRATOR_SELECT)) { + if (ALWAYS_HYDRATED_COLUMNS.has(column) || !skip.has(column)) { + select[column] = true; + } + } + return select as Prisma.TaskRunSelect; +} + +export type RunListFilter = { + organizationId: string; + projectId: string; + environmentId: string; + /** Contains-ANY tag match (OR). Omit/empty for non-tag feeds. */ + tags?: string[]; + /** Restrict to a single batch (internal batch id) — the batch feed. */ + batchId?: string; + /** Lower bound on createdAt (the tag-list feed pins this; batch omits it). */ + createdAtAfter?: Date; + /** Hard cap on the result set so a broad filter can't unbound the snapshot. */ + limit: number; +}; + +/** Resolves a tag/list filter into the matching run id-set, filter-only (rows hydrated from Postgres by id afterward). ClickHouse impl in `clickHouseRunListResolver.server.ts`. */ +export interface RunListResolver { + resolveMatchingRunIds(filter: RunListFilter): Promise; +} + +export type RunHydratorOptions = { + /** A read-replica Prisma client (`$replica`). Always Postgres. */ + replica: Pick; + /** Read-through cache TTL (ms) collapsing duplicate refetches for the same run. Set 0 to disable. Defaults to 250ms. */ + cacheTtlMs?: number; + /** Hard cap on cache entries before expired entries are swept. */ + maxCacheEntries?: number; +}; + +const DEFAULT_CACHE_TTL_MS = 250; +const DEFAULT_MAX_CACHE_ENTRIES = 5_000; + +/** Hydrates runs by id from the read replica, projected to the realtime columns; concurrent same-run refetches are single-flighted + short-TTL cached. */ +export class RunHydrator { + readonly #inflight = new Map>(); + readonly #cache: BoundedTtlCache; + readonly #cacheTtlMs: number; + + constructor(private readonly options: RunHydratorOptions) { + this.#cacheTtlMs = options.cacheTtlMs ?? DEFAULT_CACHE_TTL_MS; + this.#cache = new BoundedTtlCache( + this.#cacheTtlMs, + options.maxCacheEntries ?? DEFAULT_MAX_CACHE_ENTRIES + ); + } + + async getRunById(environmentId: string, runId: string): Promise { + const key = `${environmentId}:${runId}`; + + if (this.#cacheTtlMs > 0) { + // A cached null is a valid "run not found" hit; only undefined is a miss. + const cached = this.#cache.get(key); + if (cached !== undefined) { + return cached; + } + } + + const existing = this.#inflight.get(key); + if (existing) { + return existing; + } + + const promise = this.#fetch(environmentId, runId).finally(() => this.#inflight.delete(key)); + this.#inflight.set(key, promise); + + const row = await promise; + + if (this.#cacheTtlMs > 0) { + this.#cache.set(key, row); + } + + return row; + } + + /** Hydrate many runs by id in one query (order not guaranteed); `skipColumns` projects the SELECT so dropped columns aren't shipped. */ + async hydrateByIds( + environmentId: string, + ids: string[], + skipColumns: string[] = [] + ): Promise { + if (ids.length === 0) { + return []; + } + const rows = await this.options.replica.taskRun.findMany({ + where: { + runtimeEnvironmentId: environmentId, + id: { in: ids }, + }, + select: buildHydratorSelect(skipColumns), + }); + return rows as unknown as RealtimeRunRow[]; + } + + async #fetch(environmentId: string, runId: string): Promise { + const run = await this.options.replica.taskRun.findFirst({ + where: { + id: runId, + runtimeEnvironmentId: environmentId, + }, + select: RUN_HYDRATOR_SELECT, + }); + + return (run ?? null) as RealtimeRunRow | null; + } +} diff --git a/apps/webapp/app/services/realtime/shadowCompare.server.ts b/apps/webapp/app/services/realtime/shadowCompare.server.ts new file mode 100644 index 00000000000..27831dd68a2 --- /dev/null +++ b/apps/webapp/app/services/realtime/shadowCompare.server.ts @@ -0,0 +1,283 @@ +import { + type ElectricColumnType, + RUN_ELECTRIC_COLUMNS, + serializeRunRow, +} from "./electricStreamProtocol.server"; +import { type RunHydrator, type RunListFilter, type RunListResolver } from "./runReader.server"; + +/** + * Dual-run shadow-compare: the client is always served the Electric response while this re-derives what + * the native backend would emit and diffs the two, to prove parity on real traffic before cutover. Checks + * serialization (semantic per-column compare, gated on same updatedAt so a changed row is "skew", not a + * divergence) and membership (emitted id-set, only on tag/batch initial snapshots). Pure but for the injected deps. + */ + +export type ShadowFeed = "run" | "runs" | "batch"; + +type WireValue = Record; + +type ShapeMessage = { + key?: string; + value?: WireValue; + headers: { operation?: string; control?: string }; +}; + +const COLUMN_BY_NAME = new Map(RUN_ELECTRIC_COLUMNS.map((column) => [column.name, column])); + +export type ColumnDiff = { + runId: string; + column: string; + electric: string | null; + native: string | null; +}; + +export type ShadowCompareOutcome = { + feed: ShadowFeed; + /** Runs whose every emitted column matched (same-version). */ + serializationMatched: number; + /** Runs with at least one semantic column divergence (same-version). */ + serializationDiverged: number; + /** Runs that changed between Electric's emit and our refetch (not a divergence). */ + serializationSkew: number; + /** Per-column divergences (capped) for logging. */ + diffs: ColumnDiff[]; + /** Set membership (tag/batch initial snapshot only). undefined when not checked. */ + membershipMatch?: boolean; + missingInNative?: string[]; + extraInNative?: string[]; +}; + +export type ShadowCompareInput = { + feed: ShadowFeed; + /** The served Electric response body (a JSON array of messages, or "" / "[]"). */ + electricBody: string; + environment: { id: string }; + skipColumns: string[]; + /** True when this was an initial snapshot request (offset=-1); enables membership compare. */ + isInitialSnapshot: boolean; + /** When set (tag/batch initial snapshot), compare the resolved id-set. */ + membershipFilter?: RunListFilter; +}; + +const MAX_DIFFS = 20; + +export class RealtimeShadowComparator { + constructor( + private readonly options: { runReader: RunHydrator; runListResolver: RunListResolver } + ) {} + + async compare(input: ShadowCompareInput): Promise { + const messages = parseBody(input.electricBody); + const changes = messages.filter( + (m): m is ShapeMessage & { value: WireValue } => + typeof m.headers?.operation === "string" && !!m.value && m.headers.operation !== "delete" + ); + + const outcome: ShadowCompareOutcome = { + feed: input.feed, + serializationMatched: 0, + serializationDiverged: 0, + serializationSkew: 0, + diffs: [], + }; + + // Bulk-hydrate every emitted run in one query rather than a per-message round + // trip, so shadow mode doesn't inflate the very replica load it's measuring. + const emittedIds = changes + .map((m) => m.value.id) + .filter((id): id is string => typeof id === "string"); + const hydrated = await this.options.runReader.hydrateByIds(input.environment.id, emittedIds); + const rowsById = new Map(hydrated.map((row) => [row.id, row])); + + for (const message of changes) { + const runId = message.value.id ?? undefined; + if (!runId) { + continue; + } + + const row = rowsById.get(runId); + if (!row) { + // Run no longer readable (deleted / replica miss). Not a serialization divergence. + outcome.serializationSkew++; + continue; + } + + const nativeValue = serializeRunRow(row, input.skipColumns); + + // Only compare rows at the same version; otherwise the row advanced between + // Electric's emit and our refetch (timing skew, not a divergence). + if (!sameInstant(message.value.updatedAt, nativeValue.updatedAt)) { + outcome.serializationSkew++; + continue; + } + + let rowDiverged = false; + for (const [column, electricRaw] of Object.entries(message.value)) { + const meta = COLUMN_BY_NAME.get(column); + if (!meta) { + continue; + } + const nativeRaw = nativeValue[column] ?? null; + if (!valuesEqual(electricRaw, nativeRaw, meta.type, meta.dims, column)) { + rowDiverged = true; + if (outcome.diffs.length < MAX_DIFFS) { + outcome.diffs.push({ runId, column, electric: electricRaw, native: nativeRaw }); + } + } + } + + if (rowDiverged) { + outcome.serializationDiverged++; + } else { + outcome.serializationMatched++; + } + } + + if (input.isInitialSnapshot && input.membershipFilter) { + const electricIds = new Set( + changes.map((m) => m.value.id).filter((id): id is string => typeof id === "string") + ); + const nativeIds = new Set( + await this.options.runListResolver.resolveMatchingRunIds(input.membershipFilter) + ); + + outcome.missingInNative = [...electricIds].filter((id) => !nativeIds.has(id)); + outcome.extraInNative = [...nativeIds].filter((id) => !electricIds.has(id)); + outcome.membershipMatch = + outcome.missingInNative.length === 0 && outcome.extraInNative.length === 0; + } + + return outcome; + } +} + +function parseBody(body: string): ShapeMessage[] { + const text = body.trim(); + if (!text) { + return []; + } + try { + const parsed = JSON.parse(text); + return Array.isArray(parsed) ? (parsed as ShapeMessage[]) : []; + } catch { + return []; + } +} + +/** Status carries a known legacy rewrite (DEQUEUED -> EXECUTING) applied equally to + * both paths for non-current API versions; treat them as equivalent. */ +function normalizeStatus(value: string): string { + return value === "DEQUEUED" ? "EXECUTING" : value; +} + +function sameInstant(a: string | null | undefined, b: string | null | undefined): boolean { + if (a == null || b == null) { + return a == null && b == null; + } + // Mirror the SDK's RawShapeDate (`new Date(val + "Z")`). + return new Date(`${a}Z`).getTime() === new Date(`${b}Z`).getTime(); +} + +function valuesEqual( + electricRaw: string | null, + nativeRaw: string | null, + type: ElectricColumnType, + dims: number | undefined, + column: string +): boolean { + if (electricRaw == null || nativeRaw == null) { + return electricRaw == null && nativeRaw == null; + } + + if (dims && dims > 0) { + return arraysEqual(parsePgTextArray(electricRaw), parsePgTextArray(nativeRaw)); + } + + switch (type) { + case "timestamp": + return new Date(`${electricRaw}Z`).getTime() === new Date(`${nativeRaw}Z`).getTime(); + case "bool": + return parseBool(electricRaw) === parseBool(nativeRaw); + case "int4": + case "int8": + case "float8": + return Number(electricRaw) === Number(nativeRaw); + case "jsonb": + return jsonEqual(electricRaw, nativeRaw); + case "text": + default: + if (column === "status") { + return normalizeStatus(electricRaw) === normalizeStatus(nativeRaw); + } + return electricRaw === nativeRaw; + } +} + +function parseBool(value: string): boolean { + return value === "t" || value === "true"; +} + +function jsonEqual(a: string, b: string): boolean { + try { + return deepEqual(JSON.parse(a), JSON.parse(b)); + } catch { + return a === b; + } +} + +function deepEqual(a: unknown, b: unknown): boolean { + if (a === b) return true; + if (typeof a !== typeof b || a === null || b === null) return false; + if (Array.isArray(a) && Array.isArray(b)) { + return a.length === b.length && a.every((v, i) => deepEqual(v, b[i])); + } + if (typeof a === "object" && typeof b === "object") { + const ak = Object.keys(a as object).sort(); + const bk = Object.keys(b as object).sort(); + return ( + ak.length === bk.length && + ak.every((k, i) => k === bk[i]) && + ak.every((k) => deepEqual((a as any)[k], (b as any)[k])) + ); + } + return false; +} + +function arraysEqual(a: string[], b: string[]): boolean { + return a.length === b.length && a.every((v, i) => v === b[i]); +} + +/** Parse a Postgres text-array literal (`{"a","b"}` / `{}`). Mirrors the client's pgArrayParser. */ +function parsePgTextArray(literal: string): string[] { + if (literal === "{}" || literal === "") { + return []; + } + const inner = literal.startsWith("{") && literal.endsWith("}") ? literal.slice(1, -1) : literal; + const result: string[] = []; + let i = 0; + while (i < inner.length) { + if (inner[i] === '"') { + i++; + let s = ""; + while (i < inner.length && inner[i] !== '"') { + if (inner[i] === "\\") { + i++; + } + s += inner[i]; + i++; + } + result.push(s); + i++; + if (inner[i] === ",") i++; + } else { + let s = ""; + while (i < inner.length && inner[i] !== ",") { + s += inner[i]; + i++; + } + result.push(s); + if (inner[i] === ",") i++; + } + } + return result; +} diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts new file mode 100644 index 00000000000..90bc1d90070 --- /dev/null +++ b/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts @@ -0,0 +1,188 @@ +import { API_VERSIONS } from "~/api/versions"; +import { logger } from "../logger.server"; +import { + type RealtimeEnvironment, + type RealtimeRequestOptions, + type RealtimeRunsParams, +} from "../realtimeClient.server"; +import { RESERVED_COLUMNS } from "./electricStreamProtocol.server"; +import { + type RealtimeListEnvironment, + type RealtimeStreamClient, +} from "./nativeRealtimeClient.server"; +import { type RunListFilter } from "./runReader.server"; +import { + type RealtimeShadowComparator, + type ShadowCompareOutcome, + type ShadowFeed, +} from "./shadowCompare.server"; + +export type ShadowRealtimeClientOptions = { + /** The path actually served to the client (Electric). */ + electric: RealtimeStreamClient; + comparator: RealtimeShadowComparator; + /** createdAt window (ms) used to resolve tag-list membership for the compare. */ + maximumCreatedAtFilterAgeMs: number; + /** Cap for the membership resolve. */ + maxListResults: number; + /** Metrics sink for compare outcomes. */ + onOutcome?: (outcome: ShadowCompareOutcome) => void; +}; + +/** Transparent wrapper that serves the Electric response unchanged and, in the background (fire-and-forget), diffs what the native backend would emit. */ +export class ShadowRealtimeClient implements RealtimeStreamClient { + constructor(private readonly options: ShadowRealtimeClientOptions) {} + + async streamRun( + url: URL | string, + environment: RealtimeEnvironment, + runId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const response = await this.options.electric.streamRun( + url, + environment, + runId, + apiVersion, + requestOptions, + clientVersion, + signal + ); + this.#shadow("run", response, url, environment, requestOptions); + return response; + } + + async streamRuns( + url: URL | string, + environment: RealtimeListEnvironment, + params: RealtimeRunsParams, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const response = await this.options.electric.streamRuns( + url, + environment, + params, + apiVersion, + requestOptions, + clientVersion, + signal + ); + this.#shadow("runs", response, url, environment, requestOptions, { tags: params.tags ?? [] }); + return response; + } + + async streamBatch( + url: URL | string, + environment: RealtimeListEnvironment, + batchId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const response = await this.options.electric.streamBatch( + url, + environment, + batchId, + apiVersion, + requestOptions, + clientVersion, + signal + ); + this.#shadow("batch", response, url, environment, requestOptions, { batchId }); + return response; + } + + /** Fire-and-forget; never blocks the served response, never throws into the request. */ + #shadow( + feed: ShadowFeed, + electricResponse: Response, + url: URL | string, + environment: RealtimeEnvironment & { projectId?: string }, + requestOptions?: RealtimeRequestOptions, + membership?: { tags?: string[]; batchId?: string } + ): void { + // Clone synchronously before the client consumes the body. + let bodyClone: Response; + try { + if (electricResponse.status !== 200) { + return; + } + bodyClone = electricResponse.clone(); + } catch { + return; + } + + void this.#runShadow(feed, bodyClone, url, environment, requestOptions, membership).catch( + (error) => logger.debug("[shadowRealtime] compare failed", { feed, error }) + ); + } + + async #runShadow( + feed: ShadowFeed, + bodyClone: Response, + url: URL | string, + environment: RealtimeEnvironment & { projectId?: string }, + requestOptions: RealtimeRequestOptions | undefined, + membership: { tags?: string[]; batchId?: string } | undefined + ): Promise { + const $url = new URL(url.toString()); + const offset = $url.searchParams.get("offset") ?? "-1"; + const handle = $url.searchParams.get("handle") ?? $url.searchParams.get("shape_id"); + const isInitialSnapshot = offset === "-1" || !handle; + const skipColumns = resolveSkipColumns($url, requestOptions); + const electricBody = await bodyClone.text(); + + let membershipFilter: RunListFilter | undefined; + if (isInitialSnapshot && membership && environment.projectId) { + membershipFilter = { + organizationId: environment.organizationId, + projectId: environment.projectId, + environmentId: environment.id, + tags: membership.tags, + batchId: membership.batchId, + createdAtAfter: membership.batchId + ? undefined + : new Date(Date.now() - this.options.maximumCreatedAtFilterAgeMs), + limit: this.options.maxListResults, + }; + } + + const outcome = await this.options.comparator.compare({ + feed, + electricBody, + environment: { id: environment.id }, + skipColumns, + isInitialSnapshot, + membershipFilter, + }); + + this.options.onOutcome?.(outcome); + + if (outcome.serializationDiverged > 0 || outcome.membershipMatch === false) { + logger.warn("[shadowRealtime] divergence detected", { + feed, + serializationDiverged: outcome.serializationDiverged, + serializationMatched: outcome.serializationMatched, + serializationSkew: outcome.serializationSkew, + membershipMatch: outcome.membershipMatch, + missingInNative: outcome.missingInNative?.slice(0, 20), + extraInNative: outcome.extraInNative?.slice(0, 20), + // Log only which run/column diverged, never the raw cell values — they can + // include run payload/output/metadata and must not leak into logs. + diffs: outcome.diffs.map(({ runId, column }) => ({ runId, column })), + }); + } + } +} + +function resolveSkipColumns(url: URL, requestOptions?: RealtimeRequestOptions): string[] { + const raw = requestOptions?.skipColumns ?? url.searchParams.get("skipColumns")?.split(",") ?? []; + return raw.map((c) => c.trim()).filter((c) => c !== "" && !RESERVED_COLUMNS.includes(c)); +} diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts new file mode 100644 index 00000000000..8dbb5007c20 --- /dev/null +++ b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts @@ -0,0 +1,64 @@ +import { getMeter } from "@internal/tracing"; +import { $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; +import { singleton } from "~/utils/singleton"; +import { realtimeClient } from "../realtimeClientGlobal.server"; +import { ClickHouseRunListResolver } from "./clickHouseRunListResolver.server"; +import { RunHydrator } from "./runReader.server"; +import { RealtimeShadowComparator } from "./shadowCompare.server"; +import { ShadowRealtimeClient } from "./shadowRealtimeClient.server"; + +/** + * Process-singleton wiring for the shadow-compare client. Only constructed + * when an org's `realtimeBackend` flag is set to "shadow". + */ +function initializeShadowRealtimeClient(): ShadowRealtimeClient { + const compares = getMeter("realtime-shadow").createCounter("realtime_shadow.compares", { + description: + "Dual-run shadow-compare outcomes (Electric vs native). kind=serialization|membership, result=match|diverge|skew.", + }); + + const comparator = new RealtimeShadowComparator({ + runReader: new RunHydrator({ replica: $replica }), + runListResolver: new ClickHouseRunListResolver({ + getClickhouse: (organizationId) => + clickhouseFactory.getClickhouseForOrganization(organizationId, "realtime"), + prisma: $replica, + }), + }); + + return new ShadowRealtimeClient({ + electric: realtimeClient, + comparator, + maximumCreatedAtFilterAgeMs: env.REALTIME_MAXIMUM_CREATED_AT_FILTER_AGE_IN_MS, + maxListResults: env.REALTIME_BACKEND_NATIVE_MAX_LIST_RESULTS, + onOutcome: (outcome) => { + const { feed } = outcome; + if (outcome.serializationMatched) { + compares.add(outcome.serializationMatched, { feed, kind: "serialization", result: "match" }); + } + if (outcome.serializationDiverged) { + compares.add(outcome.serializationDiverged, { + feed, + kind: "serialization", + result: "diverge", + }); + } + if (outcome.serializationSkew) { + compares.add(outcome.serializationSkew, { feed, kind: "serialization", result: "skew" }); + } + if (outcome.membershipMatch !== undefined) { + compares.add(1, { + feed, + kind: "membership", + result: outcome.membershipMatch ? "match" : "diverge", + }); + } + }, + }); +} + +export function getShadowRealtimeClient(): ShadowRealtimeClient { + return singleton("shadowRealtimeClient", initializeShadowRealtimeClient); +} diff --git a/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts b/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts index fcf1c811d70..304777b39e0 100644 --- a/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts +++ b/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts @@ -321,7 +321,9 @@ function applyRunFiltersToQueryBuilder( } if (options.tags && options.tags.length > 0) { - queryBuilder.where("hasAny(tags, {tags: Array(String)})", { tags: options.tags }); + // Both hasAny and hasAll are served by the tags bloom_filter skip index. + const tagsFn = options.tagsMatch === "all" ? "hasAll" : "hasAny"; + queryBuilder.where(`${tagsFn}(tags, {tags: Array(String)})`, { tags: options.tags }); } if (options.scheduleId) { diff --git a/apps/webapp/app/services/runsRepository/runsRepository.server.ts b/apps/webapp/app/services/runsRepository/runsRepository.server.ts index f4eeb5466d0..74963bc3ff2 100644 --- a/apps/webapp/app/services/runsRepository/runsRepository.server.ts +++ b/apps/webapp/app/services/runsRepository/runsRepository.server.ts @@ -30,6 +30,8 @@ const RunListInputOptionsSchema = z.object({ versions: z.array(z.string()).optional(), statuses: z.array(RunStatus).optional(), tags: z.array(z.string()).optional(), + // "any" (default) = run has at least one of `tags`; "all" = run has every tag. + tagsMatch: z.enum(["any", "all"]).optional(), scheduleId: z.string().optional(), period: z.string().optional(), from: z.number().optional(), diff --git a/apps/webapp/app/v3/featureFlags.ts b/apps/webapp/app/v3/featureFlags.ts index 9a5d75cfe25..3066f2dda01 100644 --- a/apps/webapp/app/v3/featureFlags.ts +++ b/apps/webapp/app/v3/featureFlags.ts @@ -10,6 +10,7 @@ export const FEATURE_FLAG = { hasPrivateConnections: "hasPrivateConnections", mollifierEnabled: "mollifierEnabled", workerQueueScheduledSplitEnabled: "workerQueueScheduledSplitEnabled", + realtimeBackend: "realtimeBackend", } as const; export const FeatureFlagCatalog = { @@ -22,6 +23,10 @@ export const FeatureFlagCatalog = { [FEATURE_FLAG.hasPrivateConnections]: z.coerce.boolean(), [FEATURE_FLAG.mollifierEnabled]: z.coerce.boolean(), [FEATURE_FLAG.workerQueueScheduledSplitEnabled]: z.coerce.boolean(), + // Which backend serves the realtime run feed. Controllable + // globally and per-org (org wins). Defaults to "electric" when unset. + // "shadow" serves Electric but diffs the native path in the background. + [FEATURE_FLAG.realtimeBackend]: z.enum(["electric", "native", "shadow"]), }; export type FeatureFlagKey = keyof typeof FeatureFlagCatalog; diff --git a/apps/webapp/app/v3/runEngineHandlers.server.ts b/apps/webapp/app/v3/runEngineHandlers.server.ts index 3277d74ba6e..c8d9240154e 100644 --- a/apps/webapp/app/v3/runEngineHandlers.server.ts +++ b/apps/webapp/app/v3/runEngineHandlers.server.ts @@ -20,11 +20,12 @@ import { createExceptionPropertiesFromError } from "./eventRepository/common.ser import { getEventRepositoryForStore, recordRunDebugLog } from "./eventRepository/index.server"; import { roomFromFriendlyRunId, socketIo } from "./handleSocketIo.server"; import { engine } from "./runEngine.server"; +import { publishChangeRecord } from "~/services/realtime/runChangeNotifierInstance.server"; import { PerformTaskRunAlertsService } from "./services/alerts/performTaskRunAlerts.server"; import { TaskRunErrorCodes } from "@trigger.dev/core/v3"; export function registerRunEngineEventBusHandlers() { - engine.eventBus.on("runSucceeded", async ({ time, run, organization }) => { + engine.eventBus.on("runSucceeded", async ({ time, run, organization, environment }) => { const [taskRunError, taskRun] = await tryCatch( $replica.taskRun.findFirstOrThrow({ where: { @@ -45,6 +46,11 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read so the + // per-env channel carries the membership keys (no separate query). No-op when + // the native backend is disabled. + runTags: true, + batchId: true, }, }) ); @@ -57,6 +63,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: environment.id, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + const eventRepository = await getEventRepositoryForStore( run.taskEventStore, taskRun.organizationId ?? organization.id @@ -91,7 +104,7 @@ export function registerRunEngineEventBusHandlers() { }); // Handle events - engine.eventBus.on("runFailed", async ({ time, run, organization }) => { + engine.eventBus.on("runFailed", async ({ time, run, organization, environment }) => { const sanitizedError = sanitizeError(run.error); const exception = createExceptionPropertiesFromError(sanitizedError); @@ -115,6 +128,10 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the native backend is disabled). + runTags: true, + batchId: true, }, }) ); @@ -127,6 +144,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: environment.id, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + const eventRepository = await getEventRepositoryForStore( run.taskEventStore, taskRun.organizationId ?? organization.id @@ -172,6 +196,10 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the native backend is disabled). + runTags: true, + batchId: true, }, }) ); @@ -184,6 +212,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: taskRun.runtimeEnvironmentId, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + if (!taskRun.organizationId) { logger.error("[runAttemptFailed] Task run has no organization id", { runId: run.id, @@ -328,7 +363,7 @@ export function registerRunEngineEventBusHandlers() { } ); - engine.eventBus.on("runExpired", async ({ time, run, organization }) => { + engine.eventBus.on("runExpired", async ({ time, run, organization, environment }) => { if (!run.ttl) { return; } @@ -353,6 +388,10 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the native backend is disabled). + runTags: true, + batchId: true, }, }) ); @@ -365,6 +404,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: environment.id, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + const eventRepository = await getEventRepositoryForStore( taskRun.taskEventStore, taskRun.organizationId ?? organization.id @@ -386,7 +432,7 @@ export function registerRunEngineEventBusHandlers() { } }); - engine.eventBus.on("runCancelled", async ({ time, run, organization }) => { + engine.eventBus.on("runCancelled", async ({ time, run, organization, environment }) => { const [taskRunError, taskRun] = await tryCatch( $replica.taskRun.findFirstOrThrow({ where: { @@ -407,6 +453,10 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the native backend is disabled). + runTags: true, + batchId: true, }, }) ); @@ -419,6 +469,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: environment.id, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + const eventRepository = await getEventRepositoryForStore( taskRun.taskEventStore, taskRun.organizationId ?? organization.id @@ -505,15 +562,20 @@ export function registerRunEngineEventBusHandlers() { }); engine.eventBus.on("runMetadataUpdated", async ({ time, run }) => { - const env = await findEnvironmentFromRun(run.id); + const result = await findEnvironmentFromRun(run.id); - if (!env) { + if (!result) { logger.error("[runMetadataUpdated] Failed to find environment", { runId: run.id }); return; } + const { environment, runTags, batchId } = result; + try { - await updateMetadataService.call(run.id, run.metadata, env); + await updateMetadataService.call(run.id, run.metadata, environment); + // Realtime run-changed publish, after the write so the router's hydrate sees the new + // row. A full record (env + tags + batchId), so feeds route by index. + publishChangeRecord({ runId: run.id, envId: environment.id, tags: runTags, batchId }); } catch (e) { if (e instanceof MetadataTooLargeError) { logger.warn("[runMetadataUpdated] Failed to update metadata, too large", { diff --git a/apps/webapp/package.json b/apps/webapp/package.json index 162a9ede9a0..efebaf48207 100644 --- a/apps/webapp/package.json +++ b/apps/webapp/package.json @@ -163,7 +163,7 @@ "humanize-duration": "^3.27.3", "input-otp": "^1.4.2", "intl-parse-accept-language": "^1.0.0", - "ioredis": "^5.3.2", + "ioredis": "~5.6.0", "isbot": "^3.6.5", "jose": "^5.4.0", "json-stable-stringify": "^1.3.0", diff --git a/apps/webapp/test/realtime/boundedTtlCache.test.ts b/apps/webapp/test/realtime/boundedTtlCache.test.ts new file mode 100644 index 00000000000..a3fb0b1e425 --- /dev/null +++ b/apps/webapp/test/realtime/boundedTtlCache.test.ts @@ -0,0 +1,52 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import { BoundedTtlCache } from "~/services/realtime/boundedTtlCache"; + +describe("BoundedTtlCache", () => { + afterEach(() => { + vi.useRealTimers(); + }); + + it("returns a live entry within its TTL", () => { + vi.useFakeTimers(); + const cache = new BoundedTtlCache(1_000, 100); + cache.set("k", "v"); + vi.advanceTimersByTime(500); + expect(cache.get("k")).toBe("v"); + expect(cache.size).toBe(1); + }); + + it("evicts an expired entry on read instead of letting it linger", () => { + vi.useFakeTimers(); + const cache = new BoundedTtlCache(1_000, 100); + cache.set("a", 1); + expect(cache.size).toBe(1); + + vi.advanceTimersByTime(1_001); + expect(cache.get("a")).toBeUndefined(); + // The previous bug left expired entries in the map until an at-capacity sweep; + // they must now be removed on read. + expect(cache.size).toBe(0); + }); + + it("does not evict another entry when updating an existing key at capacity", () => { + const cache = new BoundedTtlCache(60_000, 2); + cache.set("a", 1); + cache.set("b", 2); + // Updating an existing key doesn't grow the map, so it must not drop "b". + cache.set("a", 11); + expect(cache.get("a")).toBe(11); + expect(cache.get("b")).toBe(2); + expect(cache.size).toBe(2); + }); + + it("drops the oldest entry when full of still-live entries", () => { + const cache = new BoundedTtlCache(60_000, 2); + cache.set("a", 1); + cache.set("b", 2); + cache.set("c", 3); // over capacity, none expired -> evict oldest insertion (a) + expect(cache.get("a")).toBeUndefined(); + expect(cache.get("b")).toBe(2); + expect(cache.get("c")).toBe(3); + expect(cache.size).toBe(2); + }); +}); diff --git a/apps/webapp/test/realtime/electricStreamProtocol.test.ts b/apps/webapp/test/realtime/electricStreamProtocol.test.ts new file mode 100644 index 00000000000..a48f4f9f8e8 --- /dev/null +++ b/apps/webapp/test/realtime/electricStreamProtocol.test.ts @@ -0,0 +1,304 @@ +import { SubscribeRunRawShape } from "@trigger.dev/core/v3/schemas"; +import { describe, expect, it } from "vitest"; +import { + buildElectricSchemaHeader, + buildRowsBody, + buildSnapshotBody, + buildUpdateBody, + buildUpToDateBody, + encodeOffset, + parseOffsetUpdatedAtMs, + type RealtimeRunRow, + rewriteBodyForLegacyApiVersion, + serializeRunRow, +} from "~/services/realtime/electricStreamProtocol.server"; + +function sampleRow(overrides: Partial = {}): RealtimeRunRow { + return { + id: "run_abc123", + taskIdentifier: "my-task", + createdAt: new Date("2026-06-06T10:00:00.000Z"), + updatedAt: new Date("2026-06-06T10:05:30.123Z"), + startedAt: new Date("2026-06-06T10:01:00.000Z"), + delayUntil: null, + queuedAt: new Date("2026-06-06T10:00:30.000Z"), + expiredAt: null, + completedAt: null, + friendlyId: "run_friendly_abc", + number: 42, + isTest: true, + status: "EXECUTING", + usageDurationMs: 1234, + costInCents: 0.55, + baseCostInCents: 0.25, + ttl: "1h", + payload: '{"hello":"world"}', + payloadType: "application/json", + metadata: '{"step":1}', + metadataType: "application/json", + output: null, + outputType: "application/json", + runTags: ["user:123", "env:prod"], + error: null, + realtimeStreams: [], + ...overrides, + }; +} + +/** + * Faithful re-implementation of the @electric-sql/client value parser rules + * (defaultParser + pgArrayParser), so we can decode our wire `value` object the + * same way the deployed client would, then validate against the real SDK schema. + * Source: @electric-sql/client@1.0.14 src/parser.ts. + */ +function electricParse( + value: Record, + schema: Record +): Record { + const out: Record = {}; + for (const [key, raw] of Object.entries(value)) { + if (raw === null) { + out[key] = null; + continue; + } + const info = schema[key]; + if (!info) { + out[key] = raw; + continue; + } + if (info.dims && info.dims > 0) { + out[key] = parsePgTextArray(raw); + continue; + } + switch (info.type) { + case "bool": + out[key] = raw === "t" || raw === "true"; + break; + case "int8": + out[key] = BigInt(raw); + break; + case "int2": + case "int4": + case "float4": + case "float8": + out[key] = Number(raw); + break; + case "json": + case "jsonb": + out[key] = JSON.parse(raw); + break; + default: + out[key] = raw; // text/timestamp pass through as strings + } + } + return out; +} + +function parsePgTextArray(literal: string): string[] { + if (literal === "{}") { + return []; + } + const inner = literal.slice(1, -1); + const result: string[] = []; + let i = 0; + while (i < inner.length) { + if (inner[i] === '"') { + i++; + let s = ""; + while (i < inner.length && inner[i] !== '"') { + if (inner[i] === "\\") { + i++; + } + s += inner[i]; + i++; + } + result.push(s); + i++; // closing quote + if (inner[i] === ",") i++; + } else { + let s = ""; + while (i < inner.length && inner[i] !== ",") { + s += inner[i]; + i++; + } + result.push(s); + if (inner[i] === ",") i++; + } + } + return result; +} + +describe("electricStreamProtocol serializer", () => { + it("encodes each Postgres type the way the Electric client expects", () => { + const value = serializeRunRow(sampleRow()); + + // text: passed through as-is + expect(value.id).toBe("run_abc123"); + expect(value.status).toBe("EXECUTING"); + expect(value.payload).toBe('{"hello":"world"}'); + + // int/float: stringified + expect(value.number).toBe("42"); + expect(value.usageDurationMs).toBe("1234"); + expect(value.costInCents).toBe("0.55"); + + // bool: postgres "t"/"f" + expect(value.isTest).toBe("t"); + + // timestamp: ISO without trailing Z (the SDK appends Z before parsing) + expect(value.updatedAt).toBe("2026-06-06T10:05:30.123"); + expect(value.createdAt).toBe("2026-06-06T10:00:00.000"); + + // nullable timestamp: null stays null + expect(value.delayUntil).toBeNull(); + expect(value.completedAt).toBeNull(); + + // text[]: quoted pg array literal; empty realtimeStreams (@default([])) => {} + expect(value.runTags).toBe('{"user:123","env:prod"}'); + expect(value.realtimeStreams).toBe("{}"); + + // jsonb: null stays null + expect(value.error).toBeNull(); + }); + + it("encodes an empty no-default array column (runTags) as null, matching Electric", () => { + // runTags has no Postgres default, so an empty value is stored as SQL NULL and + // Electric emits `null` (not `{}`). realtimeStreams has @default([]), so its + // empty value is `{}`. Prisma hands us `[]` for both; we re-derive the wire form. + const value = serializeRunRow(sampleRow({ runTags: [], realtimeStreams: [] })); + expect(value.runTags).toBeNull(); + expect(value.realtimeStreams).toBe("{}"); + }); + + it("encodes jsonb error as a JSON string", () => { + const value = serializeRunRow(sampleRow({ error: { type: "STRING_ERROR", raw: "boom" } })); + expect(value.error).toBe('{"type":"STRING_ERROR","raw":"boom"}'); + }); + + it("round-trips through the client parser into a valid SubscribeRunRawShape", () => { + const row = sampleRow({ error: { type: "STRING_ERROR", raw: "boom" } }); + const value = serializeRunRow(row); + const schema = JSON.parse(buildElectricSchemaHeader()); + + const decoded = electricParse(value, schema); + const parsed = SubscribeRunRawShape.parse(decoded); + + expect(parsed.id).toBe("run_abc123"); + expect(parsed.friendlyId).toBe("run_friendly_abc"); + expect(parsed.status).toBe("EXECUTING"); + expect(parsed.number).toBe(42); + expect(parsed.isTest).toBe(true); + expect(parsed.usageDurationMs).toBe(1234); + expect(parsed.costInCents).toBeCloseTo(0.55); + expect(parsed.runTags).toEqual(["user:123", "env:prod"]); + expect(parsed.realtimeStreams).toEqual([]); + // RawShapeDate appends "Z" and coerces to a Date equal to the source instant. + expect(parsed.createdAt.toISOString()).toBe("2026-06-06T10:00:00.000Z"); + expect(parsed.updatedAt.toISOString()).toBe("2026-06-06T10:05:30.123Z"); + expect(parsed.startedAt?.toISOString()).toBe("2026-06-06T10:01:00.000Z"); + expect(parsed.delayUntil ?? null).toBeNull(); + expect(parsed.error).toEqual({ type: "STRING_ERROR", raw: "boom" }); + }); + + it("honors skipColumns (but never the reserved columns)", () => { + const value = serializeRunRow(sampleRow(), ["payload", "output", "id", "status"]); + expect(value.payload).toBeUndefined(); + expect(value.output).toBeUndefined(); + // reserved columns can't be skipped + expect(value.id).toBe("run_abc123"); + expect(value.status).toBe("EXECUTING"); + + const schema = JSON.parse(buildElectricSchemaHeader(["payload"])); + expect(schema.payload).toBeUndefined(); + expect(schema.status).toBeDefined(); + }); +}); + +describe("electricStreamProtocol message bodies", () => { + it("emits insert + up-to-date for an initial snapshot", () => { + const messages = JSON.parse(buildSnapshotBody(sampleRow())); + expect(messages).toHaveLength(2); + expect(messages[0].headers.operation).toBe("insert"); + expect(messages[0].key).toBe('"public"."TaskRun"/"run_abc123"'); + expect(messages[0].value.status).toBe("EXECUTING"); + expect(messages[1].headers.control).toBe("up-to-date"); + }); + + it("emits a bare up-to-date for an empty (missing) run snapshot", () => { + const messages = JSON.parse(buildSnapshotBody(null)); + expect(messages).toHaveLength(1); + expect(messages[0].headers.control).toBe("up-to-date"); + }); + + it("emits update + up-to-date for a live change", () => { + const messages = JSON.parse(buildUpdateBody(sampleRow())); + expect(messages[0].headers.operation).toBe("update"); + expect(messages[1].headers.control).toBe("up-to-date"); + }); + + it("emits a bare up-to-date when nothing advanced", () => { + const messages = JSON.parse(buildUpToDateBody()); + expect(messages).toEqual([{ headers: { control: "up-to-date" } }]); + }); + + it("uses the same merge key across insert and update so the client merges by row", () => { + const insert = JSON.parse(buildSnapshotBody(sampleRow()))[0]; + const update = JSON.parse(buildUpdateBody(sampleRow()))[0]; + expect(insert.key).toBe(update.key); + }); +}); + +describe("electricStreamProtocol multi-row (tag-list) bodies", () => { + it("emits one change message per row with per-row operation, then up-to-date", () => { + const a = sampleRow({ id: "run_a" }); + const b = sampleRow({ id: "run_b", status: "QUEUED" }); + const messages = JSON.parse( + buildRowsBody([ + { row: a, operation: "insert" }, + { row: b, operation: "update" }, + ]) + ); + expect(messages).toHaveLength(3); + expect(messages[0].headers.operation).toBe("insert"); + expect(messages[0].key).toBe('"public"."TaskRun"/"run_a"'); + expect(messages[1].headers.operation).toBe("update"); + expect(messages[1].key).toBe('"public"."TaskRun"/"run_b"'); + expect(messages[1].value.status).toBe("QUEUED"); + expect(messages[2].headers.control).toBe("up-to-date"); + }); + + it("emits a bare up-to-date for an empty change set", () => { + const messages = JSON.parse(buildRowsBody([])); + expect(messages).toEqual([{ headers: { control: "up-to-date" } }]); + }); + + it("honors skipColumns across all rows", () => { + const messages = JSON.parse( + buildRowsBody([{ row: sampleRow(), operation: "insert" }], ["payload"]) + ); + expect(messages[0].value.payload).toBeUndefined(); + expect(messages[0].value.status).toBe("EXECUTING"); + }); +}); + +describe("electricStreamProtocol tokens + legacy rewrite", () => { + it("encodes and parses the offset updatedAt segment", () => { + const offset = encodeOffset(1717667130123, 7); + expect(offset).toBe("1717667130123_7"); + expect(parseOffsetUpdatedAtMs(offset)).toBe(1717667130123); + }); + + it("treats the initial offset (-1) and garbage as zero", () => { + expect(parseOffsetUpdatedAtMs("-1")).toBe(0); + expect(parseOffsetUpdatedAtMs(null)).toBe(0); + expect(parseOffsetUpdatedAtMs("nonsense")).toBe(0); + }); + + it("rewrites DEQUEUED to EXECUTING for legacy API versions", () => { + const body = buildUpdateBody(sampleRow({ status: "DEQUEUED" })); + expect(body).toContain('"status":"DEQUEUED"'); + const rewritten = rewriteBodyForLegacyApiVersion(body); + expect(rewritten).not.toContain('"status":"DEQUEUED"'); + expect(rewritten).toContain('"status":"EXECUTING"'); + }); +}); diff --git a/apps/webapp/test/realtime/envChangeRouter.test.ts b/apps/webapp/test/realtime/envChangeRouter.test.ts new file mode 100644 index 00000000000..6f2eb6df980 --- /dev/null +++ b/apps/webapp/test/realtime/envChangeRouter.test.ts @@ -0,0 +1,363 @@ +import { describe, expect, it, vi } from "vitest"; +import { + EnvChangeRouter, + type EnvChangeSource, + type RowHydrator, +} from "~/services/realtime/envChangeRouter.server"; +import { type ChangeRecord } from "~/services/realtime/runChangeNotifier.server"; +import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; + +const FLOOR_MS = Date.UTC(2026, 5, 7, 12, 0, 0); + +function row( + id: string, + opts: { tags?: string[]; createdAtMs?: number; updatedAtMs?: number } = {} +): RealtimeRunRow { + return { + id, + runTags: opts.tags ?? [], + createdAt: new Date(opts.createdAtMs ?? FLOOR_MS + 1_000), + updatedAt: new Date(opts.updatedAtMs ?? FLOOR_MS + 5_000), + } as unknown as RealtimeRunRow; +} + +function record(runId: string, extra: Partial = {}): ChangeRecord { + return { v: 1, runId, envId: "env_1", ...extra }; +} + +/** A controllable EnvChangeSource: tests push batches to the env's listener. */ +function fakeSource() { + const listeners = new Map void>>(); + const source: EnvChangeSource = { + subscribeToEnv(envId, onBatch) { + let set = listeners.get(envId); + if (!set) { + set = new Set(); + listeners.set(envId, set); + } + set.add(onBatch); + return () => { + listeners.get(envId)?.delete(onBatch); + }; + }, + }; + return { + source, + push(envId: string, records: ChangeRecord[]) { + for (const l of listeners.get(envId) ?? []) l(records); + }, + isSubscribed(envId: string) { + return (listeners.get(envId)?.size ?? 0) > 0; + }, + }; +} + +function makeRouter( + rowsById: Map = new Map(), + options: Record = {} +) { + const src = fakeSource(); + const hydrateSpy = vi.fn(async (_env, ids) => + ids.map((id) => rowsById.get(id)).filter((r): r is RealtimeRunRow => Boolean(r)) + ); + const router = new EnvChangeRouter({ + source: src.source, + hydrator: { hydrateByIds: hydrateSpy }, + ...options, + }); + return { router, src, hydrateSpy }; +} + +describe("EnvChangeRouter", () => { + it("routes a tag match to the feed (hydrated + serialized) and ignores non-matches", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows); + const reg = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + const wait = reg.waitForMatch(undefined, 1_000); + + // A non-matching tag is dropped (no wake); a matching tag wakes with the hydrated row. + src.push("env_1", [record("rX", { tags: ["b"] }), record("r1", { tags: ["a"] })]); + + const result = await wait; + expect(result.reason).toBe("notify"); + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + expect(result.rows[0].value.id).toBe("r1"); // serialized wire value + expect(hydrateSpy).toHaveBeenCalledWith("env_1", ["r1"], []); + reg.close(); + }); + + it("wakes an unfiltered tag feed (no tags) for every full record, live and via replay", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src } = makeRouter(rows); + + // Live path: a full record (tags defined) must reach the zero-filter feed even + // though it can never appear in the byTag index. + const reg = router.register("env_1", { kind: "tag", tags: [] }, []); + const wait = reg.waitForMatch(undefined, 1_000); + src.push("env_1", [record("r1", { tags: ["a"] })]); + const live = await wait; + expect(live.reason).toBe("notify"); + expect(live.rows.map((m) => m.row.id)).toEqual(["r1"]); + reg.close(); + + // Replay path: the buffered record matches an unfiltered feed registered after the push. + const late = router.register("env_1", { kind: "tag", tags: [] }, [], { + replaySinceMs: Date.now() - 1_000, + }); + const replayed = await late.waitForMatch(undefined, 1_000); + expect(replayed.reason).toBe("notify"); + expect(replayed.rows.map((m) => m.row.id)).toEqual(["r1"]); + late.close(); + }); + + it("batch-hydrates ONCE and shares the serialized value across feeds matching the same run", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows); + const regs = [ + router.register("env_1", { kind: "tag", tags: ["a"] }, []), + router.register("env_1", { kind: "tag", tags: ["a"] }, []), + ]; + const waits = regs.map((r) => r.waitForMatch(undefined, 1_000)); + + src.push("env_1", [record("r1", { tags: ["a"] })]); + const results = await Promise.all(waits); + + // One hydrate for the whole tick (same column set), shared by both feeds... + expect(hydrateSpy).toHaveBeenCalledTimes(1); + // ...and the same serialized value object is reused (serialize-once). + expect(results[0].rows[0].value).toBe(results[1].rows[0].value); + regs.forEach((r) => r.close()); + }); + + it("a hydrate failure doesn't reject out of the source callback; the feed times out", async () => { + const src = fakeSource(); + const hydrateSpy = vi.fn(async () => { + throw new Error("replica down"); + }); + const router = new EnvChangeRouter({ + source: src.source, + hydrator: { hydrateByIds: hydrateSpy }, + }); + const reg = router.register("env_1", { kind: "run", runId: "r1" }, []); + const wait = reg.waitForMatch(undefined, 50); + + // Would be an unhandled rejection (process exit) if #onBatch's promise were unguarded. + src.push("env_1", [record("r1")]); + + const result = await wait; + expect(result.reason).toBe("timeout"); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + reg.close(); + }); + + it("routes a run feed by exact runId", async () => { + const rows = new Map([["r1", row("r1")]]); + const { router, src } = makeRouter(rows); + const reg = router.register("env_1", { kind: "run", runId: "r1" }, []); + const wait = reg.waitForMatch(undefined, 1_000); + src.push("env_1", [record("r2"), record("r1")]); + const result = await wait; + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + reg.close(); + }); + + it("routes a batch feed by batchId", async () => { + const rows = new Map([["r1", row("r1")]]); + const { router, src } = makeRouter(rows); + const reg = router.register("env_1", { kind: "batch", batchId: "batch_1" }, []); + const wait = reg.waitForMatch(undefined, 1_000); + src.push("env_1", [ + record("rX", { batchId: "other" }), + record("r1", { batchId: "batch_1" }), + ]); + const result = await wait; + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + reg.close(); + }); + + it("multi-tag feeds require ALL tags on the row (Electric contains-all semantics)", async () => { + const rows = new Map([ + ["r_both", row("r_both", { tags: ["a", "b", "c"] })], + ["r_one", row("r_one", { tags: ["a"] })], + ]); + const { router, src } = makeRouter(rows); + const reg = router.register("env_1", { kind: "tag", tags: ["a", "b"] }, []); + const wait = reg.waitForMatch(undefined, 1_000); + + // r_one shares a tag (routes as a candidate via the index) but lacks "b" — must be + // culled by the authoritative row check. r_both carries both and wakes the feed. + src.push("env_1", [record("r_one", { tags: ["a"] }), record("r_both", { tags: ["a", "b", "c"] })]); + + const result = await wait; + expect(result.reason).toBe("notify"); + expect(result.rows.map((m) => m.row.id)).toEqual(["r_both"]); + reg.close(); + }); + + it("drops a tag match created before the feed's createdAt floor", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"], createdAtMs: FLOOR_MS - 10_000 })]]); + const { router, src } = makeRouter(rows); + const reg = router.register("env_1", { kind: "tag", tags: ["a"], createdAtFloorMs: FLOOR_MS }, []); + let settled = false; + const wait = reg.waitForMatch(undefined, 60).then((r) => { + settled = true; + return r; + }); + src.push("env_1", [record("r1", { tags: ["a"], createdAtMs: FLOOR_MS - 10_000 })]); + // Hydrated but out-of-window -> not woken; falls through to the timeout. + const result = await wait; + expect(settled).toBe(true); + expect(result.reason).toBe("timeout"); + reg.close(); + }); + + it("classifies a partial record (no tags) by hydrating and re-checking the row's tags", async () => { + // Partial record routes to all tag feeds as candidates; the authoritative row decides. + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src } = makeRouter(rows); + const match = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + const noMatch = router.register("env_1", { kind: "tag", tags: ["z"] }, []); + const matchWait = match.waitForMatch(undefined, 1_000); + let noMatchSettled = false; + const noMatchWait = noMatch.waitForMatch(undefined, 80).then((r) => { + noMatchSettled = true; + return r; + }); + + src.push("env_1", [record("r1", { tags: undefined })]); // partial: tags absent + + expect((await matchWait).rows.map((m) => m.row.id)).toEqual(["r1"]); + expect((await noMatchWait).reason).toBe("timeout"); // row tags ["a"] don't intersect ["z"] + expect(noMatchSettled).toBe(true); + match.close(); + noMatch.close(); + }); + + it("times out and aborts cleanly", async () => { + const { router, src } = makeRouter(new Map(), { unsubscribeLingerMs: 0 }); + const reg = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + expect((await reg.waitForMatch(undefined, 30)).reason).toBe("timeout"); + + const controller = new AbortController(); + const wait = reg.waitForMatch(controller.signal, 5_000); + controller.abort(); + expect((await wait).reason).toBe("abort"); + reg.close(); + expect(src.isSubscribed("env_1")).toBe(false); // linger disabled: last feed left -> unsubscribed + }); + + it("buffers a record that arrives between polls and replays it on the next arm", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows); + const reg = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + // Not waiting yet: the push can't wake anything, but it lands in the env buffer. + src.push("env_1", [record("r1", { tags: ["a"] })]); + expect(hydrateSpy).not.toHaveBeenCalled(); + + const result = await reg.waitForMatch(undefined, 1_000); + expect(result.reason).toBe("notify"); + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + reg.close(); + }); + + it("does not redeliver a replayed record on a later arm", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows); + const reg = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + src.push("env_1", [record("r1", { tags: ["a"] })]); + expect((await reg.waitForMatch(undefined, 1_000)).reason).toBe("notify"); + + // Same buffered record must not fire again; the wait falls through to its timeout. + expect((await reg.waitForMatch(undefined, 50)).reason).toBe("timeout"); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + reg.close(); + }); + + it("lingers the env subscription after the last feed closes and replays the gap", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows, { unsubscribeLingerMs: 60 }); + const reg1 = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + reg1.close(); + expect(src.isSubscribed("env_1")).toBe(true); // lingering + + // The inter-poll gap: a change arrives while no feed is registered. + src.push("env_1", [record("r1", { tags: ["a"] })]); + + const reg2 = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + const result = await reg2.waitForMatch(undefined, 1_000); + expect(result.reason).toBe("notify"); + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + + reg2.close(); + await new Promise((r) => setTimeout(r, 100)); + expect(src.isSubscribed("env_1")).toBe(false); // linger expired -> unsubscribed + }); + + it("reports gapCovered=false on a fresh env subscription and true once it ages past the window", async () => { + const { router } = makeRouter(new Map(), { replayWindowMs: 50 }); + const reg1 = router.register("env_1", { kind: "run", runId: "r1" }, []); + expect(reg1.gapCovered).toBe(false); + + await new Promise((r) => setTimeout(r, 70)); + const reg2 = router.register("env_1", { kind: "run", runId: "r2" }, []); + expect(reg2.gapCovered).toBe(true); + reg1.close(); + reg2.close(); + }); + + it("honors the caller's replaySinceMs so a new poll doesn't rewind into delivered records", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows); + const anchor = router.register("env_1", { kind: "tag", tags: ["a"] }, []); // keeps the env subscribed + src.push("env_1", [record("r1", { tags: ["a"] })]); + const afterPush = Date.now(); + + // A connection whose last response left after the push: nothing to replay. + const caughtUp = router.register("env_1", { kind: "tag", tags: ["a"] }, [], { + replaySinceMs: afterPush, + }); + expect(caughtUp.gapCovered).toBe(true); // env subscribed since before its gap began + expect((await caughtUp.waitForMatch(undefined, 50)).reason).toBe("timeout"); + expect(hydrateSpy).not.toHaveBeenCalled(); + + // A connection whose gap started before the push: the record replays. + const behind = router.register("env_1", { kind: "tag", tags: ["a"] }, [], { + replaySinceMs: afterPush - 1_000, + }); + const result = await behind.waitForMatch(undefined, 1_000); + expect(result.reason).toBe("notify"); + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + + anchor.close(); + caughtUp.close(); + behind.close(); + }); + + it("caps the replay buffer to the newest records per env", async () => { + const rows = new Map([ + ["r1", row("r1")], + ["r2", row("r2")], + ["r3", row("r3")], + ]); + const evictions: string[] = []; + const { router, src, hydrateSpy } = makeRouter(rows, { + replayMaxRunsPerEnv: 2, + onReplayEviction: (reason: string) => evictions.push(reason), + }); + const reg = router.register("env_1", { kind: "batch", batchId: "batch_1" }, []); + src.push("env_1", [ + record("r1", { batchId: "batch_1" }), + record("r2", { batchId: "batch_1" }), + record("r3", { batchId: "batch_1" }), + ]); + + const result = await reg.waitForMatch(undefined, 1_000); + expect(result.reason).toBe("notify"); + // r1 was evicted by the cap; only the newest two replay. + expect(hydrateSpy).toHaveBeenCalledWith("env_1", ["r2", "r3"], []); + expect(evictions).toEqual(["cap"]); + reg.close(); + }); +}); diff --git a/apps/webapp/test/realtime/nativeHoldOnEmpty.test.ts b/apps/webapp/test/realtime/nativeHoldOnEmpty.test.ts new file mode 100644 index 00000000000..615abc90394 --- /dev/null +++ b/apps/webapp/test/realtime/nativeHoldOnEmpty.test.ts @@ -0,0 +1,266 @@ +import { setTimeout as sleep } from "node:timers/promises"; +import { CURRENT_API_VERSION } from "~/api/versions"; +import { + NativeRealtimeClient, + type RealtimeListEnvironment, +} from "~/services/realtime/nativeRealtimeClient.server"; +import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; +import { + EnvChangeRouter, + type EnvChangeSource, +} from "~/services/realtime/envChangeRouter.server"; +import { type ChangeRecord } from "~/services/realtime/runChangeNotifier.server"; +import { describe, expect, it, vi } from "vitest"; + +const ENV: RealtimeListEnvironment = { id: "env_1", organizationId: "org_1", projectId: "proj_1" }; + +// Fixed offset floor: a row's updatedAt above/below it produces a delta / empty diff. The +// createdAt window resolves to this same floor (large maximumCreatedAtFilterAgeMs below). +const FLOOR_MS = Date.UTC(2026, 5, 7, 12, 0, 0); + +function row( + id: string, + updatedAtMs: number, + opts: { createdAtMs?: number; tags?: string[] } = {} +): RealtimeRunRow { + return { + id, + runTags: opts.tags ?? ["t"], + createdAt: new Date(opts.createdAtMs ?? FLOOR_MS + 1_000), + updatedAt: new Date(updatedAtMs), + } as unknown as RealtimeRunRow; +} + +function rec(runId: string, extra: Partial = {}): ChangeRecord { + return { v: 1, runId, envId: "env_1", ...extra }; +} + +/** A controllable EnvChangeSource the test pushes batches into. */ +function fakeSource() { + const listeners = new Map void>>(); + const source: EnvChangeSource = { + subscribeToEnv(envId, onBatch) { + let set = listeners.get(envId); + if (!set) { + set = new Set(); + listeners.set(envId, set); + } + set.add(onBatch); + return () => listeners.get(envId)?.delete(onBatch); + }, + }; + return { + source, + push: (envId: string, records: ChangeRecord[]) => { + for (const l of listeners.get(envId) ?? []) l(records); + }, + isSubscribed: (envId: string) => (listeners.get(envId)?.size ?? 0) > 0, + }; +} + +function makeClient(overrides: Record = {}) { + let rowsToReturn: RealtimeRunRow[] = []; + const hydrateSpy = vi.fn(async (_env: string, ids: string[]) => + rowsToReturn.filter((r) => ids.includes(r.id)) + ); + const resolveSpy = vi.fn(async () => rowsToReturn.map((r) => r.id)); + const src = fakeSource(); + const router = new EnvChangeRouter({ + source: src.source, + hydrator: { hydrateByIds: hydrateSpy }, + replayWindowMs: 0, + unsubscribeLingerMs: 0, + ...(overrides.routerOptions as Record ?? {}), + }); + delete overrides.routerOptions; + + const client = new NativeRealtimeClient({ + runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, + runListResolver: { resolveMatchingRunIds: resolveSpy } as any, + router, + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + // Large so the recovered createdAt floor isn't clamped past FLOOR_MS. + maximumCreatedAtFilterAgeMs: 100 * 365 * 24 * 60 * 60 * 1000, + runSetResolveCacheTtlMs: 0, + livePollTimeoutMs: 10_000, + ...overrides, + }); + + return { client, src, hydrateSpy, resolveSpy, setRows: (rows: RealtimeRunRow[]) => (rowsToReturn = rows) }; +} + +function liveRuns(client: NativeRealtimeClient) { + return client.streamRuns( + `http://localhost:3030/realtime/v1/runs?offset=${FLOOR_MS}_1&live=true&handle=runs_${FLOOR_MS}_7`, + ENV, + { tags: ["t"] }, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); +} + +async function whenWaiting(src: ReturnType) { + // Subscribed (feed registered) + a tick so waitForMatch has armed feed.resolve. + await vi.waitFor(() => expect(src.isSubscribed("env_1")).toBe(true)); + await sleep(15); +} + +async function bodyOf(res: Response) { + return JSON.parse(await res.text()) as Array<{ + headers?: { control?: string; operation?: string }; + value?: unknown; + }>; +} +const hasRowOp = (body: Awaited>) => + body.some((m) => m?.headers?.operation || (m && typeof m === "object" && "value" in m)); +const isUpToDate = (body: Awaited>) => + body.some((m) => m?.headers?.control === "up-to-date"); + +describe("NativeRealtimeClient multi-run live path over the router", () => { + it("a matching change hydrates by id (no ClickHouse) and returns a delta", async () => { + const emits: Array<[string, number, number]> = []; + const { client, src, hydrateSpy, resolveSpy, setRows } = makeClient({ + onEmit: (path: string, lagMs: number, rows: number) => emits.push([path, lagMs, rows]), + }); + setRows([row("run_1", FLOOR_MS + 5_000, { tags: ["t"] })]); + + const responsePromise = liveRuns(client); + await whenWaiting(src); + src.push("env_1", [rec("run_1", { tags: ["t", "x"] })]); + + const res = await responsePromise; + expect(res.status).toBe(200); + expect(hasRowOp(await bodyOf(res))).toBe(true); + expect(resolveSpy).not.toHaveBeenCalled(); // ClickHouse skipped + expect(hydrateSpy).toHaveBeenCalledWith("env_1", ["run_1"], expect.anything()); + expect(emits).toHaveLength(1); + expect(emits[0][0]).toBe("fast-hydrate"); + expect(emits[0][2]).toBe(1); // one delta row + }); + + it("a change that doesn't match the filter never wakes the feed (no CH, no PG); a later match does", async () => { + const { client, src, hydrateSpy, resolveSpy, setRows } = makeClient(); + setRows([row("run_1", FLOOR_MS + 5_000, { tags: ["t"] })]); + + const responsePromise = liveRuns(client); + let settled = false; + void responsePromise.then(() => (settled = true)); + await whenWaiting(src); + + src.push("env_1", [rec("run_x", { tags: ["other"] })]); // doesn't intersect ["t"] + await sleep(50); + expect(settled).toBe(false); + expect(hydrateSpy).not.toHaveBeenCalled(); // router never routed it + expect(resolveSpy).not.toHaveBeenCalled(); + + src.push("env_1", [rec("run_1", { tags: ["t"] })]); + const res = await responsePromise; + expect(settled).toBe(true); + expect(hasRowOp(await bodyOf(res))).toBe(true); + }); + + it("a matching run created before the window floor is hydrated but dropped (keeps holding)", async () => { + // Generous backstop so the "still holding" assertion can't race a timeout in slow CI. + const { client, src, hydrateSpy, resolveSpy, setRows } = makeClient({ livePollTimeoutMs: 1500 }); + setRows([row("run_1", FLOOR_MS + 5_000, { createdAtMs: FLOOR_MS - 10_000, tags: ["t"] })]); + + const responsePromise = liveRuns(client); + let settled = false; + void responsePromise.then(() => (settled = true)); + await whenWaiting(src); + src.push("env_1", [rec("run_1", { tags: ["t"] })]); + + await sleep(40); + expect(settled).toBe(false); // dropped by the createdAt floor -> held + expect(hydrateSpy).toHaveBeenCalledWith("env_1", ["run_1"], expect.anything()); + expect(resolveSpy).not.toHaveBeenCalled(); + + await responsePromise; // drain via the backstop + }); + + it("the backstop timeout does a full ClickHouse resolve and returns up-to-date", async () => { + const backstopResults: string[] = []; + const { client, resolveSpy } = makeClient({ + livePollTimeoutMs: 50, + onBackstopResult: (r: string) => backstopResults.push(r), + }); + const res = await liveRuns(client); // never pushed -> backstop fires + expect(res.status).toBe(200); + expect(isUpToDate(await bodyOf(res))).toBe(true); + expect(resolveSpy).toHaveBeenCalled(); + expect(backstopResults).toEqual(["empty"]); + }); + + it("a cold env registration resolves immediately instead of holding blind", async () => { + // Fresh env subscription (gapCovered=false): a change in the inter-poll gap may have + // been missed, so the live poll probes once. The row advanced past the offset floor. + const { client, resolveSpy, setRows } = makeClient({ + routerOptions: { replayWindowMs: 2_000 }, + }); + setRows([row("run_1", FLOOR_MS + 5_000, { tags: ["t"] })]); + + const res = await liveRuns(client); // no push needed — the cold probe finds the delta + expect(res.status).toBe(200); + expect(resolveSpy).toHaveBeenCalledTimes(1); + expect(hasRowOp(await bodyOf(res))).toBe(true); + }); + + it("a cold probe with nothing missed keeps holding", async () => { + const { client, src, resolveSpy, setRows } = makeClient({ + routerOptions: { replayWindowMs: 2_000 }, + livePollTimeoutMs: 1_500, + }); + setRows([row("run_1", FLOOR_MS - 1_000, { tags: ["t"] })]); // at/below the offset floor + + const responsePromise = liveRuns(client); + let settled = false; + void responsePromise.then(() => (settled = true)); + await whenWaiting(src); + await sleep(50); + expect(settled).toBe(false); // probed, found nothing missed, held + expect(resolveSpy).toHaveBeenCalledTimes(1); + await responsePromise; // drain via the backstop + }); + + it("a single-run poll holds on a replayed already-seen record instead of busy re-polling", async () => { + const { client, src, setRows } = makeClient({ + routerOptions: { replayWindowMs: 2_000 }, + livePollTimeoutMs: 300, + }); + setRows([row("run_1", FLOOR_MS + 1_000)]); + const url = `http://localhost:3030/realtime/v1/runs/run_1?offset=${FLOOR_MS + 1_000}_1&handle=run-run_1&live=true`; + + // First poll subscribes the env, then drains via its backstop. + const first = await client.streamRun(url, ENV, "run_1", CURRENT_API_VERSION, undefined, "1.0.0"); + expect(first.status).toBe(200); + + // The record lands between polls; the lingering env subscription buffers it. + src.push("env_1", [rec("run_1")]); + + // The next poll replays it, but the row hasn't advanced past the client's offset: + // the poll must HOLD (the old behavior returned up-to-date instantly = a busy loop). + let settled = false; + const second = client.streamRun(url, ENV, "run_1", CURRENT_API_VERSION, undefined, "1.0.0"); + void second.then(() => (settled = true)); + await sleep(120); + expect(settled).toBe(false); + expect((await second).status).toBe(200); // drains via the backstop + }); + + it("with holdOnEmpty=false, a matched-but-not-advanced change returns up-to-date without ClickHouse", async () => { + const { client, src, resolveSpy, setRows } = makeClient({ holdOnEmpty: false }); + // Matches the tag and is in-window, but updatedAt is at/below the offset floor -> no delta. + setRows([row("run_1", FLOOR_MS - 1_000, { tags: ["t"] })]); + + const responsePromise = liveRuns(client); + await whenWaiting(src); + src.push("env_1", [rec("run_1", { tags: ["t"] })]); + + const res = await responsePromise; + expect(res.status).toBe(200); + expect(isUpToDate(await bodyOf(res))).toBe(true); + expect(resolveSpy).not.toHaveBeenCalled(); + }); +}); diff --git a/apps/webapp/test/realtime/nativeRealtimeClient.test.ts b/apps/webapp/test/realtime/nativeRealtimeClient.test.ts new file mode 100644 index 00000000000..b94c72a4e65 --- /dev/null +++ b/apps/webapp/test/realtime/nativeRealtimeClient.test.ts @@ -0,0 +1,110 @@ +import { CURRENT_API_VERSION } from "~/api/versions"; +import { + NativeRealtimeClient, + type RealtimeListEnvironment, +} from "~/services/realtime/nativeRealtimeClient.server"; +import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; +import { EnvChangeRouter } from "~/services/realtime/envChangeRouter.server"; +import { describe, expect, it } from "vitest"; + +function sampleRow(): RealtimeRunRow { + return { + id: "run_1", + taskIdentifier: "t", + createdAt: new Date("2026-06-07T10:00:00.000Z"), + updatedAt: new Date("2026-06-07T10:00:01.000Z"), + startedAt: null, + delayUntil: null, + queuedAt: null, + expiredAt: null, + completedAt: null, + friendlyId: "run_friendly_1", + number: 1, + isTest: false, + status: "EXECUTING", + usageDurationMs: 0, + costInCents: 0, + baseCostInCents: 0, + ttl: null, + payload: "{}", + payloadType: "application/json", + metadata: null, + metadataType: "application/json", + output: null, + outputType: "application/json", + runTags: [], + error: null, + realtimeStreams: [], + }; +} + +// Only the initial-snapshot path is exercised here, which touches the shared +// #buildResponse — enough to lock the response-header contract. +function makeClient(row: RealtimeRunRow | null) { + return new NativeRealtimeClient({ + runReader: { + getRunById: async () => row, + hydrateByIds: async () => (row ? [row] : []), + } as any, + runListResolver: { resolveMatchingRunIds: async () => [] } as any, + // Snapshot path only; the router (over a no-op source) is never invoked here. + router: new EnvChangeRouter({ + source: { subscribeToEnv: () => () => {} }, + hydrator: { hydrateByIds: async () => (row ? [row] : []) }, + replayWindowMs: 0, + unsubscribeLingerMs: 0, + }), + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, + }); +} + +const ENV: RealtimeListEnvironment = { + id: "env_1", + organizationId: "org_1", + projectId: "proj_1", +}; + +describe("NativeRealtimeClient response headers", () => { + it("exposes electric headers cross-origin so browser hooks can read them", async () => { + const client = makeClient(sampleRow()); + const res = await client.streamRun( + "http://localhost:3030/realtime/v1/runs/run_1?offset=-1", + ENV, + "run_1", + CURRENT_API_VERSION, + undefined, + "1.0.0-beta.1" // modern client => lowercase electric-* headers + ); + + // Without these the deployed @electric-sql/client throws MissingHeadersError + // (it can't read the electric-* headers across origins). This regressed once. + expect(res.headers.get("access-control-allow-origin")).toBe("*"); + expect(res.headers.get("access-control-expose-headers")).toBe("*"); + + // Initial (non-live) snapshot requires offset + handle + schema. + expect(res.headers.get("electric-offset")).toBeTruthy(); + expect(res.headers.get("electric-handle")).toBeTruthy(); + expect(res.headers.get("electric-schema")).toBeTruthy(); + expect(res.headers.get("content-type")).toBe("application/json"); + }); + + it("renames headers for legacy (0.4.0) clients", async () => { + const client = makeClient(sampleRow()); + const res = await client.streamRun( + "http://localhost:3030/realtime/v1/runs/run_1?offset=-1", + ENV, + "run_1", + CURRENT_API_VERSION, + undefined, + undefined // no client version => legacy header names + ); + + expect(res.headers.get("electric-chunk-last-offset")).toBeTruthy(); + expect(res.headers.get("electric-shape-id")).toBeTruthy(); + expect(res.headers.get("electric-offset")).toBeNull(); + expect(res.headers.get("electric-handle")).toBeNull(); + expect(res.headers.get("access-control-expose-headers")).toBe("*"); + }); +}); diff --git a/apps/webapp/test/realtime/nativeRunSetCache.test.ts b/apps/webapp/test/realtime/nativeRunSetCache.test.ts new file mode 100644 index 00000000000..2389fd78080 --- /dev/null +++ b/apps/webapp/test/realtime/nativeRunSetCache.test.ts @@ -0,0 +1,344 @@ +import { CURRENT_API_VERSION } from "~/api/versions"; +import { + NativeRealtimeClient, + type RealtimeListEnvironment, +} from "~/services/realtime/nativeRealtimeClient.server"; +import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; +import { EnvChangeRouter } from "~/services/realtime/envChangeRouter.server"; +import { setTimeout as sleep } from "node:timers/promises"; +import { describe, expect, it, vi } from "vitest"; + +const ENV: RealtimeListEnvironment = { id: "env_1", organizationId: "org_1", projectId: "proj_1" }; + +function row(id: string): RealtimeRunRow { + // Only id/createdAt/updatedAt are read directly; the rest serialize to null. + return { + id, + createdAt: new Date("2026-06-07T09:00:00.000Z"), + updatedAt: new Date("2026-06-07T10:00:00.000Z"), + } as unknown as RealtimeRunRow; +} + +function makeClient(overrides: Record = {}) { + const resolveSpy = vi.fn(async () => ["run_1", "run_2"]); + const hydrateSpy = vi.fn(async (_env: string, ids: string[]) => ids.map(row)); + + const client = new NativeRealtimeClient({ + runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, + runListResolver: { resolveMatchingRunIds: resolveSpy } as any, + // No-op source: live polls never get a router wake, so they fall through to the + // backstop full-resolve — which is what the live tests below assert on. + router: new EnvChangeRouter({ + source: { subscribeToEnv: () => () => {} }, + hydrator: { hydrateByIds: hydrateSpy }, + replayWindowMs: 0, + unsubscribeLingerMs: 0, + }), + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, + runSetResolveCacheTtlMs: 5_000, + ...overrides, + }); + + return { client, resolveSpy, hydrateSpy }; +} + +// streamBatch with offset=-1 takes the snapshot path, which calls the coalescing +// resolve+hydrate directly (no concurrency slot / subscription needed). +function snapshot(client: NativeRealtimeClient, batchId: string, skipColumns?: string) { + const skip = skipColumns ? `&skipColumns=${skipColumns}` : ""; + return client.streamBatch( + `http://localhost:3030/realtime/v1/batches/${batchId}?offset=-1${skip}`, + ENV, + batchId, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); +} + +// Tag-list snapshot (offset=-1) — exercises the createdAt bucketing + cache key. +function snapshotTag(client: NativeRealtimeClient, tags: string[]) { + return client.streamRuns( + "http://localhost:3030/realtime/v1/runs?offset=-1", + ENV, + { tags }, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); +} + +describe("NativeRealtimeClient run-set resolve coalescing + cache", () => { + it("coalesces concurrent same-filter resolves into one ClickHouse + Postgres query", async () => { + const { client, resolveSpy, hydrateSpy } = makeClient(); + let release!: (ids: string[]) => void; + const gate = new Promise((resolve) => { + release = resolve; + }); + resolveSpy.mockReturnValueOnce(gate); + + const p1 = snapshot(client, "batch_1"); + const p2 = snapshot(client, "batch_1"); + release(["run_1"]); + await Promise.all([p1, p2]); + + expect(resolveSpy).toHaveBeenCalledTimes(1); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + }); + + it("serves a second same-filter request from the cache within the TTL", async () => { + const { client, resolveSpy, hydrateSpy } = makeClient(); + await snapshot(client, "batch_1"); + await snapshot(client, "batch_1"); + expect(resolveSpy).toHaveBeenCalledTimes(1); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + }); + + it("does not share the cache across different filters", async () => { + const { client, resolveSpy } = makeClient(); + await snapshot(client, "batch_1"); + await snapshot(client, "batch_2"); + expect(resolveSpy).toHaveBeenCalledTimes(2); + }); + + it("re-queries after the cache TTL expires", async () => { + vi.useFakeTimers({ toFake: ["Date"] }); + try { + const { client, resolveSpy } = makeClient({ runSetResolveCacheTtlMs: 1_000 }); + await snapshot(client, "batch_1"); + vi.advanceTimersByTime(1_001); + await snapshot(client, "batch_1"); + expect(resolveSpy).toHaveBeenCalledTimes(2); + } finally { + vi.useRealTimers(); + } + }); + + it("passes the client's skipColumns through to the hydrator (column projection)", async () => { + const { client, hydrateSpy } = makeClient(); + await snapshot(client, "batch_1", "payload,output"); + expect(hydrateSpy).toHaveBeenCalledWith("env_1", expect.any(Array), ["payload", "output"]); + }); + + it("reports resolve outcomes (miss then hit) to the metrics hook", async () => { + const results: string[] = []; + const { client } = makeClient({ onRunSetResolve: (r: string) => results.push(r) }); + await snapshot(client, "batch_1"); + await snapshot(client, "batch_1"); + expect(results).toEqual(["miss", "hit"]); + }); + + it("mints a distinct batch handle per connection and echoes a client-provided one", async () => { + const { client } = makeClient(); + // Two subscribers to the SAME batch must never share a handle (the working-set + // cache is keyed by it; sharing lets one suppress the other's deltas forever). + const res1 = await snapshot(client, "batch_1"); + const res2 = await snapshot(client, "batch_1"); + const h1 = res1.headers.get("electric-handle"); + const h2 = res2.headers.get("electric-handle"); + expect(h1).toBeTruthy(); + expect(h1).not.toBe(h2); + + // Catch-up under an existing handle keeps it. + const res3 = await client.streamBatch( + `http://localhost:3030/realtime/v1/batches/batch_1?offset=123_1&handle=${h1}`, + ENV, + "batch_1", + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); + expect(res3.headers.get("electric-handle")).toBe(h1); + }); +}); + +describe("NativeRealtimeClient resolve admission gate (mass-reconnect stampede)", () => { + // A resolver that blocks each invocation until released, so we can watch how many run + // concurrently. Tracks peak concurrency and exposes a release-one-at-a-time drain. + function gatedResolver() { + let active = 0; + let peak = 0; + const releases: Array<() => void> = []; + const resolve = vi.fn(async () => { + active++; + peak = Math.max(peak, active); + await new Promise((r) => releases.push(r)); + active--; + return ["run_1"]; + }); + return { + resolve, + peak: () => peak, + releaseOne: () => releases.shift()?.(), + waiting: () => releases.length, + }; + } + + function makeGatedClient(resolveAdmissionLimit: number, resolver: ReturnType) { + const hydrateSpy = vi.fn(async (_env: string, ids: string[]) => ids.map(row)); + return new NativeRealtimeClient({ + runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, + runListResolver: { resolveMatchingRunIds: resolver.resolve } as any, + router: new EnvChangeRouter({ + source: { subscribeToEnv: () => () => {} }, + hydrator: { hydrateByIds: hydrateSpy }, + replayWindowMs: 0, + unsubscribeLingerMs: 0, + }), + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, + runSetResolveCacheTtlMs: 0, // no cache -> every distinct filter is a fresh resolve + resolveAdmissionLimit, + }); + } + + it("throttles a distinct-filter stampede to the admission limit of concurrent CH resolves", async () => { + const resolver = gatedResolver(); + const client = makeGatedClient(2, resolver); + + // 5 distinct batchIds => 5 distinct filters => 5 fresh resolves, fired at once. + const polls = [0, 1, 2, 3, 4].map((i) => snapshot(client, `batch_${i}`)); + + // Only the limit (2) may run concurrently; the rest queue for a permit. + await vi.waitFor(() => expect(resolver.resolve).toHaveBeenCalledTimes(2)); + await sleep(20); + expect(resolver.resolve).toHaveBeenCalledTimes(2); // 3 still queued behind the gate + expect(resolver.peak()).toBe(2); + + // Drain: each release frees a permit, admitting exactly one queued resolve. + while (resolver.waiting() > 0) { + resolver.releaseOne(); + await sleep(5); + } + await Promise.all(polls); + + expect(resolver.resolve).toHaveBeenCalledTimes(5); // all ran... + expect(resolver.peak()).toBe(2); // ...but never more than the limit at once + }); + + it("lets a same-filter burst through on a single permit (coalesces before the gate)", async () => { + const resolver = gatedResolver(); + const client = makeGatedClient(1, resolver); // limit 1 would deadlock if each took a permit + + // 5 identical filters fired at once -> single-flight collapses to one in-flight resolve. + const polls = [0, 1, 2, 3, 4].map(() => snapshot(client, "batch_same")); + await vi.waitFor(() => expect(resolver.resolve).toHaveBeenCalledTimes(1)); + await sleep(20); + + resolver.releaseOne(); + await Promise.all(polls); + expect(resolver.resolve).toHaveBeenCalledTimes(1); // one resolve, one permit, no queue + }); +}); + +describe("NativeRealtimeClient tag-list createdAt bucketing", () => { + it("floors the resolved createdAt lower bound to the bucket boundary", async () => { + // Fix the clock to a non-bucket-aligned instant so the assertion is deterministic. + vi.useFakeTimers({ toFake: ["Date"] }); + vi.setSystemTime(new Date("2026-06-07T10:00:30.500Z")); + try { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 60_000 }); + await snapshotTag(client, ["critical"]); + const passed = resolveSpy.mock.calls[0][0].createdAtAfter as Date; + expect(passed.getTime() % 60_000).toBe(0); + } finally { + vi.useRealTimers(); + } + }); + + it("lets two same-tag feeds in the same bucket share one resolve", async () => { + // A large bucket guarantees both windows floor to the same boundary regardless of + // the sub-millisecond gap between the two calls. + const { client, resolveSpy, hydrateSpy } = makeClient({ + runSetCreatedAtBucketMs: 60 * 60_000, + }); + await snapshotTag(client, ["critical"]); + await snapshotTag(client, ["critical"]); + expect(resolveSpy).toHaveBeenCalledTimes(1); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + }); + + it("does not share across different tags", async () => { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 60 * 60_000 }); + await snapshotTag(client, ["critical"]); + await snapshotTag(client, ["debug"]); + expect(resolveSpy).toHaveBeenCalledTimes(2); + }); + + it("does not collide a comma-containing tag with two separate tags", async () => { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 60 * 60_000 }); + await snapshotTag(client, ["a,b"]); // one tag "a,b" + await snapshotTag(client, ["a", "b"]); // two tags a OR b — a different filter + expect(resolveSpy).toHaveBeenCalledTimes(2); + }); + + it("keeps each feed's exact lower bound when bucketing is disabled (0)", async () => { + vi.useFakeTimers({ toFake: ["Date"] }); + vi.setSystemTime(new Date("2026-06-07T10:00:30.500Z")); + try { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 0 }); + await snapshotTag(client, ["critical"]); + const passed = resolveSpy.mock.calls[0][0].createdAtAfter as Date; + // Exact (now - 24h) lower bound, not floored to a 60s boundary. + expect(passed.getTime() % 60_000).not.toBe(0); + } finally { + vi.useRealTimers(); + } + }); +}); + +describe("NativeRealtimeClient review fixes", () => { + // makeClient's router has a no-op source, so the live poll never gets a wake and falls + // through to its backstop timeout — the full ClickHouse resolve these tests assert on + // (createdAt clamp / concurrency limit). + + it("clamps a stale/crafted handle's createdAt up to the max-age floor", async () => { + const maxAge = 24 * 60 * 60 * 1000; + const { client, resolveSpy } = makeClient({ + maximumCreatedAtFilterAgeMs: maxAge, + runSetCreatedAtBucketMs: 0, + livePollTimeoutMs: 50, + }); + const before = Date.now(); + // Handle encodes createdAt = 1ms epoch, far older than the 24h ceiling. + await client.streamRuns( + "http://localhost:3030/realtime/v1/runs?offset=123_1&live=true&handle=runs_1_7", + ENV, + { tags: ["t"] }, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); + const passed = resolveSpy.mock.calls[0][0].createdAtAfter as Date; + // Clamped to ~now - maxAge, not the epoch value encoded in the handle. + expect(passed.getTime()).toBeGreaterThan(before - maxAge - 1_000); + }); + + it("enforces a concurrency limit of 0 instead of failing with a 500", async () => { + let limitCheckedWith: number | undefined; + const { client } = makeClient({ + cachedLimitProvider: { getCachedLimit: async () => 0 }, + limiter: { + incrementAndCheck: async (_env: string, _id: string, limit: number) => { + limitCheckedWith = limit; + return true; + }, + decrement: async () => {}, + }, + livePollTimeoutMs: 50, + }); + const res = await client.streamBatch( + "http://localhost:3030/realtime/v1/batches/batch_1?offset=123_1&live=true&handle=batch_batch_1_7_abc", + ENV, + "batch_1", + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); + expect(res.status).toBe(200); + expect(limitCheckedWith).toBe(0); + }); +}); diff --git a/apps/webapp/test/realtime/replayCursorStore.test.ts b/apps/webapp/test/realtime/replayCursorStore.test.ts new file mode 100644 index 00000000000..b66bc72df9c --- /dev/null +++ b/apps/webapp/test/realtime/replayCursorStore.test.ts @@ -0,0 +1,141 @@ +import { redisTest } from "@internal/testcontainers"; +import { setTimeout as sleep } from "node:timers/promises"; +import { CURRENT_API_VERSION } from "~/api/versions"; +import { EnvChangeRouter } from "~/services/realtime/envChangeRouter.server"; +import { + NativeRealtimeClient, + type RealtimeListEnvironment, +} from "~/services/realtime/nativeRealtimeClient.server"; +import { + InMemoryReplayCursorStore, + RedisReplayCursorStore, + type ReplayCursorStore, +} from "~/services/realtime/replayCursorStore.server"; +import { describe, expect, it, vi } from "vitest"; + +describe("InMemoryReplayCursorStore", () => { + it("round-trips and expires", async () => { + const store = new InMemoryReplayCursorStore(50, 10); + store.set("env_1:h1", 123_456); + expect(await store.get("env_1:h1")).toBe(123_456); + expect(await store.get("env_1:other")).toBeUndefined(); + await sleep(60); + expect(await store.get("env_1:h1")).toBeUndefined(); + }); +}); + +describe("RedisReplayCursorStore", () => { + redisTest("round-trips, misses, and expires via PX", async ({ redisOptions }) => { + const store = new RedisReplayCursorStore({ + redis: { ...redisOptions, tlsDisabled: true }, + ttlMs: 150, + }); + try { + const now = Date.now(); + store.set("env_1:h1", now); + await vi.waitFor(async () => expect(await store.get("env_1:h1")).toBe(now)); + expect(await store.get("env_1:missing")).toBeUndefined(); + await sleep(200); + expect(await store.get("env_1:h1")).toBeUndefined(); + } finally { + await store.quit(); + } + }); + + redisTest("a second store instance reads the first's cursor (fleet sharing)", async ({ + redisOptions, + }) => { + const a = new RedisReplayCursorStore({ + redis: { ...redisOptions, tlsDisabled: true }, + ttlMs: 60_000, + }); + const b = new RedisReplayCursorStore({ + redis: { ...redisOptions, tlsDisabled: true }, + ttlMs: 60_000, + }); + try { + a.set("env_1:h2", 42_000); + await vi.waitFor(async () => expect(await b.get("env_1:h2")).toBe(42_000)); + } finally { + await Promise.all([a.quit(), b.quit()]); + } + }); + + it("degrades to undefined within the read deadline when Redis is unreachable", async () => { + const results: Array<[string, boolean]> = []; + const store = new RedisReplayCursorStore({ + redis: { host: "127.0.0.1", port: 1, tlsDisabled: true } as any, + ttlMs: 1_000, + getTimeoutMs: 100, + onResult: (op, ok) => results.push([op, ok]), + }); + try { + expect(await store.get("env_1:h3")).toBeUndefined(); + expect(results).toContainEqual(["get", false]); + } finally { + await store.quit().catch(() => {}); + } + }); +}); + +describe("NativeRealtimeClient replay-cursor threading", () => { + const ENV: RealtimeListEnvironment = { id: "env_1", organizationId: "org_1", projectId: "proj_1" }; + const FLOOR_MS = Date.UTC(2026, 5, 7, 12, 0, 0); + + it("passes the stored cursor to register and stamps the store after responding", async () => { + const cursorMs = Date.now() - 500; + const gets: string[] = []; + const sets: Array<[string, number]> = []; + const store: ReplayCursorStore = { + get: async (key) => { + gets.push(key); + return cursorMs; + }, + set: (key, ms) => { + sets.push([key, ms]); + }, + }; + + const router = new EnvChangeRouter({ + source: { subscribeToEnv: () => () => {} }, + hydrator: { hydrateByIds: async () => [] }, + replayWindowMs: 0, + unsubscribeLingerMs: 0, + }); + const registerSpy = vi.spyOn(router, "register"); + + const client = new NativeRealtimeClient({ + runReader: { getRunById: async () => null, hydrateByIds: async () => [] } as any, + runListResolver: { resolveMatchingRunIds: async () => [] } as any, + router, + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + maximumCreatedAtFilterAgeMs: 100 * 365 * 24 * 60 * 60 * 1000, + runSetResolveCacheTtlMs: 0, + livePollTimeoutMs: 30, + replayCursorStore: store, + }); + + const res = await client.streamRuns( + `http://localhost:3030/realtime/v1/runs?offset=${FLOOR_MS}_1&live=true&handle=runs_${FLOOR_MS}_7`, + ENV, + { tags: ["t"] }, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); + + expect(res.status).toBe(200); + expect(gets).toEqual([`env_1:runs_${FLOOR_MS}_7`]); + expect(registerSpy).toHaveBeenCalledWith( + "env_1", + expect.objectContaining({ kind: "tag" }), + expect.anything(), + { replaySinceMs: cursorMs } + ); + // The backstop's up-to-date response stamps the cursor for the next poll. + expect(sets.length).toBe(1); + expect(sets[0][0]).toBe(`env_1:runs_${FLOOR_MS}_7`); + expect(sets[0][1]).toBeGreaterThanOrEqual(cursorMs); + }); +}); diff --git a/apps/webapp/test/realtime/runChangeNotifier.test.ts b/apps/webapp/test/realtime/runChangeNotifier.test.ts new file mode 100644 index 00000000000..96d7fd56a45 --- /dev/null +++ b/apps/webapp/test/realtime/runChangeNotifier.test.ts @@ -0,0 +1,172 @@ +import { redisTest } from "@internal/testcontainers"; +import { setTimeout as sleep } from "node:timers/promises"; +import { describe, expect, it, vi } from "vitest"; +import { + type ChangeRecord, + decodeChangeRecord, + encodeChangeRecord, + RunChangeNotifier, +} from "~/services/realtime/runChangeNotifier.server"; + +function toRedisOptions(redisOptions: { host?: string; port?: number; password?: string }) { + return { + host: redisOptions.host, + port: redisOptions.port, + password: redisOptions.password, + tlsDisabled: true, + clusterMode: false, + }; +} + +// Time for a SUBSCRIBE to register server-side before we publish. +const SUBSCRIBE_SETTLE_MS = 250; + +describe("RunChangeNotifier", () => { + redisTest( + "delivers a published change to an env subscriber", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const received: ChangeRecord[] = []; + const unsubscribe = notifier.subscribeToEnv("env_1", (records) => received.push(...records)); + expect(notifier.activeSubscriptionCount).toBe(1); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_1", envId: "env_1", tags: ["a"], batchId: "batch_1" }); + + await vi.waitFor(() => expect(received.some((r) => r.runId === "run_1")).toBe(true), { + timeout: 5_000, + interval: 50, + }); + const got = received.find((r) => r.runId === "run_1")!; + expect(got.tags).toEqual(["a"]); + expect(got.batchId).toBe("batch_1"); + + unsubscribe(); + // Cleanup is deferred until Redis confirms UNSUBSCRIBE, so the count converges to 0. + await vi.waitFor(() => expect(notifier.activeSubscriptionCount).toBe(0), { + timeout: 5_000, + interval: 50, + }); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "does not deliver a change for a different env", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const received: ChangeRecord[] = []; + notifier.subscribeToEnv("env_a", (records) => received.push(...records)); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_1", envId: "env_b", tags: [] }); // different env + await sleep(500); + + expect(received).toHaveLength(0); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "coalesces a burst of env publishes into far fewer batches than publishes (lossless)", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ + redis: toRedisOptions(redisOptions), + envWakeCoalesceWindowMs: 100, + }); + try { + let batches = 0; + const runIds = new Set(); + notifier.subscribeToEnv("env_burst", (records) => { + batches++; + for (const r of records) runIds.add(r.runId); + }); + + await sleep(SUBSCRIBE_SETTLE_MS); + let pubs = 0; + const end = Date.now() + 1_000; + while (Date.now() < end) { + notifier.publish({ runId: `r${pubs++}`, envId: "env_burst", tags: [] }); + await sleep(5); + } + await sleep(300); + + expect(pubs).toBeGreaterThan(100); + expect(batches).toBeGreaterThanOrEqual(1); + // Leading-edge throttle: far fewer deliveries than publishes... + expect(batches).toBeLessThan(pubs / 4); + // ...but lossless — the batch accumulates every run that changed in the window. + expect(runIds.size).toBeGreaterThan(pubs / 2); + } finally { + await notifier.quit(); + } + } + ); + + // Sharded pub/sub (SSUBSCRIBE/SPUBLISH/smessage) wiring — validated end to end on a + // single node (Redis 7.2 accepts these and delivers same-node). Multi-shard ROUTING + // needs a real cluster (the cluster fixture covers that); this proves the command path. + redisTest( + "delivers via sharded pub/sub on the env channel", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ + redis: toRedisOptions(redisOptions), + shardedPubSub: true, + }); + try { + const received: ChangeRecord[] = []; + notifier.subscribeToEnv("env_sharded", (records) => received.push(...records)); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_1", envId: "env_sharded", tags: ["a"] }); + + await vi.waitFor(() => expect(received.some((r) => r.runId === "run_1")).toBe(true), { + timeout: 5_000, + interval: 50, + }); + } finally { + await notifier.quit(); + } + } + ); + + describe("ChangeRecord codec", () => { + it("round-trips a full record (tags with a separator survive)", () => { + const encoded = encodeChangeRecord({ + v: 1, + runId: "run_1", + envId: "env_1", + tags: ["a", "b,c"], + batchId: "batch_1", + }); + expect(decodeChangeRecord(encoded)).toMatchObject({ + v: 1, + runId: "run_1", + envId: "env_1", + tags: ["a", "b,c"], + batchId: "batch_1", + }); + }); + + it("decodes a bare runId to a partial record (tags undefined)", () => { + // A bare/legacy frame: the consumer falls back to hydrate-to-classify. + const decoded = decodeChangeRecord("run_3"); + expect(decoded.runId).toBe("run_3"); + expect(decoded.tags).toBeUndefined(); + }); + + it("falls back to a bare runId on an unparseable message", () => { + expect(decodeChangeRecord("{not json").runId).toBe("{not json"); + }); + }); +}); diff --git a/apps/webapp/test/realtime/runReaderProjection.test.ts b/apps/webapp/test/realtime/runReaderProjection.test.ts new file mode 100644 index 00000000000..07aebf92589 --- /dev/null +++ b/apps/webapp/test/realtime/runReaderProjection.test.ts @@ -0,0 +1,75 @@ +import { describe, expect, it, vi } from "vitest"; +import { buildHydratorSelect, RunHydrator } from "~/services/realtime/runReader.server"; + +describe("buildHydratorSelect", () => { + it("returns the full select when nothing is skipped", () => { + const select = buildHydratorSelect([]); + expect(select.id).toBe(true); + expect(select.payload).toBe(true); + expect(select.output).toBe(true); + expect(select.metadata).toBe(true); + expect(select.error).toBe(true); + }); + + it("keeps protocol-reserved columns even when asked to skip them", () => { + // Reserved columns are always emitted by the serializer, so hydration must keep + // them regardless of skipColumns or the output is null/incorrect. + const select = buildHydratorSelect([ + "status", + "taskIdentifier", + "createdAt", + "friendlyId", + "payload", + ]); + expect(select.status).toBe(true); + expect(select.taskIdentifier).toBe(true); + expect(select.createdAt).toBe(true); + expect(select.friendlyId).toBe(true); + // A non-reserved skipped column is still dropped. + expect(select.payload).toBeUndefined(); + }); + + it("drops skipped columns but always keeps id + updatedAt", () => { + const select = buildHydratorSelect(["payload", "output", "metadata", "error"]); + expect(select.payload).toBeUndefined(); + expect(select.output).toBeUndefined(); + expect(select.metadata).toBeUndefined(); + expect(select.error).toBeUndefined(); + // Needed internally regardless of skipColumns (keys the row, drives the diff/offset). + expect(select.id).toBe(true); + expect(select.updatedAt).toBe(true); + // A non-skipped column survives. + expect(select.status).toBe(true); + }); +}); + +describe("RunHydrator.hydrateByIds column projection", () => { + function makeHydrator() { + let capturedSelect: Record | undefined; + const replica = { + taskRun: { + findMany: vi.fn(async ({ select }: { select: Record }) => { + capturedSelect = select; + return []; + }), + }, + } as any; + return { hydrator: new RunHydrator({ replica }), getSelect: () => capturedSelect }; + } + + it("projects the SELECT by skipColumns", async () => { + const { hydrator, getSelect } = makeHydrator(); + await hydrator.hydrateByIds("env_1", ["run_1"], ["payload", "output"]); + const select = getSelect()!; + expect(select.payload).toBeUndefined(); + expect(select.output).toBeUndefined(); + expect(select.id).toBe(true); + expect(select.updatedAt).toBe(true); + }); + + it("selects the full column set when no skipColumns are given", async () => { + const { hydrator, getSelect } = makeHydrator(); + await hydrator.hydrateByIds("env_1", ["run_1"]); + expect(getSelect()!.payload).toBe(true); + }); +}); diff --git a/apps/webapp/test/realtime/shadowCompare.test.ts b/apps/webapp/test/realtime/shadowCompare.test.ts new file mode 100644 index 00000000000..0d5f431f0bf --- /dev/null +++ b/apps/webapp/test/realtime/shadowCompare.test.ts @@ -0,0 +1,216 @@ +import { + type RealtimeRunRow, + serializeRunRow, +} from "~/services/realtime/electricStreamProtocol.server"; +import { type RunListFilter } from "~/services/realtime/runReader.server"; +import { RealtimeShadowComparator } from "~/services/realtime/shadowCompare.server"; +import { describe, expect, it } from "vitest"; + +function sampleRow(overrides: Partial = {}): RealtimeRunRow { + return { + id: "run_a", + taskIdentifier: "my-task", + createdAt: new Date("2026-06-07T09:00:00.000Z"), + updatedAt: new Date("2026-06-07T10:05:30.123Z"), + startedAt: null, + delayUntil: null, + queuedAt: null, + expiredAt: null, + completedAt: null, + friendlyId: "run_friendly_a", + number: 7, + isTest: true, + status: "EXECUTING", + usageDurationMs: 1234, + costInCents: 0.55, + baseCostInCents: 0.25, + ttl: "1h", + payload: '{"hello":"world"}', + payloadType: "application/json", + metadata: null, + metadataType: "application/json", + output: null, + outputType: "application/json", + runTags: ["a", "b"], + error: null, + realtimeStreams: [], + ...overrides, + }; +} + +const UP_TO_DATE = { headers: { control: "up-to-date" } }; + +function insert(value: Record) { + return { key: `"public"."TaskRun"/"${value.id}"`, value, headers: { operation: "insert" } }; +} + +function makeComparator( + rowsById: Record, + resolvedIds: string[] = [] +) { + return new RealtimeShadowComparator({ + runReader: { + getRunById: async (_env: string, id: string) => rowsById[id] ?? null, + hydrateByIds: async (_env: string, ids: string[]) => + ids.map((id) => rowsById[id]).filter((row): row is RealtimeRunRow => Boolean(row)), + } as any, + runListResolver: { resolveMatchingRunIds: async (_f: RunListFilter) => resolvedIds } as any, + }); +} + +describe("RealtimeShadowComparator serialization", () => { + it("counts a faithful re-serialization as a match", async () => { + const row = sampleRow(); + const body = JSON.stringify([insert(serializeRunRow(row)), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationMatched).toBe(1); + expect(out.serializationDiverged).toBe(0); + expect(out.serializationSkew).toBe(0); + expect(out.diffs).toEqual([]); + }); + + it("does not flag semantically-equivalent but differently-encoded values", async () => { + const row = sampleRow(); + // Electric encodes bool as "true" (native uses "t"), a number with a trailing + // zero, and a timestamp without millis — all equal after decoding. + const value = { + ...serializeRunRow(row), + isTest: "true", + costInCents: "0.5500", + createdAt: "2026-06-07T09:00:00", + }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationMatched).toBe(1); + expect(out.serializationDiverged).toBe(0); + }); + + it("flags a genuine column divergence (same version)", async () => { + const row = sampleRow(); + const value = { ...serializeRunRow(row), payload: '{"hello":"TAMPERED"}' }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationDiverged).toBe(1); + expect(out.serializationMatched).toBe(0); + expect(out.diffs).toEqual([ + { runId: "run_a", column: "payload", electric: '{"hello":"TAMPERED"}', native: '{"hello":"world"}' }, + ]); + }); + + it("treats DEQUEUED/EXECUTING as equivalent (legacy status rewrite)", async () => { + const row = sampleRow({ status: "EXECUTING" }); + const value = { ...serializeRunRow(row), status: "DEQUEUED" }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationDiverged).toBe(0); + expect(out.serializationMatched).toBe(1); + }); + + it("records skew when the row advanced between emit and refetch", async () => { + const row = sampleRow(); + // Electric emitted an older version; the refetched row is newer. + const value = { ...serializeRunRow(sampleRow({ updatedAt: new Date("2026-06-07T10:00:00.000Z") })) }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationSkew).toBe(1); + expect(out.serializationMatched).toBe(0); + expect(out.serializationDiverged).toBe(0); + }); +}); + +describe("RealtimeShadowComparator membership", () => { + const filter: RunListFilter = { + organizationId: "org_1", + projectId: "proj_1", + environmentId: "env_1", + tags: ["t"], + createdAtAfter: new Date("2026-06-06T00:00:00.000Z"), + limit: 1000, + }; + + function bodyFor(ids: string[]) { + const msgs = ids.map((id) => insert(serializeRunRow(sampleRow({ id })))); + return JSON.stringify([...msgs, UP_TO_DATE]); + } + + it("matches when Electric's set equals the native resolver's set", async () => { + const cmp = makeComparator( + { a: sampleRow({ id: "a" }), b: sampleRow({ id: "b" }) }, + ["a", "b"] + ); + const out = await cmp.compare({ + feed: "runs", + electricBody: bodyFor(["a", "b"]), + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + membershipFilter: filter, + }); + expect(out.membershipMatch).toBe(true); + expect(out.missingInNative).toEqual([]); + expect(out.extraInNative).toEqual([]); + }); + + it("reports rows missing from / extra in the native resolution", async () => { + const cmp = makeComparator( + { a: sampleRow({ id: "a" }), b: sampleRow({ id: "b" }) }, + ["a", "c"] // native missing b, has extra c + ); + const out = await cmp.compare({ + feed: "runs", + electricBody: bodyFor(["a", "b"]), + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + membershipFilter: filter, + }); + expect(out.membershipMatch).toBe(false); + expect(out.missingInNative).toEqual(["b"]); + expect(out.extraInNative).toEqual(["c"]); + }); +}); diff --git a/docker/config/grafana/provisioning/dashboards/realtime-native.json b/docker/config/grafana/provisioning/dashboards/realtime-native.json new file mode 100644 index 00000000000..832f2c8e320 --- /dev/null +++ b/docker/config/grafana/provisioning/dashboards/realtime-native.json @@ -0,0 +1,503 @@ +{ + "title": "Realtime Native Backend", + "uid": "realtime-native", + "tags": [ + "trigger.dev", + "realtime" + ], + "timezone": "browser", + "schemaVersion": 39, + "refresh": "10s", + "time": { + "from": "now-30m", + "to": "now" + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "annotations": { + "list": [] + }, + "templating": { + "list": [] + }, + "panels": [ + { + "id": 1, + "type": "timeseries", + "title": "Delivery lag (write \u2192 emission)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.5, sum(rate(triggerdotdev_realtime_native_delivery_lag_ms_milliseconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "p50" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(triggerdotdev_realtime_native_delivery_lag_ms_milliseconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "The end-to-end SLI: now minus the newest emitted row's updatedAt. A p99 approaching the ~20s backstop hold means live wakes are being missed." + }, + { + "id": 2, + "type": "timeseries", + "title": "Live polls by path", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(triggerdotdev_realtime_native_live_polls_total[$__rate_interval])) by (path)", + "legendFormat": "{{path}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "fast-hydrate = router wake, no ClickHouse. full-resolve = backstop. cold-resolve = fresh env subscription probed (instance hop / first poll)." + }, + { + "id": 3, + "type": "stat", + "title": "Backstop DELIVERED (should be ~0)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(triggerdotdev_realtime_native_backstops_total{result=\"delivered\"}[5m])) or vector(0)", + "legendFormat": "delivered/s" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.01 + } + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "description": "A backstop that finds missed changes means the notify/replay path is leaking. Alert on sustained non-zero." + }, + { + "id": 4, + "type": "timeseries", + "title": "Wakeups by reason", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 8 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(triggerdotdev_realtime_native_wakeups_total[$__rate_interval])) by (reason)", + "legendFormat": "{{reason}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "notify = the architecture working. A rising timeout share with active traffic = publishes not routing." + }, + { + "id": 5, + "type": "timeseries", + "title": "Gap recovery: replays + evictions", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 8 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(triggerdotdev_realtime_native_replays_total[$__rate_interval])) by (result)", + "legendFormat": "replay {{result}}" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(triggerdotdev_realtime_native_replay_evictions_total[$__rate_interval])) by (reason)", + "legendFormat": "evict {{reason}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "Replays recover records that landed between a connection's polls. 'evict cap' = an env churns more runs than the buffer window holds \u2014 retune REPLAY_MAX_RUNS / WINDOW_MS." + }, + { + "id": 6, + "type": "timeseries", + "title": "Rows per emission (p99)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 8 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(triggerdotdev_realtime_native_emitted_rows_bucket[$__rate_interval])) by (le))", + "legendFormat": "p99 rows" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "Deltas should be small. A fat tail means working-set / offset-floor fallbacks are re-emitting full sets." + }, + { + "id": 7, + "type": "timeseries", + "title": "Held feeds by kind", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(triggerdotdev_realtime_native_held_feeds) by (kind)", + "legendFormat": "{{kind}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "Long-polls currently held \u2014 the capacity unit." + }, + { + "id": 8, + "type": "timeseries", + "title": "Envs + channel subscriptions", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(triggerdotdev_realtime_native_active_envs)", + "legendFormat": "routed envs" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(triggerdotdev_realtime_notifier_active_subscriptions)", + "legendFormat": "redis channels" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "Routed envs includes lingering subscriptions (kept alive briefly after the last feed closes)." + }, + { + "id": 9, + "type": "timeseries", + "title": "Resolve health", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(triggerdotdev_realtime_native_runset_resolves_total[$__rate_interval])) by (result)", + "legendFormat": "resolve {{result}}" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(triggerdotdev_realtime_native_resolve_admission_in_use)", + "legendFormat": "gate in use" + }, + { + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(triggerdotdev_realtime_native_concurrency_rejections_total[$__rate_interval]))", + "legendFormat": "429/s" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "hit/coalesced vs miss = the single-flight cache collapsing same-filter herds. Gate in use near the limit = reconnect stampedes queueing." + } + ] +} \ No newline at end of file diff --git a/docker/docker-compose.extras.yml b/docker/docker-compose.extras.yml index 4c74c2acf70..cf16272dcc5 100644 --- a/docker/docker-compose.extras.yml +++ b/docker/docker-compose.extras.yml @@ -113,7 +113,7 @@ services: - grafana-data:/var/lib/grafana - ./config/grafana/provisioning:/etc/grafana/provisioning:ro ports: - - "${GRAFANA_HOST_PORT:-3001}:3000" + - "${GRAFANA_HOST_PORT:-4001}:3000" environment: GF_SECURITY_ADMIN_USER: admin GF_SECURITY_ADMIN_PASSWORD: admin diff --git a/internal-packages/redis/package.json b/internal-packages/redis/package.json index 9c13bbf21b0..6c7d8aa2608 100644 --- a/internal-packages/redis/package.json +++ b/internal-packages/redis/package.json @@ -6,7 +6,7 @@ "types": "./src/index.ts", "type": "module", "dependencies": { - "ioredis": "^5.3.2", + "ioredis": "~5.6.0", "@trigger.dev/core": "workspace:*" }, "scripts": { diff --git a/internal-packages/run-engine/src/engine/eventBus.ts b/internal-packages/run-engine/src/engine/eventBus.ts index 2e4adeed4b1..bd29869d280 100644 --- a/internal-packages/run-engine/src/engine/eventBus.ts +++ b/internal-packages/run-engine/src/engine/eventBus.ts @@ -11,7 +11,14 @@ export type EventBusEvents = { runCreated: [ { time: Date; - runId: string; + run: { + id: string; + runTags: string[]; + batchId: string | null; + }; + environment: { + id: string; + }; }, ]; runEnqueuedAfterDelay: [ @@ -23,6 +30,8 @@ export type EventBusEvents = { queuedAt: Date; updatedAt: Date; createdAt: Date; + runTags: string[]; + batchId: string | null; }; organization: { id: string; @@ -44,6 +53,8 @@ export type EventBusEvents = { delayUntil: Date; updatedAt: Date; createdAt: Date; + runTags: string[]; + batchId: string | null; }; organization: { id: string; @@ -76,6 +87,8 @@ export type EventBusEvents = { maxDurationInSeconds?: number; maxAttempts?: number; createdAt: Date; + runTags: string[]; + batchId: string | null; }; organization: { id: string; @@ -96,6 +109,8 @@ export type EventBusEvents = { status: TaskRunStatus; updatedAt: Date; createdAt: Date; + runTags: string[]; + batchId: string | null; }; organization: { id?: string; @@ -119,6 +134,8 @@ export type EventBusEvents = { attemptNumber: number; baseCostInCents: number; executedAt: Date | undefined; + runTags: string[]; + batchId: string | null; }; organization: { id: string; @@ -245,6 +262,8 @@ export type EventBusEvents = { createdAt: Date; error: TaskRunError; taskEventStore?: string; + runTags: string[]; + batchId: string | null; }; organization: { id: string; diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index 835ff90cc48..c3e0a5c75d0 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -1042,7 +1042,14 @@ export class RunEngine { this.eventBus.emit("runCreated", { time: new Date(), - runId: taskRun.id, + run: { + id: taskRun.id, + runTags: taskRun.runTags, + batchId: taskRun.batchId, + }, + environment: { + id: environment.id, + }, }); return taskRun; diff --git a/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts b/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts index 384384fd8c7..6c66591e288 100644 --- a/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts @@ -147,6 +147,8 @@ export class CheckpointSystem { status: run.status, updatedAt: run.updatedAt, createdAt: run.createdAt, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: run.runtimeEnvironment.organizationId, @@ -308,6 +310,8 @@ export class CheckpointSystem { projectId: true, updatedAt: true, createdAt: true, + runTags: true, + batchId: true, }, }); @@ -326,6 +330,8 @@ export class CheckpointSystem { status: run.status, updatedAt: run.updatedAt, createdAt: run.createdAt, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: run.organizationId ?? undefined, diff --git a/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts b/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts index 32ab98bad6c..10c965741cf 100644 --- a/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts @@ -79,6 +79,8 @@ export class DelayedRunSystem { delayUntil: delayUntil, updatedAt: updatedRun.updatedAt, createdAt: updatedRun.createdAt, + runTags: updatedRun.runTags, + batchId: updatedRun.batchId, }, organization: { id: snapshot.organizationId, @@ -192,6 +194,8 @@ export class DelayedRunSystem { queuedAt, updatedAt: updatedRun.updatedAt, createdAt: updatedRun.createdAt, + runTags: updatedRun.runTags, + batchId: updatedRun.batchId, }, organization: { id: run.runtimeEnvironment.organizationId, diff --git a/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts b/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts index 3fe1ef072cf..7c811ebfdfc 100644 --- a/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts @@ -490,6 +490,8 @@ export class DequeueSystem { maxAttempts: lockedTaskRun.maxAttempts ?? undefined, updatedAt: lockedTaskRun.updatedAt, createdAt: lockedTaskRun.createdAt, + runTags: lockedTaskRun.runTags, + batchId: lockedTaskRun.batchId, }, organization: { id: orgId, @@ -751,6 +753,8 @@ export class DequeueSystem { attemptNumber: true, updatedAt: true, createdAt: true, + runTags: true, + batchId: true, runtimeEnvironment: { select: { id: true, @@ -792,6 +796,8 @@ export class DequeueSystem { status: run.status, updatedAt: run.updatedAt, createdAt: run.createdAt, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: run.runtimeEnvironment.project.organizationId, diff --git a/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts b/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts index 6d503012fbc..b46b857f02a 100644 --- a/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts @@ -163,6 +163,8 @@ export class PendingVersionSystem { status: "PENDING", updatedAt: run.updatedAt, createdAt: run.createdAt, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: backgroundWorker.runtimeEnvironment.organizationId, diff --git a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts index 06c80f67f2c..02fd83a7a25 100644 --- a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts @@ -520,6 +520,8 @@ export class RunAttemptSystem { attemptNumber: nextAttemptNumber, baseCostInCents: updatedRun.baseCostInCents, executedAt: updatedRun.executedAt ?? undefined, + runTags: updatedRun.runTags, + batchId: updatedRun.batchId, }, organization: { id: updatedRun.runtimeEnvironment.organizationId, @@ -1052,6 +1054,8 @@ export class RunAttemptSystem { error: completion.error, createdAt: run.createdAt, taskEventStore: run.taskEventStore, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: run.runtimeEnvironment.organizationId, diff --git a/internal-packages/testcontainers/package.json b/internal-packages/testcontainers/package.json index 4ea83344c34..b3ab7ce5dc4 100644 --- a/internal-packages/testcontainers/package.json +++ b/internal-packages/testcontainers/package.json @@ -16,7 +16,7 @@ "@clickhouse/client": "^1.11.1", "@opentelemetry/api": "^1.9.1", "@trigger.dev/database": "workspace:*", - "ioredis": "^5.3.2" + "ioredis": "~5.6.0" }, "devDependencies": { "@testcontainers/postgresql": "^11.14.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 782b62cf7ff..39273b2976c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -228,8 +228,8 @@ importers: specifier: ^4.0.6 version: 4.0.6 ioredis: - specifier: ^5.3.2 - version: 5.3.2 + specifier: ~5.6.0 + version: 5.6.1 p-limit: specifier: ^6.2.0 version: 6.2.0 @@ -664,8 +664,8 @@ importers: specifier: ^1.0.0 version: 1.0.0 ioredis: - specifier: ^5.3.2 - version: 5.3.2 + specifier: ~5.6.0 + version: 5.6.1 isbot: specifier: ^3.6.5 version: 3.6.5 @@ -1256,8 +1256,8 @@ importers: specifier: workspace:* version: link:../../packages/core ioredis: - specifier: ^5.3.2 - version: 5.3.2 + specifier: ~5.6.0 + version: 5.6.1 internal-packages/replication: dependencies: @@ -1404,8 +1404,8 @@ importers: specifier: workspace:* version: link:../database ioredis: - specifier: ^5.3.2 - version: 5.3.2 + specifier: ~5.6.0 + version: 5.6.1 devDependencies: '@testcontainers/postgresql': specifier: ^11.14.0 @@ -11970,8 +11970,8 @@ packages: resolution: {integrity: sha512-YFMSV91JNBOSjw1cOfw2tup6hDP7mkz+2AUV7W1L1AM6ntgI75qC1ZeFpjPGMrWp+upmBRTX2fJWQ8c7jsUWpA==} engines: {node: '>=14'} - ioredis@5.3.2: - resolution: {integrity: sha512-1DKMMzlIHM02eBBVOFQ1+AolGjs6+xEcM4PDL7NqOS6szq7H9jSaEkIUH6/a5Hl241LzW6JLSiAbNvTQjUupUA==} + ioredis@5.6.1: + resolution: {integrity: sha512-UxC0Yv1Y4WRJiGQxQkP0hfdL0/5/6YvdfOOClRgJ0qppSarkhneSa6UvkMkms0AkdGimSH3Ikqm+6mkMmX7vGA==} engines: {node: '>=12.22.0'} ip-address@10.0.1: @@ -30048,11 +30048,11 @@ snapshots: intl-parse-accept-language@1.0.0: {} - ioredis@5.3.2: + ioredis@5.6.1: dependencies: '@ioredis/commands': 1.2.0 cluster-key-slot: 1.1.2 - debug: 4.3.7(supports-color@10.0.0) + debug: 4.4.3(supports-color@10.0.0) denque: 2.1.0 lodash.defaults: 4.2.0 lodash.isarguments: 3.1.0 @@ -33909,7 +33909,7 @@ snapshots: send@1.1.0(supports-color@10.0.0): dependencies: - debug: 4.3.6(supports-color@10.0.0) + debug: 4.4.3(supports-color@10.0.0) destroy: 1.2.0 encodeurl: 2.0.0 escape-html: 1.0.3