diff --git a/packages/opencode/src/cli/cmd/run.ts b/packages/opencode/src/cli/cmd/run.ts index dc59c4c86..e9cef524c 100644 --- a/packages/opencode/src/cli/cmd/run.ts +++ b/packages/opencode/src/cli/cmd/run.ts @@ -10,6 +10,15 @@ import { Filesystem } from "../../util/filesystem" import { createOpencodeClient, type Message, type OpencodeClient, type ToolPart } from "@opencode-ai/sdk/v2" import { Server } from "../../server/server" import { Provider } from "../../provider/provider" +// altimate_change start — verifier-gated router (run cheap, verify, escalate) +import { Router } from "../../router/router" +import { Verifier } from "../../router/verifier" +import { Verdict } from "../../router/verdict" +import { Policy } from "../../router/policy" +import { EquivalenceVerifier } from "../../router/equivalence-verifier" +import { ReferenceResolver } from "../../router/reference" +import * as Dispatcher from "../../altimate/native/dispatcher" +// altimate_change end import { Agent } from "../../agent/agent" import { PermissionNext } from "../../permission/next" import { Tool } from "../../tool/tool" @@ -816,46 +825,53 @@ You are speaking to a non-technical business executive. Follow these rules stric process.exit(1) }) - if (args.command) { - await sdk.session.command({ - sessionID, - agent, - model: args.model, - command: args.command, - arguments: message, - variant: args.variant, - }) - } else { - const model = args.model ? Provider.parseModel(args.model) : undefined - await sdk.session.prompt({ - sessionID, - agent, - model, - variant: args.variant, - parts: [...files, { type: "text", text: message }], - ...(audienceSystem ? { system: audienceSystem } : {}), - }) - } + // altimate_change start — per-run finally cleanup. The verifier-gated router catches a + // thrown tier (router.ts) and escalates to the next tier within the SAME process; without + // this finally, a tier whose prompt throws would leak its SIGINT/SIGTERM/beforeExit handlers + // and leave its tracer active, accumulating across tiers. Cleanup now always runs. + try { + if (args.command) { + await sdk.session.command({ + sessionID, + agent, + model: args.model, + command: args.command, + arguments: message, + variant: args.variant, + }) + } else { + const model = args.model ? Provider.parseModel(args.model) : undefined + await sdk.session.prompt({ + sessionID, + agent, + model, + variant: args.variant, + parts: [...files, { type: "text", text: message }], + ...(audienceSystem ? { system: audienceSystem } : {}), + }) + } - // Wait for the event loop to drain (breaks when session reaches idle) - await loopPromise - - // Remove crash handlers — trace will be finalized cleanly - process.removeListener("SIGINT", onSigint) - process.removeListener("SIGTERM", onSigterm) - process.removeListener("beforeExit", onBeforeExit) - - // Finalize trace and save to disk - if (tracer) { - Tracer.setActive(null) - const tracePath = await tracer.endTrace(error) - if (tracePath) { - emit("trace_saved", { path: tracePath }) - if (args.format !== "json" && process.stdout.isTTY) { - UI.println(UI.Style.TEXT_DIM + `Trace saved: ${tracePath}` + UI.Style.TEXT_NORMAL) + // Wait for the event loop to drain (breaks when session reaches idle) + await loopPromise + } finally { + // Remove crash handlers — trace will be finalized cleanly + process.removeListener("SIGINT", onSigint) + process.removeListener("SIGTERM", onSigterm) + process.removeListener("beforeExit", onBeforeExit) + + // Finalize trace and save to disk (with `error` if the run failed) + if (tracer) { + Tracer.setActive(null) + const tracePath = await tracer.endTrace(error) + if (tracePath) { + emit("trace_saved", { path: tracePath }) + if (args.format !== "json" && process.stdout.isTTY) { + UI.println(UI.Style.TEXT_DIM + `Trace saved: ${tracePath}` + UI.Style.TEXT_NORMAL) + } } } } + // altimate_change end // Write accumulated text output to file if --output was specified if (args.output) { @@ -864,8 +880,190 @@ You are speaking to a non-technical business executive. Follow these rules stric await Bun.write(outputPath, content) process.stderr.write(`\n✓ Output saved to: ${outputPath}\n`) } + + // altimate_change start — expose the session id so the router can reuse one session + // across tiers (escalation continues the same session instead of starting fresh). + return sessionID + // altimate_change end + } + + // altimate_change start — verifier-gated router orchestration + // Deterministic-verify the dbt workspace in cwd (`dbt build`, judged by Verifier). + // Only gates real dbt projects; with nothing to prove it returns ok (no escalation). + async function verifyWorkspace(): Promise { + const root = process.cwd() + if (!(await Filesystem.exists(path.join(root, "dbt_project.yml")))) + return { + ok: true, + unverifiable: true, + strength: Verifier.Strength.UNVERIFIABLE, + decision: Verifier.Decision.OK, + reason: "no dbt project to verify", + checks: [], + } + + // Reference-free gate: `dbt build` in `dir`, judged by Verifier. Used directly (default) + // and as the fallback for the equivalence verifier (greenfield / undecidable). + const buildVerify = async (dir: string): Promise => { + try { + const proc = Bun.spawn(["dbt", "build"], { cwd: dir, stdout: "pipe", stderr: "pipe" }) + // Hard timeout so a hung dbt (lock, prompt, runaway query) can't stall the run. + let timedOut = false + const timer = setTimeout(() => { + timedOut = true + proc.kill() + }, 300_000) + const out = (await new Response(proc.stdout).text()) + (await new Response(proc.stderr).text()) + const code = await proc.exited + clearTimeout(timer) + if (timedOut) + return { + ok: false, + strength: Verifier.Strength.BUILD, + decision: Verifier.Decision.FAILED, + reason: "dbt build timed out after 300s", + checks: [{ name: "dbt build", ok: false, detail: "timed out after 300s" }], + } + return Verifier.fromDbt(out, code) + } catch (e) { + // dbt binary missing / spawn failure → can't verify; mark unverifiable (fail-open, but honest). + return { + ok: true, + unverifiable: true, + strength: Verifier.Strength.UNVERIFIABLE, + decision: Verifier.Decision.OK, + reason: `verify skipped: ${String(e)}`, + checks: [], + } + } + } + + // EXPERIMENTAL (flag-gated, default off): equivalence-backed verification in the + // reference-available regime — proven-equivalent vs the model's base version. Always + // falls back to `buildVerify` on greenfield / undecidable / any error, so it can never + // be less safe than the build gate. Value is gated on altimate-core dialect + schema + // coverage (altimate-core-internal #128 / #130); ships dormant until those land. + if (process.env["ALTIMATE_ROUTER_EQUIVALENCE"] === "1") { + try { + const exec: ReferenceResolver.Exec = async (cmd, args, cwd) => { + const p = Bun.spawn([cmd, ...args], { cwd, stdout: "pipe", stderr: "pipe" }) + const stdout = await new Response(p.stdout).text() + return { stdout, code: await p.exited } + } + const readCompiled = async (dir: string): Promise> => { + const { readdir } = await import("node:fs/promises") + const map = new Map() + const baseDir = path.join(dir, "target", "compiled") + if (!(await Filesystem.exists(baseDir))) return map + const walk = async (d: string) => { + for (const e of await readdir(d, { withFileTypes: true })) { + const fp = path.join(d, e.name) + if (e.isDirectory()) await walk(fp) + else if (e.name.endsWith(".sql")) map.set(e.name.replace(/\.sql$/, ""), await Bun.file(fp).text()) + } + } + await walk(baseDir) + return map + } + const checkoutBase = async (workdir: string, ref: string) => { + const dir = path.join("/tmp", `altimate-base-${Date.now()}`) + await exec("git", ["worktree", "add", "--detach", dir, ref], workdir) + return { + dir, + cleanup: async () => { + await exec("git", ["worktree", "remove", "--force", dir], workdir) + }, + } + } + const deps = ReferenceResolver.gitDbtDeps(exec, { + readCompiled, + // Best-effort: empty schema ⇒ the engine abstains on table refs ⇒ build fallback. + // A warehouse schema resolver lands with the dialect coverage work. + buildSchema: async () => undefined, + checkoutBase, + }) + const check: EquivalenceVerifier.CheckEquivalence = async (head, base, schema) => { + const r = await Dispatcher.call("altimate_core.equivalence", { + sql1: head, + sql2: base, + schema_context: schema as Record | undefined, + }) + const d = ((r as { data?: Record }).data ?? {}) as { + equivalent?: boolean + validation_errors?: string[] + differences?: { severity?: string; description?: string }[] + confidence?: number + } + return { + equivalent: !!d.equivalent, + validation_errors: d.validation_errors ?? [], + differences: d.differences ?? [], + confidence: d.confidence, + } + } + return await EquivalenceVerifier.create(check, ReferenceResolver.create(deps), { + verify: buildVerify, + }).verify(root) + } catch { + return buildVerify(root) // the experimental path must never break the run + } + } + + return buildVerify(root) } + // Run the tier ladder: cheap → verify → escalate with failing-check context, stop at first pass. + // Each tier re-invokes the existing single-run path with that model (and the escalation note + // prepended) in the SAME workspace, so a later tier fixes the prior attempt rather than restarting. + async function runRouted(sdk: OpencodeClient) { + // Only route when the workspace is verifiable. Without a deterministic gate, routing + // would accept the cheapest tier with no way to verify or escalate — silently + // downgrading quality. In a non-dbt project, run once with the user's model instead. + if (!(await Filesystem.exists(path.join(process.cwd(), "dbt_project.yml")))) { + await execute(sdk) + return + } + const baseMessage = message + const originalModel = args.model + const originalSession = args.session + // Reuse ONE session across tiers: tier-1 creates it; escalation tiers continue the + // same session so the stronger model sees the prior attempt + the failing-check note, + // rather than starting cold. Captured from execute()'s returned session id. + let sharedSessionID: string | undefined + const policy = Policy.resolve() + const tiers = await policy.tiers({ prompt: baseMessage }) + let result + try { + result = await Router.route({ + tiers, + runAgent: async (model, note) => { + args.model = model + message = note ? `${note}\n\n${baseMessage}` : baseMessage + if (sharedSessionID) args.session = sharedSessionID // continue tier-1's session + const sid = await execute(sdk) + if (sid && !sharedSessionID) sharedSessionID = sid // capture tier-1's session + }, + verify: verifyWorkspace, + }) + } finally { + // Always restore the mutated request state, even if a tier throws — otherwise + // `message`/`args.model`/`args.session` leak the last tier's state to any + // downstream logging/telemetry/retry. + message = baseMessage + args.model = originalModel + args.session = originalSession + } + const envelope = Verdict.build(result, { now: new Date().toISOString() }) + if (args.format === "json") { + process.stdout.write(JSON.stringify({ type: "verdict", timestamp: Date.now(), ...envelope }) + EOL) + } else { + const tag = envelope.solved ? `✓ verified by ${envelope.solvedBy}` : "✗ unverified after all tiers" + UI.println(UI.Style.TEXT_INFO_BOLD + `~ router: ${tag} (policy: ${policy.source})`) + } + await Policy.reportOutcome(envelope) + } + // altimate_change end + if (args.attach) { const headers = (() => { const password = args.password ?? process.env.OPENCODE_SERVER_PASSWORD @@ -875,7 +1073,11 @@ You are speaking to a non-technical business executive. Follow these rules stric return { Authorization: auth } })() const sdk = createOpencodeClient({ baseUrl: args.attach, directory, headers }) - return await execute(sdk) + // altimate_change start — route when enabled, else single run + if (Router.enabled()) await runRouted(sdk) + else await execute(sdk) // discard execute()'s returned session id (handler returns void) + return + // altimate_change end } await bootstrap(process.cwd(), async () => { @@ -884,7 +1086,10 @@ You are speaking to a non-technical business executive. Follow these rules stric return Server.Default().fetch(request) }) as typeof globalThis.fetch const sdk = createOpencodeClient({ baseUrl: "http://altimate-code.internal", fetch: fetchFn }) - await execute(sdk) + // altimate_change start — route when enabled, else single run + if (Router.enabled()) await runRouted(sdk) + else await execute(sdk) + // altimate_change end }) }, }) diff --git a/packages/opencode/src/router/README.md b/packages/opencode/src/router/README.md new file mode 100644 index 000000000..bb6b96788 --- /dev/null +++ b/packages/opencode/src/router/README.md @@ -0,0 +1,90 @@ +# Verifier-gated router + +Run a cheap model first, verify the result deterministically, and escalate to a +stronger model only when verification fails. Most runs finish at the cheap tier; +the rest get a stronger attempt that receives the exact failing checks as context. +Flag-gated (`ALTIMATE_ROUTER`), default off — the normal single-model path is unchanged. + +## Modules (pure, unit-tested) +- **`verifier.ts`** — `Verifier`: a deterministic `Verdict` from `dbt build`/`dbt test` + output (`fromDbt`, `parseDbtSummary`, `failingNodes`). Every verdict carries a + **`Strength`** (`UNVERIFIABLE < BUILD < DBT_TEST < EQUIVALENCE`) and a **`Decision`** + (`OK | PROVEN_DIFFERENT | UNDECIDABLE | FAILED`) so consumers know *how strongly* a + result was proven, not just pass/fail. `Impl` is the pluggable verifier interface; the + default `dbtVerifier(run)` shells dbt (runner injected, fail-open). `fromEquivalence` + folds per-model equivalence results soundly. `ALLOW_ALL` passes everything (ungated). +- **`equivalence-verifier.ts`** — `EquivalenceVerifier`: an optional, stronger `Impl` for + the *reference-available* regime (editing an existing model) — compares base↔head + compiled SQL via the altimate-core equivalence engine. **Not wired into the default run + path in v1** (see "What v1 verifies"); it ships dormant behind the dbt build verifier. +- **`reference.ts`** — `ReferenceResolver`: produces the base↔head compiled-SQL pairs the + equivalence verifier needs (all git/dbt-compile/schema IO injected → unit-tested). Returns + `null` for greenfield (no base → build-fallback). Dormant alongside `equivalence-verifier`; + the production git+dbt-backed `Deps` + a flag-gated `verifyWorkspace` switch are the final + connect step, pending broader warehouse-dialect coverage in altimate-core + (equivalence currently abstains on dialect functions like duckdb `STRFTIME`). +- **`router.ts`** — `Router`: the escalation mechanism. `route({tiers, runAgent, verify})` + runs each tier, verifies, escalates on a failed verdict with the failing checks + (`escalationContext`), stops at the first pass. `shouldEscalate` is **decision-aware**: + it escalates on `FAILED`/`PROVEN_DIFFERENT` but **never on `UNDECIDABLE`** (a stronger + model can't make an undecidable query decidable). `DEFAULT_LADDER` is ordered + cheapest → strongest; override via `ALTIMATE_ROUTER_LADDER`. +- **`policy.ts`** — `Policy`: where the ladder comes from. `STATIC` is the built-in + default; `altimate(key)` fetches a per-context ladder from the altimate API when + `ALTIMATE_API_KEY` is set (degrades to static on any failure); `resolve()` picks + between them; `reportOutcome()` posts verified outcomes back (key-gated, best-effort). + `sanitizeTiers` validates + caps any ladder from the API. +- **`verdict.ts`** — `Verdict.Envelope` (schemaVersion 2): a machine-checkable record of the + result (accepted tier, `strength` + `decision`, per-attempt history, checks, evidence + hash, timestamp, optional signature). + +## What v1 verifies (read before enabling) +v1 ships the **dbt build** verifier: a verdict is `OK` at **`BUILD`** strength when +`dbt build` exits 0 with no errors. That proves the output **compiles and the project's +own tests pass — it does NOT prove value-correctness.** The envelope is honest about this: +the `strength` field says `BUILD`, not `EQUIVALENCE`. Treat the receipt as +"build-verified", not "proven equivalent". The `EQUIVALENCE`-strength path +(`equivalence-verifier.ts`) is gated on broader warehouse-dialect coverage in altimate-core +(decidability) and lands in a later release. + +## When to enable +Enable when the **tier-1 model is a strong cheap model** (the default `deepseek-v4-flash` +benchmarks at parity with frontier on dbt tasks). With a strong tier-1, escalation fires +rarely (only on a genuine build failure), so the router is economically favorable. With a +*weak* tier-1, escalation fires constantly and can cost as much as just using the strong +model — don't do that. The router is a **model-selection + verify** tool first, an +escalation ladder second. + +## Default ladder rationale +`deepseek-v4-flash → glm-5.1 → claude-opus-4.8`. Tier-1 is a validated strong-cheap model. +Benchmarking (N=10 dbt tasks) found tier-2 (`glm-5.1`) quality-redundant with tier-1, but +it is retained as a **failover / data-governance substitute** slot pending a larger powered +tiering study; the final tier is a frontier model for genuine build failures. Override the +whole ladder with `ALTIMATE_ROUTER_LADDER`. + +## Configuration +- `ALTIMATE_ROUTER=1` — enable routing (default off). +- `ALTIMATE_ROUTER_LADDER` — comma-separated `provider/model` ids to override the default ladder. +- `ALTIMATE_API_KEY` / `ALTIMATE_API_URL` — use the altimate API for the routing policy + and outcome reporting instead of the static ladder. + +## Integration +`src/cli/cmd/run.ts` (`RunCommand`): when `Router.enabled()`, the run resolves a policy, +runs each tier by re-invoking the existing run path with that model (escalation note +prepended) in the same workspace, verifies with `dbt build` between tiers, and emits a +verdict envelope. The default (non-router) path is untouched. + +## Tests +- **Unit** — `test/router/{verifier,router,verdict,policy,verdict-strength,equivalence-verifier}.test.ts`. + Pure logic, incl. adversarial cases (dbt summary-line injection, ANSI/huge/multi-summary + output, endpoint response validation/capping), the tri-state strength/decision contract, + and the equivalence verifier's sound fallback (undecidable → build/test, never silent pass). +- **E2E** (`test/router/*.e2e.test.ts`, env-gated — require docker + a dbt image + + network, excluded from default CI): + - `verifier.e2e` — real `dbt build` (pass / compile-error / failing-test) and that a + model emitting a fake summary does not change the verdict. `E2E_IMG= bun test verifier.e2e`. + - `router.e2e` — real model calls + real dbt: cheap tier solves; an unsatisfiable + workspace escalates through tiers, caps, and threads failing-check context. + `OPENROUTER_API_KEY=… E2E_IMG=… bun test router.e2e`. + - `policy.e2e` — real network: live local server (incl. error/malformed/oversized + responses) and an unreachable endpoint, all degrade gracefully. `bun test policy.e2e`. diff --git a/packages/opencode/src/router/equivalence-verifier.ts b/packages/opencode/src/router/equivalence-verifier.ts new file mode 100644 index 000000000..bd854a580 --- /dev/null +++ b/packages/opencode/src/router/equivalence-verifier.ts @@ -0,0 +1,102 @@ +/** + * Equivalence-backed verifier (reference-available regime). + * + * For a change to an EXISTING model, the strongest deterministic signal is not + * "does it build" but "is the new SQL semantically equivalent to the prior version". + * This Impl resolves a reference (base) compiled SQL per touched model, compares it + * to the head compiled SQL via the altimate-core equivalence engine, and folds the + * per-model results into one Verdict (see `Verifier.fromEquivalence`). + * + * Soundness is preserved end-to-end: the engine never reports false-equivalence, and + * an undecidable result (validation errors / unsupported dialect) maps to UNDECIDABLE, + * which the router does NOT escalate on — the caller falls back to build/test. A + * stronger model cannot make an undecidable query decidable. + * + * Both the equivalence call and the reference resolution are injected, so this is + * fully unit-testable without the native engine, dbt, or git. + */ +import { Verifier } from "./verifier" + +export namespace EquivalenceVerifier { + /** One model's base→head SQL pair plus the schema needed to resolve refs. */ + export interface Pair { + model: string + baseSql: string + headSql: string + /** Opaque schema handle passed through to the engine (e.g. altimate-core Schema). */ + schema?: unknown + } + + /** + * Resolves the comparison inputs for the touched models in a workspace. + * Returns null when there is NO reference (greenfield) — the caller then uses the + * build/test verifier instead. Returns [] when a reference regime applies but no + * models were touched (treated as nothing-to-verify). + */ + export interface ReferenceResolver { + resolve(workdir: string): Promise + } + + /** The raw equivalence call (native `altimate_core.equivalence`), injected for testability. */ + export type CheckEquivalence = ( + headSql: string, + baseSql: string, + schema: unknown, + ) => Promise + + /** + * Build an Impl. `check` performs one equivalence comparison; `resolver` provides the + * base/head pairs. `fallback` (typically the dbt build verifier) is used when there is + * no reference (greenfield) or when the engine is undecidable — never a silent pass. + */ + export function create( + check: CheckEquivalence, + resolver: ReferenceResolver, + fallback: Verifier.Impl, + ): Verifier.Impl { + return { + async verify(workdir: string): Promise { + let pairs: Awaited> + try { + pairs = await resolver.resolve(workdir) + } catch (e) { + // Can't resolve a reference → degrade to the build/test verifier (honest). + return fallback.verify(workdir) + } + // Greenfield (no reference): equivalence is not applicable. + if (pairs === null) return fallback.verify(workdir) + + const results: { model: string; result: Verifier.EquivalenceResult }[] = [] + for (const p of pairs) { + try { + results.push({ model: p.model, result: await check(p.headSql, p.baseSql, p.schema) }) + } catch (e) { + // A failed comparison is undecidable for that model, not "different". + results.push({ + model: p.model, + result: { equivalent: false, validation_errors: [`equivalence error: ${String(e)}`] }, + }) + } + } + const verdict = Verifier.fromEquivalence(results) + // Undecidable equivalence → fall back to the reference-free gate (build/test), + // so we never accept on an abstain alone. The DECISION must come from the + // fallback, not be blanket-stamped UNDECIDABLE: if the fallback build FAILS we + // must surface FAILED so the router escalates — stamping UNDECIDABLE here would + // swallow a real build failure (UNDECIDABLE never escalates). If it passes, the + // result is accepted at BUILD strength (the "equivalence couldn't decide" fact is + // carried by strength + } + + type Fetch = typeof globalThis.fetch + + /** Defensive cap: a bad/compromised policy endpoint must not inject a cost-bomb ladder. */ + export const MAX_TIERS = 8 + + /** + * Validate + cap a ladder returned by the policy endpoint. Keeps only entries with a + * non-empty string `model`, derives a label when missing, caps to MAX_TIERS. Returns + * null if nothing usable (caller falls back to the static ladder). + */ + /** A model id must look like `provider/model[/...]` — plain chars only, no whitespace/control. */ + const MODEL_RE = /^[A-Za-z0-9._-]+(?:\/[A-Za-z0-9._-]+)+$/ + + export function sanitizeTiers(raw: unknown): Router.Tier[] | null { + if (!Array.isArray(raw)) return null + const out: Router.Tier[] = [] + for (const t of raw) { + const model = (t as any)?.model + if (typeof model !== "string") continue + const m = model.trim() + if (!m || m.length > 200 || !MODEL_RE.test(m)) continue + const rawLabel = typeof (t as any)?.label === "string" && (t as any).label ? (t as any).label : m.split("/").pop() || m + // Strip non-printable/ANSI — the label is printed to the terminal. + const label = String(rawLabel).replace(/[^\x20-\x7E]/g, "").slice(0, 100) || m + out.push({ model: m, label }) + if (out.length >= MAX_TIERS) break + } + return out.length ? out : null + } + + export function apiKey(): string | undefined { + return process.env["ALTIMATE_API_KEY"] || undefined + } + + export function baseUrl(): string { + return process.env["ALTIMATE_API_URL"] || "https://api.altimate.ai" + } + + /** Built-in default ladder (env-overridable via ALTIMATE_ROUTER_LADDER). */ + export const STATIC: RoutingPolicy = { + source: "static", + async tiers() { + return Router.ladder() + }, + } + + /** + * Customer routing policy served by the altimate API. Resolves the per-context + * ladder for this account; degrades to the static ladder if the service is + * unreachable or returns nothing usable. + */ + export function altimate(key: string, base: string = baseUrl(), fetchImpl: Fetch = fetch): RoutingPolicy { + return { + source: "altimate", + async tiers(ctx: RoutingContext): Promise { + try { + const res = await fetchImpl(`${base}/v1/router/policy`, { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: `Bearer ${key}` }, + body: JSON.stringify(ctx), + signal: AbortSignal.timeout(3000), + }) + if (!res.ok) return Router.ladder() + const data = (await res.json()) as { tiers?: unknown } + return sanitizeTiers(data?.tiers) ?? Router.ladder() + } catch { + return Router.ladder() + } + }, + } + } + + /** The active policy: customer policy when an altimate key is set, else the static ladder. */ + export function resolve(fetchImpl: Fetch = fetch): RoutingPolicy { + const key = apiKey() + return key ? altimate(key, baseUrl(), fetchImpl) : STATIC + } + + /** + * Report a verified outcome back to the altimate service so the customer's policy + * improves. Best-effort and key-gated — a no-op without a key, and never throws. + */ + export async function reportOutcome( + envelope: Verdict.Envelope, + base: string = baseUrl(), + fetchImpl: Fetch = fetch, + ): Promise { + const key = apiKey() + if (!key) return + try { + await fetchImpl(`${base}/v1/router/outcomes`, { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: `Bearer ${key}` }, + body: JSON.stringify(envelope), + signal: AbortSignal.timeout(3000), + }) + } catch { + /* best-effort: outcome reporting must never break the run */ + } + } +} diff --git a/packages/opencode/src/router/reference.ts b/packages/opencode/src/router/reference.ts new file mode 100644 index 000000000..03058ac3b --- /dev/null +++ b/packages/opencode/src/router/reference.ts @@ -0,0 +1,118 @@ +/** + * Reference resolver for the equivalence verifier (reference-available regime). + * + * To compare a changed dbt model against its prior version, we need the COMPILED SQL + * of both sides (equivalence runs on compiled SQL, never raw Jinja) plus the schema to + * resolve table/column refs. This module produces `EquivalenceVerifier.Pair[]` for the + * models a change touched, or `null` when there is no reference (greenfield — the caller + * then uses the build/test verifier). + * + * All IO (git, dbt compile, schema) is injected via `Deps`, so the orchestration is + * fully unit-testable without git/dbt. A git+dbt-backed `Deps` is the production impl. + */ +import { EquivalenceVerifier } from "./equivalence-verifier" + +export namespace ReferenceResolver { + /** "WORKING" = the current working tree; otherwise a git ref (the base/PR-target). */ + export type Ref = string + + export interface Deps { + /** The base ref to diff against (PR merge-base or HEAD~), or null when none exists (greenfield). */ + baseRef(workdir: string): Promise + /** Model names whose .sql changed vs the base. */ + changedModels(workdir: string, base: Ref): Promise + /** model -> compiled SQL at a given ref ("WORKING" or a git ref). */ + compiledSql(workdir: string, ref: Ref): Promise> + /** Opaque schema handle passed to the equivalence engine (e.g. altimate-core Schema). */ + schema(workdir: string): Promise + } + + /** Run a shell command; returns stdout + exit code. Injected so `gitDbtDeps` is testable. */ + export type Exec = (cmd: string, args: string[], cwd: string) => Promise<{ stdout: string; code: number }> + + export interface GitDbtOptions { + /** dbt binary (e.g. "dbt" or "altimate-dbt"). */ + dbt?: string + /** Read compiled model SQL after a `dbt compile` in `dir` → Map. */ + readCompiled: (dir: string) => Promise> + /** Build the engine schema for the project (best-effort; empty Schema ⇒ engine abstains → build-fallback). */ + buildSchema: (workdir: string) => Promise + /** Make an isolated checkout of `ref` for base-side compilation (e.g. git worktree); returns its path + a cleanup. */ + checkoutBase: (workdir: string, ref: string) => Promise<{ dir: string; cleanup: () => Promise }> + } + + /** + * Production `Deps`: git for base/changed detection, dbt to compile each side, an + * injected schema builder. All process IO goes through `exec`/`opts` so the orchestration + * is unit-tested without git/dbt. NOTE: the live path (git-worktree base compile + + * warehouse schema) is pending E2E validation — it ships behind a flag and degrades to a + * build verdict (the engine abstains without a resolvable schema / unsupported dialect). + */ + export function gitDbtDeps(exec: Exec, opts: GitDbtOptions): Deps { + const dbt = opts.dbt ?? "dbt" + return { + async baseRef(workdir) { + const r = await exec("git", ["rev-parse", "--verify", "HEAD"], workdir) + return r.code === 0 && r.stdout.trim() ? r.stdout.trim() : null // no commits ⇒ greenfield + }, + async changedModels(workdir, base) { + const r = await exec("git", ["diff", "--name-only", base, "--", "models"], workdir) + if (r.code !== 0) return [] + return r.stdout + .split("\n") + .map((l) => l.trim()) + .filter((l) => l.endsWith(".sql")) + .map((l) => l.split("/").pop()!.replace(/\.sql$/, "")) + }, + async compiledSql(workdir, ref) { + if (ref === "WORKING") { + await exec(dbt, ["compile"], workdir) + return opts.readCompiled(workdir) + } + const base = await opts.checkoutBase(workdir, ref) + try { + await exec(dbt, ["deps"], base.dir) + await exec(dbt, ["compile"], base.dir) + return await opts.readCompiled(base.dir) + } finally { + await base.cleanup() + } + }, + schema: (workdir) => opts.buildSchema(workdir), + } + } + + /** + * Build a `EquivalenceVerifier.ReferenceResolver` from injected deps. + * Returns null (→ greenfield/build-fallback) when there is no base ref; returns [] when a + * base exists but no models changed; otherwise one Pair per changed model present on BOTH + * sides (a model that's new on head has no base → not equivalence-checkable, skipped here). + */ + export function create(deps: Deps): EquivalenceVerifier.ReferenceResolver { + return { + async resolve(workdir: string): Promise { + const base = await deps.baseRef(workdir) + if (base === null) return null // greenfield — no reference + + const changed = await deps.changedModels(workdir, base) + if (changed.length === 0) return [] + + const [headSql, baseSql, schema] = await Promise.all([ + deps.compiledSql(workdir, "WORKING"), + deps.compiledSql(workdir, base), + deps.schema(workdir), + ]) + + const pairs: EquivalenceVerifier.Pair[] = [] + for (const model of changed) { + const head = headSql.get(model) + const baseM = baseSql.get(model) + // Both sides must compile to a SQL string; a model new on head (no base) is not + // equivalence-checkable and is left to the build/test gate. + if (head && baseM) pairs.push({ model, baseSql: baseM, headSql: head, schema }) + } + return pairs + }, + } + } +} diff --git a/packages/opencode/src/router/router.ts b/packages/opencode/src/router/router.ts new file mode 100644 index 000000000..faecb6c37 --- /dev/null +++ b/packages/opencode/src/router/router.ts @@ -0,0 +1,130 @@ +/** + * Verifier-gated model router — the escalation ladder. + * + * Run the CHEAP tier first; verify the workspace deterministically (Verifier); + * if the verdict is not ok, escalate to the next stronger tier, handing it the + * exact failing checks so it fixes rather than restarts blind. Stop at the first + * passing verdict (or the top of the ladder). + * + * Because the cheap tier handles most tasks, escalation is rare. The default ladder + * is ordered cheapest → strongest and can be overridden per deployment. + * + * Pure orchestration: `runAgent` + `verify` are injected → unit-testable without + * a live model or dbt. Flag-gated (`ALTIMATE_ROUTER`); default off. + */ +import { Verifier } from "./verifier" + +export namespace Router { + export interface Tier { + model: string + label: string + } + + /** + * Default ladder, ordered cheapest → strongest. A tier is only reached when the + * previous tier's output fails verification, so most runs complete at the cheap tier. + * Override per deployment via `ALTIMATE_ROUTER_LADDER` or an injected policy. + */ + export const DEFAULT_LADDER: Tier[] = [ + { model: "openrouter/deepseek/deepseek-v4-flash", label: "deepseek-v4-flash" }, + { model: "openrouter/z-ai/glm-5.1", label: "glm-5.1" }, + { model: "openrouter/anthropic/claude-opus-4.8", label: "claude-opus-4.8" }, + ] + + export function enabled(): boolean { + return process.env["ALTIMATE_ROUTER"] === "1" + } + + /** Ladder from `ALTIMATE_ROUTER_LADDER` (comma-separated provider/model ids) or the default. */ + export function ladder(): Tier[] { + const env = process.env["ALTIMATE_ROUTER_LADDER"] + if (!env) return DEFAULT_LADDER + const tiers = env + .split(",") + .map((s) => s.trim()) + .filter(Boolean) + .map((model) => ({ model, label: model.split("/").pop() || model })) + return tiers.length ? tiers : DEFAULT_LADDER + } + + /** + * Escalate iff the verdict is escalation-worthy AND a stronger tier remains. + * + * Decision-aware: escalate on a build/test FAILURE or a PROVEN_DIFFERENT equivalence + * verdict, but NOT on UNDECIDABLE — a stronger model does not make an undecidable + * query decidable, and escalating on uncertainty is the gated-build cost-blowup + * failure mode. UNDECIDABLE is handled by the verifier's own build/test fallback. + * Falls back to the legacy `!ok` rule when a verdict carries no `decision` (back-compat). + */ + export function shouldEscalate(verdict: Verifier.Verdict, tierIndex: number, tiers: Tier[]): boolean { + if (tierIndex >= tiers.length - 1) return false + if (verdict.decision === undefined) return !verdict.ok + return ( + verdict.decision === Verifier.Decision.FAILED || + verdict.decision === Verifier.Decision.PROVEN_DIFFERENT + ) + } + + /** The note handed to the next tier — names the exact failing checks so it fixes them. */ + export function escalationContext(prev: Tier, verdict: Verifier.Verdict): string { + const failing = verdict.checks.filter((c) => !c.ok).map((c) => c.name) + const lines = [ + `A previous attempt (by ${prev.label}) did not pass verification.`, + verdict.reason ? `Verifier reason: ${verdict.reason}` : "", + failing.length ? `Failing checks to fix: ${failing.join(", ")}.` : "", + `The prior changes are in the workspace — fix these specific failures; do not start over.`, + ] + return lines.filter(Boolean).join("\n") + } + + export interface Attempt { + tier: Tier + verdict: Verifier.Verdict + } + + export interface RouteResult { + solved: boolean + solvedBy?: Tier + attempts: Attempt[] + } + + /** + * Drive the ladder: run each tier, verify, escalate on failure with context, + * stop at the first ok verdict. `runAgent(model, escalationNote?)` performs the + * agent run in the shared workspace; `verify()` judges the post-run workspace. + */ + export async function route(params: { + tiers?: Tier[] + runAgent: (model: string, escalationNote?: string) => Promise + verify: () => Promise + }): Promise { + const tiers = params.tiers ?? ladder() + const attempts: Attempt[] = [] + let note: string | undefined + for (let i = 0; i < tiers.length; i++) { + const tier = tiers[i] + // A thrown agent/verify error is treated as a failed attempt so the ladder can + // escalate, rather than aborting the whole run on a transient failure in one tier. + let verdict: Verifier.Verdict + try { + await params.runAgent(tier.model, note) + verdict = await params.verify() + } catch (e) { + // A thrown tier is a FAILED attempt (escalate to the next tier), at UNVERIFIABLE + // strength since no gate actually judged the output. + verdict = { + ok: false, + strength: Verifier.Strength.UNVERIFIABLE, + decision: Verifier.Decision.FAILED, + reason: `tier error: ${String(e)}`, + checks: [], + } + } + attempts.push({ tier, verdict }) + if (verdict.ok) return { solved: true, solvedBy: tier, attempts } + if (!shouldEscalate(verdict, i, tiers)) break + note = escalationContext(tier, verdict) + } + return { solved: false, attempts } + } +} diff --git a/packages/opencode/src/router/verdict.ts b/packages/opencode/src/router/verdict.ts new file mode 100644 index 000000000..f0a144b97 --- /dev/null +++ b/packages/opencode/src/router/verdict.ts @@ -0,0 +1,96 @@ +/** + * Verdict envelope — a machine-checkable record of a routed result. + * + * Records which tier produced the accepted result, the checks that passed, an + * evidence fingerprint, and an optional signature — a structured summary of "this + * output passed deterministic verification by tier X" for downstream/audit use. + * + * Pure + dependency-free: the timestamp and signer are injected so this never + * reaches for Date.now / crypto itself. + */ +import type { Verifier } from "./verifier" +import type { Router } from "./router" + +export namespace Verdict { + export interface AttemptRecord { + model: string + ok: boolean + /** Gate conclusion for this attempt (ok / proven_different / undecidable / failed). */ + decision?: Verifier.Decision + /** Evidence strength for this attempt (unverifiable / build / dbt_test / equivalence). */ + strength?: Verifier.Strength + reason?: string + failing: string[] + } + + export interface Envelope { + /** Envelope schema version, for forward-compat as the shape evolves. */ + schemaVersion: string + solved: boolean + solvedBy: string | null + /** ladder index that produced the passing verdict, or null if unsolved. */ + tier: number | null + /** true when the accepted result could not actually be verified (fail-open). */ + unverifiable: boolean + /** + * Evidence strength of the accepted result — the core trust signal. EQUIVALENCE + * means proven equivalent to a reference; BUILD means it merely compiled. + */ + strength?: Verifier.Strength + /** Gate conclusion of the accepted result. */ + decision?: Verifier.Decision + attempts: AttemptRecord[] + checks: Verifier.Check[] + evidenceHash: string + createdAt: string + signature?: string + } + + /** Envelope schema. v2 adds per-result `strength` + `decision` (the trust signal). */ + export const SCHEMA_VERSION = "2" + + /** Deterministic, dependency-free fingerprint of the evidence (djb2 → hex). Not a signature. */ + export function evidenceHash(s: string): string { + let h = 5381 + for (let i = 0; i < s.length; i++) h = (((h << 5) + h) ^ s.charCodeAt(i)) >>> 0 + return "djb2:" + h.toString(16).padStart(8, "0") + } + + /** + * Build the envelope from a routing result. `now` (ISO string) and an optional + * `sign` function are injected — the product wires a real signer here. + */ + export function build( + result: Router.RouteResult, + opts: { now: string; sign?: (unsigned: Omit) => string }, + ): Envelope { + const attempts: AttemptRecord[] = result.attempts.map((a) => ({ + model: a.tier.label, + ok: a.verdict.ok, + decision: a.verdict.decision, + strength: a.verdict.strength, + reason: a.verdict.reason, + failing: a.verdict.checks.filter((c) => !c.ok).map((c) => c.name), + })) + const last = result.attempts.at(-1) + const unsigned: Omit = { + schemaVersion: SCHEMA_VERSION, + solved: result.solved, + solvedBy: result.solvedBy?.label ?? null, + tier: result.solved ? result.attempts.length - 1 : null, + unverifiable: result.solved ? !!last?.verdict.unverifiable : false, + strength: last?.verdict.strength, + decision: last?.verdict.decision, + attempts, + checks: last?.verdict.checks ?? [], + evidenceHash: evidenceHash(last?.verdict.evidence ?? ""), + createdAt: opts.now, + } + const signature = opts.sign?.(unsigned) + return signature ? { ...unsigned, signature } : unsigned + } + + export function serialize(e: Envelope): string { + return JSON.stringify(e) + } +} diff --git a/packages/opencode/src/router/verifier.ts b/packages/opencode/src/router/verifier.ts new file mode 100644 index 000000000..7f6311b12 --- /dev/null +++ b/packages/opencode/src/router/verifier.ts @@ -0,0 +1,289 @@ +/** + * Deterministic verifier for the verifier-gated router. + * + * After an agent run completes, a verifier inspects the resulting workspace and + * returns a Verdict: did the work actually succeed? For dbt/SQL this is checkable, + * not estimated — `dbt build` exits 0 and `dbt test` passes. A not-ok verdict means + * the attempt is wrong, so the router escalates to a stronger model. + * + * The default `dbtVerifier` runs `dbt build`; a different verifier can be injected + * via the `Impl` interface (e.g. a semantic-equivalence check). + * + * Pure parsing + an injected command runner → fully testable without dbt. + */ + +export namespace Verifier { + /** One graded check (a dbt test, a model build, or an equivalence assertion). */ + export interface Check { + name: string + ok: boolean + detail?: string + } + + /** + * How strong is the evidence behind a verdict? Ordered weakest → strongest. + * The signed envelope carries this so a consumer knows whether a result was + * merely build-verified (value unknown) or proven equivalent to a reference. + */ + export enum Strength { + /** No gate could run (fail-open). The result is NOT proven. */ + UNVERIFIABLE = "unverifiable", + /** `dbt build` exited 0 with no errors: it compiles, but value-correctness is unknown. */ + BUILD = "build", + /** dbt schema/unit tests passed: asserted invariants hold (still not full correctness). */ + DBT_TEST = "dbt_test", + /** Proven semantically equivalent to a reference by the equivalence engine. */ + EQUIVALENCE = "equivalence", + } + + /** + * What did the gate conclude? Distinct from {@link Strength} (how it was judged). + * `UNDECIDABLE` is the equivalence engine's honest abstain — it must NEVER be + * silently treated as a pass, and must NOT trigger escalation (a stronger model + * does not make an undecidable query decidable); the caller falls back + flags. + */ + export enum Decision { + OK = "ok", + /** The equivalence engine found a MATERIAL difference vs the reference. */ + PROVEN_DIFFERENT = "proven_different", + /** The engine could not decide (validation errors / unsupported syntax / no reference). */ + UNDECIDABLE = "undecidable", + /** A build/test gate failed. */ + FAILED = "failed", + } + + export interface Verdict { + ok: boolean + /** + * True when verification could not actually run (e.g. no dbt project, dbt binary + * missing). Distinct from a genuine pass: `ok` is true so the run is not blocked + * (fail-open), but the result was NOT proven — consumers/the envelope can tell. + */ + unverifiable?: boolean + /** + * Evidence strength (optional for back-compat; populated by every constructor). + * Lets the signed envelope say "verified at strength EQUIVALENCE" vs "BUILD only". + */ + strength?: Strength + /** + * Gate conclusion (optional for back-compat; populated by every constructor). + * Drives decision-aware escalation in the router. + */ + decision?: Decision + /** Engine confidence in [0,1] when available (equivalence). Never 1.0 — soundness margin. */ + confidence?: number + /** Human/agent-readable reason when not ok (fed to the next tier on escalation). */ + reason?: string + checks: Check[] + /** Raw evidence excerpt (for the verdict envelope / audit). */ + evidence?: string + } + + /** One model's equivalence result (subset of altimate-core's EquivalenceResult). */ + export interface EquivalenceResult { + equivalent: boolean + /** Non-empty ⇒ the engine could not decide (undecidable), NOT "different". */ + validation_errors?: string[] + /** Material differences when decidably non-equivalent. */ + differences?: { severity?: string; description?: string }[] + confidence?: number + } + + /** dbt's "Done. PASS=.. WARN=.. ERROR=.. SKIP=.. TOTAL=.." summary. */ + export interface DbtSummary { + pass: number + warn: number + error: number + skip: number + total: number + } + + /** Result of running a verification command (injected; real impl shells out). */ + export interface RunResult { + output: string + exitCode: number + } + + /** Pluggable judgment. Default = dbtVerifier; a custom verifier can be injected. */ + export interface Impl { + verify(workdir: string): Verdict | Promise + } + + /** + * Parse the dbt run summary line. Returns null if not present (build never finished). + * + * Hardening: takes the LAST matching line, not the first. dbt prints its real run + * summary last; a malicious/confused model could emit SQL containing a fake + * "Done. PASS=99 ERROR=0" that dbt echoes earlier in its error log. (The exitCode + * check in `fromDbt` is the primary backstop; last-match is defense in depth.) + */ + export function parseDbtSummary(output: string): DbtSummary | null { + const re = /PASS=(\d+)\s+WARN=(\d+)\s+ERROR=(\d+)\s+SKIP=(\d+)(?:\s+NO-OP=\d+)?\s+TOTAL=(\d+)/gi + let last: RegExpExecArray | null = null + let m: RegExpExecArray | null + while ((m = re.exec(output))) last = m + if (!last) return null + return { pass: +last[1], warn: +last[2], error: +last[3], skip: +last[4], total: +last[5] } + } + + /** + * Extract the dbt nodes that failed (the actionable detail for escalation). + * Matches dbt's standard phrasings: + * "Failure in test not_null_orders_id (models/schema.yml)" + * "Error in model my_model (models/my_model.sql)" + * "Compilation Error in model stg_x (...)" + */ + export function failingNodes(output: string): Check[] { + const out: Check[] = [] + const re = /(?:Compilation Error|Failure|Error|Runtime Error) in (test|model|seed|snapshot|unit_test) ([\w.]+)/gi + let m: RegExpExecArray | null + const seen = new Set() + while ((m = re.exec(output))) { + const name = `${m[1]}:${m[2]}` + if (seen.has(name)) continue + seen.add(name) + out.push({ name: m[2], ok: false, detail: m[0] }) + } + return out + } + + /** + * Build a Verdict from a `dbt build`/`dbt test` run. + * ok ⇔ command exited 0 AND a summary was produced AND it had zero ERRORs. + * A missing summary (build crashed / never ran) is NOT ok. + */ + export function fromDbt(output: string, exitCode: number): Verdict { + const s = parseDbtSummary(output) + const failing = failingNodes(output) + const ok = exitCode === 0 && !!s && s.error === 0 + let reason: string | undefined + if (!ok) { + if (!s) reason = "dbt build did not complete (no run summary found)" + else if (s.error > 0) + reason = `${s.error} dbt error(s); ${s.pass}/${s.total} passed` + + (failing.length ? ` — failing: ${failing.map((f) => f.name).join(", ")}` : "") + else if (exitCode !== 0) reason = `dbt exited ${exitCode}` + } + const checks: Check[] = failing.length + ? failing + : s + ? [{ name: "dbt build", ok, detail: `PASS=${s.pass} ERROR=${s.error} TOTAL=${s.total}` }] + : [{ name: "dbt build", ok: false, detail: "no summary" }] + return { + ok, + strength: Strength.BUILD, + decision: ok ? Decision.OK : Decision.FAILED, + reason, + checks, + evidence: output.slice(-800), + } + } + + /** + * Build a Verdict from per-model equivalence results (reference-available regime). + * + * Folds N model verdicts into one, honoring the engine's soundness: + * - any model with `validation_errors` (or a no-reference/error result) ⇒ UNDECIDABLE + * for the whole verdict (the caller MUST fall back to build/test, never pass silently); + * - else any model decidably non-equivalent ⇒ PROVEN_DIFFERENT (escalation-worthy); + * - else (every model proven equivalent) ⇒ OK at EQUIVALENCE strength. + * + * `ok` is true only for the all-equivalent case. UNDECIDABLE and PROVEN_DIFFERENT are + * NOT `ok` (the run is not accepted on equivalence alone), but they differ in how the + * router reacts (see Router.shouldEscalate): escalate on PROVEN_DIFFERENT, fall back on + * UNDECIDABLE. + */ + export function fromEquivalence(results: { model: string; result: EquivalenceResult }[]): Verdict { + if (results.length === 0) { + return { + ok: false, + strength: Strength.UNVERIFIABLE, + decision: Decision.UNDECIDABLE, + reason: "no models to compare (no reference resolved)", + checks: [], + } + } + const checks: Check[] = [] + let anyUndecidable = false + let anyDifferent = false + // Track confidence only when a model actually reports it — never synthesize a + // 1.0 default (that would read as "100% confident" on a non-OK verdict). + let minConfidence: number | undefined + for (const { model, result } of results) { + const undecidable = !!(result.validation_errors && result.validation_errors.length > 0) + if (typeof result.confidence === "number") + minConfidence = minConfidence === undefined ? result.confidence : Math.min(minConfidence, result.confidence) + if (undecidable) { + anyUndecidable = true + checks.push({ name: model, ok: false, detail: `undecidable: ${result.validation_errors!.join("; ")}` }) + } else if (!result.equivalent) { + anyDifferent = true + const diff = (result.differences ?? []).map((d) => d.description ?? d.severity ?? "diff").join("; ") + checks.push({ name: model, ok: false, detail: `not equivalent: ${diff || "material difference"}` }) + } else { + checks.push({ name: model, ok: true, detail: "equivalent" }) + } + } + // PROVEN_DIFFERENT outranks UNDECIDABLE: a proven material diff is actionable (escalate), + // even if another model in the change was undecidable. + if (anyDifferent) { + return { + ok: false, + strength: Strength.EQUIVALENCE, + decision: Decision.PROVEN_DIFFERENT, + confidence: minConfidence, + reason: `not equivalent to reference: ${checks.filter((c) => !c.ok).map((c) => c.name).join(", ")}`, + checks, + } + } + if (anyUndecidable) { + return { + ok: false, + strength: Strength.BUILD, // equivalence couldn't decide; caller falls back to build/test + decision: Decision.UNDECIDABLE, + reason: "equivalence undecidable for some models — falling back to build/test", + checks, + } + } + return { + ok: true, + strength: Strength.EQUIVALENCE, + decision: Decision.OK, + confidence: minConfidence, + checks, + } + } + + /** Default that passes everything (ungated) — used when no real verifier is configured. */ + export const ALLOW_ALL: Impl = { + verify: () => ({ ok: true, strength: Strength.UNVERIFIABLE, decision: Decision.OK, unverifiable: true, checks: [] }), + } + + /** + * Default deterministic verifier: runs `dbt build` in the workspace and judges + * the result. The command runner is injected so this is unit-testable without dbt. + * NEVER throws — a verifier crash must not break the run (fail-open to a soft verdict). + */ + export function dbtVerifier(run: (cmd: string, workdir: string) => Promise): Impl { + return { + async verify(workdir: string): Promise { + try { + const r = await run("dbt build", workdir) + return fromDbt(r.output, r.exitCode) + } catch (e) { + // Fail-open: can't verify → don't block, but mark unverifiable so it's not + // mistaken for a real pass. + return { + ok: true, + unverifiable: true, + strength: Strength.UNVERIFIABLE, + decision: Decision.UNDECIDABLE, + reason: `verifier error: ${String(e)}`, + checks: [], + evidence: "verifier-error", + } + } + }, + } + } +} diff --git a/packages/opencode/test/router/equivalence-verifier.test.ts b/packages/opencode/test/router/equivalence-verifier.test.ts new file mode 100644 index 000000000..9ea1b925e --- /dev/null +++ b/packages/opencode/test/router/equivalence-verifier.test.ts @@ -0,0 +1,108 @@ +import { describe, expect, test } from "bun:test" +import { Verifier } from "../../src/router/verifier" +import { EquivalenceVerifier } from "../../src/router/equivalence-verifier" + +const { Decision, Strength } = Verifier + +// a fallback that records whether it was consulted +function recordingFallback(verdict: Verifier.Verdict) { + let called = false + const impl: Verifier.Impl = { verify: async () => { called = true; return verdict } } + return { impl, called: () => called } +} + +const resolver = (pairs: EquivalenceVerifier.Pair[] | null): EquivalenceVerifier.ReferenceResolver => ({ + resolve: async () => pairs, +}) + +describe("EquivalenceVerifier", () => { + test("all models proven equivalent → OK at EQUIVALENCE strength, no fallback", async () => { + const fb = recordingFallback({ ok: true, checks: [] }) + const impl = EquivalenceVerifier.create( + async () => ({ equivalent: true, confidence: 0.95 }), + resolver([{ model: "m1", baseSql: "a", headSql: "a" }]), + fb.impl, + ) + const v = await impl.verify("/ws") + expect(v.decision).toBe(Decision.OK) + expect(v.strength).toBe(Strength.EQUIVALENCE) + expect(v.ok).toBe(true) + expect(fb.called()).toBe(false) + }) + + test("a proven material difference → PROVEN_DIFFERENT, no fallback (escalation-worthy)", async () => { + const fb = recordingFallback({ ok: true, checks: [] }) + const impl = EquivalenceVerifier.create( + async () => ({ equivalent: false, differences: [{ severity: "semantic", description: "extra filter" }] }), + resolver([{ model: "m1", baseSql: "a", headSql: "b" }]), + fb.impl, + ) + const v = await impl.verify("/ws") + expect(v.decision).toBe(Decision.PROVEN_DIFFERENT) + expect(v.ok).toBe(false) + expect(fb.called()).toBe(false) + }) + + test("undecidable equivalence + fallback PASSES → OK at BUILD strength (ok⟺OK invariant holds)", async () => { + const fb = recordingFallback({ ok: true, strength: Strength.BUILD, decision: Decision.OK, checks: [{ name: "dbt build", ok: true }] }) + const impl = EquivalenceVerifier.create( + async () => ({ equivalent: false, validation_errors: ["unsupported: STRFTIME"] }), + resolver([{ model: "m1", baseSql: "a", headSql: "b" }]), + fb.impl, + ) + const v = await impl.verify("/ws") + expect(fb.called()).toBe(true) + expect(v.ok).toBe(true) + expect(v.decision).toBe(Decision.OK) // accepted; NOT silently UNDECIDABLE (ok⟺OK) + expect(v.strength).toBe(Strength.BUILD) // the "equivalence couldn't decide" fact lives here + expect(v.reason).toContain("undecidable") + }) + + test("undecidable equivalence + fallback FAILS → FAILED (must escalate, not be swallowed)", async () => { + const fb = recordingFallback({ ok: false, strength: Strength.BUILD, decision: Decision.FAILED, checks: [{ name: "dbt build", ok: false }] }) + const impl = EquivalenceVerifier.create( + async () => ({ equivalent: false, validation_errors: ["unsupported: STRFTIME"] }), + resolver([{ model: "m1", baseSql: "a", headSql: "b" }]), + fb.impl, + ) + const v = await impl.verify("/ws") + expect(fb.called()).toBe(true) + expect(v.ok).toBe(false) + expect(v.decision).toBe(Decision.FAILED) // a real build failure must surface as FAILED so the router escalates + }) + + test("greenfield (no reference) → uses fallback verifier directly", async () => { + const fb = recordingFallback({ ok: true, strength: Strength.BUILD, decision: Decision.OK, checks: [] }) + const impl = EquivalenceVerifier.create(async () => ({ equivalent: true }), resolver(null), fb.impl) + const v = await impl.verify("/ws") + expect(fb.called()).toBe(true) + expect(v.strength).toBe(Strength.BUILD) + }) + + test("equivalence engine throw on one model → undecidable (NOT 'different'), routes to fallback", async () => { + const fb = recordingFallback({ ok: true, strength: Strength.BUILD, decision: Decision.OK, checks: [] }) + const impl = EquivalenceVerifier.create( + async () => { throw new Error("napi panic") }, + resolver([{ model: "m1", baseSql: "a", headSql: "b" }]), + fb.impl, + ) + const v = await impl.verify("/ws") + // engine error ⇒ undecidable (NOT PROVEN_DIFFERENT) ⇒ fallback consulted, decision from fallback + expect(fb.called()).toBe(true) + expect(v.decision).not.toBe(Decision.PROVEN_DIFFERENT) + expect(v.decision).toBe(Decision.OK) // fallback passed → accepted at BUILD strength + expect(v.strength).toBe(Strength.BUILD) + }) + + test("resolver throw → degrade to fallback (fail-open, honest)", async () => { + const fb = recordingFallback({ ok: true, unverifiable: true, strength: Strength.UNVERIFIABLE, decision: Decision.UNDECIDABLE, checks: [] }) + const impl = EquivalenceVerifier.create( + async () => ({ equivalent: true }), + { resolve: async () => { throw new Error("git failed") } }, + fb.impl, + ) + const v = await impl.verify("/ws") + expect(fb.called()).toBe(true) + expect(v.strength).toBe(Strength.UNVERIFIABLE) + }) +}) diff --git a/packages/opencode/test/router/policy.e2e.test.ts b/packages/opencode/test/router/policy.e2e.test.ts new file mode 100644 index 000000000..a72f5cefe --- /dev/null +++ b/packages/opencode/test/router/policy.e2e.test.ts @@ -0,0 +1,109 @@ +import { afterAll, afterEach, beforeAll, describe, expect, test } from "bun:test" +import { Policy } from "../../src/router/policy" +import { Router } from "../../src/router/router" + +// REAL network: a live local HTTP server (Bun.serve) + the real (unreachable) api.altimate.ai. +let server: ReturnType +let base = "" +let mode = "good" +const outcomes: any[] = [] + +beforeAll(() => { + server = Bun.serve({ + port: 0, + async fetch(req) { + const url = new URL(req.url) + if (url.pathname.endsWith("/outcomes")) { + outcomes.push(await req.json().catch(() => null)) + return new Response("{}", { status: 200 }) + } + switch (mode) { + case "good": + return Response.json({ tiers: [{ model: "openrouter/acme/fast", label: "acme-fast" }, { model: "openrouter/acme/strong" }] }) + case "500": + return new Response("upstream error", { status: 500 }) + case "malformed": + return new Response("not json {{{", { status: 200 }) + case "empty": + return Response.json({ tiers: [] }) + case "garbage": + return Response.json({ tiers: [{ nope: 1 }, "str", null, { model: "" }, { model: 123 }] }) + case "bomb": + return Response.json({ tiers: Array.from({ length: 1000 }, (_, i) => ({ model: `openrouter/x/m${i}` })) }) + case "injection": + return Response.json({ tiers: [{ model: "openrouter/evil/m", label: "" }] }) + default: + return new Response("", { status: 404 }) + } + }, + }) + base = `http://localhost:${server.port}` +}) +afterAll(() => server?.stop(true)) +afterEach(() => { + delete process.env["ALTIMATE_API_KEY"] + delete process.env["ALTIMATE_ROUTER_LADDER"] +}) + +const STATIC0 = Router.DEFAULT_LADDER[0].label + +describe("Policy × REAL network (no mocks)", () => { + test("resolve() is static with no key (no network)", () => { + expect(Policy.resolve().source).toBe("static") + }) + + test("good endpoint: fetches the customer ladder over real HTTP", async () => { + mode = "good" + const tiers = await Policy.altimate("k", base).tiers({ taskId: "t" }) + expect(tiers[0].label).toBe("acme-fast") + expect(tiers[1].label).toBe("strong") // label derived from model + }) + + test("real UNREACHABLE endpoint (api.altimate.ai) → graceful fallback to static", async () => { + const tiers = await Policy.altimate("k", "https://api.altimate.ai").tiers({}) + expect(tiers[0].label).toBe(STATIC0) + }, 30_000) + + test("reportOutcome posts to a real server when keyed; best-effort (no throw) when unreachable", async () => { + process.env["ALTIMATE_API_KEY"] = "k" + await Policy.reportOutcome( + { schemaVersion: "1", solved: true, solvedBy: "glm-5.1", tier: 1, unverifiable: false, attempts: [], checks: [], evidenceHash: "djb2:0", createdAt: "t" }, + base, + ) + expect(outcomes.at(-1)?.solvedBy).toBe("glm-5.1") + // unreachable host must not throw + await Policy.reportOutcome( + { schemaVersion: "1", solved: false, solvedBy: null, tier: null, unverifiable: false, attempts: [], checks: [], evidenceHash: "djb2:0", createdAt: "t" }, + "https://api.altimate.ai", + ) + }, 30_000) +}) + +describe("Policy × REAL network — ADVERSARIAL endpoint responses", () => { + const cases: [string, (t: Router.Tier[]) => void][] = [ + ["500", (t) => expect(t[0].label).toBe(STATIC0)], + ["malformed", (t) => expect(t[0].label).toBe(STATIC0)], + ["empty", (t) => expect(t[0].label).toBe(STATIC0)], + ["garbage", (t) => expect(t[0].label).toBe(STATIC0)], // no valid model → fallback + ] + for (const [m, assert] of cases) { + test(`'${m}' response → graceful fallback to static`, async () => { + mode = m + assert(await Policy.altimate("k", base).tiers({})) + }) + } + + test("'bomb' (1000-tier cost bomb) → capped to MAX_TIERS", async () => { + mode = "bomb" + const tiers = await Policy.altimate("k", base).tiers({}) + expect(tiers.length).toBe(Policy.MAX_TIERS) + }) + + test("'injection' label → kept as inert string, does not crash; single tier", async () => { + mode = "injection" + const tiers = await Policy.altimate("k", base).tiers({}) + expect(tiers).toHaveLength(1) + expect(tiers[0].model).toBe("openrouter/evil/m") + expect(typeof tiers[0].label).toBe("string") + }) +}) diff --git a/packages/opencode/test/router/policy.test.ts b/packages/opencode/test/router/policy.test.ts new file mode 100644 index 000000000..310409319 --- /dev/null +++ b/packages/opencode/test/router/policy.test.ts @@ -0,0 +1,117 @@ +import { afterEach, describe, expect, test } from "bun:test" +import { Policy } from "../../src/router/policy" +import { Router } from "../../src/router/router" +import type { Verdict } from "../../src/router/verdict" + +afterEach(() => { + delete process.env["ALTIMATE_API_KEY"] + delete process.env["ALTIMATE_API_URL"] + delete process.env["ALTIMATE_ROUTER_LADDER"] +}) + +function fakeFetch(handler: (url: string, init: any) => { ok: boolean; json: () => any }) { + const calls: { url: string; init: any }[] = [] + const fn = (async (url: string, init: any) => { + calls.push({ url, init }) + const r = handler(url, init) + return { ok: r.ok, json: async () => r.json() } as any + }) as unknown as typeof fetch + return Object.assign(fn, { calls }) +} + +describe("Policy.STATIC", () => { + test("returns the calibrated default ladder", async () => { + const tiers = await Policy.STATIC.tiers({}) + expect(tiers[0].label).toBe("deepseek-v4-flash") + expect(Policy.STATIC.source).toBe("static") + }) + test("honors the env ladder override", async () => { + process.env["ALTIMATE_ROUTER_LADDER"] = "openrouter/x/y" + expect((await Policy.STATIC.tiers({}))[0].label).toBe("y") + }) +}) + +describe("Policy.sanitizeTiers (defense against bad/compromised endpoint)", () => { + test("keeps valid tiers + derives missing labels", () => { + const t = Policy.sanitizeTiers([{ model: "p/a", label: "A" }, { model: "p/b" }]) + expect(t).toEqual([{ model: "p/a", label: "A" }, { model: "p/b", label: "b" }]) + }) + test("filters entries without a usable string model", () => { + expect(Policy.sanitizeTiers([{ nope: 1 }, "str", null, { model: "" }, { model: 123 }, { model: "p/ok" }])).toEqual([ + { model: "p/ok", label: "ok" }, + ]) + }) + test("caps a cost-bomb ladder to MAX_TIERS", () => { + const big = Array.from({ length: 1000 }, (_, i) => ({ model: `p/m${i}` })) + expect(Policy.sanitizeTiers(big)!).toHaveLength(Policy.MAX_TIERS) + }) + test("returns null for non-array / all-invalid (caller falls back to static)", () => { + expect(Policy.sanitizeTiers(null)).toBeNull() + expect(Policy.sanitizeTiers("nope")).toBeNull() + expect(Policy.sanitizeTiers([{ nope: 1 }])).toBeNull() + }) + test("rejects malformed model ids (no slash, whitespace, control chars, over-long)", () => { + expect(Policy.sanitizeTiers([{ model: "noslash" }])).toBeNull() + expect(Policy.sanitizeTiers([{ model: "p/ a" }])).toBeNull() + expect(Policy.sanitizeTiers([{ model: "p/x" }])).toBeNull() + expect(Policy.sanitizeTiers([{ model: "p/" + "x".repeat(300) }])).toBeNull() + }) + test("strips non-printable/ANSI from label (printed to terminal)", () => { + const t = Policy.sanitizeTiers([{ model: "p/evil", label: "okbad" }]) + expect(t![0].label).toBe("ok[31mbad") // ESC + BEL stripped, printable kept + }) +}) + +describe("Policy.resolve", () => { + test("static when no altimate key", () => { + expect(Policy.resolve().source).toBe("static") + }) + test("altimate (customer) policy when key present", () => { + process.env["ALTIMATE_API_KEY"] = "sk-altimate-test" + expect(Policy.resolve().source).toBe("altimate") + }) +}) + +describe("Policy.altimate (customer policy)", () => { + test("fetches the per-context ladder with auth", async () => { + const ff = fakeFetch(() => ({ + ok: true, + json: () => ({ tiers: [{ model: "openrouter/acme/fast", label: "acme-fast" }] }), + })) + const p = Policy.altimate("sk-acme", "https://api.altimate.ai", ff) + const tiers = await p.tiers({ taskId: "t1", projectType: "dbt" }) + expect(tiers[0].label).toBe("acme-fast") + expect(ff.calls[0].url).toContain("/v1/router/policy") + expect(ff.calls[0].init.headers.Authorization).toBe("Bearer sk-acme") + }) + test("falls back to static ladder on non-ok response", async () => { + const ff = fakeFetch(() => ({ ok: false, json: () => ({}) })) + const tiers = await Policy.altimate("k", "https://api.altimate.ai", ff).tiers({}) + expect(tiers[0].label).toBe(Router.DEFAULT_LADDER[0].label) + }) + test("falls back to static ladder when transport throws", async () => { + const boom = (async () => { + throw new Error("network down") + }) as unknown as typeof fetch + const tiers = await Policy.altimate("k", "https://api.altimate.ai", boom).tiers({}) + expect(tiers[0].label).toBe(Router.DEFAULT_LADDER[0].label) + }) +}) + +describe("Policy.reportOutcome", () => { + const env: Verdict.Envelope = { + schemaVersion: "1", solved: true, solvedBy: "glm-5.1", tier: 1, unverifiable: false, attempts: [], checks: [], evidenceHash: "djb2:0", createdAt: "2026-05-31T00:00:00Z", + } + test("no-op without a key", async () => { + const ff = fakeFetch(() => ({ ok: true, json: () => ({}) })) + await Policy.reportOutcome(env, "https://api.altimate.ai", ff) + expect(ff.calls).toHaveLength(0) + }) + test("posts the verdict envelope when a key is set", async () => { + process.env["ALTIMATE_API_KEY"] = "sk-acme" + const ff = fakeFetch(() => ({ ok: true, json: () => ({}) })) + await Policy.reportOutcome(env, "https://api.altimate.ai", ff) + expect(ff.calls[0].url).toContain("/v1/router/outcomes") + expect(JSON.parse(ff.calls[0].init.body).solvedBy).toBe("glm-5.1") + }) +}) diff --git a/packages/opencode/test/router/reference.test.ts b/packages/opencode/test/router/reference.test.ts new file mode 100644 index 000000000..deeffe31d --- /dev/null +++ b/packages/opencode/test/router/reference.test.ts @@ -0,0 +1,92 @@ +import { describe, expect, test } from "bun:test" +import { ReferenceResolver } from "../../src/router/reference" + +const deps = (over: Partial): ReferenceResolver.Deps => ({ + baseRef: async () => "main", + changedModels: async () => ["m1"], + compiledSql: async (_w, ref) => new Map([["m1", ref === "WORKING" ? "select 1 as a" : "select 1 as b"]]), + schema: async () => ({ schema: true }), + ...over, +}) + +describe("ReferenceResolver", () => { + test("no base ref → null (greenfield, caller uses build verifier)", async () => { + const r = ReferenceResolver.create(deps({ baseRef: async () => null })) + expect(await r.resolve("/ws")).toBeNull() + }) + + test("base exists but nothing changed → [] (nothing to verify)", async () => { + const r = ReferenceResolver.create(deps({ changedModels: async () => [] })) + expect(await r.resolve("/ws")).toEqual([]) + }) + + test("changed model present on both sides → one pair with base/head compiled SQL + schema", async () => { + const r = ReferenceResolver.create(deps({})) + const pairs = await r.resolve("/ws") + expect(pairs).toHaveLength(1) + expect(pairs![0]).toMatchObject({ model: "m1", baseSql: "select 1 as b", headSql: "select 1 as a" }) + expect(pairs![0].schema).toEqual({ schema: true }) + }) + + test("model new on head (no base compiled) is skipped — not equivalence-checkable", async () => { + const r = ReferenceResolver.create( + deps({ + changedModels: async () => ["m1", "m_new"], + compiledSql: async (_w, ref) => + ref === "WORKING" + ? new Map([["m1", "select 1"], ["m_new", "select 2"]]) + : new Map([["m1", "select 1 old"]]), // m_new absent at base + }), + ) + const pairs = await r.resolve("/ws") + expect(pairs!.map((p) => p.model)).toEqual(["m1"]) // m_new dropped + }) +}) + +describe("ReferenceResolver.gitDbtDeps (orchestration, mocked exec)", () => { + const mkExec = (calls: string[][], outputs: Record) => + (async (cmd: string, args: string[]) => { + calls.push([cmd, ...args]) + return outputs[`${cmd} ${args[0]}`] ?? { stdout: "", code: 0 } + }) as ReferenceResolver.Exec + + const baseOpts = (over: Partial = {}): ReferenceResolver.GitDbtOptions => ({ + readCompiled: async () => new Map([["m1", "select 1"]]), + buildSchema: async () => ({ schema: true }), + checkoutBase: async () => ({ dir: "/tmp/base", cleanup: async () => {} }), + ...over, + }) + + test("baseRef: HEAD present → sha; absent → null (greenfield)", async () => { + const d1 = ReferenceResolver.gitDbtDeps(mkExec([], { "git rev-parse": { stdout: "abc123\n", code: 0 } }), baseOpts()) + expect(await d1.baseRef("/ws")).toBe("abc123") + const d2 = ReferenceResolver.gitDbtDeps(mkExec([], { "git rev-parse": { stdout: "", code: 128 } }), baseOpts()) + expect(await d2.baseRef("/ws")).toBeNull() + }) + + test("changedModels: parses git diff to bare model names, filters non-.sql", async () => { + const d = ReferenceResolver.gitDbtDeps( + mkExec([], { "git diff": { stdout: "models/agg/m1.sql\nmodels/schema.yml\nmodels/dim/m2.sql\n", code: 0 } }), + baseOpts(), + ) + expect(await d.changedModels("/ws", "HEAD")).toEqual(["m1", "m2"]) + }) + + test("compiledSql WORKING → dbt compile in workdir then readCompiled", async () => { + const calls: string[][] = [] + const d = ReferenceResolver.gitDbtDeps(mkExec(calls, {}), baseOpts()) + const sql = await d.compiledSql("/ws", "WORKING") + expect(sql.get("m1")).toBe("select 1") + expect(calls.some((c) => c[0] === "dbt" && c[1] === "compile")).toBe(true) + }) + + test("compiledSql base → checkout, deps+compile in the checkout, cleanup always runs", async () => { + let cleaned = false + const d = ReferenceResolver.gitDbtDeps( + mkExec([], {}), + baseOpts({ checkoutBase: async () => ({ dir: "/tmp/base", cleanup: async () => { cleaned = true } }) }), + ) + await d.compiledSql("/ws", "abc123") + expect(cleaned).toBe(true) + }) +}) diff --git a/packages/opencode/test/router/router.e2e.test.ts b/packages/opencode/test/router/router.e2e.test.ts new file mode 100644 index 000000000..c66b7c038 --- /dev/null +++ b/packages/opencode/test/router/router.e2e.test.ts @@ -0,0 +1,115 @@ +import { afterAll, beforeAll, describe, expect, test } from "bun:test" +import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from "node:fs" +import { tmpdir } from "node:os" +import { join } from "node:path" +import { Router } from "../../src/router/router" +import { Verifier } from "../../src/router/verifier" +import { Verdict } from "../../src/router/verdict" + +// REAL OpenRouter calls + REAL dbt. No mocks. +const KEY = process.env["OPENROUTER_API_KEY"] || "" +const IMG = process.env["E2E_IMG"] || "" // provide a docker image with dbt-duckdb; no default +const OR = "https://openrouter.ai/api/v1" + +const dirs: string[] = [] +function project(models: Record): string { + const dir = mkdtempSync(join(tmpdir(), "e2e-router-")) + dirs.push(dir) + writeFileSync(join(dir, "dbt_project.yml"), `name: e2e\nprofile: e2e\nversion: "1.0"\nflags:\n send_anonymous_usage_stats: false\nmodels:\n e2e:\n +materialized: table\n`) + writeFileSync(join(dir, "profiles.yml"), `e2e:\n target: dev\n outputs:\n dev:\n type: duckdb\n path: /proj/e2e.duckdb\n`) + mkdirSync(join(dir, "models")) + for (const [n, sql] of Object.entries(models)) writeFileSync(join(dir, "models", n), sql) + return dir +} + +async function realVerify(dir: string): Promise { + return Verifier.dbtVerifier((cmd, workdir) => { + const p = Bun.spawnSync( + ["docker", "run", "--rm", "-v", `${workdir}:/proj`, "-w", "/proj", IMG, "bash", "-lc", `${cmd} --profiles-dir /proj 2>&1`], + { stdout: "pipe", stderr: "pipe" }, + ) + return Promise.resolve({ output: (p.stdout?.toString() ?? "") + (p.stderr?.toString() ?? ""), exitCode: p.exitCode ?? 1 }) + }).verify(dir) +} + +function extractSql(s: string): string { + const fenced = s.match(/```(?:sql)?\s*([\s\S]*?)```/i) + return (fenced ? fenced[1] : s).trim() +} + +// A real model call that writes the requested dbt model into the workspace. +async function realRunAgent(model: string, note: string | undefined, dir: string, task: string, log: { model: string; note?: string }[]) { + log.push({ model, note }) + const apiModel = model.replace(/^openrouter\//, "") + const res = await fetch(`${OR}/chat/completions`, { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: `Bearer ${KEY}` }, + body: JSON.stringify({ + model: apiModel, + messages: [ + { role: "system", content: "You are a dbt engineer. Output ONLY the SQL for the requested model in a ```sql code block. No prose, no schema.yml." }, + { role: "user", content: task + (note ? `\n\nA PREVIOUS ATTEMPT FAILED VERIFICATION:\n${note}` : "") }, + ], + max_tokens: 600, + temperature: 0, + }), + }) + const j: any = await res.json() + const sql = extractSql(j?.choices?.[0]?.message?.content ?? "select 1 as id") + writeFileSync(join(dir, "models", "answer.sql"), sql) +} + +beforeAll(() => { + if (!KEY) throw new Error("OPENROUTER_API_KEY required for router E2E") + if (!IMG) throw new Error("E2E_IMG not set — provide a docker image with dbt-duckdb") + if (Bun.spawnSync(["docker", "image", "inspect", IMG], { stdout: "ignore", stderr: "ignore" }).exitCode !== 0) + throw new Error(`image ${IMG} missing`) +}) +afterAll(() => { + for (const d of dirs) try { Bun.spawnSync(["sudo", "rm", "-rf", d]); rmSync(d, { recursive: true, force: true }) } catch {} +}) + +describe("Router × REAL OpenRouter + REAL dbt (no mocks)", () => { + test("solves at the cheap tier → no escalation (1 real call)", async () => { + const dir = project({}) + const log: { model: string; note?: string }[] = [] + const task = "Create a dbt model named `answer` that selects the integer 42 aliased as `value`. Materialized as a table." + const result = await Router.route({ + tiers: [{ model: "openrouter/deepseek/deepseek-v4-flash", label: "deepseek-v4-flash" }], + runAgent: (m, note) => realRunAgent(m, note, dir, task, log), + verify: () => realVerify(dir), + }) + expect(result.solved).toBe(true) + expect(result.solvedBy?.label).toBe("deepseek-v4-flash") + expect(log).toHaveLength(1) // only the cheap tier ran + // verdict envelope from a real run + const env = Verdict.build(result, { now: "2026-05-31T00:00:00Z" }) + expect(env.solved).toBe(true) + expect(env.tier).toBe(0) + }, 180_000) + + test("ADVERSARIAL: unsatisfiable workspace → escalates through every real tier, caps, threads failure context", async () => { + // An unrelated, locked broken model makes verification fail no matter what the agent writes, + // forcing real escalation through both tiers. Tests real multi-model escalation + capping + + // that the exact failing node is handed to the next real model. + const dir = project({ "locked_broken.sql": "select notacolumn as x" }) + const log: { model: string; note?: string }[] = [] + const task = "Create a dbt model named `answer` selecting 1 as id." + const result = await Router.route({ + tiers: [ + { model: "openrouter/deepseek/deepseek-v4-flash", label: "deepseek-v4-flash" }, + { model: "openrouter/z-ai/glm-5.1", label: "glm-5.1" }, + ], + runAgent: (m, note) => realRunAgent(m, note, dir, task, log), + verify: () => realVerify(dir), + }) + expect(result.solved).toBe(false) // genuinely unsolvable here + expect(result.attempts).toHaveLength(2) // escalated through BOTH real tiers + expect(log.map((l) => l.model)).toEqual([ + "openrouter/deepseek/deepseek-v4-flash", + "openrouter/z-ai/glm-5.1", + ]) + expect(log[0].note).toBeUndefined() + expect(log[1].note ?? "").toMatch(/locked_broken|did not pass/i) // real failing-check context threaded + }, 240_000) +}) diff --git a/packages/opencode/test/router/router.test.ts b/packages/opencode/test/router/router.test.ts new file mode 100644 index 000000000..91779c3d5 --- /dev/null +++ b/packages/opencode/test/router/router.test.ts @@ -0,0 +1,111 @@ +import { afterEach, describe, expect, test } from "bun:test" +import { Router } from "../../src/router/router" +import { Verifier } from "../../src/router/verifier" + +const OK: Verifier.Verdict = { ok: true, checks: [{ name: "dbt build", ok: true }] } +const FAIL: Verifier.Verdict = { + ok: false, + reason: "1 dbt error(s) — failing: not_null_x", + checks: [{ name: "not_null_x", ok: false, detail: "Failure in test not_null_x" }], +} + +afterEach(() => { + delete process.env["ALTIMATE_ROUTER"] + delete process.env["ALTIMATE_ROUTER_LADDER"] +}) + +describe("Router config", () => { + test("enabled reads the flag", () => { + expect(Router.enabled()).toBe(false) + process.env["ALTIMATE_ROUTER"] = "1" + expect(Router.enabled()).toBe(true) + }) + test("default ladder is cheap→strong", () => { + expect(Router.DEFAULT_LADDER[0].label).toBe("deepseek-v4-flash") + expect(Router.DEFAULT_LADDER.at(-1)!.label).toBe("claude-opus-4.8") + }) + test("ladder() honors env override", () => { + process.env["ALTIMATE_ROUTER_LADDER"] = "openrouter/a/m1, openrouter/b/m2" + const l = Router.ladder() + expect(l.map((t) => t.label)).toEqual(["m1", "m2"]) + }) +}) + +describe("Router.shouldEscalate", () => { + const tiers = Router.DEFAULT_LADDER + test("escalates on failure with tiers remaining", () => { + expect(Router.shouldEscalate(FAIL, 0, tiers)).toBe(true) + }) + test("does not escalate on success", () => { + expect(Router.shouldEscalate(OK, 0, tiers)).toBe(false) + }) + test("does not escalate past the top tier", () => { + expect(Router.shouldEscalate(FAIL, tiers.length - 1, tiers)).toBe(false) + }) +}) + +describe("Router.escalationContext", () => { + test("names the failing checks + reason for the next tier", () => { + const ctx = Router.escalationContext({ model: "m", label: "deepseek-v4-flash" }, FAIL) + expect(ctx).toContain("deepseek-v4-flash") + expect(ctx).toContain("not_null_x") + expect(ctx).toContain("do not start over") + }) +}) + +describe("Router.route", () => { + test("stops at tier 0 when it passes (no escalation)", async () => { + const models: string[] = [] + const r = await Router.route({ + tiers: Router.DEFAULT_LADDER, + runAgent: async (m) => void models.push(m), + verify: async () => OK, + }) + expect(r.solved).toBe(true) + expect(r.solvedBy!.label).toBe("deepseek-v4-flash") + expect(models).toHaveLength(1) // only the cheap tier ran + }) + + test("escalates through tiers until one passes, threading failure context", async () => { + const calls: { model: string; note?: string }[] = [] + let n = 0 + const r = await Router.route({ + tiers: Router.DEFAULT_LADDER, + runAgent: async (model, note) => void calls.push({ model, note }), + verify: async () => (++n >= 2 ? OK : FAIL), // tier0 fails, tier1 passes + }) + expect(r.solved).toBe(true) + expect(r.solvedBy!.label).toBe("glm-5.1") + expect(calls).toHaveLength(2) + expect(calls[0].note).toBeUndefined() + expect(calls[1].note).toContain("not_null_x") // tier1 got the failure context + }) + + test("a thrown runAgent error becomes a failed attempt and escalates (does not abort)", async () => { + const calls: string[] = [] + let n = 0 + const r = await Router.route({ + tiers: Router.DEFAULT_LADDER, + runAgent: async (m) => { + calls.push(m) + if (++n === 1) throw new Error("model API down") // tier 0 throws + }, + verify: async () => OK, // tier 1 verifies ok + }) + expect(r.solved).toBe(true) + expect(r.solvedBy!.label).toBe("glm-5.1") // escalated past the throwing tier + expect(calls).toHaveLength(2) + expect(r.attempts[0].verdict.ok).toBe(false) + expect(r.attempts[0].verdict.reason).toContain("tier error") + }) + + test("unsolved when every tier fails (records all attempts)", async () => { + const r = await Router.route({ + tiers: Router.DEFAULT_LADDER, + runAgent: async () => {}, + verify: async () => FAIL, + }) + expect(r.solved).toBe(false) + expect(r.attempts).toHaveLength(Router.DEFAULT_LADDER.length) + }) +}) diff --git a/packages/opencode/test/router/verdict-strength.test.ts b/packages/opencode/test/router/verdict-strength.test.ts new file mode 100644 index 000000000..c54977855 --- /dev/null +++ b/packages/opencode/test/router/verdict-strength.test.ts @@ -0,0 +1,115 @@ +import { describe, expect, test } from "bun:test" +import { Verifier } from "../../src/router/verifier" +import { Router } from "../../src/router/router" +import { Verdict } from "../../src/router/verdict" + +const { Strength, Decision } = Verifier +const PASS = "Done. PASS=5 WARN=0 ERROR=0 SKIP=0 TOTAL=5" +const FAIL = "Failure in test not_null_x (models/schema.yml)\nDone. PASS=4 WARN=0 ERROR=1 SKIP=0 TOTAL=5" + +describe("fromDbt sets strength + decision", () => { + test("clean build → BUILD / OK", () => { + const v = Verifier.fromDbt(PASS, 0) + expect(v.ok).toBe(true) + expect(v.strength).toBe(Strength.BUILD) + expect(v.decision).toBe(Decision.OK) + }) + test("failed build → BUILD / FAILED", () => { + const v = Verifier.fromDbt(FAIL, 1) + expect(v.ok).toBe(false) + expect(v.strength).toBe(Strength.BUILD) + expect(v.decision).toBe(Decision.FAILED) + }) + // Regression: non-zero exit with a CLEAN summary (e.g. dbt killed/OOM mid-run) must + // still be FAILED with a reason set — the `else if (exitCode !== 0)` branch IS reachable. + test("non-zero exit + clean summary → FAILED with reason (not a silent pass)", () => { + const v = Verifier.fromDbt("Done. PASS=5 WARN=0 ERROR=0 SKIP=0 TOTAL=5", 5) + expect(v.ok).toBe(false) + expect(v.decision).toBe(Decision.FAILED) + expect(v.reason).toContain("exited 5") + }) +}) + +describe("fromEquivalence folds per-model results soundly", () => { + test("all equivalent → OK at EQUIVALENCE strength", () => { + const v = Verifier.fromEquivalence([ + { model: "a", result: { equivalent: true, confidence: 0.95 } }, + { model: "b", result: { equivalent: true, confidence: 0.9 } }, + ]) + expect(v.ok).toBe(true) + expect(v.decision).toBe(Decision.OK) + expect(v.strength).toBe(Strength.EQUIVALENCE) + expect(v.confidence).toBe(0.9) // min across models + }) + + test("a material difference → PROVEN_DIFFERENT, not ok", () => { + const v = Verifier.fromEquivalence([ + { model: "a", result: { equivalent: true } }, + { model: "b", result: { equivalent: false, differences: [{ description: "extra row" }] } }, + ]) + expect(v.ok).toBe(false) + expect(v.decision).toBe(Decision.PROVEN_DIFFERENT) + expect(v.strength).toBe(Strength.EQUIVALENCE) + expect(v.reason).toContain("b") + }) + + test("validation errors → UNDECIDABLE (NOT different), drops to BUILD strength", () => { + const v = Verifier.fromEquivalence([ + { model: "a", result: { equivalent: false, validation_errors: ["unsupported: QUALIFY"] } }, + ]) + expect(v.ok).toBe(false) + expect(v.decision).toBe(Decision.UNDECIDABLE) + expect(v.strength).toBe(Strength.BUILD) + }) + + test("proven-different outranks undecidable", () => { + const v = Verifier.fromEquivalence([ + { model: "a", result: { equivalent: false, validation_errors: ["undecidable"] } }, + { model: "b", result: { equivalent: false, differences: [{ severity: "Semantic" }] } }, + ]) + expect(v.decision).toBe(Decision.PROVEN_DIFFERENT) + }) + + test("no reference resolved → UNDECIDABLE / UNVERIFIABLE (never silent pass)", () => { + const v = Verifier.fromEquivalence([]) + expect(v.ok).toBe(false) + expect(v.decision).toBe(Decision.UNDECIDABLE) + expect(v.strength).toBe(Strength.UNVERIFIABLE) + }) +}) + +describe("Router.shouldEscalate is decision-aware", () => { + const tiers: Router.Tier[] = [{ model: "m1", label: "m1" }, { model: "m2", label: "m2" }] + const mk = (decision: Verifier.Decision): Verifier.Verdict => ({ ok: decision === Decision.OK, decision, checks: [] }) + + test("FAILED escalates", () => expect(Router.shouldEscalate(mk(Decision.FAILED), 0, tiers)).toBe(true)) + test("PROVEN_DIFFERENT escalates", () => expect(Router.shouldEscalate(mk(Decision.PROVEN_DIFFERENT), 0, tiers)).toBe(true)) + test("UNDECIDABLE does NOT escalate (fallback, not stronger model)", () => + expect(Router.shouldEscalate(mk(Decision.UNDECIDABLE), 0, tiers)).toBe(false)) + test("OK does NOT escalate", () => expect(Router.shouldEscalate(mk(Decision.OK), 0, tiers)).toBe(false)) + test("never escalates past the last tier", () => + expect(Router.shouldEscalate(mk(Decision.FAILED), 1, tiers)).toBe(false)) + test("legacy verdict without decision falls back to !ok", () => { + expect(Router.shouldEscalate({ ok: false, checks: [] }, 0, tiers)).toBe(true) + expect(Router.shouldEscalate({ ok: true, checks: [] }, 0, tiers)).toBe(false) + }) +}) + +describe("Verdict.Envelope carries strength + decision (v2)", () => { + test("schema version bumped to 2", () => expect(Verdict.SCHEMA_VERSION).toBe("2")) + test("envelope records the accepted result's strength + decision", () => { + const result: Router.RouteResult = { + solved: true, + solvedBy: { model: "m1", label: "m1" }, + attempts: [ + { tier: { model: "m1", label: "m1" }, verdict: Verifier.fromEquivalence([{ model: "x", result: { equivalent: true, confidence: 0.95 } }]) }, + ], + } + const env = Verdict.build(result, { now: "2026-05-31T00:00:00Z" }) + expect(env.schemaVersion).toBe("2") + expect(env.strength).toBe(Strength.EQUIVALENCE) + expect(env.decision).toBe(Decision.OK) + expect(env.attempts[0].strength).toBe(Strength.EQUIVALENCE) + expect(env.attempts[0].decision).toBe(Decision.OK) + }) +}) diff --git a/packages/opencode/test/router/verdict.test.ts b/packages/opencode/test/router/verdict.test.ts new file mode 100644 index 000000000..cbab5d882 --- /dev/null +++ b/packages/opencode/test/router/verdict.test.ts @@ -0,0 +1,70 @@ +import { describe, expect, test } from "bun:test" +import { Verdict } from "../../src/router/verdict" +import type { Router } from "../../src/router/router" + +const NOW = "2026-05-31T05:00:00.000Z" + +const solvedResult: Router.RouteResult = { + solved: true, + solvedBy: { model: "openrouter/z-ai/glm-5.1", label: "glm-5.1" }, + attempts: [ + { + tier: { model: "openrouter/deepseek/deepseek-v4-flash", label: "deepseek-v4-flash" }, + verdict: { ok: false, reason: "1 error — failing: not_null_x", checks: [{ name: "not_null_x", ok: false }] }, + }, + { + tier: { model: "openrouter/z-ai/glm-5.1", label: "glm-5.1" }, + verdict: { ok: true, checks: [{ name: "dbt build", ok: true }], evidence: "PASS=12 ERROR=0" }, + }, + ], +} + +const unsolvedResult: Router.RouteResult = { + solved: false, + attempts: [ + { + tier: { model: "m", label: "deepseek-v4-flash" }, + verdict: { ok: false, reason: "fail", checks: [{ name: "t1", ok: false }], evidence: "ERROR=2" }, + }, + ], +} + +describe("Verdict.evidenceHash", () => { + test("deterministic + prefixed", () => { + expect(Verdict.evidenceHash("abc")).toBe(Verdict.evidenceHash("abc")) + expect(Verdict.evidenceHash("abc")).toMatch(/^djb2:[0-9a-f]{8}$/) + expect(Verdict.evidenceHash("abc")).not.toBe(Verdict.evidenceHash("abd")) + }) +}) + +describe("Verdict.build", () => { + test("solved: records solving tier, index, and per-attempt history", () => { + const e = Verdict.build(solvedResult, { now: NOW }) + expect(e.solved).toBe(true) + expect(e.solvedBy).toBe("glm-5.1") + expect(e.tier).toBe(1) + expect(e.attempts).toHaveLength(2) + expect(e.attempts[0]).toMatchObject({ model: "deepseek-v4-flash", ok: false, failing: ["not_null_x"] }) + expect(e.checks[0].ok).toBe(true) + expect(e.createdAt).toBe(NOW) + expect(e.signature).toBeUndefined() + }) + + test("unsolved: solvedBy null, tier null", () => { + const e = Verdict.build(unsolvedResult, { now: NOW }) + expect(e.solved).toBe(false) + expect(e.solvedBy).toBeNull() + expect(e.tier).toBeNull() + expect(e.attempts[0].failing).toEqual(["t1"]) + }) + + test("applies an injected signer", () => { + const e = Verdict.build(solvedResult, { now: NOW, sign: (u) => "sig-" + u.evidenceHash }) + expect(e.signature).toContain("sig-djb2:") + }) + + test("serialize round-trips", () => { + const e = Verdict.build(solvedResult, { now: NOW }) + expect(JSON.parse(Verdict.serialize(e)).solvedBy).toBe("glm-5.1") + }) +}) diff --git a/packages/opencode/test/router/verifier.e2e.test.ts b/packages/opencode/test/router/verifier.e2e.test.ts new file mode 100644 index 000000000..3a042b176 --- /dev/null +++ b/packages/opencode/test/router/verifier.e2e.test.ts @@ -0,0 +1,97 @@ +import { afterAll, beforeAll, describe, expect, test } from "bun:test" +import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from "node:fs" +import { tmpdir } from "node:os" +import { join } from "node:path" +import { Verifier } from "../../src/router/verifier" + +// REAL dbt — no mocks. Runs `dbt build` inside a docker image that has dbt-duckdb. +// Provide the image via E2E_IMG (no default — opt-in, infra-dependent test). +const IMG = process.env["E2E_IMG"] || "" + +/** Real command runner: shells `dbt build` inside the image against a mounted project. */ +function dockerDbtRun(cmd: string, workdir: string): Promise { + const p = Bun.spawnSync( + ["docker", "run", "--rm", "-v", `${workdir}:/proj`, "-w", "/proj", IMG, "bash", "-lc", `${cmd} --profiles-dir /proj 2>&1`], + { stdout: "pipe", stderr: "pipe" }, + ) + const output = (p.stdout?.toString() ?? "") + (p.stderr?.toString() ?? "") + return Promise.resolve({ output, exitCode: p.exitCode ?? 1 }) +} + +const dirs: string[] = [] +function project(models: Record, schema?: string): string { + const dir = mkdtempSync(join(tmpdir(), "e2e-dbt-")) + dirs.push(dir) + writeFileSync(join(dir, "dbt_project.yml"), `name: e2e\nprofile: e2e\nversion: "1.0"\nflags:\n send_anonymous_usage_stats: false\nmodels:\n e2e:\n +materialized: table\n`) + writeFileSync(join(dir, "profiles.yml"), `e2e:\n target: dev\n outputs:\n dev:\n type: duckdb\n path: /proj/e2e.duckdb\n`) + mkdirSync(join(dir, "models")) + for (const [name, sql] of Object.entries(models)) writeFileSync(join(dir, "models", name), sql) + if (schema) writeFileSync(join(dir, "models", "schema.yml"), schema) + return dir +} + +beforeAll(() => { + if (!IMG) throw new Error("E2E_IMG not set — provide a docker image with dbt-duckdb") + const ok = Bun.spawnSync(["docker", "image", "inspect", IMG], { stdout: "ignore", stderr: "ignore" }) + if (ok.exitCode !== 0) throw new Error(`E2E image ${IMG} not present`) +}) +afterAll(() => { + for (const d of dirs) try { Bun.spawnSync(["sudo", "rm", "-rf", d]); rmSync(d, { recursive: true, force: true }) } catch {} +}) + +describe("Verifier × REAL dbt (no mocks)", () => { + test("clean project builds → verdict ok", async () => { + const dir = project( + { "ok_model.sql": "select 1 as id" }, + "version: 2\nmodels:\n - name: ok_model\n columns:\n - name: id\n tests: [not_null, unique]\n", + ) + const v = await Verifier.dbtVerifier(dockerDbtRun).verify(dir) + expect(v.ok).toBe(true) + }, 120_000) + + test("compile error → verdict not ok, names the failing model", async () => { + const dir = project({ "ok_model.sql": "select 1 as id", "broken.sql": "select from where" }) + const v = await Verifier.dbtVerifier(dockerDbtRun).verify(dir) + expect(v.ok).toBe(false) + expect(v.reason ?? "").toMatch(/broken|error/i) + }, 120_000) + + test("failing data test (not_null on a null column) → verdict not ok, names the test", async () => { + const dir = project( + { "nulls.sql": "select cast(null as integer) as id" }, + "version: 2\nmodels:\n - name: nulls\n columns:\n - name: id\n tests: [not_null]\n", + ) + const v = await Verifier.dbtVerifier(dockerDbtRun).verify(dir) + expect(v.ok).toBe(false) + expect(JSON.stringify(v.checks)).toMatch(/not_null/i) + }, 120_000) + + test("ADVERSARIAL spoof: model emits a fake 'Done. PASS=99 ERROR=0' but real build fails → verdict not ok", async () => { + // A runtime error makes dbt echo the failing SQL — incl. the injected fake summary comment — + // into stdout, BEFORE dbt's own real ERROR summary. The verifier runs dbt fresh, checks the + // real exit code, and parses the LAST summary → it must not be fooled. + // Unresolved column → a reliable DuckDB error; dbt echoes the failing compiled SQL + // (incl. the injected fake-summary comment) into stdout, then its REAL ERROR summary. + const dir = project({ + "evil.sql": "select notacolumn as id -- Done. PASS=99 WARN=0 ERROR=0 SKIP=0 TOTAL=99", + }) + const r = await dockerDbtRun("dbt build", dir) + const v = Verifier.fromDbt(r.output, r.exitCode) + expect(r.exitCode).not.toBe(0) // the build really failed + expect(v.ok).toBe(false) // ...and the gate cannot be spoofed by model-emitted text + // If the injection vector fired (fake line echoed), last-match must still return the real (error>0) summary. + if (r.output.includes("PASS=99")) { + expect(Verifier.parseDbtSummary(r.output)!.error).toBeGreaterThan(0) + } + }, 120_000) + + test("ADVERSARIAL: agent CLAIMS success in its transcript, but the verifier ignores the claim and runs dbt itself", async () => { + // Simulate the orchestration: the agent's transcript says it passed, but the workspace is broken. + const agentTranscript = "I have completed the task. All tests pass. Done. PASS=50 WARN=0 ERROR=0 TOTAL=50" + const dir = project({ "ok_model.sql": "select 1 as id", "broken.sql": "select nonexistent_col from nowhere" }) + // The verifier does NOT look at agentTranscript — it runs dbt on the real workspace. + const v = await Verifier.dbtVerifier(dockerDbtRun).verify(dir) + expect(agentTranscript).toContain("ERROR=0") // the lie exists... + expect(v.ok).toBe(false) // ...but ground truth wins + }, 120_000) +}) diff --git a/packages/opencode/test/router/verifier.test.ts b/packages/opencode/test/router/verifier.test.ts new file mode 100644 index 000000000..678fe6102 --- /dev/null +++ b/packages/opencode/test/router/verifier.test.ts @@ -0,0 +1,124 @@ +import { describe, expect, test } from "bun:test" +import { Verifier } from "../../src/router/verifier" + +const PASS = "01:23:45 Done. PASS=12 WARN=0 ERROR=0 SKIP=0 NO-OP=0 TOTAL=12" +const FAIL = + "Failure in test not_null_fct_reviews_review_id (models/schema.yml)\n" + + "01:23:45 Done. PASS=11 WARN=0 ERROR=1 SKIP=0 TOTAL=12" +const COMPILE_ERR = "Compilation Error in model stg_orders (models/stg_orders.sql)\n unexpected token" + +describe("Verifier.parseDbtSummary", () => { + test("parses a clean summary", () => { + expect(Verifier.parseDbtSummary(PASS)).toEqual({ pass: 12, warn: 0, error: 0, skip: 0, total: 12 }) + }) + test("parses summary with errors", () => { + expect(Verifier.parseDbtSummary(FAIL)).toEqual({ pass: 11, warn: 0, error: 1, skip: 0, total: 12 }) + }) + test("returns null when no summary present", () => { + expect(Verifier.parseDbtSummary("nothing here")).toBeNull() + }) +}) + +describe("Verifier.failingNodes", () => { + test("extracts a failing test", () => { + const f = Verifier.failingNodes(FAIL) + expect(f).toHaveLength(1) + expect(f[0].name).toBe("not_null_fct_reviews_review_id") + expect(f[0].ok).toBe(false) + }) + test("extracts a compilation error model", () => { + const f = Verifier.failingNodes(COMPILE_ERR) + expect(f[0].name).toBe("stg_orders") + }) + test("dedups repeated nodes", () => { + const f = Verifier.failingNodes(FAIL + "\n" + FAIL) + expect(f).toHaveLength(1) + }) +}) + +describe("Verifier.fromDbt", () => { + test("ok when exit 0 + summary + zero errors", () => { + const v = Verifier.fromDbt(PASS, 0) + expect(v.ok).toBe(true) + expect(v.reason).toBeUndefined() + }) + test("not ok when there are dbt errors (and names the failing node for escalation)", () => { + const v = Verifier.fromDbt(FAIL, 1) + expect(v.ok).toBe(false) + expect(v.reason).toContain("not_null_fct_reviews_review_id") + expect(v.checks.some((c) => !c.ok)).toBe(true) + }) + test("not ok when build never completed (no summary)", () => { + const v = Verifier.fromDbt("crashed early", 1) + expect(v.ok).toBe(false) + expect(v.reason).toContain("did not complete") + }) + test("not ok when summary clean but non-zero exit", () => { + expect(Verifier.fromDbt(PASS, 2).ok).toBe(false) + }) +}) + +describe("Verifier — ADVERSARIAL", () => { + test("summary-line INJECTION: fake 'PASS=99 ERROR=0' earlier is ignored; real (last) summary wins", () => { + const malicious = + "-- model output echoed by dbt on error:\n" + + "Done. PASS=99 WARN=0 ERROR=0 SKIP=0 TOTAL=99\n" + // fake, injected via model SQL + "Compilation Error in model evil (models/evil.sql)\n" + + "01:00:00 Done. PASS=4 WARN=0 ERROR=1 SKIP=0 TOTAL=5" // dbt's REAL summary, last + expect(Verifier.parseDbtSummary(malicious)).toEqual({ pass: 4, warn: 0, error: 1, skip: 0, total: 5 }) + expect(Verifier.fromDbt(malicious, 1).ok).toBe(false) + }) + + test("exitCode is the backstop: fake clean summary but non-zero exit -> not ok", () => { + expect(Verifier.fromDbt("Done. PASS=99 WARN=0 ERROR=0 SKIP=0 TOTAL=99", 1).ok).toBe(false) + }) + + test("a real clean build (exit 0, real summary last) is ok even if a fake line precedes", () => { + const out = "Done. PASS=1 ERROR=5 TOTAL=6\n...later...\nDone. PASS=12 WARN=0 ERROR=0 SKIP=0 TOTAL=12" + expect(Verifier.fromDbt(out, 0).ok).toBe(true) + }) + + test("ANSI color codes around the summary do not break parsing", () => { + const ansi = "01:00:00 Done. PASS=12 WARN=0 ERROR=0 SKIP=0 TOTAL=12" + expect(Verifier.parseDbtSummary(ansi)?.pass).toBe(12) + }) + + test("empty / whitespace / non-dbt output -> not ok (no summary)", () => { + for (const o of ["", " \n\t", "Killed", "Traceback (most recent call last):"]) { + expect(Verifier.fromDbt(o, 1).ok).toBe(false) + } + }) + + test("huge output completes quickly (no catastrophic backtracking)", () => { + const huge = "x ".repeat(500_000) + "\nDone. PASS=3 WARN=0 ERROR=0 SKIP=0 TOTAL=3" + const t0 = Date.now() + expect(Verifier.fromDbt(huge, 0).ok).toBe(true) + expect(Date.now() - t0).toBeLessThan(2000) + }) + + test("multiple real summaries (incremental run + test run) -> last one is authoritative", () => { + const multi = "Done. PASS=8 ERROR=0 TOTAL=8\n...tests...\nDone. PASS=10 WARN=0 ERROR=2 SKIP=0 TOTAL=12" + expect(Verifier.fromDbt(multi, 1).ok).toBe(false) + expect(Verifier.parseDbtSummary(multi)?.error).toBe(2) + }) +}) + +describe("Verifier impls", () => { + test("ALLOW_ALL passes", async () => { + expect((await Verifier.ALLOW_ALL.verify("/app")).ok).toBe(true) + }) + test("dbtVerifier judges via injected runner", async () => { + const good = Verifier.dbtVerifier(async () => ({ output: PASS, exitCode: 0 })) + expect((await good.verify("/app")).ok).toBe(true) + const bad = Verifier.dbtVerifier(async () => ({ output: FAIL, exitCode: 1 })) + expect((await bad.verify("/app")).ok).toBe(false) + }) + test("dbtVerifier fails open if the runner throws", async () => { + const boom = Verifier.dbtVerifier(async () => { + throw new Error("dbt missing") + }) + const v = await boom.verify("/app") + expect(v.ok).toBe(true) + expect(v.reason).toContain("verifier error") + }) +})