From ef0c6fcafe6c70e01bcdb5b97205bc042d41f5d7 Mon Sep 17 00:00:00 2001 From: Laith Al-Saadoon <9553966+theagenticguy@users.noreply.github.com> Date: Fri, 29 May 2026 16:41:10 -0500 Subject: [PATCH] feat(cli): status surfaces retrieval mode (summaries / vectors / embedder) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit symbol_summaries empty and/or embeddings absent silently leaves `query` running BM25-only even though doctor reports embedder weights present — the user only discovers it by inspecting --json (field-report Issue 5). status now prints, after node/edge counts: summaries: when degraded> vectors: populated | bm25-only | unknown embedder: Honesty corrections from review: `vectors: populated` is NOT `hybrid` — `query` also requires the active embedder to load AND its modelId to match meta.embedderModelId, so we report the embedder id from meta rather than implying hybrid will fire. summaries and embeddings are distinct tables; summaries-emptiness does not cause BM25-only (embeddings-emptiness does), so both are surfaced separately. - Adds ITemporalStore.countSymbolSummaries() (+ DuckDbStore impl: COUNT DISTINCT node_id, swallow→0 on missing table). Updated the inline ITemporalStore fake in interface.test.ts. - runStatus gains a probeRetrieval test seam (defaults to opening the read-only store); degrades to summaries:- / vectors:unknown when the store can't open, so a meta-only repo still renders the rest of status. Verified on ngs-research-agent: summaries 0 / vectors bm25-only / embedder none. Tests: bm25-only, populated, and degraded paths. storage 161/161, cli 266/266, tsc + biome clean. --- packages/cli/src/commands/status.test.ts | 51 ++++++++++++++++++++++ packages/cli/src/commands/status.ts | 54 ++++++++++++++++++++++++ packages/storage/src/duckdb-adapter.ts | 19 +++++++++ packages/storage/src/interface.test.ts | 1 + packages/storage/src/interface.ts | 8 ++++ 5 files changed, 133 insertions(+) diff --git a/packages/cli/src/commands/status.test.ts b/packages/cli/src/commands/status.test.ts index d39daeb1..5c899c71 100644 --- a/packages/cli/src/commands/status.test.ts +++ b/packages/cli/src/commands/status.test.ts @@ -116,3 +116,54 @@ test("status surfaces every group the repo belongs to, alphabetical", async () = assert.match(groupsLine, /groups:\s+alpha, zeta$/); assert.doesNotMatch(groupsLine, /unrelated/); }); + +test("status reports bm25-only + summaries count from the retrieval probe", async () => { + const home = await scratch(); + const repoPath = await seedRepo(home, "bm25repo"); + const cap = captureStdout(); + try { + await runStatus(repoPath, { + home, + probeRetrieval: async () => ({ summaries: 0, vectors: "bm25-only" }), + }); + } finally { + cap.restore(); + } + assert.ok( + cap.lines.some((l) => /^summaries:\s+0$/.test(l)), + `expected 'summaries: 0'; got:\n${cap.lines.join("\n")}`, + ); + assert.ok(cap.lines.some((l) => /^vectors:\s+bm25-only$/.test(l))); +}); + +test("status reports populated vectors when the probe says so", async () => { + const home = await scratch(); + const repoPath = await seedRepo(home, "hybridrepo"); + const cap = captureStdout(); + try { + await runStatus(repoPath, { + home, + probeRetrieval: async () => ({ summaries: 42, vectors: "populated" }), + }); + } finally { + cap.restore(); + } + assert.ok(cap.lines.some((l) => /^summaries:\s+42$/.test(l))); + assert.ok(cap.lines.some((l) => /^vectors:\s+populated$/.test(l))); +}); + +test("status degrades to summaries:- / vectors:unknown when the store can't open", async () => { + const home = await scratch(); + const repoPath = await seedRepo(home, "degraded"); + const cap = captureStdout(); + try { + // Default probe: no graph.lbug exists in the seeded repo → undefined. + await runStatus(repoPath, { home, probeRetrieval: async () => undefined }); + } finally { + cap.restore(); + } + assert.ok(cap.lines.some((l) => /^summaries:\s+-$/.test(l))); + assert.ok(cap.lines.some((l) => /^vectors:\s+unknown$/.test(l))); + // The rest of status still renders (groups line present). + assert.ok(cap.lines.some((l) => l.startsWith("groups:"))); +}); diff --git a/packages/cli/src/commands/status.ts b/packages/cli/src/commands/status.ts index 1390d5a1..b15c903b 100644 --- a/packages/cli/src/commands/status.ts +++ b/packages/cli/src/commands/status.ts @@ -8,12 +8,48 @@ */ import { resolve } from "node:path"; +import { embeddingsPopulated } from "@opencodehub/search"; import { readStoreMeta } from "@opencodehub/storage"; import { listGroups } from "../groups.js"; import { readRegistry } from "../registry.js"; +import { openStoreForCommand } from "./open-store.js"; + +/** + * Retrieval-mode probe result for the status output. `summaries` is the count + * of distinct nodes with an LLM summary (dense-leg input); `vectors` reports + * whether the embeddings table is populated. Both are best-effort: a degraded + * or absent store yields `summaries: null`. + */ +export interface RetrievalState { + readonly summaries: number | null; + readonly vectors: "populated" | "bm25-only"; +} export interface StatusOptions { readonly home?: string; + /** + * Test seam: open a read-only store and return its retrieval state. Defaults + * to opening the real composed store. Tests inject a stub so they don't need + * a live graph.lbug on disk. + */ + readonly probeRetrieval?: (repoPath: string) => Promise; +} + +async function defaultProbeRetrieval(repoPath: string): Promise { + let store: Awaited>["store"] | undefined; + try { + const opened = await openStoreForCommand({ repo: repoPath, readOnly: true }); + store = opened.store; + const summaries = await store.temporal.countSymbolSummaries(); + const populated = await embeddingsPopulated(store.graph); + return { summaries, vectors: populated ? "populated" : "bm25-only" }; + } catch { + // No index / degraded store / missing binding — caller degrades the + // output rather than failing the whole status command. + return undefined; + } finally { + await store?.close(); + } } export async function runStatus(path: string, opts: StatusOptions = {}): Promise { @@ -34,6 +70,24 @@ export async function runStatus(path: string, opts: StatusOptions = {}): Promise console.log(`lastCommit: ${meta.lastCommit ?? "-"}`); console.log(`nodes: ${meta.nodeCount}`); console.log(`edges: ${meta.edgeCount}`); + + // Retrieval mode. `query` runs BM25-only unless the embeddings table is + // populated AND the active embedder's modelId matches `meta.embedderModelId` + // — so report the embedder id from meta (no second probe) alongside the + // vector state, instead of implying hybrid will fire. Summaries are a + // distinct table (dense-leg context), not what gates BM25-vs-hybrid; we + // surface the count so an empty-summaries index is visible. + const probe = opts.probeRetrieval ?? defaultProbeRetrieval; + const retrieval = await probe(repoPath); + if (retrieval === undefined) { + console.log("summaries: -"); + console.log("vectors: unknown"); + } else { + console.log(`summaries: ${retrieval.summaries ?? "-"}`); + console.log(`vectors: ${retrieval.vectors}`); + } + console.log(`embedder: ${meta.embedderModelId ?? "none"}`); + if (registryHit === undefined) { console.log("registry: missing — run `codehub analyze` to re-register"); } else { diff --git a/packages/storage/src/duckdb-adapter.ts b/packages/storage/src/duckdb-adapter.ts index 522c0361..d319c7df 100644 --- a/packages/storage/src/duckdb-adapter.ts +++ b/packages/storage/src/duckdb-adapter.ts @@ -329,6 +329,25 @@ export class DuckDbStore implements ITemporalStore { } } + async countSymbolSummaries(): Promise { + try { + const c = this.requireConn(); + const stmt = await c.prepare("SELECT COUNT(DISTINCT node_id) AS n FROM symbol_summaries"); + try { + const reader = await stmt.runAndReadAll(); + const first = reader.getRowObjects()[0] as Record | undefined; + const n = first?.["n"]; + return typeof n === "bigint" ? Number(n) : typeof n === "number" ? n : 0; + } finally { + stmt.destroySync(); + } + } catch { + // Missing table / degraded store → report 0 rather than throwing, so + // `codehub status` degrades gracefully. + return 0; + } + } + // -------------------------------------------------------------------------- // exec — read-only SQL escape hatch (codehub query --sql, MCP sql tool) // -------------------------------------------------------------------------- diff --git a/packages/storage/src/interface.test.ts b/packages/storage/src/interface.test.ts index 0e19b09f..e3e42bda 100644 --- a/packages/storage/src/interface.test.ts +++ b/packages/storage/src/interface.test.ts @@ -122,6 +122,7 @@ test("ITemporalStore-shaped value lacks graph methods at runtime", () => { bulkLoadSymbolSummaries: async () => {}, lookupSymbolSummary: async () => undefined, lookupSymbolSummariesByNode: async () => [], + countSymbolSummaries: async () => 0, }; const bag = temporalOnly as unknown as Record; diff --git a/packages/storage/src/interface.ts b/packages/storage/src/interface.ts index b1371d07..3e377139 100644 --- a/packages/storage/src/interface.ts +++ b/packages/storage/src/interface.ts @@ -467,6 +467,14 @@ export interface ITemporalStore { * deterministically when more than one row per node is present. */ lookupSymbolSummariesByNode(nodeIds: readonly string[]): Promise; + /** + * Count distinct nodes that have at least one summary row. Used by + * `codehub status` to report whether LLM symbol summaries were generated + * for this index (they feed the dense-retrieval leg). Returns 0 — never + * throws — when the table is missing or the store is degraded, so status + * degrades gracefully. + */ + countSymbolSummaries(): Promise; } // ─────────────────────────────────────────────────────────────────────────────