diff --git a/crates/codegraph-core/src/domain/graph/builder/pipeline.rs b/crates/codegraph-core/src/domain/graph/builder/pipeline.rs index f90c25b64..434770f6d 100644 --- a/crates/codegraph-core/src/domain/graph/builder/pipeline.rs +++ b/crates/codegraph-core/src/domain/graph/builder/pipeline.rs @@ -242,6 +242,9 @@ fn resolve_pipeline_imports( let abs_file = Path::new(root_dir).join(rel_path); let abs_str = abs_file.to_str().unwrap_or("").replace('\\', "/"); for imp in &symbols.imports { + // Skip CJS require bindings — they feed imported_names for receiver-edge + // resolution but must not produce DB import edges (#1678). + if imp.cjs_require.unwrap_or(false) { continue; } batch_inputs.push(ImportResolutionInput { from_file: abs_str.clone(), import_source: imp.source.clone(), diff --git a/docs/architecture/decisions/002-dynamic-call-resolution.md b/docs/architecture/decisions/002-dynamic-call-resolution.md new file mode 100644 index 000000000..0e70c3422 --- /dev/null +++ b/docs/architecture/decisions/002-dynamic-call-resolution.md @@ -0,0 +1,129 @@ +# ADR-002: Dynamic Call Resolution — Taxonomy, Flagged-Edge Sink, and Static Resolution + +**Date:** 2026-06-21 +**Status:** Accepted +**Context:** Dynamic call sites (computed property access, eval, reflection) were silently dropped from the graph — invisible to every query, impact analysis, and dead-code detector. This ADR documents the architectural decisions for surfacing them. + +--- + +## Decision + +Dynamic call sites are **never silently dropped**. Every dynamic call in every language is either: + +- **Resolved (Track A):** statically-knowable targets (`obj["foo"]()`, `const m='foo'; obj[m]()`, closed dispatch tables, literal-name reflection) are resolved into real `calls` edges at full or penalized confidence. +- **Flagged (Track B):** undecidable calls (`eval`, `new Function()`, `obj[runtimeVar]()`, dynamic reflection) are emitted as **sink edges** — visible in the graph, queryable via `codegraph roles --dynamic`, but never polluting normal precision metrics. + +The boolean `Call.dynamic` is refined by a **`DynamicKind` taxonomy** that distinguishes resolvable from flag-only cases. Sink edges reuse `kind='calls'` with a new `dynamic_kind` DB column — not a new edge kind. + +--- + +## Context + +### The problem + +Codegraph documented this in `README.md` as a known limitation: + +> **Dynamic calls are best-effort** — complex computed property access and `eval` patterns are not resolved. + +The actual behavior was worse than "best-effort" — these call sites were silently dropped by `resolveCallTargets` when it found no target. The graph was missing entire call paths through dispatch tables, computed property access, and reflection APIs in all 34 supported languages. + +The limitation conflated two fundamentally different problems: + +- **Track A (resolvable):** `obj["foo"]()`, `const m='foo'; obj[m]()`, `{a:fnA}[k]()`, `getattr(obj,'foo')()`, `Method.invoke(getMethod("foo"))`. A static analyzer can determine the target set. +- **Track B (undecidable):** `eval(s)`, `new Function(s)`, `obj[runtimeVar]()`, `$obj->$m()` where `$m` is not a constant. No static analyzer can resolve these in general. + +The valuable improvements are distinct: **resolve Track A** targets; **detect and flag Track B** instead of dropping. + +### Scope + +The approved scope covers the full pipeline across all 34 supported languages: resolve knowable dynamic dispatch + flag the residue. Delivered as a foundation PR followed by per-language-family PRs. One PR per concern. + +--- + +## Trade-offs + +### Costs + +1. **Data model complexity.** `DynamicKind` replaces a boolean — every `call.dynamic` site in JS, Rust, and the FFI boundary must be updated or threaded through. + +2. **FFI serialization risk.** `Call` is serialized through `SerializedExtractorOutput` in `wasm-worker-{protocol,entry,pool}.ts`. Any new field not explicitly threaded is silently dropped at the Worker boundary — the primary parity divergence risk. + +3. **Parity surface.** Sink-edge emission is two separate code paths (JS `buildFileCallEdges` + Rust `build_call_edges` + a back-fill pass). Both must agree byte-for-byte on synthetic name, `confidence=0.0`, and `dynamic_kind`. + +4. **Scope creep risk (long tail).** 34 languages × multiple dynamic idioms per family. Mitigation: per-family PRs with a filed GitHub issue to split anything that exceeds a single concern. + +5. **Resolution over-approximation (RES-2).** Dispatch-table expansion can invent false-positive edges. Kept at penalized confidence and validated against the `pts-javascript` fixture (separate from `javascript`, which has a precision-1.0 floor). + +### Benefits + +1. **No invisible call sites.** Every dynamic call is represented in the graph. Dead-code detection, blast-radius analysis, and impact queries no longer have blind spots at dynamic dispatch boundaries. + +2. **Queryable diagnostics.** `codegraph roles --dynamic` lists all flagged calls grouped by kind — the "never silently dropped" guarantee made visible to users. + +3. **Correct dead-code classification.** Functions only reachable through flagged dynamic calls are not misclassified as dead. + +4. **Incremental resolution.** The Track A/B split means resolution logic lands in focused PRs without blocking the flag-only improvement, and recall floors can be raised incrementally per language family. + +--- + +## Key Design Decisions + +### DynamicKind taxonomy (not a boolean) + +```ts +export type DynamicKind = + | 'computed-literal' // obj["foo"]() — resolvable + | 'computed-key' // obj[k]() — resolvable iff k is a const literal, else flag + | 'reflection' // .call/.apply/.bind, getattr, Method.invoke, $obj->$m() + | 'eval' // eval(), new Function() — undecidable + | 'unresolved-dynamic'; // detected dynamic call we cannot resolve +``` + +`dynamic?: boolean` is kept to avoid churning every `call.dynamic ? 1 : 0` site. + +### Sink edges reuse `kind='calls'`, not a new edge kind + +A new `EdgeKind` would ripple through every edge-kind switch, role classifier, exporter, MCP tool, and the viewer — high blast radius. Instead: DB migration adds `dynamic_kind TEXT` column to `edges`; sink edges use `kind='calls'`, `dynamic=1`, `dynamic_kind=`, `confidence=0.0`. Confidence below `DEFAULT_MIN_CONFIDENCE=0.5` means they never pollute normal queries or exports but remain queryable when explicitly requested. + +Flagged calls use a synthetic non-matching name (``, ``, etc.). `resolveCallTargets` short-circuits names starting with `$m()`, `call_user_func` | +| **Phase 5** | Go + C/C++: method values, func-typed struct fields, function pointers, `dlsym` | +| **Phase 6** | Long tail (C#, Rust, Swift, ObjC, Elixir, Lua, Dart, …): per-language idioms; languages with no idiomatic dynamic dispatch noted explicitly | +| **RES-1** | Constant-string-key propagation: `const m='foo'; obj[m]()` → `obj.foo` | +| **RES-2** | Dispatch-table expansion: `{a:fnA,b:fnB}[k]()` → `{fnA,fnB}` at penalized confidence | +| **RES-3** | Literal-name reflection per family | +| **Docs** | Rewrite `README.md` limitation; update ROADMAP/BACKLOG; raise recall floors | + +--- + +## Alternatives Considered + +| Alternative | Why rejected | +|-------------|-------------| +| **Keep silently dropping** | Invisible call sites cause wrong dead-code classification and missing blast-radius paths — the tool's core value proposition is undermined | +| **New `dynamic_call` edge kind** | High blast radius through every edge-kind switch, role classifier, exporter, MCP tool, and viewer — not worth it when `kind='calls'` + `dynamic_kind` column achieves the same filtering at zero ripple cost | +| **Resolve everything with types** | Would require a type inference system (TypeScript compiler API, etc.) — dependency on an external heavy system, not in scope; the points-to solver is the right abstraction for the known cases | +| **Flag only, no resolution** | Track A cases (computed literals, dispatch tables) are statically knowable; leaving them flagged rather than resolved misses precision that real users benefit from | + +--- + +## Decision Outcome + +Dynamic call sites are never silently dropped. The `DynamicKind` taxonomy, `dynamic_kind` DB column, and sink-edge pattern are the canonical representation for this class of call. Resolution phases (RES-1/2/3) land on top of the detection foundation without changing the data model. Both WASM and native engines must produce identical sink edges, gated by `/parity` on every phase PR. + +The `javascript` fixture's precision-1.0 floor is the false-positive canary; `pts-javascript` is where dispatch-table expansion is tested under relaxed precision expectations. diff --git a/docs/architecture/decisions/003-interprocedural-dataflow.md b/docs/architecture/decisions/003-interprocedural-dataflow.md new file mode 100644 index 000000000..9a631e228 --- /dev/null +++ b/docs/architecture/decisions/003-interprocedural-dataflow.md @@ -0,0 +1,166 @@ +# ADR-003: Interprocedural Dataflow — Variable-Level Vertex Model + +**Date:** 2026-06-21 +**Status:** Accepted +**Context:** Codegraph's dataflow analysis was intraprocedural (single-function scope) and function-keyed. It could not answer "where does this user input end up?" across call boundaries. This ADR documents the architectural decisions for making it interprocedural with variable-level precision. + +--- + +## Decision + +Dataflow analysis is upgraded from **function-keyed** to **variable-level** by introducing **dataflow vertices** — addressable data locations (`param`, `local`, `return`, `receiver`) that belong to enclosing function nodes. Interprocedural stitching rides on the already-resolved `calls` edges rather than ambiguous name-based matching. + +Key structural decisions: + +1. **Dedicated `dataflow_vertices` table** — variable vertices are never added to `nodes`, to avoid polluting role classification, dead-code detection, fan-in/out, communities, and every graph analytic keyed off `nodes`. +2. **Function summaries** — a `dataflow_summary` table caches `param[i] →* return` intra-reachability per function, enabling interprocedural stitching without full callee inlining. +3. **Backward-compatible `dataflow_fn` view** — the existing function-level edge contract is preserved during migration; all current queries continue working unchanged. +4. **Extend to all 34 supported languages** — the 26 languages with no `DATAFLOW_RULES` today get extraction support. +5. **Decision Point DP-1 deferred to P6** — whether variable-level output becomes the default (replacing function-level) is decided at Phase 6, informed by actual P4 benchmark numbers. All P1–P5 work is additive and independently mergeable. + +--- + +## Context + +### The problem + +`codegraph dataflow` answered function-scoped questions: "what does *this function* pass/return/mutate?" The graph was keyed by functions; data passing *through* a helper, middleware chain, or factory was invisible. The `README.md` documented this as a known limitation: + +> **Intraprocedural (single-function scope), not interprocedural** — data flow across call boundaries is not tracked. + +The existing visitor already computed the raw material for variable-level summaries (`parameters`, `returns.referencedNames`, binding indices) — but three of the five fact types were thrown away by `insertDataflowEdges`/`collectNativeEdges`. This plan consumes facts that already exist; it is not starting from scratch. + +### Existing assets + +- **Parameter nodes are already first-class** (`kind: 'parameter'`, linked by `parameter_of` edge and `parent_id`). Variable-level `param` vertices link back to these existing nodes — no new node kind needed. +- **Call edges are already resolved** into the `edges` table (`kind: 'calls'`), via the 6-level import-aware resolver. Interprocedural stitching rides on these proven, high-precision edges — a precision upgrade over the current name-based `flows_to` resolution (top-10 by file/line, ambiguous). + +### Languages + +8 languages have dataflow rules today (JavaScript, TypeScript, TSX, Python, Go, Rust, Java, C#, PHP, Ruby). The other 26 have no `DATAFLOW_RULES` — `extractDataflow` returns empty. This plan extends coverage to all 34. + +--- + +## Trade-offs + +### Costs + +1. **Schema migration complexity.** Repointing `dataflow` FKs from `nodes` to `dataflow_vertices` is a potentially breaking change. Mitigated by the backward-compatible `dataflow_fn` view and by treating the DB as a derived cache (a rebuild is always acceptable — codegraph already prompts rebuild when dataflow is missing). + +2. **Parity surface grows significantly.** Variable-level facts are far more numerous than function-level. Ordering and deduplication must be deterministic across engines. The parity comparator (`scripts/parity-compare.mjs`) must be extended to diff `dataflow_vertices` + new edge columns (`scope`, `call_edge_id`). + +3. **Performance / DB size.** A variable graph can be 10–50× more edges than the function graph. Hard per-function vertex caps, `MAX_WALK_DEPTH`, and indexed `(func_id, kind)` lookups are mandatory before enabling by default. `bench-check` baseline required before/after each phase. + +4. **26 new language extractors.** Each requires `DATAFLOW_RULES` in TS + a `DataflowRules` static + `ParamStrategy` in Rust, with fixtures and parity gate. Functional languages (Haskell, OCaml, F#, Gleam, Elixir, Erlang, Clojure) need a `TailExpression` return strategy — no `return_node` exists in these grammars. Declarative languages (HCL/Terraform, Verilog) may yield low-value output; implementation vs explicit exclusion is decided per language during that batch. + +5. **Worker-protocol serialization seam.** Any new `ExtractorOutput` field (e.g. `dataflowVertices`) not added to `SerializedExtractorOutput` in `wasm-worker-{protocol,entry,pool}.ts` is silently dropped at the Worker thread boundary — the canonical parity divergence risk in this codebase. + +### Benefits + +1. **Cross-boundary taint tracking.** `dataflow path ` and `dataflow --impact` traverse a precise variable graph across function and file boundaries — the core ask for security audits and refactor impact. + +2. **Precision upgrade.** Stitching on resolved `calls` edges eliminates the ambiguous name-based `flows_to` matching (top-10 by proximity). Call-resolution precision directly bounds dataflow precision. + +3. **No limitation in README.** The "intraprocedural only" caveat is removed at P6, reflecting a genuinely resolved limitation. + +4. **Full language coverage.** Taint analysis works for all 34 languages, not just the 8 with rules today. + +--- + +## Target Architecture + +### Variable-level vertices + +| Vertex kind | Identity | Source | +|-------------|----------|--------| +| `param` | (func_id, param_index) | Reuse existing `parameter` nodes; set `node_id` link | +| `local` | (func_id, name, decl_line) | New — from `assignments`/`var_declarator` | +| `return` | (func_id) | New — one per function with a return value | +| `receiver` | (func_id) | New — `this`/`self` for mutation tracking (optional, Phase 3) | + +Vertices live in `dataflow_vertices(id, func_id, kind, name, param_index, line, node_id)` — not in `nodes`. + +### Edge taxonomy + +| Edge kind | Scope | Meaning | +|-----------|-------|---------| +| `def_use` | `intra` | `param→local`, `local→local`, `*→return` within one function | +| `arg_in` | `inter` | Caller arg-vertex → callee `param[j]` vertex at a resolved call site | +| `return_out` | `inter` | Callee `return` vertex → caller capture-vertex | +| `mutates` | `inter` | Callee mutates arg — propagated back to caller vertex | + +`call_edge_id` links each inter-edge to the `edges` row it was stitched from (precision provenance). + +### Stitching algorithm + +Post-pass after all per-file intra edges and summaries exist: + +``` +for each resolved call edge A --calls--> B: + for each argFlow at that call site (argIndex=j, sourceVertex=x): + emit dataflow(x → B.param[j], kind='arg_in', scope='inter') + if B.summary.param[j].flows_to_return: + emit dataflow(B.return → v, kind='return_out', scope='inter') + if B.summary.param[j].is_mutated: + emit dataflow(x → x, kind='mutates', scope='inter') +``` + +Where `v` is the caller's capture vertex for `B(...)`. + +### Backward-compatible view + +```sql +CREATE VIEW dataflow_fn AS + SELECT sv.func_id AS source_id, tv.func_id AS target_id, + d.kind, d.param_index, d.expression, d.line, d.confidence + FROM dataflow d + JOIN dataflow_vertices sv ON d.source_vertex = sv.id + JOIN dataflow_vertices tv ON d.target_vertex = tv.id + WHERE sv.func_id != tv.func_id; +``` + +Existing `dataflowData`/`dataflowPathData`/`dataflowImpactData` queries and MCP tool continue working during migration. + +--- + +## Delivery Sequence + +Each phase is independently shippable behind the existing `--dataflow` default. + +| Phase | Deliverable | Gate | +|-------|-------------|------| +| **P0** | Schema finalization, migration, parity-comparator extension, worker-protocol fields. Prototype on a single JS fixture end-to-end. | Spike branch (throwaway) | +| **P1** | `dataflow_vertices` + intra `def_use` edges + summaries for JS/TS/TSX. Both engines. `dataflow_fn` view. | Parity (JS) + existing dataflow tests pass | +| **P2** | `arg_in`/`return_out`/`mutates` inter stitching; cross-file; `--taint`; new variable-path queries. | Taint integration tests + parity | +| **P3** | Python, Go, Rust, Java, C#, PHP, Ruby variable model + stitch. | Per-lang parity + dataflow tests | +| **P4** | Cross-file re-stitch on incremental builds; perf caps + benchmarks. | `bench-check` baseline, watcher tests | +| **P5a–P5e** | New languages — B1 (C-family), B2 (JVM/mobile), B3 (scripting), B4 (functional), B5 (systems/DSL). | Per-batch parity + resolution-benchmark | +| **P6** | CLI/MCP polish, docs, README limitation removed. **Resolve DP-1** using P4 benchmark numbers. | Docs review + DP-1 recorded | + +--- + +## Decision Point DP-1 — Variable-Level Default vs Opt-In + +Whether variable-level output replaces function-level as the default is **deferred to Phase 6**, decided by the actual P4 benchmark numbers (build time, DB size, query latency on real repos). + +**Approval gate (binding):** if DP-1 resolves to replacing the default output shape (a breaking change), the implementing agent **must stop, surface the decision and rationale, and wait for explicit approval** before writing any breaking code. When approved, the breaking change goes in a dedicated, self-contained PR held until the user schedules it. All P1–P5 feature PRs must be independently mergeable and shippable without depending on the breaking PR. + +--- + +## Alternatives Considered + +| Alternative | Why rejected | +|-------------|-------------| +| **Add variable vertices to `nodes`** | Pollutes role classification, dead-code detection, fan-in/out, communities, complexity, and every graph analytic keyed off `nodes`. A dedicated table keeps the analytical layer clean | +| **Keep function-keyed, add cross-function BFS** | Ambiguous name matching (`flows_to` uses top-10 by proximity) gives low precision at call boundaries. Riding on resolved `calls` edges is architecturally superior and reuses the work already done by the 6-level import resolver | +| **Single-level interprocedural only** | Variable-level is required for precise taint paths — knowing that `param[2]` of function A reaches `param[0]` of helper B is what makes a taint report actionable. Function-level only tells you A calls B | +| **Full type inference (TypeScript compiler, etc.)** | External heavy dependency; not in scope. The visitor's existing `parameters`/`returns`/`argFlows` facts are sufficient for the IFDS-style summary approach | +| **WASM-only first, then native** | CLAUDE.md mandates identical output from both engines. Implementing TS first and deferring the Rust mirror creates a window where the tool is in a parity-broken state; mirror module-by-module per the mirrored-engine-layout convention instead | + +--- + +## Decision Outcome + +The variable-level vertex model with a dedicated `dataflow_vertices` table, function summaries, and stitching on resolved `calls` edges is the canonical architecture for interprocedural dataflow in codegraph. The backward-compatible `dataflow_fn` view ensures a non-breaking delivery path through P5. DP-1 (default-level choice) is the single breaking-change lever, deliberately deferred to P6 where benchmark data makes it an informed decision. + +All phases ship both WASM and native engine implementations, gated by `/parity`. The worker-protocol serialization seam is a required checklist item on every phase PR. diff --git a/src/domain/graph/builder/stages/native-orchestrator.ts b/src/domain/graph/builder/stages/native-orchestrator.ts index 0b5c3fbcb..10ec9f658 100644 --- a/src/domain/graph/builder/stages/native-orchestrator.ts +++ b/src/domain/graph/builder/stages/native-orchestrator.ts @@ -2039,9 +2039,7 @@ async function runPostNativePasses( debug( scopedFiles ? `Post-pass role re-classification complete (${scopedFiles.length} file(s))` - : needsFullReclassify - ? 'Post-pass role re-classification complete (full graph — full build)' - : 'Post-pass role re-classification complete (full graph — null-file endpoints)', + : 'Post-pass role re-classification complete (full graph)', ); } catch (err) { debug(`Post-pass role re-classification failed: ${toErrorMessage(err)}`); diff --git a/src/extractors/javascript.ts b/src/extractors/javascript.ts index 99d9a7a45..16cb50191 100644 --- a/src/extractors/javascript.ts +++ b/src/extractors/javascript.ts @@ -294,7 +294,6 @@ function dispatchQueryMatch( imports: Import[], classes: ClassRelation[], exps: Export[], - arrayElemBindings?: ArrayElemBinding[], ): void { if (c.fn_node) { handleFnCapture(c, definitions); @@ -324,9 +323,7 @@ function dispatchQueryMatch( if (cbDef) definitions.push(cbDef); calls.push(...extractCallbackReferenceCalls(c.callmem_node)); } else if (c.callsub_node) { - const callInfo = arrayElemBindings - ? extractCallInfoWithBindings(c.callsub_fn!, c.callsub_node, arrayElemBindings) - : extractCallInfo(c.callsub_fn!, c.callsub_node); + const callInfo = extractCallInfo(c.callsub_fn!, c.callsub_node); if (callInfo) calls.push(callInfo); calls.push(...extractCallbackReferenceCalls(c.callsub_node)); } else if (c.newfn_node) { @@ -378,7 +375,7 @@ function extractSymbolsQuery(tree: TreeSitterTree, query: TreeSitterQuery): Extr // Build capture lookup for this match (1-3 captures each, very fast) const c: Record = Object.create(null); for (const cap of match.captures) c[cap.name] = cap.node; - dispatchQueryMatch(c, definitions, calls, imports, classes, exps, arrayElemBindings); + dispatchQueryMatch(c, definitions, calls, imports, classes, exps); } // Extract top-level constants via targeted walk (query patterns don't cover these) @@ -1317,9 +1314,7 @@ function handleCallExpr(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.calls.push({ name: 'this', line: nodeStartLine(node) }); return; // no further processing needed for this()-style calls } - const callInfo = ctx.arrayElemBindings - ? extractCallInfoWithBindings(fn, node, ctx.arrayElemBindings) - : extractCallInfo(fn, node); + const callInfo = extractCallInfo(fn, node); if (callInfo) ctx.calls.push(callInfo); if (fn.type === 'member_expression') { const cbDef = extractCallbackDefinition(node, fn); @@ -3013,18 +3008,6 @@ function extractCallInfo(fn: TreeSitterNode, callNode: TreeSitterNode): Call | n return null; } -/** Like extractCallInfo but passes arrayElemBindings for dispatch-table detection. */ -function extractCallInfoWithBindings( - fn: TreeSitterNode, - callNode: TreeSitterNode, - arrayElemBindings: ArrayElemBinding[], -): Call | null { - if (fn.type === 'subscript_expression') { - return extractSubscriptCallInfo(fn, callNode, arrayElemBindings); - } - return extractCallInfo(fn, callNode); -} - /** Return the first non-punctuation argument node from a call_expression. */ function getFirstCallArg(callNode: TreeSitterNode): TreeSitterNode | null { const args = callNode.childForFieldName('arguments') || findChild(callNode, 'arguments'); @@ -3169,11 +3152,7 @@ function extractMemberExprCallInfo(fn: TreeSitterNode, callNode: TreeSitterNode) } /** Extract call info from a subscript_expression function node (obj[key]()). */ -function extractSubscriptCallInfo( - fn: TreeSitterNode, - callNode: TreeSitterNode, - arrayElemBindings?: ArrayElemBinding[], -): Call | null { +function extractSubscriptCallInfo(fn: TreeSitterNode, callNode: TreeSitterNode): Call | null { const obj = fn.childForFieldName('object'); const index = fn.childForFieldName('index'); if (!index) return null; @@ -3193,46 +3172,6 @@ function extractSubscriptCallInfo( } } - // RES-2: {a:fnA,b:fnB}[k]() — inline object literal dispatch table. - // Collect all identifier values as array elements under a synthetic name - // so the pts solver can resolve the wildcard call to each target. - // Also handles ({a:fn})[k]() where the object is wrapped in parentheses. - const objNode = - obj?.type === 'parenthesized_expression' - ? (obj.childForFieldName('expression') ?? obj.child(1) ?? obj) - : obj; - if (indexType === 'identifier' && objNode?.type === 'object' && arrayElemBindings) { - const line = nodeStartLine(callNode); - const col = callNode.startPosition.column; - const tableName = ``; - let idx = 0; - for (let i = 0; i < objNode.childCount; i++) { - const child = objNode.child(i); - if (!child) continue; - if (child.type === 'shorthand_property_identifier') { - if (!BUILTIN_GLOBALS.has(child.text)) { - arrayElemBindings.push({ arrayName: tableName, index: idx, elemName: child.text }); - idx++; - } - } else if (child.type === 'pair') { - const valN = child.childForFieldName('value'); - if (valN?.type === 'identifier' && !BUILTIN_GLOBALS.has(valN.text)) { - arrayElemBindings.push({ arrayName: tableName, index: idx, elemName: valN.text }); - idx++; - } - } - } - if (idx > 0) { - return { - name: `${tableName}[*]`, - line, - dynamic: true, - dynamicKind: 'dispatch-table', - keyExpr: index.text, - }; - } - } - // obj[variable]() — key is a variable; may be resolvable via pts (RES-1), else flagged if (indexType === 'identifier') { const receiver = extractReceiverName(obj); diff --git a/src/features/dataflow.ts b/src/features/dataflow.ts index 204d15201..e04894792 100644 --- a/src/features/dataflow.ts +++ b/src/features/dataflow.ts @@ -468,22 +468,12 @@ function buildDataflowVerticesAndEdges( // ── P2: interprocedural stitching ───────────────────────────────────────────── -/** - * Post-pass: connect arg-flow candidates to vertex-level inter-procedural edges. - * Runs after all per-file vertices + summaries have been committed. - * - * For each resolved argFlow (A calls B with arg x → B.param[j]): - * - Emits 'arg_in' inter edge: A's source vertex → B.param[j] vertex - * - If B's summary shows B.param[j] reaches B's return: emits 'return_out' - * inter edge: B.return → A's capture local (if any) - */ /** * Core stitch logic — must be called inside an already-open transaction. * - * Separated from buildInterproceduralStitch so callers that manage their own - * outer transaction (buildDataflowVerticesFromMap, buildDataflowEdges) can - * call this directly without starting a nested transaction, which better-sqlite3 - * does not support. + * All callers (buildDataflowVerticesFromMap, buildDataflowEdges, + * buildDataflowP4ForNative) manage their own outer transaction and call this + * directly to avoid nested transactions, which better-sqlite3 does not support. */ function runInterproceduralStitch( db: BetterSqlite3Database, @@ -592,28 +582,6 @@ function runInterproceduralStitch( return count; } -/** - * Wrap runInterproceduralStitch in its own transaction. - * - * Used only by buildDataflowP4ForNative, which calls the stitch as a - * standalone operation outside any other transaction boundary. All other - * call sites (buildDataflowVerticesFromMap, buildDataflowEdges) manage their - * own outer transaction and call runInterproceduralStitch directly to avoid - * nesting, which better-sqlite3 does not support. - */ -function buildInterproceduralStitch( - db: BetterSqlite3Database, - candidates: StitchCandidate[], - captures: ReturnCapture[], -): number { - if (candidates.length === 0) return 0; - let count = 0; - db.transaction(() => { - count = runInterproceduralStitch(db, candidates, captures); - })(); - return count; -} - // ── buildDataflowEdges ────────────────────────────────────────────────────── // ── P4 helpers ─────────────────────────────────────────────────────────────── @@ -639,7 +607,7 @@ export function collectFuncIdsForFiles( * and recreated, but the callers' files are never re-parsed — so their * arg_in edges (pointing to the old param vertices) are deleted and never * replaced. This function reads those caller files from disk and rebuilds - * the StitchCandidate list so buildInterproceduralStitch can reconnect them. + * the StitchCandidate list so runInterproceduralStitch can reconnect them. */ export async function collectCallerStitchCandidates( db: BetterSqlite3Database, @@ -929,19 +897,25 @@ export async function buildDataflowP4ForNative( ); if (candidates.length > 0) { - // Purge any existing arg_in edges targeting the changed callees from unchanged - // caller files. These edges were inserted by a previous P4 pass. Without this - // guard, repeated calls to buildDataflowP4ForNative insert duplicates because - // the dataflow table has no UNIQUE constraint on (source_vertex, target_vertex). - const placeholders = changedFuncIds.map(() => '?').join(','); - db.prepare( - `DELETE FROM dataflow - WHERE kind = 'arg_in' - AND target_id IN (${placeholders}) - AND source_id NOT IN (${placeholders})`, - ).run(...changedFuncIds, ...changedFuncIds); - - const count = buildInterproceduralStitch(db, candidates, captures); + let count = 0; + // Wrap the DELETE + stitch in a single transaction so that a crash between + // the purge and the re-insert cannot leave arg_in edges permanently removed + // but never replaced. + db.transaction(() => { + // Purge any existing arg_in edges targeting the changed callees from unchanged + // caller files. These edges were inserted by a previous P4 pass. Without this + // guard, repeated calls to buildDataflowP4ForNative insert duplicates because + // the dataflow table has no UNIQUE constraint on (source_vertex, target_vertex). + const placeholders = changedFuncIds.map(() => '?').join(','); + db.prepare( + `DELETE FROM dataflow + WHERE kind = 'arg_in' + AND target_id IN (${placeholders}) + AND source_id NOT IN (${placeholders})`, + ).run(...changedFuncIds, ...changedFuncIds); + + count = runInterproceduralStitch(db, candidates, captures); + })(); if (count > 0) { info(`Dataflow (native P4): ${count} inter-procedural edges re-stitched`); } diff --git a/src/types.ts b/src/types.ts index ad878ff04..c85f36f22 100644 --- a/src/types.ts +++ b/src/types.ts @@ -484,7 +484,6 @@ export interface LOCMetrics { export type DynamicKind = | 'computed-literal' // obj["foo"]() — resolvable; already emitted as normal edge | 'computed-key' // obj[k]() — potentially resolvable via pts; else flagged - | 'dispatch-table' // {a:fnA,b:fnB}[k]() — inline object literal subscript; resolved via pts wildcard | 'reflection' // .call/.apply/.bind / Reflect.* / callable-ref — resolved when target is in codebase; sink edge emitted if unresolved | 'eval' // eval() / new Function() — undecidable; always flagged | 'unresolved-dynamic'; // any other detected dynamic pattern; flagged diff --git a/tests/benchmarks/resolution/fixtures/pts-javascript/dispatch-table.js b/tests/benchmarks/resolution/fixtures/pts-javascript/dispatch-table.js deleted file mode 100644 index a4f4208b5..000000000 --- a/tests/benchmarks/resolution/fixtures/pts-javascript/dispatch-table.js +++ /dev/null @@ -1,7 +0,0 @@ -// pts-dispatch-table: closed object literal subscript dispatch {a:fnA,b:fnB}[k]() -function dtFn1() {} -function dtFn2() {} - -function runDispatch(key) { - ({ a: dtFn1, b: dtFn2 })[key](); -} diff --git a/tests/benchmarks/resolution/fixtures/pts-javascript/expected-edges.json b/tests/benchmarks/resolution/fixtures/pts-javascript/expected-edges.json index 36fcc3045..a198fe0f3 100644 --- a/tests/benchmarks/resolution/fixtures/pts-javascript/expected-edges.json +++ b/tests/benchmarks/resolution/fixtures/pts-javascript/expected-edges.json @@ -93,20 +93,6 @@ "kind": "calls", "mode": "pts-spread", "notes": "y() inside consumer2 — spread consumer2(...[sprFn3, sprFn4]) binds sprFn4 to y" - }, - { - "source": { "name": "runDispatch", "file": "dispatch-table.js" }, - "target": { "name": "dtFn1", "file": "dispatch-table.js" }, - "kind": "calls", - "mode": "pts-dispatch-table", - "notes": "({a:dtFn1,b:dtFn2})[key]() — inline dispatch table; pts resolves wildcard to each value" - }, - { - "source": { "name": "runDispatch", "file": "dispatch-table.js" }, - "target": { "name": "dtFn2", "file": "dispatch-table.js" }, - "kind": "calls", - "mode": "pts-dispatch-table", - "notes": "({a:dtFn1,b:dtFn2})[key]() — inline dispatch table; pts resolves wildcard to each value" } ] } diff --git a/tests/unit/points-to.test.ts b/tests/unit/points-to.test.ts index 3c435d61e..b9dac077d 100644 --- a/tests/unit/points-to.test.ts +++ b/tests/unit/points-to.test.ts @@ -417,40 +417,3 @@ describe('buildPointsToMap — object-rest parameter dispatch (Phase 8.3f)', () } }); }); - -describe('buildPointsToMap — dispatch-table pts constraints (RES-2)', () => { - it('seeds wildcard for synthetic dispatch-table array elem bindings', () => { - const defNames = new Set(['dtFn1', 'dtFn2']); - const arrayElemBindings = [ - { arrayName: '', index: 0, elemName: 'dtFn1' }, - { arrayName: '', index: 1, elemName: 'dtFn2' }, - ]; - const pts = buildPointsToMap([], defNames, NO_IMPORTS, undefined, undefined, arrayElemBindings); - expect(resolveViaPointsTo('[*]', pts)).toContain('dtFn1'); - expect(resolveViaPointsTo('[*]', pts)).toContain('dtFn2'); - }); - - it('resolves only identifier values, not string keys', () => { - const defNames = new Set(['fn1']); - const arrayElemBindings = [{ arrayName: '', index: 0, elemName: 'fn1' }]; - const pts = buildPointsToMap([], defNames, NO_IMPORTS, undefined, undefined, arrayElemBindings); - expect(resolveViaPointsTo('[*]', pts)).toEqual(['fn1']); - }); - - it('two dispatch tables in the same file use distinct synthetic names', () => { - const defNames = new Set(['fnA', 'fnB', 'fnC', 'fnD']); - const arrayElemBindings = [ - { arrayName: '', index: 0, elemName: 'fnA' }, - { arrayName: '', index: 1, elemName: 'fnB' }, - { arrayName: '', index: 0, elemName: 'fnC' }, - { arrayName: '', index: 1, elemName: 'fnD' }, - ]; - const pts = buildPointsToMap([], defNames, NO_IMPORTS, undefined, undefined, arrayElemBindings); - expect(resolveViaPointsTo('[*]', pts)).toContain('fnA'); - expect(resolveViaPointsTo('[*]', pts)).toContain('fnB'); - expect(resolveViaPointsTo('[*]', pts)).not.toContain('fnC'); - expect(resolveViaPointsTo('[*]', pts)).toContain('fnC'); - expect(resolveViaPointsTo('[*]', pts)).toContain('fnD'); - expect(resolveViaPointsTo('[*]', pts)).not.toContain('fnA'); - }); -});