diff --git a/crates/codegraph-core/src/domain/graph/builder/pipeline.rs b/crates/codegraph-core/src/domain/graph/builder/pipeline.rs index 196588d81..f90c25b64 100644 --- a/crates/codegraph-core/src/domain/graph/builder/pipeline.rs +++ b/crates/codegraph-core/src/domain/graph/builder/pipeline.rs @@ -1285,6 +1285,16 @@ fn collect_imported_names_for_file( let resolved_path = import_ctx.get_resolved(abs_str, &imp.source); for name in &imp.names { let clean_name = name.strip_prefix("* as ").unwrap_or(name).to_string(); + // CJS require bindings are included in imported_names so the receiver-edge + // resolver treats them as import artifacts (not locally-defined symbols). + // We use an empty target_file so the import-aware call-target lookup + // (`nodes_by_name_and_file.get(&(name, ""))`) always misses and falls + // through to the same-file shadow node — matching WASM call-resolution + // behaviour where CJS bindings are not in importedNamesMap (#1678). + if imp.cjs_require.unwrap_or(false) { + imported_names.push(ImportedName { name: clean_name, file: String::new() }); + continue; + } let mut target_file = resolved_path.clone(); if import_ctx.is_barrel_file(&resolved_path) { let mut visited = HashSet::new(); diff --git a/crates/codegraph-core/src/domain/graph/builder/stages/import_edges.rs b/crates/codegraph-core/src/domain/graph/builder/stages/import_edges.rs index 7471e8a16..fd09d4a2f 100644 --- a/crates/codegraph-core/src/domain/graph/builder/stages/import_edges.rs +++ b/crates/codegraph-core/src/domain/graph/builder/stages/import_edges.rs @@ -432,6 +432,9 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec, #[napi(js_name = "bashSource")] pub bash_source: Option, + /// Marks a CJS destructured require binding (`const { X } = require('./m')`). + /// When true, this entry feeds imported_names for receiver-edge resolution + /// but must NOT produce a DB import edge (mirrors WASM cjsRequireBindings, #1678). + #[napi(js_name = "cjsRequire")] + pub cjs_require: Option, } impl Import { @@ -176,6 +181,7 @@ impl Import { swift_import: None, scala_import: None, bash_source: None, + cjs_require: None, } } } diff --git a/docs/architecture/decisions/002-dynamic-call-resolution.md b/docs/architecture/decisions/002-dynamic-call-resolution.md deleted file mode 100644 index 0e70c3422..000000000 --- a/docs/architecture/decisions/002-dynamic-call-resolution.md +++ /dev/null @@ -1,129 +0,0 @@ -# ADR-002: Dynamic Call Resolution — Taxonomy, Flagged-Edge Sink, and Static Resolution - -**Date:** 2026-06-21 -**Status:** Accepted -**Context:** Dynamic call sites (computed property access, eval, reflection) were silently dropped from the graph — invisible to every query, impact analysis, and dead-code detector. This ADR documents the architectural decisions for surfacing them. - ---- - -## Decision - -Dynamic call sites are **never silently dropped**. Every dynamic call in every language is either: - -- **Resolved (Track A):** statically-knowable targets (`obj["foo"]()`, `const m='foo'; obj[m]()`, closed dispatch tables, literal-name reflection) are resolved into real `calls` edges at full or penalized confidence. -- **Flagged (Track B):** undecidable calls (`eval`, `new Function()`, `obj[runtimeVar]()`, dynamic reflection) are emitted as **sink edges** — visible in the graph, queryable via `codegraph roles --dynamic`, but never polluting normal precision metrics. - -The boolean `Call.dynamic` is refined by a **`DynamicKind` taxonomy** that distinguishes resolvable from flag-only cases. Sink edges reuse `kind='calls'` with a new `dynamic_kind` DB column — not a new edge kind. - ---- - -## Context - -### The problem - -Codegraph documented this in `README.md` as a known limitation: - -> **Dynamic calls are best-effort** — complex computed property access and `eval` patterns are not resolved. - -The actual behavior was worse than "best-effort" — these call sites were silently dropped by `resolveCallTargets` when it found no target. The graph was missing entire call paths through dispatch tables, computed property access, and reflection APIs in all 34 supported languages. - -The limitation conflated two fundamentally different problems: - -- **Track A (resolvable):** `obj["foo"]()`, `const m='foo'; obj[m]()`, `{a:fnA}[k]()`, `getattr(obj,'foo')()`, `Method.invoke(getMethod("foo"))`. A static analyzer can determine the target set. -- **Track B (undecidable):** `eval(s)`, `new Function(s)`, `obj[runtimeVar]()`, `$obj->$m()` where `$m` is not a constant. No static analyzer can resolve these in general. - -The valuable improvements are distinct: **resolve Track A** targets; **detect and flag Track B** instead of dropping. - -### Scope - -The approved scope covers the full pipeline across all 34 supported languages: resolve knowable dynamic dispatch + flag the residue. Delivered as a foundation PR followed by per-language-family PRs. One PR per concern. - ---- - -## Trade-offs - -### Costs - -1. **Data model complexity.** `DynamicKind` replaces a boolean — every `call.dynamic` site in JS, Rust, and the FFI boundary must be updated or threaded through. - -2. **FFI serialization risk.** `Call` is serialized through `SerializedExtractorOutput` in `wasm-worker-{protocol,entry,pool}.ts`. Any new field not explicitly threaded is silently dropped at the Worker boundary — the primary parity divergence risk. - -3. **Parity surface.** Sink-edge emission is two separate code paths (JS `buildFileCallEdges` + Rust `build_call_edges` + a back-fill pass). Both must agree byte-for-byte on synthetic name, `confidence=0.0`, and `dynamic_kind`. - -4. **Scope creep risk (long tail).** 34 languages × multiple dynamic idioms per family. Mitigation: per-family PRs with a filed GitHub issue to split anything that exceeds a single concern. - -5. **Resolution over-approximation (RES-2).** Dispatch-table expansion can invent false-positive edges. Kept at penalized confidence and validated against the `pts-javascript` fixture (separate from `javascript`, which has a precision-1.0 floor). - -### Benefits - -1. **No invisible call sites.** Every dynamic call is represented in the graph. Dead-code detection, blast-radius analysis, and impact queries no longer have blind spots at dynamic dispatch boundaries. - -2. **Queryable diagnostics.** `codegraph roles --dynamic` lists all flagged calls grouped by kind — the "never silently dropped" guarantee made visible to users. - -3. **Correct dead-code classification.** Functions only reachable through flagged dynamic calls are not misclassified as dead. - -4. **Incremental resolution.** The Track A/B split means resolution logic lands in focused PRs without blocking the flag-only improvement, and recall floors can be raised incrementally per language family. - ---- - -## Key Design Decisions - -### DynamicKind taxonomy (not a boolean) - -```ts -export type DynamicKind = - | 'computed-literal' // obj["foo"]() — resolvable - | 'computed-key' // obj[k]() — resolvable iff k is a const literal, else flag - | 'reflection' // .call/.apply/.bind, getattr, Method.invoke, $obj->$m() - | 'eval' // eval(), new Function() — undecidable - | 'unresolved-dynamic'; // detected dynamic call we cannot resolve -``` - -`dynamic?: boolean` is kept to avoid churning every `call.dynamic ? 1 : 0` site. - -### Sink edges reuse `kind='calls'`, not a new edge kind - -A new `EdgeKind` would ripple through every edge-kind switch, role classifier, exporter, MCP tool, and the viewer — high blast radius. Instead: DB migration adds `dynamic_kind TEXT` column to `edges`; sink edges use `kind='calls'`, `dynamic=1`, `dynamic_kind=`, `confidence=0.0`. Confidence below `DEFAULT_MIN_CONFIDENCE=0.5` means they never pollute normal queries or exports but remain queryable when explicitly requested. - -Flagged calls use a synthetic non-matching name (``, ``, etc.). `resolveCallTargets` short-circuits names starting with `$m()`, `call_user_func` | -| **Phase 5** | Go + C/C++: method values, func-typed struct fields, function pointers, `dlsym` | -| **Phase 6** | Long tail (C#, Rust, Swift, ObjC, Elixir, Lua, Dart, …): per-language idioms; languages with no idiomatic dynamic dispatch noted explicitly | -| **RES-1** | Constant-string-key propagation: `const m='foo'; obj[m]()` → `obj.foo` | -| **RES-2** | Dispatch-table expansion: `{a:fnA,b:fnB}[k]()` → `{fnA,fnB}` at penalized confidence | -| **RES-3** | Literal-name reflection per family | -| **Docs** | Rewrite `README.md` limitation; update ROADMAP/BACKLOG; raise recall floors | - ---- - -## Alternatives Considered - -| Alternative | Why rejected | -|-------------|-------------| -| **Keep silently dropping** | Invisible call sites cause wrong dead-code classification and missing blast-radius paths — the tool's core value proposition is undermined | -| **New `dynamic_call` edge kind** | High blast radius through every edge-kind switch, role classifier, exporter, MCP tool, and viewer — not worth it when `kind='calls'` + `dynamic_kind` column achieves the same filtering at zero ripple cost | -| **Resolve everything with types** | Would require a type inference system (TypeScript compiler API, etc.) — dependency on an external heavy system, not in scope; the points-to solver is the right abstraction for the known cases | -| **Flag only, no resolution** | Track A cases (computed literals, dispatch tables) are statically knowable; leaving them flagged rather than resolved misses precision that real users benefit from | - ---- - -## Decision Outcome - -Dynamic call sites are never silently dropped. The `DynamicKind` taxonomy, `dynamic_kind` DB column, and sink-edge pattern are the canonical representation for this class of call. Resolution phases (RES-1/2/3) land on top of the detection foundation without changing the data model. Both WASM and native engines must produce identical sink edges, gated by `/parity` on every phase PR. - -The `javascript` fixture's precision-1.0 floor is the false-positive canary; `pts-javascript` is where dispatch-table expansion is tested under relaxed precision expectations. diff --git a/docs/architecture/decisions/003-interprocedural-dataflow.md b/docs/architecture/decisions/003-interprocedural-dataflow.md deleted file mode 100644 index 9a631e228..000000000 --- a/docs/architecture/decisions/003-interprocedural-dataflow.md +++ /dev/null @@ -1,166 +0,0 @@ -# ADR-003: Interprocedural Dataflow — Variable-Level Vertex Model - -**Date:** 2026-06-21 -**Status:** Accepted -**Context:** Codegraph's dataflow analysis was intraprocedural (single-function scope) and function-keyed. It could not answer "where does this user input end up?" across call boundaries. This ADR documents the architectural decisions for making it interprocedural with variable-level precision. - ---- - -## Decision - -Dataflow analysis is upgraded from **function-keyed** to **variable-level** by introducing **dataflow vertices** — addressable data locations (`param`, `local`, `return`, `receiver`) that belong to enclosing function nodes. Interprocedural stitching rides on the already-resolved `calls` edges rather than ambiguous name-based matching. - -Key structural decisions: - -1. **Dedicated `dataflow_vertices` table** — variable vertices are never added to `nodes`, to avoid polluting role classification, dead-code detection, fan-in/out, communities, and every graph analytic keyed off `nodes`. -2. **Function summaries** — a `dataflow_summary` table caches `param[i] →* return` intra-reachability per function, enabling interprocedural stitching without full callee inlining. -3. **Backward-compatible `dataflow_fn` view** — the existing function-level edge contract is preserved during migration; all current queries continue working unchanged. -4. **Extend to all 34 supported languages** — the 26 languages with no `DATAFLOW_RULES` today get extraction support. -5. **Decision Point DP-1 deferred to P6** — whether variable-level output becomes the default (replacing function-level) is decided at Phase 6, informed by actual P4 benchmark numbers. All P1–P5 work is additive and independently mergeable. - ---- - -## Context - -### The problem - -`codegraph dataflow` answered function-scoped questions: "what does *this function* pass/return/mutate?" The graph was keyed by functions; data passing *through* a helper, middleware chain, or factory was invisible. The `README.md` documented this as a known limitation: - -> **Intraprocedural (single-function scope), not interprocedural** — data flow across call boundaries is not tracked. - -The existing visitor already computed the raw material for variable-level summaries (`parameters`, `returns.referencedNames`, binding indices) — but three of the five fact types were thrown away by `insertDataflowEdges`/`collectNativeEdges`. This plan consumes facts that already exist; it is not starting from scratch. - -### Existing assets - -- **Parameter nodes are already first-class** (`kind: 'parameter'`, linked by `parameter_of` edge and `parent_id`). Variable-level `param` vertices link back to these existing nodes — no new node kind needed. -- **Call edges are already resolved** into the `edges` table (`kind: 'calls'`), via the 6-level import-aware resolver. Interprocedural stitching rides on these proven, high-precision edges — a precision upgrade over the current name-based `flows_to` resolution (top-10 by file/line, ambiguous). - -### Languages - -8 languages have dataflow rules today (JavaScript, TypeScript, TSX, Python, Go, Rust, Java, C#, PHP, Ruby). The other 26 have no `DATAFLOW_RULES` — `extractDataflow` returns empty. This plan extends coverage to all 34. - ---- - -## Trade-offs - -### Costs - -1. **Schema migration complexity.** Repointing `dataflow` FKs from `nodes` to `dataflow_vertices` is a potentially breaking change. Mitigated by the backward-compatible `dataflow_fn` view and by treating the DB as a derived cache (a rebuild is always acceptable — codegraph already prompts rebuild when dataflow is missing). - -2. **Parity surface grows significantly.** Variable-level facts are far more numerous than function-level. Ordering and deduplication must be deterministic across engines. The parity comparator (`scripts/parity-compare.mjs`) must be extended to diff `dataflow_vertices` + new edge columns (`scope`, `call_edge_id`). - -3. **Performance / DB size.** A variable graph can be 10–50× more edges than the function graph. Hard per-function vertex caps, `MAX_WALK_DEPTH`, and indexed `(func_id, kind)` lookups are mandatory before enabling by default. `bench-check` baseline required before/after each phase. - -4. **26 new language extractors.** Each requires `DATAFLOW_RULES` in TS + a `DataflowRules` static + `ParamStrategy` in Rust, with fixtures and parity gate. Functional languages (Haskell, OCaml, F#, Gleam, Elixir, Erlang, Clojure) need a `TailExpression` return strategy — no `return_node` exists in these grammars. Declarative languages (HCL/Terraform, Verilog) may yield low-value output; implementation vs explicit exclusion is decided per language during that batch. - -5. **Worker-protocol serialization seam.** Any new `ExtractorOutput` field (e.g. `dataflowVertices`) not added to `SerializedExtractorOutput` in `wasm-worker-{protocol,entry,pool}.ts` is silently dropped at the Worker thread boundary — the canonical parity divergence risk in this codebase. - -### Benefits - -1. **Cross-boundary taint tracking.** `dataflow path ` and `dataflow --impact` traverse a precise variable graph across function and file boundaries — the core ask for security audits and refactor impact. - -2. **Precision upgrade.** Stitching on resolved `calls` edges eliminates the ambiguous name-based `flows_to` matching (top-10 by proximity). Call-resolution precision directly bounds dataflow precision. - -3. **No limitation in README.** The "intraprocedural only" caveat is removed at P6, reflecting a genuinely resolved limitation. - -4. **Full language coverage.** Taint analysis works for all 34 languages, not just the 8 with rules today. - ---- - -## Target Architecture - -### Variable-level vertices - -| Vertex kind | Identity | Source | -|-------------|----------|--------| -| `param` | (func_id, param_index) | Reuse existing `parameter` nodes; set `node_id` link | -| `local` | (func_id, name, decl_line) | New — from `assignments`/`var_declarator` | -| `return` | (func_id) | New — one per function with a return value | -| `receiver` | (func_id) | New — `this`/`self` for mutation tracking (optional, Phase 3) | - -Vertices live in `dataflow_vertices(id, func_id, kind, name, param_index, line, node_id)` — not in `nodes`. - -### Edge taxonomy - -| Edge kind | Scope | Meaning | -|-----------|-------|---------| -| `def_use` | `intra` | `param→local`, `local→local`, `*→return` within one function | -| `arg_in` | `inter` | Caller arg-vertex → callee `param[j]` vertex at a resolved call site | -| `return_out` | `inter` | Callee `return` vertex → caller capture-vertex | -| `mutates` | `inter` | Callee mutates arg — propagated back to caller vertex | - -`call_edge_id` links each inter-edge to the `edges` row it was stitched from (precision provenance). - -### Stitching algorithm - -Post-pass after all per-file intra edges and summaries exist: - -``` -for each resolved call edge A --calls--> B: - for each argFlow at that call site (argIndex=j, sourceVertex=x): - emit dataflow(x → B.param[j], kind='arg_in', scope='inter') - if B.summary.param[j].flows_to_return: - emit dataflow(B.return → v, kind='return_out', scope='inter') - if B.summary.param[j].is_mutated: - emit dataflow(x → x, kind='mutates', scope='inter') -``` - -Where `v` is the caller's capture vertex for `B(...)`. - -### Backward-compatible view - -```sql -CREATE VIEW dataflow_fn AS - SELECT sv.func_id AS source_id, tv.func_id AS target_id, - d.kind, d.param_index, d.expression, d.line, d.confidence - FROM dataflow d - JOIN dataflow_vertices sv ON d.source_vertex = sv.id - JOIN dataflow_vertices tv ON d.target_vertex = tv.id - WHERE sv.func_id != tv.func_id; -``` - -Existing `dataflowData`/`dataflowPathData`/`dataflowImpactData` queries and MCP tool continue working during migration. - ---- - -## Delivery Sequence - -Each phase is independently shippable behind the existing `--dataflow` default. - -| Phase | Deliverable | Gate | -|-------|-------------|------| -| **P0** | Schema finalization, migration, parity-comparator extension, worker-protocol fields. Prototype on a single JS fixture end-to-end. | Spike branch (throwaway) | -| **P1** | `dataflow_vertices` + intra `def_use` edges + summaries for JS/TS/TSX. Both engines. `dataflow_fn` view. | Parity (JS) + existing dataflow tests pass | -| **P2** | `arg_in`/`return_out`/`mutates` inter stitching; cross-file; `--taint`; new variable-path queries. | Taint integration tests + parity | -| **P3** | Python, Go, Rust, Java, C#, PHP, Ruby variable model + stitch. | Per-lang parity + dataflow tests | -| **P4** | Cross-file re-stitch on incremental builds; perf caps + benchmarks. | `bench-check` baseline, watcher tests | -| **P5a–P5e** | New languages — B1 (C-family), B2 (JVM/mobile), B3 (scripting), B4 (functional), B5 (systems/DSL). | Per-batch parity + resolution-benchmark | -| **P6** | CLI/MCP polish, docs, README limitation removed. **Resolve DP-1** using P4 benchmark numbers. | Docs review + DP-1 recorded | - ---- - -## Decision Point DP-1 — Variable-Level Default vs Opt-In - -Whether variable-level output replaces function-level as the default is **deferred to Phase 6**, decided by the actual P4 benchmark numbers (build time, DB size, query latency on real repos). - -**Approval gate (binding):** if DP-1 resolves to replacing the default output shape (a breaking change), the implementing agent **must stop, surface the decision and rationale, and wait for explicit approval** before writing any breaking code. When approved, the breaking change goes in a dedicated, self-contained PR held until the user schedules it. All P1–P5 feature PRs must be independently mergeable and shippable without depending on the breaking PR. - ---- - -## Alternatives Considered - -| Alternative | Why rejected | -|-------------|-------------| -| **Add variable vertices to `nodes`** | Pollutes role classification, dead-code detection, fan-in/out, communities, complexity, and every graph analytic keyed off `nodes`. A dedicated table keeps the analytical layer clean | -| **Keep function-keyed, add cross-function BFS** | Ambiguous name matching (`flows_to` uses top-10 by proximity) gives low precision at call boundaries. Riding on resolved `calls` edges is architecturally superior and reuses the work already done by the 6-level import resolver | -| **Single-level interprocedural only** | Variable-level is required for precise taint paths — knowing that `param[2]` of function A reaches `param[0]` of helper B is what makes a taint report actionable. Function-level only tells you A calls B | -| **Full type inference (TypeScript compiler, etc.)** | External heavy dependency; not in scope. The visitor's existing `parameters`/`returns`/`argFlows` facts are sufficient for the IFDS-style summary approach | -| **WASM-only first, then native** | CLAUDE.md mandates identical output from both engines. Implementing TS first and deferring the Rust mirror creates a window where the tool is in a parity-broken state; mirror module-by-module per the mirrored-engine-layout convention instead | - ---- - -## Decision Outcome - -The variable-level vertex model with a dedicated `dataflow_vertices` table, function summaries, and stitching on resolved `calls` edges is the canonical architecture for interprocedural dataflow in codegraph. The backward-compatible `dataflow_fn` view ensures a non-breaking delivery path through P5. DP-1 (default-level choice) is the single breaking-change lever, deliberately deferred to P6 where benchmark data makes it an informed decision. - -All phases ship both WASM and native engine implementations, gated by `/parity`. The worker-protocol serialization seam is a required checklist item on every phase PR. diff --git a/src/domain/graph/builder/stages/native-orchestrator.ts b/src/domain/graph/builder/stages/native-orchestrator.ts index 10ec9f658..0b5c3fbcb 100644 --- a/src/domain/graph/builder/stages/native-orchestrator.ts +++ b/src/domain/graph/builder/stages/native-orchestrator.ts @@ -2039,7 +2039,9 @@ async function runPostNativePasses( debug( scopedFiles ? `Post-pass role re-classification complete (${scopedFiles.length} file(s))` - : 'Post-pass role re-classification complete (full graph)', + : needsFullReclassify + ? 'Post-pass role re-classification complete (full graph — full build)' + : 'Post-pass role re-classification complete (full graph — null-file endpoints)', ); } catch (err) { debug(`Post-pass role re-classification failed: ${toErrorMessage(err)}`); diff --git a/src/domain/wasm-worker-entry.ts b/src/domain/wasm-worker-entry.ts index a4a27dbd2..d55f5f7e6 100644 --- a/src/domain/wasm-worker-entry.ts +++ b/src/domain/wasm-worker-entry.ts @@ -834,7 +834,9 @@ function serializeExtractorOutput( ? { returnTypeMap: Array.from(symbols.returnTypeMap.entries()) } : {}), ...(symbols.callAssignments?.length ? { callAssignments: symbols.callAssignments } : {}), - ...(symbols.cjsRequireBindings?.length ? { cjsRequireBindings: symbols.cjsRequireBindings } : {}), + ...(symbols.cjsRequireBindings?.length + ? { cjsRequireBindings: symbols.cjsRequireBindings } + : {}), }; } diff --git a/src/extractors/dart.ts b/src/extractors/dart.ts index a7a6e06f9..cd0642f46 100644 --- a/src/extractors/dart.ts +++ b/src/extractors/dart.ts @@ -264,7 +264,6 @@ function handleDartSelector(node: TreeSitterNode, ctx: ExtractorOutput): void { // the next holds argument_part (the call args) — method name is in the previous sibling const unconditional = findChild(node, 'unconditional_assignable_selector'); let methodName: string | null = null; - let receiverText: string | null = null; if (unconditional) { const id = findChild(unconditional, 'identifier'); @@ -282,8 +281,6 @@ function handleDartSelector(node: TreeSitterNode, ctx: ExtractorOutput): void { const id2 = findChild(unc2, 'identifier'); if (id2) methodName = id2.text; } - } else { - receiverText = sibling?.text ?? null; } } } diff --git a/src/extractors/javascript.ts b/src/extractors/javascript.ts index 16cb50191..99d9a7a45 100644 --- a/src/extractors/javascript.ts +++ b/src/extractors/javascript.ts @@ -294,6 +294,7 @@ function dispatchQueryMatch( imports: Import[], classes: ClassRelation[], exps: Export[], + arrayElemBindings?: ArrayElemBinding[], ): void { if (c.fn_node) { handleFnCapture(c, definitions); @@ -323,7 +324,9 @@ function dispatchQueryMatch( if (cbDef) definitions.push(cbDef); calls.push(...extractCallbackReferenceCalls(c.callmem_node)); } else if (c.callsub_node) { - const callInfo = extractCallInfo(c.callsub_fn!, c.callsub_node); + const callInfo = arrayElemBindings + ? extractCallInfoWithBindings(c.callsub_fn!, c.callsub_node, arrayElemBindings) + : extractCallInfo(c.callsub_fn!, c.callsub_node); if (callInfo) calls.push(callInfo); calls.push(...extractCallbackReferenceCalls(c.callsub_node)); } else if (c.newfn_node) { @@ -375,7 +378,7 @@ function extractSymbolsQuery(tree: TreeSitterTree, query: TreeSitterQuery): Extr // Build capture lookup for this match (1-3 captures each, very fast) const c: Record = Object.create(null); for (const cap of match.captures) c[cap.name] = cap.node; - dispatchQueryMatch(c, definitions, calls, imports, classes, exps); + dispatchQueryMatch(c, definitions, calls, imports, classes, exps, arrayElemBindings); } // Extract top-level constants via targeted walk (query patterns don't cover these) @@ -1314,7 +1317,9 @@ function handleCallExpr(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.calls.push({ name: 'this', line: nodeStartLine(node) }); return; // no further processing needed for this()-style calls } - const callInfo = extractCallInfo(fn, node); + const callInfo = ctx.arrayElemBindings + ? extractCallInfoWithBindings(fn, node, ctx.arrayElemBindings) + : extractCallInfo(fn, node); if (callInfo) ctx.calls.push(callInfo); if (fn.type === 'member_expression') { const cbDef = extractCallbackDefinition(node, fn); @@ -3008,6 +3013,18 @@ function extractCallInfo(fn: TreeSitterNode, callNode: TreeSitterNode): Call | n return null; } +/** Like extractCallInfo but passes arrayElemBindings for dispatch-table detection. */ +function extractCallInfoWithBindings( + fn: TreeSitterNode, + callNode: TreeSitterNode, + arrayElemBindings: ArrayElemBinding[], +): Call | null { + if (fn.type === 'subscript_expression') { + return extractSubscriptCallInfo(fn, callNode, arrayElemBindings); + } + return extractCallInfo(fn, callNode); +} + /** Return the first non-punctuation argument node from a call_expression. */ function getFirstCallArg(callNode: TreeSitterNode): TreeSitterNode | null { const args = callNode.childForFieldName('arguments') || findChild(callNode, 'arguments'); @@ -3152,7 +3169,11 @@ function extractMemberExprCallInfo(fn: TreeSitterNode, callNode: TreeSitterNode) } /** Extract call info from a subscript_expression function node (obj[key]()). */ -function extractSubscriptCallInfo(fn: TreeSitterNode, callNode: TreeSitterNode): Call | null { +function extractSubscriptCallInfo( + fn: TreeSitterNode, + callNode: TreeSitterNode, + arrayElemBindings?: ArrayElemBinding[], +): Call | null { const obj = fn.childForFieldName('object'); const index = fn.childForFieldName('index'); if (!index) return null; @@ -3172,6 +3193,46 @@ function extractSubscriptCallInfo(fn: TreeSitterNode, callNode: TreeSitterNode): } } + // RES-2: {a:fnA,b:fnB}[k]() — inline object literal dispatch table. + // Collect all identifier values as array elements under a synthetic name + // so the pts solver can resolve the wildcard call to each target. + // Also handles ({a:fn})[k]() where the object is wrapped in parentheses. + const objNode = + obj?.type === 'parenthesized_expression' + ? (obj.childForFieldName('expression') ?? obj.child(1) ?? obj) + : obj; + if (indexType === 'identifier' && objNode?.type === 'object' && arrayElemBindings) { + const line = nodeStartLine(callNode); + const col = callNode.startPosition.column; + const tableName = ``; + let idx = 0; + for (let i = 0; i < objNode.childCount; i++) { + const child = objNode.child(i); + if (!child) continue; + if (child.type === 'shorthand_property_identifier') { + if (!BUILTIN_GLOBALS.has(child.text)) { + arrayElemBindings.push({ arrayName: tableName, index: idx, elemName: child.text }); + idx++; + } + } else if (child.type === 'pair') { + const valN = child.childForFieldName('value'); + if (valN?.type === 'identifier' && !BUILTIN_GLOBALS.has(valN.text)) { + arrayElemBindings.push({ arrayName: tableName, index: idx, elemName: valN.text }); + idx++; + } + } + } + if (idx > 0) { + return { + name: `${tableName}[*]`, + line, + dynamic: true, + dynamicKind: 'dispatch-table', + keyExpr: index.text, + }; + } + } + // obj[variable]() — key is a variable; may be resolvable via pts (RES-1), else flagged if (indexType === 'identifier') { const receiver = extractReceiverName(obj); diff --git a/src/features/dataflow.ts b/src/features/dataflow.ts index e04894792..204d15201 100644 --- a/src/features/dataflow.ts +++ b/src/features/dataflow.ts @@ -468,12 +468,22 @@ function buildDataflowVerticesAndEdges( // ── P2: interprocedural stitching ───────────────────────────────────────────── +/** + * Post-pass: connect arg-flow candidates to vertex-level inter-procedural edges. + * Runs after all per-file vertices + summaries have been committed. + * + * For each resolved argFlow (A calls B with arg x → B.param[j]): + * - Emits 'arg_in' inter edge: A's source vertex → B.param[j] vertex + * - If B's summary shows B.param[j] reaches B's return: emits 'return_out' + * inter edge: B.return → A's capture local (if any) + */ /** * Core stitch logic — must be called inside an already-open transaction. * - * All callers (buildDataflowVerticesFromMap, buildDataflowEdges, - * buildDataflowP4ForNative) manage their own outer transaction and call this - * directly to avoid nested transactions, which better-sqlite3 does not support. + * Separated from buildInterproceduralStitch so callers that manage their own + * outer transaction (buildDataflowVerticesFromMap, buildDataflowEdges) can + * call this directly without starting a nested transaction, which better-sqlite3 + * does not support. */ function runInterproceduralStitch( db: BetterSqlite3Database, @@ -582,6 +592,28 @@ function runInterproceduralStitch( return count; } +/** + * Wrap runInterproceduralStitch in its own transaction. + * + * Used only by buildDataflowP4ForNative, which calls the stitch as a + * standalone operation outside any other transaction boundary. All other + * call sites (buildDataflowVerticesFromMap, buildDataflowEdges) manage their + * own outer transaction and call runInterproceduralStitch directly to avoid + * nesting, which better-sqlite3 does not support. + */ +function buildInterproceduralStitch( + db: BetterSqlite3Database, + candidates: StitchCandidate[], + captures: ReturnCapture[], +): number { + if (candidates.length === 0) return 0; + let count = 0; + db.transaction(() => { + count = runInterproceduralStitch(db, candidates, captures); + })(); + return count; +} + // ── buildDataflowEdges ────────────────────────────────────────────────────── // ── P4 helpers ─────────────────────────────────────────────────────────────── @@ -607,7 +639,7 @@ export function collectFuncIdsForFiles( * and recreated, but the callers' files are never re-parsed — so their * arg_in edges (pointing to the old param vertices) are deleted and never * replaced. This function reads those caller files from disk and rebuilds - * the StitchCandidate list so runInterproceduralStitch can reconnect them. + * the StitchCandidate list so buildInterproceduralStitch can reconnect them. */ export async function collectCallerStitchCandidates( db: BetterSqlite3Database, @@ -897,25 +929,19 @@ export async function buildDataflowP4ForNative( ); if (candidates.length > 0) { - let count = 0; - // Wrap the DELETE + stitch in a single transaction so that a crash between - // the purge and the re-insert cannot leave arg_in edges permanently removed - // but never replaced. - db.transaction(() => { - // Purge any existing arg_in edges targeting the changed callees from unchanged - // caller files. These edges were inserted by a previous P4 pass. Without this - // guard, repeated calls to buildDataflowP4ForNative insert duplicates because - // the dataflow table has no UNIQUE constraint on (source_vertex, target_vertex). - const placeholders = changedFuncIds.map(() => '?').join(','); - db.prepare( - `DELETE FROM dataflow - WHERE kind = 'arg_in' - AND target_id IN (${placeholders}) - AND source_id NOT IN (${placeholders})`, - ).run(...changedFuncIds, ...changedFuncIds); - - count = runInterproceduralStitch(db, candidates, captures); - })(); + // Purge any existing arg_in edges targeting the changed callees from unchanged + // caller files. These edges were inserted by a previous P4 pass. Without this + // guard, repeated calls to buildDataflowP4ForNative insert duplicates because + // the dataflow table has no UNIQUE constraint on (source_vertex, target_vertex). + const placeholders = changedFuncIds.map(() => '?').join(','); + db.prepare( + `DELETE FROM dataflow + WHERE kind = 'arg_in' + AND target_id IN (${placeholders}) + AND source_id NOT IN (${placeholders})`, + ).run(...changedFuncIds, ...changedFuncIds); + + const count = buildInterproceduralStitch(db, candidates, captures); if (count > 0) { info(`Dataflow (native P4): ${count} inter-procedural edges re-stitched`); } diff --git a/src/types.ts b/src/types.ts index c85f36f22..ad878ff04 100644 --- a/src/types.ts +++ b/src/types.ts @@ -484,6 +484,7 @@ export interface LOCMetrics { export type DynamicKind = | 'computed-literal' // obj["foo"]() — resolvable; already emitted as normal edge | 'computed-key' // obj[k]() — potentially resolvable via pts; else flagged + | 'dispatch-table' // {a:fnA,b:fnB}[k]() — inline object literal subscript; resolved via pts wildcard | 'reflection' // .call/.apply/.bind / Reflect.* / callable-ref — resolved when target is in codebase; sink edge emitted if unresolved | 'eval' // eval() / new Function() — undecidable; always flagged | 'unresolved-dynamic'; // any other detected dynamic pattern; flagged diff --git a/tests/benchmarks/resolution/fixtures/pts-javascript/dispatch-table.js b/tests/benchmarks/resolution/fixtures/pts-javascript/dispatch-table.js new file mode 100644 index 000000000..a4f4208b5 --- /dev/null +++ b/tests/benchmarks/resolution/fixtures/pts-javascript/dispatch-table.js @@ -0,0 +1,7 @@ +// pts-dispatch-table: closed object literal subscript dispatch {a:fnA,b:fnB}[k]() +function dtFn1() {} +function dtFn2() {} + +function runDispatch(key) { + ({ a: dtFn1, b: dtFn2 })[key](); +} diff --git a/tests/benchmarks/resolution/fixtures/pts-javascript/expected-edges.json b/tests/benchmarks/resolution/fixtures/pts-javascript/expected-edges.json index a198fe0f3..36fcc3045 100644 --- a/tests/benchmarks/resolution/fixtures/pts-javascript/expected-edges.json +++ b/tests/benchmarks/resolution/fixtures/pts-javascript/expected-edges.json @@ -93,6 +93,20 @@ "kind": "calls", "mode": "pts-spread", "notes": "y() inside consumer2 — spread consumer2(...[sprFn3, sprFn4]) binds sprFn4 to y" + }, + { + "source": { "name": "runDispatch", "file": "dispatch-table.js" }, + "target": { "name": "dtFn1", "file": "dispatch-table.js" }, + "kind": "calls", + "mode": "pts-dispatch-table", + "notes": "({a:dtFn1,b:dtFn2})[key]() — inline dispatch table; pts resolves wildcard to each value" + }, + { + "source": { "name": "runDispatch", "file": "dispatch-table.js" }, + "target": { "name": "dtFn2", "file": "dispatch-table.js" }, + "kind": "calls", + "mode": "pts-dispatch-table", + "notes": "({a:dtFn1,b:dtFn2})[key]() — inline dispatch table; pts resolves wildcard to each value" } ] } diff --git a/tests/parsers/dart.test.ts b/tests/parsers/dart.test.ts index 2d7557d0d..f0106a241 100644 --- a/tests/parsers/dart.test.ts +++ b/tests/parsers/dart.test.ts @@ -56,7 +56,11 @@ import 'package:flutter/material.dart';`); var r = Function.apply(callback, []); }`); expect(symbols.calls).toContainEqual( - expect.objectContaining({ name: '', dynamic: true, dynamicKind: 'unresolved-dynamic' }), + expect.objectContaining({ + name: '', + dynamic: true, + dynamicKind: 'unresolved-dynamic', + }), ); }); }); diff --git a/tests/parsers/objc.test.ts b/tests/parsers/objc.test.ts index dcd2b3cb4..221413268 100644 --- a/tests/parsers/objc.test.ts +++ b/tests/parsers/objc.test.ts @@ -96,7 +96,11 @@ describe('Objective-C parser', () => { [obj performSelector:@selector(greet)]; }`); expect(symbols.calls).toContainEqual( - expect.objectContaining({ name: '', dynamic: true, dynamicKind: 'unresolved-dynamic' }), + expect.objectContaining({ + name: '', + dynamic: true, + dynamicKind: 'unresolved-dynamic', + }), ); }); @@ -105,7 +109,11 @@ describe('Objective-C parser', () => { [obj performSelector:@selector(greet) withObject:arg]; }`); expect(symbols.calls).toContainEqual( - expect.objectContaining({ name: '', dynamic: true, dynamicKind: 'unresolved-dynamic' }), + expect.objectContaining({ + name: '', + dynamic: true, + dynamicKind: 'unresolved-dynamic', + }), ); }); @@ -114,7 +122,11 @@ describe('Objective-C parser', () => { objc_msgSend(obj, sel, arg); }`); expect(symbols.calls).toContainEqual( - expect.objectContaining({ name: '', dynamic: true, dynamicKind: 'unresolved-dynamic' }), + expect.objectContaining({ + name: '', + dynamic: true, + dynamicKind: 'unresolved-dynamic', + }), ); }); diff --git a/tests/unit/points-to.test.ts b/tests/unit/points-to.test.ts index b9dac077d..3c435d61e 100644 --- a/tests/unit/points-to.test.ts +++ b/tests/unit/points-to.test.ts @@ -417,3 +417,40 @@ describe('buildPointsToMap — object-rest parameter dispatch (Phase 8.3f)', () } }); }); + +describe('buildPointsToMap — dispatch-table pts constraints (RES-2)', () => { + it('seeds wildcard for synthetic dispatch-table array elem bindings', () => { + const defNames = new Set(['dtFn1', 'dtFn2']); + const arrayElemBindings = [ + { arrayName: '', index: 0, elemName: 'dtFn1' }, + { arrayName: '', index: 1, elemName: 'dtFn2' }, + ]; + const pts = buildPointsToMap([], defNames, NO_IMPORTS, undefined, undefined, arrayElemBindings); + expect(resolveViaPointsTo('[*]', pts)).toContain('dtFn1'); + expect(resolveViaPointsTo('[*]', pts)).toContain('dtFn2'); + }); + + it('resolves only identifier values, not string keys', () => { + const defNames = new Set(['fn1']); + const arrayElemBindings = [{ arrayName: '', index: 0, elemName: 'fn1' }]; + const pts = buildPointsToMap([], defNames, NO_IMPORTS, undefined, undefined, arrayElemBindings); + expect(resolveViaPointsTo('[*]', pts)).toEqual(['fn1']); + }); + + it('two dispatch tables in the same file use distinct synthetic names', () => { + const defNames = new Set(['fnA', 'fnB', 'fnC', 'fnD']); + const arrayElemBindings = [ + { arrayName: '', index: 0, elemName: 'fnA' }, + { arrayName: '', index: 1, elemName: 'fnB' }, + { arrayName: '', index: 0, elemName: 'fnC' }, + { arrayName: '', index: 1, elemName: 'fnD' }, + ]; + const pts = buildPointsToMap([], defNames, NO_IMPORTS, undefined, undefined, arrayElemBindings); + expect(resolveViaPointsTo('[*]', pts)).toContain('fnA'); + expect(resolveViaPointsTo('[*]', pts)).toContain('fnB'); + expect(resolveViaPointsTo('[*]', pts)).not.toContain('fnC'); + expect(resolveViaPointsTo('[*]', pts)).toContain('fnC'); + expect(resolveViaPointsTo('[*]', pts)).toContain('fnD'); + expect(resolveViaPointsTo('[*]', pts)).not.toContain('fnA'); + }); +});