From 3061e6a23f5035069f9a2c28534751a31cabd682 Mon Sep 17 00:00:00 2001 From: Laith Al-Saadoon <9553966+theagenticguy@users.noreply.github.com> Date: Fri, 29 May 2026 16:24:48 -0500 Subject: [PATCH] fix(ingestion): resolve Python multi-line + src-layout imports (cross-module edges) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs in Python import handling left cross-module CALLS/IMPORTS edges unresolved, so a multi-file Python package read as disconnected islands (field-report Issue 1, Bugs A + B). Bug A — extractPyImports (providers/python.ts) was line-based: a multi-line parenthesized import `from m import (\n a,\n b,\n)` matched the from-regex on the first line only with rest='(' → 0 names → the whole import was silently dropped. black/ruff wrap every long import list this way, so most real modules lost their imports. Fix: joinLogicalLines() collapses physical lines into logical ones across an open paren or a trailing backslash before matching. Bug B — resolveImportTarget (pipeline/phases/parse.ts) only handled ./ ../ / specifiers, so a dotted ABSOLUTE src-layout import (`pkg.client` → src/pkg/client.py) never resolved and was emitted as an stub. Fix: resolveDottedAbsoluteImport() converts dots→slashes and probes the module at the repo root and under detected src roots (discoverSourceRoots), gated to namespace-import languages (Python). A dotted specifier that resolves to no in-repo file is still treated as third-party (external stub) as before. Verified end-to-end on ngs-research-agent: zero stubs for ngs_research_agent.client; real file→file IMPORTS edges (mcp_server→client, test_mcp_server→client) now land. Regression tests: multi-line/backslash import extraction (python.test.ts) and a src-layout dotted-import → file IMPORTS edge with no stub (parse.test.ts). Ingestion 604/604, tsc + biome clean. --- .../src/pipeline/phases/parse.test.ts | 59 ++++++++++++++++++ .../ingestion/src/pipeline/phases/parse.ts | 62 ++++++++++++++++++- .../ingestion/src/providers/python.test.ts | 31 ++++++++++ packages/ingestion/src/providers/python.ts | 41 +++++++++++- 4 files changed, 190 insertions(+), 3 deletions(-) diff --git a/packages/ingestion/src/pipeline/phases/parse.test.ts b/packages/ingestion/src/pipeline/phases/parse.test.ts index f246fa68..0a121c45 100644 --- a/packages/ingestion/src/pipeline/phases/parse.test.ts +++ b/packages/ingestion/src/pipeline/phases/parse.test.ts @@ -123,6 +123,65 @@ describe("parsePhase (integration)", () => { }); }); +describe("parsePhase (Python src-layout dotted import resolution)", () => { + // A src-layout package (src/pkg/...) imported via a dotted ABSOLUTE + // specifier (`pkg.client`) must resolve to the in-repo file, not stub as + // . See field-report Issue 1 Bug B. Also exercises the multi-line + // parenthesized import (Bug A) end-to-end through the phase. + let repo: string; + + before(async () => { + repo = await mkdtemp(path.join(tmpdir(), "och-parse-pysrc-")); + await fs.mkdir(path.join(repo, "src", "pkg"), { recursive: true }); + await fs.writeFile(path.join(repo, "src", "pkg", "__init__.py"), ""); + await fs.writeFile( + path.join(repo, "src", "pkg", "client.py"), + "def get_client():\n return 1\n", + ); + await fs.writeFile( + path.join(repo, "src", "pkg", "app.py"), + [ + "from pkg.client import (", + " get_client,", + ")", + "", + "def run():", + " return get_client()", + "", + ].join("\n"), + ); + }); + + after(async () => { + await rm(repo, { recursive: true, force: true }); + }); + + it("resolves a dotted absolute src-layout import to a file IMPORTS edge (no stub)", async () => { + const { graph } = await runThreePhases(repo); + const nodes = [...graph.nodes()]; + + // No external stub minted for the in-repo dotted import. + const externalStub = nodes.find( + (n) => n.id.includes("") && n.id.includes("pkg.client"), + ); + assert.equal(externalStub, undefined, "in-repo dotted import must not stub as "); + + // A real file→file IMPORTS edge app.py → client.py exists. + const imports = [...graph.edges()].filter( + (e) => + e.type === "IMPORTS" && + String(e.from).includes("app.py") && + String(e.to).includes("client.py"), + ); + assert.ok( + imports.length >= 1, + `expected file IMPORTS app.py -> client.py; edges: ${JSON.stringify( + [...graph.edges()].filter((e) => e.type === "IMPORTS").map((e) => `${e.from}->${e.to}`), + )}`, + ); + }); +}); + describe("parsePhase (content-cache replay)", () => { let repo: string; diff --git a/packages/ingestion/src/pipeline/phases/parse.ts b/packages/ingestion/src/pipeline/phases/parse.ts index 96ea4f97..e0b3ae6e 100644 --- a/packages/ingestion/src/pipeline/phases/parse.ts +++ b/packages/ingestion/src/pipeline/phases/parse.ts @@ -771,13 +771,16 @@ function resolveImportTarget( // Only relative / absolute-in-repo specifiers can be resolved without a // package-manager layer. External packages (e.g. `react`, `numpy`) fall - // through to `undefined` and are skipped by the caller. + // through to `undefined` and are skipped by the caller — EXCEPT for + // namespace-import languages (Python), where a dotted ABSOLUTE specifier + // like `pkg.sub.mod` may still resolve to an in-repo file. We try that + // below before giving up. if ( !preprocessed.startsWith("./") && !preprocessed.startsWith("../") && !preprocessed.startsWith("/") ) { - return undefined; + return resolveDottedAbsoluteImport(preprocessed, provider, structure); } const importerDir = parentDir(importerRel); @@ -793,6 +796,61 @@ function resolveImportTarget( return undefined; } +/** + * Resolve a dotted ABSOLUTE import (`pkg.sub.mod`) to an in-repo file for + * namespace-import languages (Python). The same module can sit at the repo + * root (`pkg/sub/mod.py`) or under a src-layout root (`src/pkg/sub/mod.py`), + * so we convert dots→slashes and probe both shapes against `pathSet`. A + * dotted specifier that resolves to no in-repo file is a third-party package + * (`boto3.session`) and returns undefined — the caller emits the external + * stub as before. + * + * Only `namespace` importSemantics opt in; ES-module languages never carry a + * bare dotted absolute specifier that maps to a repo file this way. + */ +function resolveDottedAbsoluteImport( + specifier: string, + provider: LanguageProvider, + structure: StructureOutput, +): string | undefined { + if (provider.importSemantics !== "namespace") return undefined; + if (!specifier.includes(".")) return undefined; + const asPath = specifier.replace(/\./g, "/"); + // Probe the module path at the repo root and under each detected src-layout + // root. `discoverSourceRoots` is cheap (set scan) and cached per call site + // is unnecessary — pathSet membership is O(1) and the root list is tiny. + const bases = [asPath, ...discoverSourceRoots(structure).map((r) => `${r}/${asPath}`)]; + for (const base of bases) { + for (const c of candidatePathsFor(base, provider.extensions, provider.id)) { + if (structure.pathSet.has(c)) return c; + } + } + return undefined; +} + +/** + * Source-layout roots present in the repo. A package imported as `pkg.mod` + * may live at `src/pkg/mod.py` (PEP 517/518 src-layout) rather than + * `pkg/mod.py`. We detect a root by looking for a path segment that commonly + * holds package sources and actually exists in `pathSet`. Kept conservative + * (`src`) to avoid false roots; extend the list if a layout needs it. + */ +const SOURCE_ROOT_SEGMENTS: readonly string[] = ["src"]; + +function discoverSourceRoots(structure: StructureOutput): readonly string[] { + const roots = new Set(); + for (const seg of SOURCE_ROOT_SEGMENTS) { + const prefix = `${seg}/`; + for (const p of structure.pathSet) { + if (p.startsWith(prefix)) { + roots.add(seg); + break; + } + } + } + return [...roots]; +} + function candidatePathsFor( base: string, extensions: readonly string[], diff --git a/packages/ingestion/src/providers/python.test.ts b/packages/ingestion/src/providers/python.test.ts index b07ec5c1..8eac11b2 100644 --- a/packages/ingestion/src/providers/python.test.ts +++ b/packages/ingestion/src/providers/python.test.ts @@ -113,6 +113,37 @@ describe("pythonProvider (behavior)", () => { assert.equal(utils?.isWildcard, true); }); + it("parses multi-line parenthesized and backslash-continued imports", () => { + // black/ruff wrap long import lists in parens; a line-based parser drops + // these silently (rest='(' → 0 names). See field-report Issue 1 Bug A. + const source = [ + "from pkg.client import (", + " CredentialError,", + " get_client,", + " preflight,", + ")", + "from other import a, \\", + " b", + "from solo import (only_one)", + ].join("\n"); + const imports = pythonProvider.extractImports({ filePath: "m.py", sourceText: source }); + const byName = (s: string) => imports.find((i) => i.source === s); + + const client = byName("pkg.client"); + assert.ok(client, "multi-line parenthesized import must be extracted"); + assert.deepEqual([...(client?.importedNames ?? [])].sort(), [ + "CredentialError", + "get_client", + "preflight", + ]); + + const other = byName("other"); + assert.deepEqual([...(other?.importedNames ?? [])].sort(), ["a", "b"]); + + const solo = byName("solo"); + assert.deepEqual([...(solo?.importedNames ?? [])], ["only_one"]); + }); + it("extracts self-receiver calls in methods", () => { const defs = pythonProvider.extractDefinitions({ filePath: fx.filePath, diff --git a/packages/ingestion/src/providers/python.ts b/packages/ingestion/src/providers/python.ts index e7b242e2..939f6771 100644 --- a/packages/ingestion/src/providers/python.ts +++ b/packages/ingestion/src/providers/python.ts @@ -186,10 +186,49 @@ function qualifiedForCapture( * `from X import *` → package-wildcard * `from . import x` → relative import (source begins `.`) */ +/** + * Collapse Python's physical lines into logical lines so a multi-line import + * is matched as one unit. Two continuation forms are joined: + * - parenthesized lists: `from m import (\n a,\n b,\n)` — join while the + * running open-paren count is > 0; + * - explicit backslash continuation: a line ending in `\`. + * Both are ubiquitous in real Python (black / ruff wrap long import lists in + * parens). Without joining, the per-line regex sees `from m import (` → + * rest `(` → zero names → the whole import is silently dropped. + * + * Comments are already stripped upstream, so a `(` here is structural, not a + * literal inside a string/comment. Paren counting is a coarse approximation + * (it doesn't track string literals) but import statements never contain + * string-embedded parens, so it is exact for the import grammar. + */ +function joinLogicalLines(lines: readonly string[]): string[] { + const out: string[] = []; + let buf = ""; + let depth = 0; + for (const raw of lines) { + let line = raw; + let continued = false; + if (depth === 0 && /\\\s*$/.test(line)) { + line = line.replace(/\\\s*$/, " "); + continued = true; + } + buf = buf === "" ? line : `${buf} ${line.trim()}`; + for (const ch of line) { + if (ch === "(") depth += 1; + else if (ch === ")") depth = Math.max(0, depth - 1); + } + if (depth > 0 || continued) continue; + out.push(buf); + buf = ""; + } + if (buf !== "") out.push(buf); + return out; +} + function extractPyImports(input: ExtractImportsInput): readonly ExtractedImport[] { const { filePath, sourceText } = input; const stripped = stripComments(sourceText); - const lines = stripped.split("\n"); + const lines = joinLogicalLines(stripped.split("\n")); const out: ExtractedImport[] = []; for (const raw of lines) {