Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions packages/ingestion/src/pipeline/phases/parse.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,65 @@ describe("parsePhase (integration)", () => {
});
});

describe("parsePhase (Python src-layout dotted import resolution)", () => {
// A src-layout package (src/pkg/...) imported via a dotted ABSOLUTE
// specifier (`pkg.client`) must resolve to the in-repo file, not stub as
// <external>. See field-report Issue 1 Bug B. Also exercises the multi-line
// parenthesized import (Bug A) end-to-end through the phase.
let repo: string;

before(async () => {
repo = await mkdtemp(path.join(tmpdir(), "och-parse-pysrc-"));
await fs.mkdir(path.join(repo, "src", "pkg"), { recursive: true });
await fs.writeFile(path.join(repo, "src", "pkg", "__init__.py"), "");
await fs.writeFile(
path.join(repo, "src", "pkg", "client.py"),
"def get_client():\n return 1\n",
);
await fs.writeFile(
path.join(repo, "src", "pkg", "app.py"),
[
"from pkg.client import (",
" get_client,",
")",
"",
"def run():",
" return get_client()",
"",
].join("\n"),
);
});

after(async () => {
await rm(repo, { recursive: true, force: true });
});

it("resolves a dotted absolute src-layout import to a file IMPORTS edge (no <external> stub)", async () => {
const { graph } = await runThreePhases(repo);
const nodes = [...graph.nodes()];

// No external stub minted for the in-repo dotted import.
const externalStub = nodes.find(
(n) => n.id.includes("<external>") && n.id.includes("pkg.client"),
);
assert.equal(externalStub, undefined, "in-repo dotted import must not stub as <external>");

// A real file→file IMPORTS edge app.py → client.py exists.
const imports = [...graph.edges()].filter(
(e) =>
e.type === "IMPORTS" &&
String(e.from).includes("app.py") &&
String(e.to).includes("client.py"),
);
assert.ok(
imports.length >= 1,
`expected file IMPORTS app.py -> client.py; edges: ${JSON.stringify(
[...graph.edges()].filter((e) => e.type === "IMPORTS").map((e) => `${e.from}->${e.to}`),
)}`,
);
});
});

describe("parsePhase (content-cache replay)", () => {
let repo: string;

Expand Down
62 changes: 60 additions & 2 deletions packages/ingestion/src/pipeline/phases/parse.ts
Original file line number Diff line number Diff line change
Expand Up @@ -771,13 +771,16 @@ function resolveImportTarget(

// Only relative / absolute-in-repo specifiers can be resolved without a
// package-manager layer. External packages (e.g. `react`, `numpy`) fall
// through to `undefined` and are skipped by the caller.
// through to `undefined` and are skipped by the caller — EXCEPT for
// namespace-import languages (Python), where a dotted ABSOLUTE specifier
// like `pkg.sub.mod` may still resolve to an in-repo file. We try that
// below before giving up.
if (
!preprocessed.startsWith("./") &&
!preprocessed.startsWith("../") &&
!preprocessed.startsWith("/")
) {
return undefined;
return resolveDottedAbsoluteImport(preprocessed, provider, structure);
}

const importerDir = parentDir(importerRel);
Expand All @@ -793,6 +796,61 @@ function resolveImportTarget(
return undefined;
}

/**
* Resolve a dotted ABSOLUTE import (`pkg.sub.mod`) to an in-repo file for
* namespace-import languages (Python). The same module can sit at the repo
* root (`pkg/sub/mod.py`) or under a src-layout root (`src/pkg/sub/mod.py`),
* so we convert dots→slashes and probe both shapes against `pathSet`. A
* dotted specifier that resolves to no in-repo file is a third-party package
* (`boto3.session`) and returns undefined — the caller emits the external
* stub as before.
*
* Only `namespace` importSemantics opt in; ES-module languages never carry a
* bare dotted absolute specifier that maps to a repo file this way.
*/
function resolveDottedAbsoluteImport(
specifier: string,
provider: LanguageProvider,
structure: StructureOutput,
): string | undefined {
if (provider.importSemantics !== "namespace") return undefined;
if (!specifier.includes(".")) return undefined;
const asPath = specifier.replace(/\./g, "/");
// Probe the module path at the repo root and under each detected src-layout
// root. `discoverSourceRoots` is cheap (set scan) and cached per call site
// is unnecessary — pathSet membership is O(1) and the root list is tiny.
const bases = [asPath, ...discoverSourceRoots(structure).map((r) => `${r}/${asPath}`)];
for (const base of bases) {
for (const c of candidatePathsFor(base, provider.extensions, provider.id)) {
if (structure.pathSet.has(c)) return c;
}
}
return undefined;
}

/**
* Source-layout roots present in the repo. A package imported as `pkg.mod`
* may live at `src/pkg/mod.py` (PEP 517/518 src-layout) rather than
* `pkg/mod.py`. We detect a root by looking for a path segment that commonly
* holds package sources and actually exists in `pathSet`. Kept conservative
* (`src`) to avoid false roots; extend the list if a layout needs it.
*/
const SOURCE_ROOT_SEGMENTS: readonly string[] = ["src"];

function discoverSourceRoots(structure: StructureOutput): readonly string[] {
const roots = new Set<string>();
for (const seg of SOURCE_ROOT_SEGMENTS) {
const prefix = `${seg}/`;
for (const p of structure.pathSet) {
if (p.startsWith(prefix)) {
roots.add(seg);
break;
}
}
}
return [...roots];
}

function candidatePathsFor(
base: string,
extensions: readonly string[],
Expand Down
31 changes: 31 additions & 0 deletions packages/ingestion/src/providers/python.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,37 @@ describe("pythonProvider (behavior)", () => {
assert.equal(utils?.isWildcard, true);
});

it("parses multi-line parenthesized and backslash-continued imports", () => {
// black/ruff wrap long import lists in parens; a line-based parser drops
// these silently (rest='(' → 0 names). See field-report Issue 1 Bug A.
const source = [
"from pkg.client import (",
" CredentialError,",
" get_client,",
" preflight,",
")",
"from other import a, \\",
" b",
"from solo import (only_one)",
].join("\n");
const imports = pythonProvider.extractImports({ filePath: "m.py", sourceText: source });
const byName = (s: string) => imports.find((i) => i.source === s);

const client = byName("pkg.client");
assert.ok(client, "multi-line parenthesized import must be extracted");
assert.deepEqual([...(client?.importedNames ?? [])].sort(), [
"CredentialError",
"get_client",
"preflight",
]);

const other = byName("other");
assert.deepEqual([...(other?.importedNames ?? [])].sort(), ["a", "b"]);

const solo = byName("solo");
assert.deepEqual([...(solo?.importedNames ?? [])], ["only_one"]);
});

it("extracts self-receiver calls in methods", () => {
const defs = pythonProvider.extractDefinitions({
filePath: fx.filePath,
Expand Down
41 changes: 40 additions & 1 deletion packages/ingestion/src/providers/python.ts
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,49 @@ function qualifiedForCapture(
* `from X import *` → package-wildcard
* `from . import x` → relative import (source begins `.`)
*/
/**
* Collapse Python's physical lines into logical lines so a multi-line import
* is matched as one unit. Two continuation forms are joined:
* - parenthesized lists: `from m import (\n a,\n b,\n)` — join while the
* running open-paren count is > 0;
* - explicit backslash continuation: a line ending in `\`.
* Both are ubiquitous in real Python (black / ruff wrap long import lists in
* parens). Without joining, the per-line regex sees `from m import (` →
* rest `(` → zero names → the whole import is silently dropped.
*
* Comments are already stripped upstream, so a `(` here is structural, not a
* literal inside a string/comment. Paren counting is a coarse approximation
* (it doesn't track string literals) but import statements never contain
* string-embedded parens, so it is exact for the import grammar.
*/
function joinLogicalLines(lines: readonly string[]): string[] {
const out: string[] = [];
let buf = "";
let depth = 0;
for (const raw of lines) {
let line = raw;
let continued = false;
if (depth === 0 && /\\\s*$/.test(line)) {
line = line.replace(/\\\s*$/, " ");
continued = true;
}
buf = buf === "" ? line : `${buf} ${line.trim()}`;
for (const ch of line) {
if (ch === "(") depth += 1;
else if (ch === ")") depth = Math.max(0, depth - 1);
}
if (depth > 0 || continued) continue;
out.push(buf);
buf = "";
}
if (buf !== "") out.push(buf);
return out;
}

function extractPyImports(input: ExtractImportsInput): readonly ExtractedImport[] {
const { filePath, sourceText } = input;
const stripped = stripComments(sourceText);
const lines = stripped.split("\n");
const lines = joinLogicalLines(stripped.split("\n"));
const out: ExtractedImport[] = [];

for (const raw of lines) {
Expand Down
Loading