Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions graphify/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -15821,6 +15821,30 @@ def _body_of(block):
}


# Extensionless executables (CLI entry points like `devctl` or `manage`) carry
# their language in the shebang, not the suffix. detect.classify_file already
# routes them to the CODE path via _shebang_interpreter; _get_extractor must
# honor the same signal or these files are classified as code and then silently
# dropped by extraction. Only interpreters with a real extractor are mapped —
# detect's wider set (perl, fish, tcsh, Rscript) stays unmapped and skipped.
_SHEBANG_DISPATCH: dict[str, Any] = {
"python": extract_python,
"python2": extract_python,
"python3": extract_python,
"bash": extract_bash,
"sh": extract_bash,
"dash": extract_bash,
"zsh": extract_bash,
"ksh": extract_bash,
"node": extract_js,
"nodejs": extract_js,
"ruby": extract_ruby,
"lua": extract_lua,
"php": extract_php,
"julia": extract_julia,
}


# ObjC-only directives. They are illegal in C and C++, so finding one in a `.h`
# file is a near-zero-false-positive signal that the header is Objective-C (and so
# belongs to extract_objc, not extract_c). `@property` is deliberately excluded: it
Expand Down Expand Up @@ -15908,6 +15932,14 @@ def _get_extractor(path: Path) -> Any | None:
# grammar has no class_specifier). Reroute to extract_cpp (#1547).
if _is_cpp_header(path):
return extract_cpp
# Extensionless files: resolve by shebang, mirroring detect.classify_file.
# Without this, detect labels e.g. `#!/usr/bin/env bash` CLIs as code but
# extraction returns no extractor and the file silently contributes nothing.
if not suffix:
from graphify.detect import _shebang_interpreter
interp = _shebang_interpreter(path)
if interp is not None:
return _SHEBANG_DISPATCH.get(interp)
return _DISPATCH.get(suffix)


Expand Down
50 changes: 50 additions & 0 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -1514,6 +1514,56 @@ def test_extract_json_via_dispatch():
assert _get_extractor(Path("foo.json")) is extract_json


def test_extensionless_shebang_via_dispatch(tmp_path):
"""Extensionless CLIs resolve their extractor from the shebang, mirroring
detect.classify_file — otherwise detect labels them code and extraction
silently drops them."""
from graphify.extract import _get_extractor

cli = tmp_path / "devctl"
cli.write_text("#!/usr/bin/env bash\necho hi\n")
assert _get_extractor(cli) is extract_bash

pytool = tmp_path / "manage"
pytool.write_text("#!/usr/bin/env python3\nprint('hi')\n")
assert _get_extractor(pytool) is extract_python

# env -S split-args form is handled by the shared shebang parser
split = tmp_path / "runner"
split.write_text("#!/usr/bin/env -S bash -eu\necho hi\n")
assert _get_extractor(split) is extract_bash


def test_extensionless_without_usable_shebang_stays_unsupported(tmp_path):
from graphify.extract import _get_extractor

plain = tmp_path / "LICENSE-COPY"
plain.write_text("plain text, no shebang\n")
assert _get_extractor(plain) is None

# Interpreter known to detect but with no AST extractor: stays skipped
# rather than being mis-parsed by a wrong grammar.
perl = tmp_path / "legacy"
perl.write_text("#!/usr/bin/env perl\nprint 1;\n")
assert _get_extractor(perl) is None


def test_extract_extensionless_bash_cli_end_to_end(tmp_path):
"""A shebang-only bash CLI must contribute nodes with the same ID scheme
as a .sh file (path stem + entity), so doc-created stub IDs merge."""
cli = tmp_path / "devctl"
cli.write_text(
"#!/usr/bin/env bash\n"
"helper() { echo hi; }\n"
"main() { helper; }\n"
'main "$@"\n'
)
result = extract([cli], cache_root=tmp_path)
ids = {n["id"] for n in result["nodes"]}
assert "devctl_helper" in ids
assert "devctl_main" in ids


def test_extract_bash_node_metadata_is_sanitized():
"""Bash extractor must route node metadata through sanitize_metadata so
HTML-sensitive characters cannot reach downstream graph viewers raw."""
Expand Down