From d113b9526529353be89ab9dbe8e684e1ead7c040 Mon Sep 17 00:00:00 2001 From: Swarit Pandey Date: Mon, 29 Jun 2026 00:00:54 +0530 Subject: [PATCH 1/3] feat(python): add disk-based package scanning (dist-info metadata) Read installed Python packages from *.dist-info/METADATA and *.egg-info/ PKG-INFO instead of running pip. Default to disk scan; legacy pip path kept behind --legacy-python-scan / use_legacy_python_scan. --- cmd/stepsecurity-dev-machine-guard/main.go | 5 + internal/cli/cli.go | 7 + internal/config/config.go | 13 + internal/detector/pythondist.go | 291 +++++++++++++++++++++ internal/detector/pythondist_test.go | 171 ++++++++++++ internal/detector/pythonproject.go | 22 +- internal/detector/pythonscan.go | 42 +++ internal/scan/scanner.go | 11 +- internal/telemetry/telemetry.go | 10 +- 9 files changed, 568 insertions(+), 4 deletions(-) create mode 100644 internal/detector/pythondist.go create mode 100644 internal/detector/pythondist_test.go diff --git a/cmd/stepsecurity-dev-machine-guard/main.go b/cmd/stepsecurity-dev-machine-guard/main.go index 3816133..73d6b68 100644 --- a/cmd/stepsecurity-dev-machine-guard/main.go +++ b/cmd/stepsecurity-dev-machine-guard/main.go @@ -98,6 +98,11 @@ func main() { if cfg.EnablePythonScan == nil && config.EnablePythonScan != nil { cfg.EnablePythonScan = config.EnablePythonScan } + // --legacy-python-scan / --disk-python-scan override the config-file value + // (which config.Load already applied to config.UseLegacyPythonScan). + if cfg.UseLegacyPythonScan != nil { + config.UseLegacyPythonScan = *cfg.UseLegacyPythonScan + } if cfg.IncludeTCCProtected == nil && config.IncludeTCCProtected != nil { cfg.IncludeTCCProtected = config.IncludeTCCProtected } diff --git a/internal/cli/cli.go b/internal/cli/cli.go index ac06880..d96669e 100644 --- a/internal/cli/cli.go +++ b/internal/cli/cli.go @@ -29,6 +29,7 @@ type Config struct { EnableNPMScan *bool // nil=auto, true/false=explicit EnableBrewScan *bool // nil=auto, true/false=explicit EnablePythonScan *bool // nil=auto, true/false=explicit + UseLegacyPythonScan *bool // nil=auto (disk scan); true=pip path, false=disk path IncludeBundledPlugins bool // --include-bundled-plugins: include bundled/platform plugins in output // IncludeTCCProtected is tristate: nil or false = skip the macOS // TCC-protected dirs (Documents, Downloads, ~/Library/Mail, ...) @@ -182,6 +183,12 @@ func Parse(args []string) (*Config, error) { case arg == "--disable-python-scan": v := false cfg.EnablePythonScan = &v + case arg == "--legacy-python-scan": + v := true + cfg.UseLegacyPythonScan = &v + case arg == "--disk-python-scan": + v := false + cfg.UseLegacyPythonScan = &v case arg == "--include-bundled-plugins": cfg.IncludeBundledPlugins = true case arg == "--include-tcc-protected": diff --git a/internal/config/config.go b/internal/config/config.go index 6f3aacf..eaf32cb 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -34,6 +34,14 @@ var ( // STEPSEC_ENABLE_SCAN_STATE=1) to opt back in. STEPSEC_DISABLE_SCAN_STATE=1 // always forces legacy. UseLegacyPackageScan = true + + // UseLegacyPythonScan, when true, reverts Python package discovery to the + // command-based path (`pip list` per venv and `pip3`/`conda`/`uv list` + // for globals). Defaults to false: Python packages are read from on-disk + // install metadata (*.dist-info/METADATA, *.egg-info/PKG-INFO) with no + // package-manager subprocess. Set use_legacy_python_scan=true in + // config.json (or --legacy-python-scan) to opt back into the pip path. + UseLegacyPythonScan = false ) // MaxExecutionDuration is the whole-process execution-watchdog limit @@ -64,6 +72,7 @@ type ConfigFile struct { InstallDir string `json:"install_dir,omitempty"` MaxExecutionDuration string `json:"max_execution_duration,omitempty"` UseLegacyPackageScan *bool `json:"use_legacy_package_scan,omitempty"` + UseLegacyPythonScan *bool `json:"use_legacy_python_scan,omitempty"` } // userConfigDir returns ~/.stepsecurity — the per-user config location. @@ -199,6 +208,9 @@ func Load() { if cfg.UseLegacyPackageScan != nil { UseLegacyPackageScan = *cfg.UseLegacyPackageScan } + if cfg.UseLegacyPythonScan != nil { + UseLegacyPythonScan = *cfg.UseLegacyPythonScan + } } // IsEnterpriseMode returns true if valid enterprise credentials are configured. @@ -515,6 +527,7 @@ func ShowConfigure() { fmt.Printf(" %-24s %s\n", "Enable NPM Scan:", displayBoolScan(cfg.EnableNPMScan)) fmt.Printf(" %-24s %s\n", "Enable Brew Scan:", displayBoolScan(cfg.EnableBrewScan)) fmt.Printf(" %-24s %s\n", "Enable Python Scan:", displayBoolScan(cfg.EnablePythonScan)) + fmt.Printf(" %-24s %s\n", "Legacy Python Scan:", displayBoolScan(cfg.UseLegacyPythonScan)) fmt.Printf(" %-24s %s\n", "Scan TCC-Protected Dirs:", displayTCC(cfg.IncludeTCCProtected)) fmt.Printf(" %-24s %s\n", "Color Mode:", displayColorMode(cfg.ColorMode)) fmt.Printf(" %-24s %s\n", "Output Format:", displayOutputFormat(cfg.OutputFormat)) diff --git a/internal/detector/pythondist.go b/internal/detector/pythondist.go new file mode 100644 index 0000000..995be62 --- /dev/null +++ b/internal/detector/pythondist.go @@ -0,0 +1,291 @@ +// Disk-based Python package discovery. +// +// PythonDistDetector inventories installed Python packages by reading their +// on-disk install metadata — *.dist-info/METADATA (PEP 566) and the legacy +// *.egg-info/PKG-INFO — instead of running `pip list`. This is the same set +// pip itself reports, since pip derives its listing from these dirs. +// +// Read-only: no pip/uv/conda subprocess. Per-file size is capped so a +// package shipping a giant METADATA description payload cannot blow up memory. +package detector + +import ( + "bufio" + "bytes" + "fmt" + "io" + "os" + "path/filepath" + "runtime" + "sort" + "strings" + + "github.com/step-security/dev-machine-guard/internal/executor" + "github.com/step-security/dev-machine-guard/internal/model" + "github.com/step-security/dev-machine-guard/internal/progress" + "github.com/step-security/dev-machine-guard/internal/tcc" +) + +// maxMetadataFileSize bounds a single METADATA / PKG-INFO read. The header +// block we care about is tiny; the cap only guards against pathological +// description payloads. +const maxMetadataFileSize = 1 << 20 // 1 MiB + +// PythonDistDetector discovers installed Python packages from install +// metadata on disk, with no package-manager subprocess. +type PythonDistDetector struct { + exec executor.Executor + log *progress.Logger + skipper *tcc.Skipper + maxFileSize int64 +} + +func NewPythonDistDetector(exec executor.Executor) *PythonDistDetector { + return &PythonDistDetector{exec: exec, log: progress.NewNoop(), maxFileSize: maxMetadataFileSize} +} + +// WithSkipper attaches a TCC skipper so the walk skips macOS-protected +// directories. A nil skipper is a no-op. Returns the detector for chaining. +func (d *PythonDistDetector) WithSkipper(s *tcc.Skipper) *PythonDistDetector { + d.skipper = s + return d +} + +// WithLogger attaches a progress logger. A nil logger falls back to the +// no-op default. Returns the detector for chaining. +func (d *PythonDistDetector) WithLogger(log *progress.Logger) *PythonDistDetector { + if log != nil { + d.log = log + } + return d +} + +// ScanVenv returns the packages installed in a single virtual environment by +// reading the dist-info/egg-info metadata under it (typically +// lib/python*/site-packages or Lib/site-packages). Replaces the per-venv +// `pip list` call. +func (d *PythonDistDetector) ScanVenv(venvPath string) []model.PackageDetail { + return d.ScanRoots([]string{venvPath}) +} + +// ScanRoots walks each root and returns every distinct package discovered via +// install metadata. Packages are de-duplicated by (lowercased name, version) +// so the same install surfaced once is reported once, and the result is +// sorted by name then version for stable output. +func (d *PythonDistDetector) ScanRoots(roots []string) []model.PackageDetail { + seen := make(map[string]struct{}) + var pkgs []model.PackageDetail + + for _, root := range roots { + _ = filepath.WalkDir(root, func(path string, entry os.DirEntry, err error) error { + if err != nil { + return nil + } + if entry.IsDir() { + if d.skipper.ShouldSkip(path, root) { + return filepath.SkipDir + } + if shouldSkipMetadataDir(entry.Name()) { + return filepath.SkipDir + } + return nil + } + + name, version, ok := d.parseMetadataFile(path, entry.Name()) + if !ok { + return nil + } + key := strings.ToLower(name) + "\x00" + version + if _, dup := seen[key]; dup { + return nil + } + seen[key] = struct{}{} + pkgs = append(pkgs, model.PackageDetail{Name: name, Version: version}) + return nil + }) + } + + sort.Slice(pkgs, func(i, j int) bool { + if pkgs[i].Name == pkgs[j].Name { + return pkgs[i].Version < pkgs[j].Version + } + return pkgs[i].Name < pkgs[j].Name + }) + return pkgs +} + +// ScanGlobalPackages walks the host's global / user site-packages roots and +// returns the installed packages, replacing the `pip3 list` global scan. +func (d *PythonDistDetector) ScanGlobalPackages() []model.PythonPackage { + details := d.ScanRoots(PythonGlobalRoots(d.exec)) + out := make([]model.PythonPackage, len(details)) + for i, p := range details { + out[i] = model.PythonPackage{Name: p.Name, Version: p.Version} + } + return out +} + +// parseMetadataFile returns the package name and version if path is a +// recognised metadata file (*.dist-info/METADATA or *.egg-info/PKG-INFO). +func (d *PythonDistDetector) parseMetadataFile(path, base string) (name, version string, ok bool) { + switch base { + case "METADATA": + if !isDistInfoMetadata(path) { + return "", "", false + } + case "PKG-INFO": + if !isEggInfoPKGInfo(path) { + return "", "", false + } + default: + return "", "", false + } + + data, err := d.readBounded(path) + if err != nil { + return "", "", false + } + name, version = parseRFC822NameVersion(data) + if name == "" || version == "" { + d.log.Debug("python dist scan: %s missing Name/Version header — skipping", path) + return "", "", false + } + return name, version, true +} + +// readBounded reads path through the executor and rejects files over the size +// cap. The metadata header we parse is tiny; the cap only guards memory. +func (d *PythonDistDetector) readBounded(path string) ([]byte, error) { + data, err := d.exec.ReadFile(path) + if err != nil { + return nil, err + } + if d.maxFileSize > 0 && int64(len(data)) > d.maxFileSize { + d.log.Debug("python dist scan: %s exceeds %d bytes — skipping", path, d.maxFileSize) + return nil, fmt.Errorf("file %s exceeds max size %d", path, d.maxFileSize) + } + return data, nil +} + +// isDistInfoMetadata reports whether path is METADATA inside a *.dist-info dir. +func isDistInfoMetadata(path string) bool { + return filepath.Base(path) == "METADATA" && strings.HasSuffix(filepath.Dir(path), ".dist-info") +} + +// isEggInfoPKGInfo reports whether path is PKG-INFO inside a *.egg-info dir. +func isEggInfoPKGInfo(path string) bool { + return filepath.Base(path) == "PKG-INFO" && strings.HasSuffix(filepath.Dir(path), ".egg-info") +} + +// shouldSkipMetadataDir lists directories that never hold install metadata and +// are costly to descend. Unlike the venv-discovery skip list this does NOT +// skip site-packages or dotted dirs — installed packages live under both. +func shouldSkipMetadataDir(name string) bool { + switch name { + case "node_modules", ".git", ".hg", ".svn", ".cache", + "__pycache__", ".tox", ".nox", ".mypy_cache", ".pytest_cache", ".ruff_cache": + return true + } + return false +} + +// parseRFC822NameVersion reads only the RFC-822 header block of a METADATA / +// PKG-INFO file, stopping at the first blank line so the (potentially large) +// description payload is never scanned. +func parseRFC822NameVersion(data []byte) (name, version string) { + br := bufio.NewReader(bytes.NewReader(data)) + for { + line, err := br.ReadString('\n') + trim := strings.TrimRight(line, "\r\n") + if trim == "" { + break + } + // Continuation lines start with whitespace; we only care about + // Name/Version, which are single-line in practice. + if trim[0] == ' ' || trim[0] == '\t' { + if err == io.EOF { + break + } + continue + } + if idx := strings.IndexByte(trim, ':'); idx > 0 { + key := strings.TrimSpace(trim[:idx]) + val := strings.TrimSpace(trim[idx+1:]) + switch strings.ToLower(key) { + case "name": + if name == "" { + name = val + } + case "version": + if version == "" { + version = val + } + } + } + if name != "" && version != "" { + break + } + if err != nil { + break + } + } + return name, version +} + +// PythonGlobalRoots returns the global / user site-packages locations worth +// scanning for system-wide Python packages, keeping only those present on +// this host. These are the install roots the old `pip3 list` global scan +// reported from, plus user-site and version-manager locations the +// command-based scan tended to miss. +func PythonGlobalRoots(exec executor.Executor) []string { + var candidates []string + add := func(paths ...string) { candidates = append(candidates, paths...) } + addGlob := func(pattern string) { + if matches, err := filepath.Glob(pattern); err == nil { + add(matches...) + } + } + + if home, err := os.UserHomeDir(); err == nil && home != "" { + addGlob(filepath.Join(home, ".local", "lib", "python*", "site-packages")) + add(filepath.Join(home, ".local", "share", "pipx", "venvs")) + addGlob(filepath.Join(home, ".pyenv", "versions", "*", "lib", "python*", "site-packages")) + } + + switch runtime.GOOS { + case "darwin": + addGlob("/opt/homebrew/lib/python*/site-packages") + addGlob("/usr/local/lib/python*/site-packages") + addGlob("/Library/Frameworks/Python.framework/Versions/*/lib/python*/site-packages") + if home, err := os.UserHomeDir(); err == nil && home != "" { + addGlob(filepath.Join(home, "Library", "Python", "*", "lib", "python", "site-packages")) + } + case "linux": + addGlob("/usr/lib/python*/dist-packages") + addGlob("/usr/lib/python*/site-packages") + addGlob("/usr/lib/python3/dist-packages") + addGlob("/usr/local/lib/python*/dist-packages") + addGlob("/usr/local/lib/python*/site-packages") + } + + // Keep only existing directories; absent candidates are normal. + seen := make(map[string]struct{}, len(candidates)) + roots := make([]string, 0, len(candidates)) + for _, c := range candidates { + if _, dup := seen[c]; dup { + continue + } + seen[c] = struct{}{} + if exec.FileExists(c) || isDir(c) { + roots = append(roots, c) + } + } + return roots +} + +// isDir reports whether path is an existing directory. exec.FileExists rejects +// directories, so global roots (which are dirs) are confirmed here. +func isDir(path string) bool { + info, err := os.Stat(path) + return err == nil && info.IsDir() +} diff --git a/internal/detector/pythondist_test.go b/internal/detector/pythondist_test.go new file mode 100644 index 0000000..4fc8e1b --- /dev/null +++ b/internal/detector/pythondist_test.go @@ -0,0 +1,171 @@ +package detector + +import ( + "os" + "path/filepath" + "testing" + + "github.com/step-security/dev-machine-guard/internal/executor" +) + +// mustWriteMeta writes a metadata file to the real filesystem (so the walker +// finds it) and registers its content in the mock (so ReadFile returns it). +func mustWriteMeta(t *testing.T, mock *executor.Mock, path, content string) { + t.Helper() + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + mock.SetFile(path, []byte(content)) +} + +const sampleMetadata = `Metadata-Version: 2.1 +Name: requests +Version: 2.31.0 +Summary: Python HTTP for Humans. + +This is the long description that should never be parsed. +Version: 9.9.9 (this trailing header must be ignored) +` + +func TestPythonDistDetector_ScanVenv_DistInfo(t *testing.T) { + dir := t.TempDir() + mock := executor.NewMock() + sp := filepath.Join(dir, ".venv", "lib", "python3.11", "site-packages") + + mustWriteMeta(t, mock, filepath.Join(sp, "requests-2.31.0.dist-info", "METADATA"), sampleMetadata) + mustWriteMeta(t, mock, filepath.Join(sp, "click-8.1.7.dist-info", "METADATA"), + "Name: click\nVersion: 8.1.7\n\nbody") + + det := NewPythonDistDetector(mock) + pkgs := det.ScanVenv(filepath.Join(dir, ".venv")) + + if len(pkgs) != 2 { + t.Fatalf("expected 2 packages, got %d: %+v", len(pkgs), pkgs) + } + // Sorted by name: click then requests. + if pkgs[0].Name != "click" || pkgs[0].Version != "8.1.7" { + t.Errorf("pkgs[0] = %+v, want click 8.1.7", pkgs[0]) + } + if pkgs[1].Name != "requests" || pkgs[1].Version != "2.31.0" { + t.Errorf("pkgs[1] = %+v, want requests 2.31.0", pkgs[1]) + } +} + +func TestPythonDistDetector_EggInfoFallback(t *testing.T) { + dir := t.TempDir() + mock := executor.NewMock() + sp := filepath.Join(dir, "lib", "python3.9", "site-packages") + + mustWriteMeta(t, mock, filepath.Join(sp, "legacy.egg-info", "PKG-INFO"), + "Metadata-Version: 1.0\nName: legacy\nVersion: 0.1.0\n\nbody") + + pkgs := NewPythonDistDetector(mock).ScanRoots([]string{dir}) + if len(pkgs) != 1 || pkgs[0].Name != "legacy" || pkgs[0].Version != "0.1.0" { + t.Fatalf("expected legacy 0.1.0, got %+v", pkgs) + } +} + +func TestPythonDistDetector_SkipsCachesAndNonDistInfo(t *testing.T) { + dir := t.TempDir() + mock := executor.NewMock() + sp := filepath.Join(dir, "site-packages") + + // dist-info under __pycache__ must be skipped (dir is pruned). + mustWriteMeta(t, mock, filepath.Join(sp, "__pycache__", "evil-1.0.dist-info", "METADATA"), + "Name: evil\nVersion: 1.0\n\nx") + // A METADATA file NOT inside a *.dist-info dir must be ignored. + mustWriteMeta(t, mock, filepath.Join(sp, "notadist", "METADATA"), + "Name: nope\nVersion: 1.0\n\nx") + // A real package, to prove the walk still works. + mustWriteMeta(t, mock, filepath.Join(sp, "good-2.0.dist-info", "METADATA"), + "Name: good\nVersion: 2.0\n\nx") + + pkgs := NewPythonDistDetector(mock).ScanRoots([]string{dir}) + if len(pkgs) != 1 || pkgs[0].Name != "good" { + t.Fatalf("expected only good, got %+v", pkgs) + } +} + +func TestPythonDistDetector_MalformedAndDedup(t *testing.T) { + dir := t.TempDir() + mock := executor.NewMock() + sp := filepath.Join(dir, "site-packages") + + // Missing Version → skipped. + mustWriteMeta(t, mock, filepath.Join(sp, "broken-0.dist-info", "METADATA"), + "Name: broken\nSummary: no version\n\nx") + // Same (name, version) twice → de-duplicated. + mustWriteMeta(t, mock, filepath.Join(sp, "dup-1.0.dist-info", "METADATA"), + "Name: dup\nVersion: 1.0\n\nx") + mustWriteMeta(t, mock, filepath.Join(sp, "dup-1.0-py3.dist-info", "METADATA"), + "Name: dup\nVersion: 1.0\n\nx") + + pkgs := NewPythonDistDetector(mock).ScanRoots([]string{dir}) + if len(pkgs) != 1 || pkgs[0].Name != "dup" { + t.Fatalf("expected single dup, got %+v", pkgs) + } +} + +func TestPythonDistDetector_SizeCap(t *testing.T) { + dir := t.TempDir() + mock := executor.NewMock() + sp := filepath.Join(dir, "site-packages") + mustWriteMeta(t, mock, filepath.Join(sp, "big-1.0.dist-info", "METADATA"), + "Name: big\nVersion: 1.0\n\nx") + + det := NewPythonDistDetector(mock) + det.maxFileSize = 5 // smaller than the header → rejected + if pkgs := det.ScanRoots([]string{dir}); len(pkgs) != 0 { + t.Fatalf("expected size-cap to drop the package, got %+v", pkgs) + } +} + +func TestParseRFC822NameVersion(t *testing.T) { + name, version := parseRFC822NameVersion([]byte(sampleMetadata)) + if name != "requests" || version != "2.31.0" { + t.Fatalf("got name=%q version=%q, want requests/2.31.0 (must stop at blank line)", name, version) + } +} + +func TestIsDistInfoAndEggInfo(t *testing.T) { + if !isDistInfoMetadata("/a/foo-1.0.dist-info/METADATA") { + t.Error("expected dist-info METADATA to match") + } + if isDistInfoMetadata("/a/foo/METADATA") { + t.Error("plain METADATA must not match") + } + if !isEggInfoPKGInfo("/a/foo.egg-info/PKG-INFO") { + t.Error("expected egg-info PKG-INFO to match") + } + if isEggInfoPKGInfo("/a/foo/PKG-INFO") { + t.Error("plain PKG-INFO must not match") + } +} + +// Disk-mode ListProjects: a venv's packages come from site-packages metadata, +// with no pip invocation — and it works for a --without-pip venv. +func TestPythonProjectDetector_DiskScan(t *testing.T) { + dir := t.TempDir() + mock := executor.NewMock() + + venv := filepath.Join(dir, "proj", ".venv") + // pyvenv.cfg marks the venv (no bin/pip → exercises the --without-pip path). + mustWriteMeta(t, mock, filepath.Join(venv, "pyvenv.cfg"), "home = /usr\n") + sp := filepath.Join(venv, "lib", "python3.12", "site-packages") + mustWriteMeta(t, mock, filepath.Join(sp, "flask-3.0.0.dist-info", "METADATA"), + "Name: Flask\nVersion: 3.0.0\n\nx") + + dist := NewPythonDistDetector(mock) + det := NewPythonProjectDetector(mock).WithDiskScan(dist) + projects, _ := det.ListProjects([]string{dir}, nil) + + if len(projects) != 1 { + t.Fatalf("expected 1 project, got %d: %+v", len(projects), projects) + } + if got := projects[0].Packages; len(got) != 1 || got[0].Name != "Flask" || got[0].Version != "3.0.0" { + t.Fatalf("expected Flask 3.0.0 from disk, got %+v", got) + } +} diff --git a/internal/detector/pythonproject.go b/internal/detector/pythonproject.go index cabb647..037c111 100644 --- a/internal/detector/pythonproject.go +++ b/internal/detector/pythonproject.go @@ -22,6 +22,9 @@ type PythonProjectDetector struct { exec executor.Executor log *progress.Logger skipper *tcc.Skipper + // dist, when non-nil, makes per-venv package listing read install + // metadata from disk instead of running `pip list`. + dist *PythonDistDetector } func NewPythonProjectDetector(exec executor.Executor) *PythonProjectDetector { @@ -45,6 +48,15 @@ func (d *PythonProjectDetector) WithLogger(log *progress.Logger) *PythonProjectD return d } +// WithDiskScan switches per-venv package listing to read on-disk install +// metadata (via the supplied PythonDistDetector) instead of running +// `pip list`. A nil detector leaves the legacy pip path in place. Returns +// the detector for chaining. +func (d *PythonProjectDetector) WithDiskScan(dist *PythonDistDetector) *PythonProjectDetector { + d.dist = dist + return d +} + // CountProjects counts Python projects with virtual environments. func (d *PythonProjectDetector) CountProjects(_ context.Context, searchDirs []string) int { projects, _ := d.ListProjects(searchDirs, nil) @@ -99,10 +111,16 @@ func (d *PythonProjectDetector) ListProjects(searchDirs []string, knownLastVerif projects = make([]model.ProjectInfo, 0, len(candidates)) for _, c := range candidates { var pkgs []model.PackageDetail - if c.pipPath != "" { + switch { + case d.dist != nil: + // Disk mode: read install metadata from the venv's + // site-packages — works even for --without-pip venvs. + d.log.Progress(" Scanning: %s (%s)", c.path, c.pm) + pkgs = d.dist.ScanVenv(c.path) + case c.pipPath != "": d.log.Progress(" Scanning: %s (%s)", c.path, c.pm) pkgs = d.listVenvPackages(ctx, c.path, c.pipPath) - } else { + default: // A valid venv (pyvenv.cfg present) created with --without-pip: // there's nothing to list, but record that we saw it so the // absence of packages is explained rather than silent. diff --git a/internal/detector/pythonscan.go b/internal/detector/pythonscan.go index 202f4e7..549279a 100644 --- a/internal/detector/pythonscan.go +++ b/internal/detector/pythonscan.go @@ -3,12 +3,14 @@ package detector import ( "context" "encoding/base64" + "encoding/json" "strings" "time" "github.com/step-security/dev-machine-guard/internal/executor" "github.com/step-security/dev-machine-guard/internal/model" "github.com/step-security/dev-machine-guard/internal/progress" + "github.com/step-security/dev-machine-guard/internal/tcc" ) // PythonScanner performs enterprise-mode Python scanning (raw output, base64 encoded). @@ -91,6 +93,46 @@ func (s *PythonScanner) ScanGlobalPackages(ctx context.Context) []model.PythonSc return results } +// ScanGlobalPackagesFromDisk inventories global / user site-packages by +// reading install metadata on disk (no pip/conda/uv subprocess) and returns +// the result in the same PythonScanResult shape as ScanGlobalPackages: the +// package list is JSON-encoded ([{"name","version"},...]) into RawStdoutBase64 +// so the existing backend decoder needs no change. Returns nil when no global +// site-packages roots exist on the host. +func (s *PythonScanner) ScanGlobalPackagesFromDisk(skipper *tcc.Skipper) []model.PythonScanResult { + roots := PythonGlobalRoots(s.exec) + if len(roots) == 0 { + s.log.Debug("python global disk scan: no site-packages roots found") + return nil + } + + s.emitProgress("scanning global site-packages") + s.log.Progress(" Scanning global Python site-packages on disk...") + + start := time.Now() + dist := NewPythonDistDetector(s.exec).WithLogger(s.log).WithSkipper(skipper) + pkgs := dist.ScanRoots(roots) + duration := time.Since(start).Milliseconds() + + type pipEntry struct { + Name string `json:"name"` + Version string `json:"version"` + } + entries := make([]pipEntry, len(pkgs)) + for i, p := range pkgs { + entries[i] = pipEntry{Name: p.Name, Version: p.Version} + } + raw, _ := json.Marshal(entries) + s.log.Debug("python global disk scan: roots=%d packages=%d duration=%dms", len(roots), len(pkgs), duration) + + return []model.PythonScanResult{{ + PackageManager: "pip", + RawStdoutBase64: base64.StdEncoding.EncodeToString(raw), + ExitCode: 0, + ScanDurationMs: duration, + }} +} + func (s *PythonScanner) getVersion(ctx context.Context, binary, versionCmd string) string { stdout, _, _, err := s.exec.RunWithTimeout(ctx, 10*time.Second, binary, versionCmd) if err != nil { diff --git a/internal/scan/scanner.go b/internal/scan/scanner.go index 48553e0..d035105 100644 --- a/internal/scan/scanner.go +++ b/internal/scan/scanner.go @@ -7,6 +7,7 @@ import ( "github.com/step-security/dev-machine-guard/internal/buildinfo" "github.com/step-security/dev-machine-guard/internal/cli" + "github.com/step-security/dev-machine-guard/internal/config" "github.com/step-security/dev-machine-guard/internal/detector" "github.com/step-security/dev-machine-guard/internal/detector/configaudit" "github.com/step-security/dev-machine-guard/internal/device" @@ -209,12 +210,20 @@ func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) error { log.StepStart("Listing Python packages") start = time.Now() - pythonPackages = pyDetector.ListPackages(ctx) + if config.UseLegacyPythonScan { + pythonPackages = pyDetector.ListPackages(ctx) + } else { + pythonPackages = detector.NewPythonDistDetector(exec).WithSkipper(tccSkipper).WithLogger(log).ScanGlobalPackages() + } log.StepDone(time.Since(start)) log.StepStart("Scanning Python projects") start = time.Now() pyProjectDetector := detector.NewPythonProjectDetector(exec).WithSkipper(tccSkipper).WithLogger(log) + if !config.UseLegacyPythonScan { + pyProjectDetector = pyProjectDetector.WithDiskScan( + detector.NewPythonDistDetector(exec).WithSkipper(tccSkipper).WithLogger(log)) + } pythonProjects, _ = pyProjectDetector.ListProjects(searchDirs, nil) log.StepDone(time.Since(start)) } else { diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 4291612..9250746 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -751,11 +751,19 @@ func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) (err err // "scanning uv") into the phase tracker so heartbeats surface where // inside the python phase a slow pip3 list is stuck. pyScanner.ProgressHook = func(detail string) { tracker.UpdateDetail(detail) } - pythonGlobalPkgs = pyScanner.ScanGlobalPackages(phaseCtx) + if config.UseLegacyPythonScan { + pythonGlobalPkgs = pyScanner.ScanGlobalPackages(phaseCtx) + } else { + pythonGlobalPkgs = pyScanner.ScanGlobalPackagesFromDisk(tccSkipper) + } log.Progress(" Found %d Python global package source(s)", len(pythonGlobalPkgs)) log.Progress("Searching for Python projects...") pyProjectDetector := detector.NewPythonProjectDetector(exec).WithSkipper(tccSkipper).WithLogger(log) + if !config.UseLegacyPythonScan { + pyProjectDetector = pyProjectDetector.WithDiskScan( + detector.NewPythonDistDetector(exec).WithSkipper(tccSkipper).WithLogger(log)) + } var knownPython map[string]time.Time if scanState != nil && !scanStateFullSync { knownPython = make(map[string]time.Time, len(scanState.PythonProjects)) From 70e644a406e2de973679d472902de9b6ca868a65 Mon Sep 17 00:00:00 2001 From: Swarit Pandey Date: Mon, 29 Jun 2026 00:23:10 +0530 Subject: [PATCH 2/3] fix(python): use type conversion for PackageDetail->PythonPackage (staticcheck S1016) --- internal/detector/pythondist.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/detector/pythondist.go b/internal/detector/pythondist.go index 995be62..3df6457 100644 --- a/internal/detector/pythondist.go +++ b/internal/detector/pythondist.go @@ -120,7 +120,7 @@ func (d *PythonDistDetector) ScanGlobalPackages() []model.PythonPackage { details := d.ScanRoots(PythonGlobalRoots(d.exec)) out := make([]model.PythonPackage, len(details)) for i, p := range details { - out[i] = model.PythonPackage{Name: p.Name, Version: p.Version} + out[i] = model.PythonPackage(p) } return out } From d85e9c526f80e2e076a7fe28737f92d8f5881d81 Mon Sep 17 00:00:00 2001 From: Swarit Pandey Date: Mon, 29 Jun 2026 20:49:28 +0530 Subject: [PATCH 3/3] fix(python): address PR review on disk-based scanning - PythonGlobalRoots anchors per-user paths on the console user via executor.ResolveHome (falling back to os.UserHomeDir), so the root/launchd agent scans the logged-in user's ~/.local, ~/.pyenv, pipx. - readBounded stats file size before reading to avoid large allocations, keeping the post-read length check as a race-safety fallback. - ScanVenv limits its walk to the venv's site-packages dirs instead of the whole tree. --- internal/detector/pythondist.go | 48 +++++++++++++++++++++++++--- internal/detector/pythondist_test.go | 19 +++++++++++ 2 files changed, 63 insertions(+), 4 deletions(-) diff --git a/internal/detector/pythondist.go b/internal/detector/pythondist.go index 3df6457..e846c31 100644 --- a/internal/detector/pythondist.go +++ b/internal/detector/pythondist.go @@ -65,7 +65,28 @@ func (d *PythonDistDetector) WithLogger(log *progress.Logger) *PythonDistDetecto // lib/python*/site-packages or Lib/site-packages). Replaces the per-venv // `pip list` call. func (d *PythonDistDetector) ScanVenv(venvPath string) []model.PackageDetail { - return d.ScanRoots([]string{venvPath}) + return d.ScanRoots(venvSitePackages(venvPath)) +} + +// venvSitePackages returns the site-packages directories inside a venv — +// lib/python*/site-packages (POSIX) and Lib/site-packages (Windows). Scanning +// only these avoids walking bin/include/share, which never hold install +// metadata. Falls back to the venv root if no site-packages dir is found, so +// a non-standard layout is still scanned. +func venvSitePackages(venvPath string) []string { + var roots []string + for _, pattern := range []string{ + filepath.Join(venvPath, "lib", "python*", "site-packages"), + filepath.Join(venvPath, "Lib", "site-packages"), + } { + if matches, err := filepath.Glob(pattern); err == nil { + roots = append(roots, matches...) + } + } + if len(roots) == 0 { + return []string{venvPath} + } + return roots } // ScanRoots walks each root and returns every distinct package discovered via @@ -154,8 +175,17 @@ func (d *PythonDistDetector) parseMetadataFile(path, base string) (name, version } // readBounded reads path through the executor and rejects files over the size -// cap. The metadata header we parse is tiny; the cap only guards memory. +// cap. The metadata header we parse is tiny; the cap only guards memory. The +// size is checked via Stat *before* reading so a pathological file is never +// pulled into memory, with the post-read length check kept as a race-safety +// fallback (the file can grow between Stat and ReadFile). func (d *PythonDistDetector) readBounded(path string) ([]byte, error) { + if d.maxFileSize > 0 { + if info, err := d.exec.Stat(path); err == nil && info.Size() > d.maxFileSize { + d.log.Debug("python dist scan: %s exceeds %d bytes — skipping", path, d.maxFileSize) + return nil, fmt.Errorf("file %s exceeds max size %d", path, d.maxFileSize) + } + } data, err := d.exec.ReadFile(path) if err != nil { return nil, err @@ -246,7 +276,17 @@ func PythonGlobalRoots(exec executor.Executor) []string { } } - if home, err := os.UserHomeDir(); err == nil && home != "" { + // Anchor per-user paths on the console (GUI) user, not the process user: + // the enterprise agent runs as root via launchd, where os.UserHomeDir + // resolves to /var/root and would miss the logged-in user's ~/.local, + // ~/.pyenv, pipx venvs, etc. (issue #63). Fall back to os.UserHomeDir. + home := executor.ResolveHome(exec) + if home == "" { + if h, err := os.UserHomeDir(); err == nil { + home = h + } + } + if home != "" { addGlob(filepath.Join(home, ".local", "lib", "python*", "site-packages")) add(filepath.Join(home, ".local", "share", "pipx", "venvs")) addGlob(filepath.Join(home, ".pyenv", "versions", "*", "lib", "python*", "site-packages")) @@ -257,7 +297,7 @@ func PythonGlobalRoots(exec executor.Executor) []string { addGlob("/opt/homebrew/lib/python*/site-packages") addGlob("/usr/local/lib/python*/site-packages") addGlob("/Library/Frameworks/Python.framework/Versions/*/lib/python*/site-packages") - if home, err := os.UserHomeDir(); err == nil && home != "" { + if home != "" { addGlob(filepath.Join(home, "Library", "Python", "*", "lib", "python", "site-packages")) } case "linux": diff --git a/internal/detector/pythondist_test.go b/internal/detector/pythondist_test.go index 4fc8e1b..543553e 100644 --- a/internal/detector/pythondist_test.go +++ b/internal/detector/pythondist_test.go @@ -54,6 +54,25 @@ func TestPythonDistDetector_ScanVenv_DistInfo(t *testing.T) { } } +// ScanVenv limits its walk to the venv's site-packages dirs, so metadata +// stashed elsewhere in the tree (e.g. under bin/) is not reported. +func TestPythonDistDetector_ScanVenv_ScopedToSitePackages(t *testing.T) { + dir := t.TempDir() + mock := executor.NewMock() + venv := filepath.Join(dir, ".venv") + sp := filepath.Join(venv, "lib", "python3.11", "site-packages") + + mustWriteMeta(t, mock, filepath.Join(sp, "requests-2.31.0.dist-info", "METADATA"), sampleMetadata) + // A stray metadata file outside site-packages must be ignored. + mustWriteMeta(t, mock, filepath.Join(venv, "bin", "stray-1.0.0.dist-info", "METADATA"), + "Name: stray\nVersion: 1.0.0\n\nbody") + + pkgs := NewPythonDistDetector(mock).ScanVenv(venv) + if len(pkgs) != 1 || pkgs[0].Name != "requests" { + t.Fatalf("expected only requests from site-packages, got %+v", pkgs) + } +} + func TestPythonDistDetector_EggInfoFallback(t *testing.T) { dir := t.TempDir() mock := executor.NewMock()