diff --git a/README.md b/README.md index 6482b44..6c3cb47 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ A memory-efficient Python library and CLI for comparing two GTFS feeds and produ ## Overview -GTFS Diff Engine compares two GTFS feeds (zip archives or directories) file-by-file and row-by-row, emitting a machine-readable JSON document that describes exactly what changed: which files were added or deleted, which columns appeared or disappeared, and which rows were inserted, removed, or modified (with before/after field values). +GTFS Diff Engine compares two GTFS feeds (zip archives, directories, or public HTTP(S) folder URLs) file-by-file and row-by-row, emitting a machine-readable JSON document that describes exactly what changed: which files were added or deleted, which columns appeared or disappeared, and which rows were inserted, removed, or modified (with before/after field values). The output conforms to the **GTFS Diff v2 schema** maintained by MobilityData: @@ -12,9 +12,12 @@ The output conforms to the **GTFS Diff v2 schema** maintained by MobilityData: ## Features - **Memory-efficient streaming diff** — two-pass CSV indexing; no full in-memory table loads -- **Supports `.zip` archives and plain directories** — including zips with a single sub-directory layout +- **Built-in DuckDB backend for very large files** — automatically diffs eligible 50 MB+ files such as million-row `stop_times.txt` on disk without exhausting memory +- **Supports `.zip` archives, plain directories, and public HTTP(S) folder URLs** — including non-listable folders whose individual GTFS files are public - **Row-level changes with primary key identification** — each change record includes the primary key fields for the affected row - **Column-level change tracking** — columns added or deleted between feeds are reported with their original positions +- **Per-file and per-column change statistics** — modified files report true row-change percentages and optional per-column modification counts +- **Unreliable-diff detection (`not_compared`)** — files whose primary keys are regenerated between versions ("id churn"), missing mandatory key columns, or duplicate primary-key values are flagged `not_compared` instead of producing a misleading row diff; id-churn thresholds are tunable globally or per file - **Configurable row-changes cap** — limit output size per file; omitted changes are counted in a `Truncated` record - **CLI and Python API** — use as a command-line tool or import directly in your code @@ -24,6 +27,8 @@ The output conforms to the **GTFS Diff v2 schema** maintained by MobilityData: pip install gtfs-diff-engine ``` +This installs the DuckDB backend used for very large files automatically, so no extra steps are required. + For a development (editable) install with test dependencies: ```bash @@ -48,21 +53,55 @@ with open("diff.json", "w") as f: ## CLI Usage ``` -Usage: gtfs-diff [OPTIONS] BASE_FEED NEW_FEED +Usage: python -m gtfs_diff [OPTIONS] BASE_FEED NEW_FEED + + Compare two GTFS feeds and output a JSON diff. - Compare two GTFS feeds (zip or directory) and output a JSON diff. + BASE_FEED: local path or http(s):// folder URL to the base GTFS feed - BASE_FEED: path to the base GTFS feed (zip or directory) - NEW_FEED: path to the new GTFS feed (zip or directory) + NEW_FEED: local path or http(s):// folder URL to the new GTFS feed + + Use optional --files with a comma-separated GTFS file list. For URLs, + omitting --files auto-discovers known GTFS files. Options: --version Show the version and exit. - -o, --output FILE Write JSON output to FILE instead of stdout. + --files NAMES Comma-separated list of GTFS files to + compare, e.g. 'stops.txt,trips.txt'. + Optional: for folder URLs, omitting it + probes all known GTFS files; for local feeds + it restricts the comparison. + -o, --output PATH Write JSON output to FILE instead of stdout. -c, --cap INTEGER Max row changes per file (0 = omit row-level detail). --pretty / --no-pretty Pretty-print JSON (default: --pretty). - --base-downloaded-at TEXT ISO 8601 datetime for when base was downloaded. - --new-downloaded-at TEXT ISO 8601 datetime for when new was downloaded. + --base-downloaded-at TEXT ISO 8601 datetime for when base was + downloaded. + --new-downloaded-at TEXT ISO 8601 datetime for when new was + downloaded. + --id-churn-threshold FLOAT RANGE + Primary-key churn ratio (0.0-1.0) above + which a file is reported as not_compared + instead of diffed (detects regenerated ids). + [default: 0.7; 0.0<=x<=1.0] + --id-churn-threshold-for FILENAME RATIO + Per-file id-churn threshold override; + repeatable. Takes precedence over --id- + churn-threshold. Example: --id-churn- + threshold-for shapes.txt 0.95 + --large-file-threshold-mb FLOAT RANGE + Files whose larger side is at least this + many megabytes are diffed with the built-in + DuckDB backend (lower memory for very large + files). Use --no-duckdb to always use the + in-memory engine. [default: 50.0; x>=0.0] + --no-duckdb Disable the DuckDB backend; always use the + in-memory engine. + --column-stats / --no-column-stats + Include per-column modification counts and + percentages in each modified file's stats + (default: on). The file-level + rows_changed_percentage is always computed. --help Show this message and exit. ``` @@ -78,9 +117,40 @@ gtfs-diff --cap 500 base.zip new.zip # Save output to a file gtfs-diff -o diff.json base.zip new.zip +# Compare public HTTP(S) folder feeds; auto-discovers known GTFS files +gtfs-diff https://storage.googleapis.com/example/base \ + https://storage.googleapis.com/example/new + +# Non-listable folders are OK if individual files are public; +# missing files that return 403/404 are skipped +gtfs-diff https://files.mobilitydatabase.org/mdb-2126/base/extracted \ + https://files.mobilitydatabase.org/mdb-2126/new/extracted + +# Compare only selected files from public HTTP(S) folder feeds +gtfs-diff https://storage.googleapis.com/example/base \ + https://storage.googleapis.com/example/new \ + --files "stops.txt,trips.txt" + +# Lower the id-churn sensitivity globally (mark a file not_compared sooner) +gtfs-diff --id-churn-threshold 0.5 base.zip new.zip + +# Override the id-churn threshold for specific files (repeatable) +gtfs-diff --id-churn-threshold-for shapes.txt 0.95 \ + --id-churn-threshold-for trips.txt 0.9 \ + base.zip new.zip + # Omit row-level detail (column diffs and counts are still computed) gtfs-diff --cap 0 base.zip new.zip +# Lower the DuckDB auto-switch threshold to 10 MB +gtfs-diff --large-file-threshold-mb 10 base.zip new.zip + +# Disable DuckDB and always use the in-memory engine +gtfs-diff --no-duckdb base.zip new.zip + +# Omit per-column modification statistics +gtfs-diff --no-column-stats base.zip new.zip + # With feed download timestamps gtfs-diff --base-downloaded-at 2024-01-01T00:00:00Z \ --new-downloaded-at 2024-06-01T00:00:00Z \ @@ -98,16 +168,51 @@ def diff_feeds( row_changes_cap_per_file: int | None = None, base_downloaded_at: datetime | None = None, new_downloaded_at: datetime | None = None, + id_churn_threshold: float = 0.7, + id_churn_thresholds: Mapping[str, float] | None = None, + files: Iterable[str] | None = None, + large_file_threshold_bytes: int | None = 52428800, + column_stats: bool = True, ) -> GtfsDiff ``` | Parameter | Type | Description | |---|---|---| -| `base_path` | `str \| Path` | Path to the base (old) GTFS feed — zip or directory | -| `new_path` | `str \| Path` | Path to the new GTFS feed — zip or directory | +| `base_path` | `str \| Path` | Path or URL to the base (old) GTFS feed — zip, directory, or HTTP(S) folder URL | +| `new_path` | `str \| Path` | Path or URL to the new GTFS feed — zip, directory, or HTTP(S) folder URL | | `row_changes_cap_per_file` | `int \| None` | `None` = include all; `0` = omit row detail; `N` = cap at N per file | | `base_downloaded_at` | `datetime \| None` | When the base feed was downloaded (defaults to now) | | `new_downloaded_at` | `datetime \| None` | When the new feed was downloaded (defaults to now) | +| `id_churn_threshold` | `float` | Global primary-key churn ratio (`0.0`–`1.0`, default `0.7`) above which a file is marked `not_compared` instead of diffed | +| `id_churn_thresholds` | `Mapping[str, float] \| None` | Optional `{file_name: threshold}` per-file overrides; take precedence over `id_churn_threshold` | +| `files` | `Iterable[str] \| None` | Optional file list. Local feeds compare all discoverable files when omitted or only these files when supplied; HTTP(S) folder URLs probe all known supported GTFS files when omitted or these exact files when supplied. Non-listable folders are supported when individual files are public; missing files that return 403/404 are skipped | +| `large_file_threshold_bytes` | `int \| None` | Files whose larger side is at least this many uncompressed bytes are routed to the built-in DuckDB backend (default `52428800`, 50 MB). Pass `None` to disable DuckDB entirely and always use the in-memory engine; pass a smaller number to route more eligible files | +| `column_stats` | `bool` | When `True` (default), include per-column modification counts and percentages in each modified file's `stats.column_stats`. Pass `False` to omit that per-column breakdown; `stats.rows_changed_percentage` is still computed | + +```python +# URL feeds can auto-discover known supported GTFS files +result = diff_feeds( + "https://storage.googleapis.com/example/base", + "https://storage.googleapis.com/example/new", +) + +# Or restrict the comparison to selected files +result = diff_feeds( + "https://storage.googleapis.com/example/base", + "https://storage.googleapis.com/example/new", + files=["stops.txt", "trips.txt"], +) + +# Route smaller eligible files through DuckDB +result = diff_feeds( + "base.zip", + "new.zip", + large_file_threshold_bytes=10 * 1024 * 1024, +) + +# Or disable DuckDB entirely +result = diff_feeds("base.zip", "new.zip", large_file_threshold_bytes=None) +``` **Returns:** a `GtfsDiff` Pydantic model with three top-level fields: @@ -143,15 +248,17 @@ def diff_feeds( | `networks.txt` | `network_id` | | `route_networks.txt` | `route_id` | | `fare_media.txt` | `fare_media_id` | -| `fare_products.txt` | `fare_product_id` | +| `fare_products.txt` | `fare_product_id`, `rider_category_id`, `fare_media_id` | | `fare_leg_rules.txt` | `leg_group_id` | -| `fare_transfer_rules.txt` | `from_leg_group_id`, `to_leg_group_id`, `transfer_count`, `duration_limit` | +| `fare_transfer_rules.txt` | `from_leg_group_id`, `to_leg_group_id`, `fare_product_id`, `transfer_count`, `duration_limit` | | `timeframes.txt` | `timeframe_group_id`, `start_time`, `end_time`, `service_id` | | `rider_categories.txt` | `rider_category_id` | | `booking_rules.txt` | `booking_rule_id` | | `location_groups.txt` | `location_group_id` | | `location_group_stops.txt` | `location_group_id`, `stop_id` | +For `translations.txt`, `record_id`, `record_sub_id`, and `field_value` are conditional primary-key columns. When some are absent from a feed, the engine keeps the full primary key and treats the missing columns as null (empty) values during comparison only, so feeds that include different subsets still align. Missing mandatory key columns and duplicate primary-key values mark only that file as `not_compared` with reason `missing_primary_key` or `duplicate_primary_key`; the feed diff continues, and column-level differences are still reported. For these primary-key problems, the reason message identifies whether the base feed, new feed, or both feeds caused the issue. Foreign-key columns in other files that reference that not-compared file are excluded from field-level diffs and listed under `ignored_columns` with reason `references_not_compared_file`, matching id-churn handling. The same optional-column padding applies to other files with conditionally-required key columns (e.g. `agency.txt`, `fare_rules.txt`, `fare_products.txt`, `fare_leg_rules.txt`, `fare_transfer_rules.txt`, `timeframes.txt`, `transfers.txt`, `attributions.txt`). + Files not in this table (e.g. GeoJSON flex locations) are recorded in `metadata.unsupported_files` and skipped. ## Output Schema @@ -214,12 +321,39 @@ The output follows the GTFS Diff v2 schema. Below is a minimal example: } ] }, + "stats": { + "total_rows_base": 10, + "total_rows_new": 11, + "columns_added_count": 0, + "columns_deleted_count": 0, + "rows_added_count": 1, + "rows_deleted_count": 0, + "rows_modified_count": 2, + "rows_changed_percentage": 27.27, + "column_stats": [ + { + "column": "stop_name", + "modifications_count": 1, + "modifications_percentage": 50.0 + } + ] + }, "truncated": null } ] } ``` +For modified files, `stats.rows_changed_percentage` is the percentage of rows that were added, deleted, or modified relative to the larger of the two versions: `min((rows_added + rows_deleted + rows_modified) / max(total_rows_base, total_rows_new) * 100, 100)`, rounded to 2 decimals. It is `null` when both versions are empty and uses true counts, so it is not affected by `--cap` or `row_changes_cap_per_file` truncation. + +`stats.column_stats` is per-column modification statistics. Only covers modified rows. Each entry has `column`, `modifications_count`, and `modifications_percentage`; the count is the number of modified rows whose value changed in that column, and the percentage is relative to total modified rows. Counts are true counts, unaffected by caps. Columns with no modified-row changes are omitted, and entries follow `row_changes.columns` order. `column_stats` is `null` when there are no modifications or when `column_stats=False` / `--no-column-stats` is used. These stats appear only for files with `file_action: "modified"`. + +## Configuration + +### Environment variables + +- `GTFS_DIFF_DUCKDB_TMPDIR`: optional base directory for DuckDB's on-disk spill files when the DuckDB backend handles large eligible files (50 MB+ by default). Set this to a volume with enough free space if the system temp directory (for example, `/tmp`) is too small for multi-gigabyte feed comparisons. A leading `~` is expanded and the directory is created if needed. If unset or blank, the engine uses the system temp directory. + ## Memory Efficiency The engine uses a **streaming two-pass algorithm**: @@ -231,7 +365,7 @@ The engine uses a **streaming two-pass algorithm**: Only the raw CSV strings are stored in the index (not parsed dicts), keeping memory proportional to the number of rows rather than rows × columns. -> **Note:** For very large feeds (`stop_times.txt` with 10 M+ rows) the in-memory index may become a bottleneck. A disk-backed index (e.g. SQLite) would be more appropriate for production deployments at that scale; that optimisation is left as future work. +> **Note:** Very large eligible files are diffed with the built-in, disk-backed DuckDB backend. It auto-switches at 50 MB by default, reads remote HTTP(S) files in place through DuckDB `httpfs` range requests after Python-side `HEAD` checks, uses the in-memory engine for ineligible files, and can be disabled with `large_file_threshold_bytes=None` or `--no-duckdb`. ## Running Tests diff --git a/docs/architecture.md b/docs/architecture.md index aab3127..7769339 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -12,9 +12,49 @@ |---|---| | `engine.py` | Core diff logic: feed opener, CSV indexing, per-file diff, public `diff_feeds()` function | | `models.py` | Pydantic v2 data models for the GTFS Diff v2 output format (`GtfsDiff`, `FileDiff`, `RowChanges`, etc.) | -| `gtfs_definitions.py` | Static registry of supported GTFS files and their primary key columns; `get_primary_key()` helper | +| `gtfs_definitions.py` | Static registry of supported GTFS files, their primary keys, foreign-key relationships, and id-churn thresholds; `get_primary_key()` / `get_foreign_keys()` / `get_id_churn_threshold()` helpers | | `cli.py` | Click-based CLI entry point (`gtfs-diff`); thin wrapper around `diff_feeds()` | +## Feed Sources + +The engine accepts three feed source types: + +- **Local directory** — a directory containing GTFS `.txt` files. +- **Local `.zip` archive** — including archives that wrap files in a single subdirectory. +- **HTTP(S) folder URL** — a public, unauthenticated folder-style URL whose GTFS files are addressed as `/`. + +`diff_feeds(..., files=...)` controls which files are considered and is optional for every feed source. For local directories and zip archives, `files` acts as a filter: when omitted, the opener lists the feed contents; when supplied, only the named GTFS `.txt` files are compared. For HTTP(S) folder URLs, supplying `files` checks exactly those names. When `files` is omitted for folder URLs, the engine probes every known GTFS file in `gtfs_definitions.SUPPORTED_FILES` (the files with primary-key definitions) and compares whichever ones exist. + +Remote presence detection still supports added/deleted file reporting. For each candidate file, the engine joins the name to each folder URL and probes the resulting URL with `HEAD`; if the server rejects `HEAD`, it falls back to a ranged `GET`. A file that exists in only one version is therefore reported as added or deleted rather than being silently skipped. + +For the small-file/in-memory path, remote contents are fetched lazily with `GET` only when a file is actually diffed, and are decoded as `utf-8-sig` like local files. For large eligible files routed to DuckDB, Python performs only `HEAD` probes for existence and size; DuckDB reads the file URL directly through its `httpfs` extension using HTTP range requests. The Python remote opener uses standard-library `urllib`; only public HTTP(S) URLs are supported. Authenticated feeds and `gs://` SDK access are intentionally out of scope. + +### Private folders with public files + +HTTP(S) folder URLs do not need to allow directory listing. The engine never requests the folder URL itself; it only requests individual file URLs formed as `/`. A non-listable folder is therefore usable as long as the GTFS `.txt` files inside it are individually public. + +Presence probing also tolerates object-store semantics. For example, Google Cloud Storage may return `403 Forbidden` rather than `404 Not Found` for a missing object in a private, non-listable bucket because the server will not confirm or deny existence without list permission. The engine treats `401`, `403`, `404`, and `410` probe responses as "absent or not fetchable" and skips that file, reporting it as added or deleted when it exists in only one feed. Genuine server errors such as `5xx` still propagate. + +Python API example: + +```python +from gtfs_diff.engine import diff_feeds + +result = diff_feeds( + "https://storage.googleapis.com/example/base", + "https://storage.googleapis.com/example/new", + files=["stops.txt", "trips.txt"], +) +``` + +CLI example: + +```bash +gtfs-diff https://storage.googleapis.com/example/base \ + https://storage.googleapis.com/example/new \ + --files "stops.txt,trips.txt" +``` + ## Streaming Algorithm The per-file diff (`_diff_file`) operates in two streaming passes: @@ -56,6 +96,172 @@ field_changes = [ Comparing only shared columns ensures that adding or removing a column from a file does not cause every existing row to appear as modified. +### Optional primary-key columns + +Most GTFS files have mandatory primary-key columns: if any are absent from a feed header, the low-level indexing helper raises `MissingPrimaryKeyError` rather than producing an unreliable row diff. The engine catches that error at the per-file boundary and reports just that file as `not_compared` with reason code `missing_primary_key`; duplicate primary-key values are handled the same way with reason code `duplicate_primary_key`. The overall feed diff continues, and column-level differences for the affected file are still populated. For these primary-key problems, the reason message identifies whether the base feed, new feed, or both feeds caused the issue. As with `id_churn`, foreign-key columns in other files that reference the `not_compared` file are excluded from field-level diffs and listed under `ignored_columns` with reason code `references_not_compared_file`. Some files, however, define conditionally-present key columns. For example, `translations.txt` identifies a translation by either `record_id` (optionally with `record_sub_id`) or `field_value`; real feeds usually include only the subset required for the form they use. + +For these optional primary-key columns, `_read_csv_index` keeps the file's **full** primary key and treats any optional column that is absent from a feed's headers as a null (empty) value for every row — effectively adding the missing PK header and filling it with nulls *for the compare step only*. This guarantees both feeds build their composite key over an identical set of columns, so a feed that omits an optional key column still aligns with one that includes it (instead of every row looking added/deleted). The padding affects only row identity during comparison: the injected columns are never added to the reported headers, column diff, or row values. + +Which key columns are optional is derived from the GTFS Schedule reference: a primary-key column is treated as optional whenever its documented presence is anything other than "Required" (Optional, Conditionally Required, Recommended, or Conditionally Forbidden). See `gtfs_definitions.OPTIONAL_PRIMARY_KEY_COLUMNS`. + +## Large files: the DuckDB backend + +The in-memory two-pass engine remains the default path for every file. Very large GTFS tables, especially `stop_times.txt` with 1 M+ rows, can still make the Python key indexes expensive. For those cases the engine automatically routes an eligible modified file to the DuckDB backend, which performs the heavy set arithmetic and row scan on disk rather than keeping every row in Python dictionaries. DuckDB ships as a runtime dependency, so the backend is available out of the box. + +The switch is deliberately conservative. A file is sent to DuckDB only when **all** of the following are true: + +- `large_file_threshold_bytes` is not `None`; +- the larger side's uncompressed size is at least `large_file_threshold_bytes` (default `DEFAULT_LARGE_FILE_THRESHOLD_BYTES`, 50 MB); +- both feed sizes are cheaply known; +- the file has a simple, explicit primary key with no optional or conditional PK columns. + +Empty-PK files and files with optional primary-key columns, such as `translations.txt`, always stay on the in-memory engine. Added and deleted files also stay on the normal code path; the DuckDB backend is only used for files present in both feeds. If the size is unknown, the PK is ineligible, or the file is below the threshold, the safe default is the in-memory engine. Any DuckDB backend error is traced and falls back to the in-memory engine (a defensive safeguard that also covers an unexpectedly missing `duckdb` install); duplicate-primary-key files are reported directly as `not_compared` with reason code `duplicate_primary_key`, matching the in-memory path. + +Size metadata comes from the cheapest source available for the feed type: + +- local directories use `Path.stat().st_size`; +- zip feeds use each member's `ZipInfo.file_size` (uncompressed size); +- HTTP(S) folder URLs use the probe response's `Content-Length`. + +DuckDB can read local paths and HTTP(S) URLs directly. Directory feeds are read in place, without copying. Remote URLs are also read in place by DuckDB through its `httpfs` extension using HTTP range requests; the extension is installed and loaded on first URL use. On the Python side, the engine only performs `HEAD` probes for existence and `Content-Length` routing, so remote files are not fully downloaded or staged to temporary files for this path. Zip members are still streamed to a temporary file, keeping memory bounded, and that file is deleted immediately after the per-file diff completes. This preserves the cleanup guarantee for staged archive contents. + +Output parity is the main design constraint. The DuckDB path uses SQL only as a **superset pre-filter** for candidate changes: rows whose raw shared-column strings are distinct are streamed back to Python in batches. The final decision still uses the same `_values_differ` helper as the in-memory engine, so comparisons remain case-insensitive, whitespace-trimmed, and numeric-aware (for example, `-73.55625` and `-73.556250` are equal). The backend also reuses the same helpers for `raw_value` construction, id-churn detection, ignored foreign-key columns, line numbers (header = line 1, first data row = line 2), and cap/truncation accounting. Set-ordered outputs such as added and deleted rows may be emitted in a different order, but the records are otherwise identical. + +Each file is diffed on its own short-lived in-memory DuckDB connection. The two per-file tables are dropped and the connection is closed in a `finally` block once the diff completes, so DuckDB releases their buffers before the next file is processed; any lingering process memory afterwards is the allocator returning freed pages to the OS lazily, not retained tables. DuckDB's on-disk spill is redirected to a per-file temporary directory (instead of its default `.tmp` in the current working directory) that is removed after each diff, so large-file spill never accumulates in or litters the caller's working directory. By default these per-file directories are created under the system temp directory; setting `GTFS_DIFF_DUCKDB_TMPDIR` redirects them under that base directory instead. The override expands a leading `~`, creates the directory if needed, and falls back to the system temp directory when unset or blank, which is useful when `/tmp` is too small for multi-gigabyte feed comparisons. + +## Change Statistics + +Modified files include per-file row-change statistics and, by default, a per-column breakdown for modified rows. Both the in-memory engine (`engine.py`) and DuckDB backend (`engine_duckdb.py`) accumulate per-column modification counts while scanning the full modified set, not just the rows retained after `row_changes_cap_per_file`, so these counts are true and cap-independent. + +The shared helpers `_compute_rows_changed_percentage` and `_build_column_stats` live in `diff_helpers.py`. They centralise the formulas for `rows_changed_percentage` and `column_stats`, guaranteeing parity between the two backends. The `column_stats` toggle gates only the per-column list; `rows_changed_percentage` is always computed for modified files. + +## "Not Compared" Files + +Some files cannot be meaningfully diffed by primary key. Rather than emitting a +misleading row-by-row diff, the engine reports such a file with +`file_action: "not_compared"`, a machine-readable `not_compared_reason`, and a +`row_changes` of `null`. Column-level differences (`columns_added` / +`columns_deleted`) are still populated. + +The mechanism is generic: `id_churn`, `missing_primary_key`, +`duplicate_primary_key`, and future reasons (file too large, etc.) reuse the same +`not_compared` code path by returning a `NotComparedReason` from a detector or +per-file error handler and short-circuiting `_diff_file_modified`. + +### Detecting regenerated ids (`id_churn`) + +Several GTFS producers regenerate primary-key values on every export — most +notably `shape_id` (shapes.txt), `trip_id` (trips.txt) and `service_id` +(calendar*.txt). A primary-key comparison then reports nearly every row as both +added and deleted, drowning out real changes. + +After the two key sets are built, the engine measures the **churn ratio** as the +complement of the [overlap coefficient](https://en.wikipedia.org/wiki/Overlap_coefficient): + +``` +churn_ratio = 1 − |common_keys| / min(|base_keys|, |new_keys|) +``` + +i.e. the fraction of the **smaller** feed's primary keys that have no match in +the other feed. When the ratio meets or exceeds the file's threshold the file is +marked `not_compared` with reason code `id_churn`, and the expensive +modification scan is skipped. + +#### Why the overlap coefficient? + +The denominator is what matters. Three candidates all read 0 for identical key +sets and 1 for fully disjoint sets, but they disagree on **asymmetric** sets: + +| Metric | Formula | Bulk add: 100 keys ⊂ 1000 keys | +|---|---|-----------------------------| +| `÷max` | `1 − common/max(base,new)` | **0.90** false positive | +| Jaccard | `1 − common/(base ∪ new)` | **0.90** false positive | +| **Overlap** | `1 − common/min(base,new)` | **0.00** | + +Both `÷max` and Jaccard penalise a feed that simply *grows* or *shrinks*: a file +that gained 900 rows looks 90% "churned" even though every original key is +preserved and perfectly matchable. That is a bulk add/delete, not id +regeneration, and flagging it as `not_compared` would hide a real, comparable +diff. + +Detection is deliberately conservative and only runs when it can yield a +reliable signal: + +- the file has an **explicit** primary key (empty-PK files use all columns as a + composite key, where any field edit would look like churn); +- **both** feeds have at least `MIN_ROWS_FOR_ID_CHURN_DETECTION` rows (near-total + turnover in a tiny file is just as likely an ordinary edit). + +#### Configuring thresholds + +Because GTFS files differ in how volatile their ids are, the threshold can be set +at several levels. For each file the engine resolves the threshold in this order +(highest precedence first): + +1. **Caller per-file override** — a `{file_name: threshold}` mapping passed as + `diff_feeds(id_churn_thresholds=...)`, or on the CLI via the repeatable + `--id-churn-threshold-for FILENAME RATIO` option. This lets you tune one file + (e.g. `{"shapes.txt": 0.95}`) without affecting others or mutating any state. +2. **Built-in per-file defaults** — `ID_CHURN_THRESHOLDS` in + `gtfs_definitions.py`, the project's baseline domain knowledge for files whose + keys are known to be volatile. +3. **Global threshold** — `diff_feeds(id_churn_threshold=...)` (CLI: + `--id-churn-threshold`), applied to every file without a more specific value. +4. **`DEFAULT_ID_CHURN_THRESHOLD`** (currently `0.7`) — the ultimate fallback. + +This resolution lives in `get_id_churn_threshold()`; callers never need to write +to the module-level map to customise behaviour. + +### Propagating unreliable references through foreign keys + +An unreliable parent key does not just affect its own file. GTFS files form a +hierarchy via foreign keys — e.g. `trips.shape_id → shapes.txt`, +`stop_times.trip_id → trips.txt`, `routes.agency_id → agency.txt`. When a parent +file's primary key churns, those same regenerated values reappear in every +child's foreign-key column; when a parent's mandatory primary-key column is +missing, child foreign keys cannot be validated against a comparable parent diff. +Comparing either column would report unreliable field changes, even though the +child rows may otherwise be identical. + +To handle this, `diff_feeds()`: + +1. **Orders files by dependency.** `_processing_order()` performs a deterministic + topological sort over the present files using `GTFS_FOREIGN_KEYS`, so every + referenced (parent) file is diffed *before* the files that reference it. Its + `not_compared` status is therefore known in advance. (Self-references such as + `stops.parent_station → stops` are excluded to keep the graph acyclic; cycles + and missing parents are handled defensively.) + +2. **Ignores unreliable foreign-key columns.** When a child is diffed, any + foreign-key column pointing at a file that was marked `not_compared` due to + `id_churn`, `missing_primary_key`, or `duplicate_primary_key` is excluded from + the field-level comparison (`_scan_modifications`) and listed in the child's + `ignored_columns`, each with + a `references_not_compared_file` reason. Primary-key columns are never ignored — + if an unreliable referenced column is part of the child's own primary key, the + normal per-file detection handles the child rather than treating that key as an + ignorable field. + +The net effect: a stable `trips.txt` whose only "change" is a regenerated or +uncomparable `shape_id` is correctly reported as unchanged (or shows only its +*real* edits), rather than every row appearing modified. + +```jsonc +// trips.txt diff when shapes.txt was not_compared +"ignored_columns": [ + { + "column": "shape_id", + "reason": { + "code": "references_not_compared_file", + "message": "Column 'shape_id' references shapes.txt, which was not compared …" + } + } +] +``` + +Processing happens in dependency order, but the final `file_diffs` and +`summary.files` are re-sorted by file name so the output order is stable. + ## Handling Edge Cases ### Files with no explicit primary key @@ -119,7 +325,7 @@ Base columns appear first (preserving their original order), followed by any new ## Limitations and Future Work -- **Disk-backed index for huge feeds** — `stop_times.txt` can exceed 10 M rows; the in-memory index should be replaced with a SQLite-backed approach for production deployments at that scale. +- **DuckDB eligibility** — the disk-backed path is intentionally limited to modified files with simple explicit primary keys and known sizes; other files continue to use the in-memory engine. - **Parallel file processing** — files within a feed are currently processed sequentially; parallel workers (e.g. `concurrent.futures.ThreadPoolExecutor`) could reduce wall-clock time for feeds with many files. - **GeoJSON / Flex location support** — `locations.geojson` and other non-CSV GTFS Flex files are not CSV and are currently reported as unsupported. Dedicated diff logic for these formats is left as future work. diff --git a/pyproject.toml b/pyproject.toml index 86fb16d..fe1c98f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "pydantic>=2.0", "click>=8.0", "psutil>=5.9", + "duckdb>=1.0", ] [project.optional-dependencies] @@ -19,6 +20,7 @@ dev = [ "pytest-cov", "datamodel-code-generator[ruff]>=0.59", "ruff>=0.11", + "duckdb>=1.0", ] [project.scripts] diff --git a/requirements.txt b/requirements.txt index 2021a0e..65bbf56 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ annotated-types==0.7.0 click==8.3.2 coverage==7.13.5 +duckdb==1.5.3 iniconfig==2.3.0 packaging==26.1 pluggy==1.6.0 diff --git a/src/gtfs_diff/cli.py b/src/gtfs_diff/cli.py index 5371563..9debdff 100644 --- a/src/gtfs_diff/cli.py +++ b/src/gtfs_diff/cli.py @@ -6,13 +6,24 @@ import click -from gtfs_diff.engine import MissingPrimaryKeyError, diff_feeds +from gtfs_diff.engine import _is_url, diff_feeds +from gtfs_diff.gtfs_definitions import DEFAULT_ID_CHURN_THRESHOLD @click.command() @click.version_option(version="0.1.0", prog_name="gtfs-diff-engine") -@click.argument("base_feed", type=click.Path(exists=True, path_type=Path)) -@click.argument("new_feed", type=click.Path(exists=True, path_type=Path)) +@click.argument("base_feed", type=str) +@click.argument("new_feed", type=str) +@click.option( + "--files", + default=None, + metavar="NAMES", + help=( + "Comma-separated list of GTFS files to compare, e.g. " + "'stops.txt,trips.txt'. Optional: for folder URLs, omitting it " + "probes all known GTFS files; for local feeds it restricts the comparison." + ), +) @click.option( "--output", "-o", @@ -40,20 +51,88 @@ default=None, help="ISO 8601 datetime for when new was downloaded.", ) +@click.option( + "--id-churn-threshold", + type=click.FloatRange(0.0, 1.0), + default=DEFAULT_ID_CHURN_THRESHOLD, + show_default=True, + help=( + "Primary-key churn ratio (0.0-1.0) above which a file is reported as " + "not_compared instead of diffed (detects regenerated ids)." + ), +) +@click.option( + "--id-churn-threshold-for", + type=(str, click.FloatRange(0.0, 1.0)), + multiple=True, + metavar="FILENAME RATIO", + help=( + "Per-file id-churn threshold override; repeatable. Takes precedence " + "over --id-churn-threshold. Example: " + "--id-churn-threshold-for shapes.txt 0.95" + ), +) +@click.option( + "--large-file-threshold-mb", + type=click.FloatRange(0.0), + default=50.0, + show_default=True, + help=( + "Files whose larger side is at least this many megabytes are diffed " + "with the built-in DuckDB backend (lower memory for very large files). " + "Use --no-duckdb to always use the in-memory engine." + ), +) +@click.option( + "--no-duckdb", + is_flag=True, + default=False, + help="Disable the DuckDB backend; always use the in-memory engine.", +) +@click.option( + "--column-stats/--no-column-stats", + default=True, + help=( + "Include per-column modification counts and percentages in each " + "modified file's stats (default: on). The file-level " + "rows_changed_percentage is always computed." + ), +) def main( - base_feed: Path, - new_feed: Path, + base_feed: str, + new_feed: str, + files: str | None, output: Path | None, cap: int | None, pretty: bool, base_downloaded_at: str | None, new_downloaded_at: str | None, + id_churn_threshold: float, + id_churn_threshold_for: tuple[tuple[str, float], ...], + large_file_threshold_mb: float, + no_duckdb: bool, + column_stats: bool, ) -> None: - """Compare two GTFS feeds (zip or directory) and output a JSON diff. + """Compare two GTFS feeds and output a JSON diff. - BASE_FEED: path to the base GTFS feed (zip or directory)\n - NEW_FEED: path to the new GTFS feed (zip or directory) + BASE_FEED: local path or http(s):// folder URL to the base GTFS feed\n + NEW_FEED: local path or http(s):// folder URL to the new GTFS feed\n + Use optional --files with a comma-separated GTFS file list. For URLs, + omitting --files auto-discovers known GTFS files. """ + base_is_url = _is_url(base_feed) + new_is_url = _is_url(new_feed) + + base_path: str | Path = base_feed if base_is_url else Path(base_feed) + new_path: str | Path = new_feed if new_is_url else Path(new_feed) + + if isinstance(base_path, Path) and not base_path.exists(): + click.echo(f"Error: {base_path} does not exist.", err=True) + sys.exit(1) + if isinstance(new_path, Path) and not new_path.exists(): + click.echo(f"Error: {new_path} does not exist.", err=True) + sys.exit(1) + try: base_dt = ( datetime.fromisoformat(base_downloaded_at) if base_downloaded_at else None @@ -66,22 +145,24 @@ def main( sys.exit(1) try: + parsed_files = ( + [f.strip() for f in files.split(",") if f.strip()] if files else None + ) + large_file_threshold_bytes = ( + None if no_duckdb else int(large_file_threshold_mb * 1024 * 1024) + ) result = diff_feeds( - base_path=base_feed, - new_path=new_feed, + base_path=base_path, + new_path=new_path, row_changes_cap_per_file=cap, base_downloaded_at=base_dt, new_downloaded_at=new_dt, + id_churn_threshold=id_churn_threshold, + id_churn_thresholds=dict(id_churn_threshold_for), + files=parsed_files, + large_file_threshold_bytes=large_file_threshold_bytes, + column_stats=column_stats, ) - except MissingPrimaryKeyError as exc: - click.echo( - f"ERROR: Cannot process '{exc.file_name}' — " - f"required primary key column(s) {exc.missing_columns} " - f"are missing from the file headers.\n" - f"Headers found: {exc.headers}", - err=True, - ) - sys.exit(1) except Exception as exc: click.echo(f"Error: {exc}", err=True) sys.exit(1) diff --git a/src/gtfs_diff/csv_utils.py b/src/gtfs_diff/csv_utils.py new file mode 100644 index 0000000..a437298 --- /dev/null +++ b/src/gtfs_diff/csv_utils.py @@ -0,0 +1,254 @@ +"""Low-level CSV helpers: header parsing, indexing, and value comparison. + +These are pure functions shared by the in-memory engine and the DuckDB backend. +""" + +from __future__ import annotations + +import csv +import io +from pathlib import Path +from typing import TextIO + +from .gtfs_definitions import get_optional_primary_key_columns + + +def _is_url(path: str | Path) -> bool: + """Return True if *path* is an ``http://`` or ``https://`` URL.""" + return isinstance(path, str) and ( + path.startswith("http://") or path.startswith("https://") + ) + + +class MissingPrimaryKeyError(ValueError): + """Raised when a required primary key column is absent from a file's headers.""" + + def __init__( + self, file_name: str, missing_columns: list[str], headers: list[str] + ) -> None: + self.file_name = file_name + self.missing_columns = missing_columns + self.headers = headers + super().__init__( + f"'{file_name}': required primary key column(s) " + f"{missing_columns} not found in headers {headers}." + ) + + +class DuplicatePrimaryKeyError(ValueError): + """Raised when duplicate primary key values are found in a file's rows. + + A duplicate key means rows cannot be uniquely matched between feeds, so the + file is reported as ``not_compared`` (like a missing primary key) rather than + aborting the whole diff. Subclasses :class:`ValueError` for backward + compatibility with callers that caught the previously-raised ``ValueError``. + """ + + def __init__( + self, + file_name: str, + primary_key: list[str], + duplicate_key: dict[str, str] | None = None, + line_number: int | None = None, + first_line: int | None = None, + side: str | None = None, + ) -> None: + self.file_name = file_name + self.primary_key = primary_key + self.duplicate_key = duplicate_key + self.line_number = line_number + self.first_line = first_line + self.side = side + location = "" + if line_number is not None and first_line is not None: + location = f" at line {line_number} (first seen at line {first_line})" + feed = f" in the {side} feed" if side in ("base", "new") else "" + super().__init__( + f"{file_name}: duplicate primary key " + f"{duplicate_key if duplicate_key is not None else primary_key}" + f"{location}{feed}." + ) + + @property + def detail(self) -> str | None: + """A short human-readable locator for the first duplicate, if known.""" + if self.duplicate_key is None: + return None + if self.line_number is not None and self.first_line is not None: + return ( + f"e.g. {self.duplicate_key} appears at lines " + f"{self.first_line} and {self.line_number}" + ) + return f"e.g. {self.duplicate_key}" + + +def _row_to_csv(values: list[str]) -> str: + """Serialize a list of string values to a single CSV line (no trailing newline).""" + buf = io.StringIO() + writer = csv.writer(buf, lineterminator="") + writer.writerow(values) + return buf.getvalue() + + +def _read_headers(text_io: TextIO) -> list[str]: + """Read only the header row from a CSV stream, stripping whitespace.""" + reader = csv.reader(text_io) + try: + row = next(reader) + return [h.strip() for h in row] + except StopIteration: + return [] + + +def _read_headers_and_count(text_io: TextIO) -> tuple[list[str], int]: + """Read the header row and count the data rows of a CSV stream. + + Used for files that cannot be indexed (e.g. a missing required primary key) + but still need accurate row counts in their ``not_compared`` stats. The CSV + reader is used so quoted fields containing newlines are counted as one row. + """ + reader = csv.reader(text_io) + try: + headers = [h.strip() for h in next(reader)] + except StopIteration: + return [], 0 + count = sum(1 for _ in reader) + return headers, count + + +def _missing_required_pk_columns( + headers: list[str], pk_columns: list[str], file_name: str +) -> list[str]: + """Return the required primary-key columns absent from *headers*. + + Conditionally-present (optional) primary-key columns are not required: they + participate in the compare identity as NULL/empty values when absent (see + :func:`_read_csv_index`). Only mandatory key columns count as missing. An + empty *pk_columns* (composite key over all columns) never has a requirement, + so an empty list is returned. + """ + if not pk_columns: + return [] + header_set = set(headers) + optional_pk = get_optional_primary_key_columns(file_name) + return [ + col for col in pk_columns if col not in header_set and col not in optional_pk + ] + + +def _read_csv_index( + text_io: TextIO, + pk_columns: list[str] | None = None, + file_name: str = "", + side: str | None = None, +) -> tuple[list[str], dict[tuple, tuple[int, str]]]: + """Stream a CSV file and build a primary-key → (line_number, raw_csv_string) index. + + Args: + text_io: Open text stream for the CSV file (utf-8-sig recommended). + pk_columns: Columns that form the primary key. `None` / empty list + means use *all* columns as the composite key. + file_name: Used in error messages only. + side: Which feed this stream belongs to (``"base"`` / ``"new"``), + recorded on a raised :class:`DuplicatePrimaryKeyError` so the + not_compared reason can name the offending feed. + + Returns: + headers: Stripped column names from the header row. + index: Maps `pk_tuple` → `(line_number, raw_csv_string)`. + Line numbers are 1-based; the header row is line 1, so the + first data row is line 2. + + Raises: + MissingPrimaryKeyError: If expected primary key columns are absent from + the header (diff would silently treat all rows as identical). + DuplicatePrimaryKeyError: If duplicate primary key values are found (diff + would silently discard earlier rows). + """ + reader = csv.reader(text_io) + try: + raw_headers = next(reader) + except StopIteration: + return [], {} + + headers = [h.strip() for h in raw_headers] + n = len(headers) + effective_pk = pk_columns if pk_columns else headers + + if pk_columns: + # Conditionally-present PK columns (e.g. translations.txt's record_id / + # record_sub_id / field_value) may be absent, but they still participate + # in the compare identity as NULL/empty values. Only a *mandatory* + # missing column is an error. + missing_required = _missing_required_pk_columns(headers, pk_columns, file_name) + if missing_required: + raise MissingPrimaryKeyError(file_name, missing_required, headers) + effective_pk = pk_columns + + index: dict[tuple, tuple[int, str]] = {} + for line_num, row in enumerate(reader, start=2): + # Pad short rows; trim rows wider than the header (malformed CSV safety). + if len(row) < n: + row = row + [""] * (n - len(row)) + row_vals = row[:n] + row_dict = dict(zip(headers, row_vals, strict=True)) + pk_tuple = tuple(row_dict.get(col, "") for col in effective_pk) + + if pk_tuple in index: + raise DuplicatePrimaryKeyError( + file_name, + list(effective_pk), + duplicate_key=dict(zip(effective_pk, pk_tuple, strict=True)), + line_number=line_num, + first_line=index[pk_tuple][0], + side=side, + ) + + index[pk_tuple] = (line_num, _row_to_csv(row_vals)) + + return headers, index + + +def _parse_raw_line(raw_line: str, headers: list[str]) -> dict[str, str]: + """Deserialise a raw CSV string (as stored in the index) back to a row dict.""" + reader = csv.reader(io.StringIO(raw_line)) + try: + row = next(reader) + except StopIteration: + return {col: "" for col in headers} + if len(row) < len(headers): + row = row + [""] * (len(headers) - len(row)) + return dict(zip(headers, row, strict=True)) + + +def _values_differ(a: str, b: str) -> bool: + """Return True if two field values represent meaningfully different data. + + String-identical values are equal (fast path). + If they differ as strings, attempt numeric comparison — this silently + ignores cosmetic differences like trailing zeros in coordinate fields + (e.g. '-73.55625' vs '-73.556250'). + Non-numeric strings fall back to string equality. + """ + a = a.strip() + b = b.strip() + if a.lower() == b.lower(): + return False + try: + return float(a) != float(b) + except (ValueError, OverflowError): + return True + + +def _compute_raw_value( + row_dict: dict[str, str], + columns: list[str], + present_headers: set[str], +) -> str: + """Build an ordered CSV string aligned to the *union* columns list. + + Columns absent from ``present_headers`` (i.e. the file this row came from + did not have that column) are rendered as empty strings. + """ + values = [row_dict[col] if col in present_headers else "" for col in columns] + return _row_to_csv(values) diff --git a/src/gtfs_diff/diff_helpers.py b/src/gtfs_diff/diff_helpers.py new file mode 100644 index 0000000..029a2da --- /dev/null +++ b/src/gtfs_diff/diff_helpers.py @@ -0,0 +1,595 @@ +"""Pure diff helpers: file ordering, column diffing, id-churn, and row scanning. + +These functions contain no I/O and are shared by the in-memory engine and the +DuckDB backend. They operate on already-parsed headers/indexes and return model +objects or plain data structures. +""" + +from __future__ import annotations + +import time + +from .csv_utils import _compute_raw_value, _parse_raw_line, _values_differ +from .gtfs_definitions import ( + MIN_ROWS_FOR_ID_CHURN_DETECTION, + get_foreign_keys, +) +from .models import ( + ColumnEntry, + ColumnStat, + FieldChange, + FileDiff, + FileStats, + FileSummary, + IgnoredColumn, + NotComparedReason, + RowAdded, + RowChanges, + RowDeleted, + RowModified, + Truncated, +) +from .tracing import _trace + + +def _processing_order(files: list[str]) -> list[str]: + """Return *files* ordered so referenced (parent) files precede the files + that reference them via a foreign key. + + Uses a deterministic topological sort (alphabetical tie-break). Foreign keys + pointing at files that are absent from *files* are ignored, and any cycle is + broken by emitting the alphabetically-first remaining file — neither case can + occur for well-formed GTFS but both are handled defensively. + """ + present = set(files) + deps: dict[str, set[str]] = {f: set() for f in files} + for child in files: + for refs in get_foreign_keys(child).values(): + for parent in refs: + if parent in present and parent != child: + deps[child].add(parent) + + order: list[str] = [] + done: set[str] = set() + remaining = set(files) + while remaining: + ready = sorted(f for f in remaining if deps[f] <= done) + if not ready: # breaking the cycle + ready = [min(remaining)] + for f in ready: + order.append(f) + done.add(f) + remaining.discard(f) + return order + + +def _compute_ignored_columns( + file_name: str, + base_headers: list[str], + new_headers: list[str], + pk_cols: list[str], + not_compared_files: dict[str, str], +) -> tuple[list[IgnoredColumn], set[str]]: + """Determine which foreign-key columns to exclude from the field-level diff. + + A foreign-key column is ignored when the file it references was itself marked + ``not_compared`` — because its identifiers were regenerated (``id_churn``) or + because it was missing a required primary key (``missing_primary_key``). In + either case the referenced file's key values are unreliable, so any change in + the referencing column is noise rather than a real edit. Primary-key columns + are never ignored (they are the row identity). + + *not_compared_files* maps each not-compared file name to its reason code. + + Returns the ``IgnoredColumn`` records (in base-header order) and the set of + ignored column names. + """ + foreign_keys = get_foreign_keys(file_name) + if not foreign_keys: + return [], set() + + shared = set(base_headers) & set(new_headers) + pk_set = set(pk_cols) + ignored: list[IgnoredColumn] = [] + names: set[str] = set() + for col in base_headers: + if col not in shared or col in pk_set or col in names: + continue + refs = [r for r in foreign_keys.get(col, ()) if r in not_compared_files] + if not refs: + continue + referenced = refs[0] + ignored.append( + IgnoredColumn( + column=col, + reason=NotComparedReason( + code="references_not_compared_file", + message=_ignored_column_message( + col, referenced, not_compared_files[referenced] + ), + ), + ) + ) + names.add(col) + return ignored, names + + +def _ignored_column_message(column: str, referenced: str, reason_code: str) -> str: + """Explain why a foreign-key *column* was excluded from the diff. + + The wording reflects *why* the *referenced* file was not compared so the + message stays accurate for both id-churn and missing-primary-key cases. + """ + if reason_code == "missing_primary_key": + cause = ( + "was not compared because it is missing required primary key " + "column(s), so its rows could not be matched" + ) + elif reason_code == "id_churn": + cause = ( + "was not compared because its primary key appears to be " + "regenerated across versions (id_churn)" + ) + elif reason_code == "duplicate_primary_key": + cause = ( + "was not compared because it has duplicate primary key values, so " + "its rows could not be uniquely matched" + ) + else: + cause = "was not compared" + return ( + f"Column '{column}' references {referenced}, which {cause}. Its values " + f"are unreliable, so the column was excluded from the diff." + ) + + +def _missing_primary_key_reason( + missing_base: list[str], missing_new: list[str] +) -> NotComparedReason: + """Build the ``not_compared`` reason for a file missing required PK columns. + + Reports which feed side(s) are missing which mandatory primary-key columns, + so the file is skipped (rather than aborting the whole diff) and reported + with column-level differences preserved. + """ + parts: list[str] = [] + if missing_base: + parts.append(f"the base feed is missing {sorted(set(missing_base))}") + if missing_new: + parts.append(f"the new feed is missing {sorted(set(missing_new))}") + detail = " and ".join(parts) if parts else "a required primary key column" + return NotComparedReason( + code="missing_primary_key", + message=( + f"Required primary key column(s) are absent: {detail}. Rows cannot " + f"be matched without the primary key, so row-level comparison was " + f"skipped to avoid a misleading diff." + ), + ) + + +def _feed_side_phrase(side: str | None) -> str: + """Return a human phrase naming the offending feed side(s). + + ``side`` is ``"base"``, ``"new"``, ``"both"`` (or ``None`` when unknown). + """ + if side == "base": + return "the base feed" + if side == "new": + return "the new feed" + if side == "both": + return "both the base and new feed" + return "the base or new feed" + + +def _duplicate_primary_key_reason( + primary_key: list[str] | None, + detail: str | None = None, + side: str | None = None, +) -> NotComparedReason: + """Build the ``not_compared`` reason for a file with duplicate primary keys. + + Duplicate key values mean rows cannot be uniquely matched between feeds, so + the file is skipped (rather than aborting the whole diff) and reported with + column-level differences preserved. *side* names which feed contains the + duplicate (``"base"``, ``"new"``, ``"both"``, or ``None`` when unknown). + *detail*, when given, locates an example duplicate (e.g. the offending key + and line numbers). + """ + pk = sorted(set(primary_key)) if primary_key else [] + pk_part = f" {pk}" if pk else "" + extra = f" ({detail})" if detail else "" + where = _feed_side_phrase(side) + return NotComparedReason( + code="duplicate_primary_key", + message=( + f"Duplicate primary key{pk_part} value(s) were found in {where}" + f"{extra}, so rows cannot be uniquely matched between the base and " + f"new feed. Row-level comparison was skipped to avoid a misleading " + f"diff." + ), + ) + + +def _diff_columns( + base_headers: list[str], + new_headers: list[str], +) -> tuple[list[ColumnEntry], list[ColumnEntry], list[str]]: + """Compute column-level differences between two header lists. + + Returns: + columns_added: Columns present in new but not in base. + columns_deleted: Columns present in base but not in new. + union_columns: All columns — base order first, new-only appended. + + Note: column reorders (same columns, different positions) are silently + ignored — values are always compared by name, not position. + """ + base_header_set = set(base_headers) + new_header_set = set(new_headers) + columns_added = [ + ColumnEntry(name=col, position=i + 1) + for i, col in enumerate(new_headers) + if col not in base_header_set + ] + columns_deleted = [ + ColumnEntry(name=col, position=i + 1) + for i, col in enumerate(base_headers) + if col not in new_header_set + ] + new_only_cols = [col for col in new_headers if col not in base_header_set] + union_columns: list[str] = base_headers + new_only_cols + return columns_added, columns_deleted, union_columns + + +def _detect_id_churn( + pk_cols: list[str], + pk_is_explicit: bool, + base_row_count: int, + new_row_count: int, + common_count: int, + id_churn_threshold: float, +) -> NotComparedReason | None: + """Return a ``NotComparedReason`` when primary-key churn is too high. + + Files whose identifiers are regenerated on every export (e.g. ``shape_id``, + ``trip_id``) produce two nearly disjoint key sets, so almost every row looks + added or deleted. We measure how badly the keys fail to match between the + two feeds and, when it meets or exceeds *id_churn_threshold*, flag the file + as not comparable instead of emitting a misleading diff. + + The churn ratio is the complement of the **overlap coefficient** — + ``|common| / min(|base|, |new|)`` — i.e. the fraction of the *smaller* + feed's keys that have no match in the other feed:: + + churn_ratio = 1 − |common| / min(|base|, |new|) + + Dividing by ``min`` (rather than ``max`` or the union, as in Jaccard) makes + the metric robust to bulk additions or deletions: a feed that merely grows + or shrinks keeps a high overlap and is *not* mistaken for id regeneration. + Only when the keys themselves are replaced — the actual signature of churn — + does the ratio approach 1. See ``docs/architecture.md`` for the rationale. + + Detection is skipped when it cannot yield a reliable signal: + + * files without an explicit primary key (those use all columns as a + composite key, where any field edit would look like churn); + * a feed side with no rows (a bulk add or delete, not regenerated ids); + * files too small for near-total turnover to be statistically meaningful + (see :data:`gtfs_definitions.MIN_ROWS_FOR_ID_CHURN_DETECTION`). + """ + if not pk_is_explicit: + return None + + smaller = min(base_row_count, new_row_count) + if smaller < MIN_ROWS_FOR_ID_CHURN_DETECTION: + return None + + churn_ratio = 1.0 - (common_count / smaller) + if churn_ratio < id_churn_threshold: + return None + + pct = round(churn_ratio * 100, 1) + return NotComparedReason( + code="id_churn", + message=( + f"{pct}% of primary key values {pk_cols} differ between the base " + f"and new feed, indicating the identifiers are regenerated across " + f"versions and cannot be reliably matched. Row-level comparison was " + f"skipped to avoid a misleading diff." + ), + ) + + +def _build_not_compared_diff( + file_name: str, + reason: NotComparedReason, + columns_added: list[ColumnEntry], + columns_deleted: list[ColumnEntry], + base_row_count: int, + new_row_count: int, +) -> tuple[FileDiff, FileSummary]: + """Build the ``not_compared`` result for a file that could not be diffed. + + ``row_changes`` is omitted (``None``) while column-level differences are + preserved, per the GTFS Diff v2 schema. + """ + file_diff = FileDiff( + file_name=file_name, + file_action="not_compared", + not_compared_reason=reason, + columns_added=columns_added, + columns_deleted=columns_deleted, + row_changes=None, + stats=FileStats( + total_rows_base=base_row_count, + total_rows_new=new_row_count, + columns_added_count=len(columns_added), + columns_deleted_count=len(columns_deleted), + ), + ) + summary = FileSummary(file_name=file_name, status="not_compared") + return file_diff, summary + + +def _shared_columns( + base_headers: list[str], + new_header_set: set[str], + ignored: set[str], +) -> list[str]: + """Columns present in both feeds (in base order) excluding *ignored* ones. + + The shared set is the basis for the "modified" comparison: a column added or + removed between versions is a column-level change, not a row-level one, so it + must not trigger false row modifications. *ignored* columns (unreliable + foreign keys to not_compared files) are likewise skipped. + """ + return [c for c in base_headers if c in new_header_set and c not in ignored] + + +def _scan_modifications( + file_name: str, + common_keys: set[tuple], + base_index: dict[tuple, tuple[int, str]], + new_index: dict[tuple, tuple[int, str]], + base_headers: list[str], + new_headers: list[str], + ignored_columns: set[str] | None = None, +) -> list[tuple[tuple, list[FieldChange], int, int]]: + """Scan rows present in both feeds and return those whose field values differ. + + Compares only columns shared between both headers to avoid false positives + when a column is added or removed. Columns in *ignored_columns* (unreliable + foreign keys to not_compared files) are also skipped. + + Returns a list of (pk_tuple, field_changes, base_line, new_line) for + every common row that has at least one changed field. + + Note: row reorders (same rows, different line positions) are silently + ignored — keys are compared as sets, so row order has no effect. + """ + ignored = ignored_columns or set() + shared_cols = _shared_columns(base_headers, set(new_headers), ignored) + candidates: list[tuple[tuple, list[FieldChange], int, int]] = [] + + n = len(common_keys) + _trace(f" [{file_name}] scanning {n:,} common rows...") + t0 = time.monotonic() + for pk_tuple in common_keys: + b_line, b_raw = base_index[pk_tuple] + n_line, n_raw = new_index[pk_tuple] + b_dict = _parse_raw_line(b_raw, base_headers) + n_dict = _parse_raw_line(n_raw, new_headers) + field_changes = [ + FieldChange(field=col, base_value=b_dict[col], new_value=n_dict[col]) + for col in shared_cols + if _values_differ(b_dict.get(col, ""), n_dict.get(col, "")) + ] + if field_changes: + candidates.append((pk_tuple, field_changes, b_line, n_line)) + + _trace( + f" [{file_name}] scan done in {time.monotonic() - t0:.1f}s — " + f"{len(candidates):,} modified" + ) + return candidates + + +def _split_row_changes_cap( + cap: int | None, + true_added: int, + true_deleted: int, + true_modified: int, +) -> tuple[int | None, int | None, int | None]: + """Split a row-changes *cap* fairly across the change types that have rows. + + Rather than filling the cap with added rows first, then deleted, then + modified (which can hide whole change types when one is large), the budget + is shared evenly between the types that actually have changes so the user + sees a little of everything: with one active type it gets the whole cap, + with two they split it ~50/50, with three ~33/33/33. + + Allocation uses water-filling: the budget is divided evenly among the types + that still have rows to show; any leftover (because a type has fewer rows + than its share) is redistributed to the remaining hungry types until the cap + is exhausted or every change is included. A remainder that cannot divide + evenly is handed out one row at a time in added → deleted → modified order. + + Returns ``(added_limit, deleted_limit, modified_limit)``. When *cap* is + ``None`` (unlimited) all three limits are ``None``. + """ + if cap is None: + return None, None, None + + counts = [true_added, true_deleted, true_modified] + limits = [0, 0, 0] + active = [i for i, c in enumerate(counts) if c > 0] + remaining = cap + + while remaining > 0: + hungry = [i for i in active if limits[i] < counts[i]] + if not hungry: + break + share = remaining // len(hungry) + if share == 0: + # Indivisible remainder: hand out one row at a time, in type order. + for i in hungry: + if remaining == 0: + break + limits[i] += 1 + remaining -= 1 + break + for i in hungry: + grant = min(share, counts[i] - limits[i]) + limits[i] += grant + remaining -= grant + + return limits[0], limits[1], limits[2] + + +def _compute_rows_changed_percentage( + rows_added: int, + rows_deleted: int, + rows_modified: int, + total_base: int, + total_new: int, +) -> float | None: + """Percentage of rows changed relative to the larger of the two versions. + + Returns ``None`` when both versions are empty (no meaningful denominator). + The count of changed rows is the *true* total (added + deleted + modified), + so the percentage is unaffected by any row-changes cap/truncation. With heavy + churn the raw count can exceed the larger version's row count (added rows + exist only in the new version, deleted rows only in the base), so the result + is clamped to ``100.0`` to satisfy the schema's ``[0, 100]`` bound. + """ + denominator = max(total_base, total_new) + if denominator == 0: + return None + changed = rows_added + rows_deleted + rows_modified + return round(min(changed / denominator * 100.0, 100.0), 2) + + +def _build_column_stats( + column_mod_counts: dict[str, int], + total_modified: int, + column_order: list[str], +) -> list[ColumnStat] | None: + """Build per-column modification statistics for a modified file. + + *column_mod_counts* maps a column name to the number of modified rows that + changed in that column (a *true* count, independent of any cap). Only + columns with at least one modification are included, ordered by their + appearance in *column_order* for deterministic output. Returns ``None`` when + there are no per-column modifications to report. + """ + if total_modified == 0 or not column_mod_counts: + return None + stats = [ + ColumnStat( + column=col, + modifications_count=column_mod_counts[col], + modifications_percentage=round( + column_mod_counts[col] / total_modified * 100.0, 2 + ), + ) + for col in column_order + if column_mod_counts.get(col) + ] + return stats or None + + +def _build_identifier_and_raw( + row_dict: dict[str, str], + pk_cols: list[str], + union_columns: list[str], + header_set: set[str], +) -> tuple[dict[str, str], str]: + """Build the ``(identifier, raw_value)`` pair for a single changed row. + + The identifier maps each primary-key column to its value (empty string when + the column is absent from the row); ``raw_value`` is the row projected onto + *union_columns*. Shared by both engines so the cap'd row payloads are + byte-identical regardless of how *row_dict* was produced. + """ + identifier = {col: row_dict.get(col, "") for col in pk_cols} + raw_value = _compute_raw_value(row_dict, union_columns, header_set) + return identifier, raw_value + + +def _assemble_modified_file_diff( + *, + file_name: str, + pk_cols: list[str], + union_columns: list[str], + columns_added: list[ColumnEntry], + columns_deleted: list[ColumnEntry], + ignored_columns: list[IgnoredColumn], + added_rows: list[RowAdded], + deleted_rows: list[RowDeleted], + modified_rows: list[RowModified], + true_added: int, + true_deleted: int, + true_modified: int, + total_base: int, + total_new: int, + cap: int | None, + include_row_changes: bool, + column_stats: bool, + column_mod_counts: dict[str, int], +) -> tuple[FileDiff, FileSummary]: + """Assemble the ``FileDiff``/``FileSummary`` pair for a modified file. + + Shared by both engines so the truncation accounting, ``RowChanges`` payload, + and ``FileStats`` are built identically regardless of how the rows were + collected (in-memory dict iteration or DuckDB queries). The caller supplies + the already-collected (and already cap-limited) row lists plus the *true* + counts; ``total_base``/``total_new`` are the full row counts of each version. + """ + truncated: Truncated | None = None + row_changes: RowChanges | None = None + if include_row_changes: + total_included = len(added_rows) + len(deleted_rows) + len(modified_rows) + total_true = true_added + true_deleted + true_modified + if cap is not None and total_true > cap: + truncated = Truncated( + is_truncated=True, omitted_count=total_true - total_included + ) + # Use pk_cols for the primary_key field; for empty-pk files that means + # all base columns — which is correct (they form the composite key). + row_changes = RowChanges( + primary_key=pk_cols, + columns=union_columns, + added=added_rows, + deleted=deleted_rows, + modified=modified_rows, + ) + + file_diff = FileDiff( + file_name=file_name, + file_action="modified", + ignored_columns=ignored_columns or None, + columns_added=columns_added, + columns_deleted=columns_deleted, + row_changes=row_changes, + truncated=truncated, + stats=FileStats( + total_rows_base=total_base, + total_rows_new=total_new, + columns_added_count=len(columns_added), + columns_deleted_count=len(columns_deleted), + rows_added_count=true_added, + rows_deleted_count=true_deleted, + rows_modified_count=true_modified, + rows_changed_percentage=_compute_rows_changed_percentage( + true_added, true_deleted, true_modified, total_base, total_new + ), + column_stats=( + _build_column_stats(column_mod_counts, true_modified, union_columns) + if column_stats + else None + ), + ), + ) + summary = FileSummary(file_name=file_name, status="modified") + return file_diff, summary diff --git a/src/gtfs_diff/engine.py b/src/gtfs_diff/engine.py index ebced6c..fc0852f 100644 --- a/src/gtfs_diff/engine.py +++ b/src/gtfs_diff/engine.py @@ -2,46 +2,83 @@ Memory note ----------- -The two-pass algorithm builds in-memory indexes mapping primary-key tuples to -(line_number, raw_csv_string) for every row in each file. For typical transit -feeds this is fine. For very large feeds (stop_times.txt can exceed 10 M rows) -a disk-backed index (e.g. SQLite) would be more appropriate; that is left as a -future optimization. +The default two-pass algorithm builds in-memory indexes mapping primary-key +tuples to (line_number, raw_csv_string) for every row in each file. For typical +transit feeds this is fine. For very large feeds (stop_times.txt can exceed +10 M rows) this becomes expensive, so files whose larger side exceeds +``large_file_threshold_bytes`` are routed to the DuckDB backend +(:mod:`gtfs_diff.engine_duckdb`), which diffs them on disk without holding every +row in memory. DuckDB is a runtime dependency; in the unlikely event it is +unavailable the engine falls back to the in-memory path. """ from __future__ import annotations import configparser -import csv +import contextlib import io -import sys +import os +import shutil +import tempfile import time +import urllib.error +import urllib.request import zipfile -from collections.abc import Callable, Generator +from collections.abc import Callable, Generator, Iterable, Mapping from contextlib import contextmanager +from dataclasses import dataclass, field from datetime import datetime, timezone from importlib import resources from pathlib import Path from typing import TextIO -from .gtfs_definitions import get_primary_key +# Re-exported for backward compatibility (``gtfs_diff.engine`` has historically +# been the import site for these helpers); they now live in focused modules. +from .csv_utils import ( + DuplicatePrimaryKeyError, # noqa: F401 (re-export) + MissingPrimaryKeyError, # noqa: F401 (re-export) + _is_url, + _missing_required_pk_columns, + _parse_raw_line, + _read_csv_index, + _read_headers, + _read_headers_and_count, +) +from .diff_helpers import ( + _assemble_modified_file_diff, + _build_identifier_and_raw, + _build_not_compared_diff, + _compute_ignored_columns, + _detect_id_churn, + _diff_columns, + _duplicate_primary_key_reason, + _missing_primary_key_reason, + _processing_order, + _scan_modifications, + _split_row_changes_cap, +) +from .gtfs_definitions import ( + DEFAULT_ID_CHURN_THRESHOLD, + SUPPORTED_FILES, + get_id_churn_threshold, + get_optional_primary_key_columns, + get_primary_key, +) from .models import ( ColumnEntry, FeedSource, - FieldChange, FileDiff, FileStats, FileSummary, GtfsDiff, Metadata, RowAdded, - RowChanges, RowDeleted, RowModified, Summary, - Truncated, UnsupportedFile, ) +from .tracing import _trace def _read_schema_version() -> str: @@ -51,166 +88,85 @@ def _read_schema_version() -> str: return parser.get("default", "SCHEMA_VERSION") -class MissingPrimaryKeyError(ValueError): - """Raised when a required primary key column is absent from a file's headers.""" - - def __init__( - self, file_name: str, missing_columns: list[str], headers: list[str] - ) -> None: - self.file_name = file_name - self.missing_columns = missing_columns - self.headers = headers - super().__init__( - f"'{file_name}': required primary key column(s) " - f"{missing_columns} not found in headers {headers}." - ) - - -def _trace(msg: str) -> None: - """Print a timestamped progress message with current RSS to stderr.""" - import psutil - - rss_mb = psutil.Process().memory_info().rss / 1024 / 1024 - print( - f"[gtfs-diff {datetime.now().strftime('%H:%M:%S')} {rss_mb:.0f}MB] {msg}", - file=sys.stderr, - flush=True, - ) - - # A "lazy opener" maps a filename (e.g. "stops.txt") to a zero-arg callable # that opens the file and returns a text stream. LazyOpeners = dict[str, Callable[[], TextIO]] -# --------------------------------------------------------------------------- -# Low-level CSV helpers -# --------------------------------------------------------------------------- - - -def _row_to_csv(values: list[str]) -> str: - """Serialize a list of string values to a single CSV line (no trailing newline).""" - buf = io.StringIO() - writer = csv.writer(buf, lineterminator="") - writer.writerow(values) - return buf.getvalue() - - -def _read_headers(text_io: TextIO) -> list[str]: - """Read only the header row from a CSV stream, stripping whitespace.""" - reader = csv.reader(text_io) - try: - row = next(reader) - return [h.strip() for h in row] - except StopIteration: - return [] +@dataclass +class FeedFileMeta: + """Side-channel metadata about a feed file, used to route large files to the + DuckDB backend without changing the text-opener fast path. + + Attributes: + size: Uncompressed size in bytes, if cheaply known (directory + ``stat``, zip ``file_size``, or HTTP ``Content-Length``); + ``None`` when unknown. + local_path: Real filesystem path DuckDB can read directly (set only for + plain-directory feeds); ``None`` for zip members and URLs. + materialize: Writes the file's raw bytes to a given destination path, + streaming (bounded memory). Used to stage zip members to a + temp file for DuckDB. + url: Direct ``http(s)://`` URL of the file. When set, DuckDB reads + it in place via its ``httpfs`` extension (HTTP range + requests) instead of us downloading the whole file first — + our side only performs HEAD requests (size / existence). + """ + size: int | None = None + local_path: str | None = None + materialize: Callable[[str], None] | None = None + url: str | None = None -def _read_csv_index( - text_io: TextIO, - pk_columns: list[str] | None = None, - file_name: str = "", -) -> tuple[list[str], dict[tuple, tuple[int, str]]]: - """Stream a CSV file and build a primary-key → (line_number, raw_csv_string) index. - Args: - text_io: Open text stream for the CSV file (utf-8-sig recommended). - pk_columns: Columns that form the primary key. `None` / empty list - means use *all* columns as the composite key. - file_name: Used in error messages only. +@dataclass +class FeedHandle: + """An opened feed: lazy text openers plus per-file routing metadata.""" - Returns: - headers: Stripped column names from the header row. - index: Maps `pk_tuple` → `(line_number, raw_csv_string)`. - Line numbers are 1-based; the header row is line 1, so the - first data row is line 2. - - Raises: - MissingPrimaryKeyError: If expected primary key columns are absent from - the header (diff would silently treat all rows as identical). - ValueError: If duplicate primary key values are found (diff would - silently discard earlier rows). - """ - reader = csv.reader(text_io) - try: - raw_headers = next(reader) - except StopIteration: - return [], {} - - headers = [h.strip() for h in raw_headers] - n = len(headers) - effective_pk = pk_columns if pk_columns else headers - - if pk_columns: - missing = [col for col in pk_columns if col not in set(headers)] - if missing: - raise MissingPrimaryKeyError(file_name, missing, headers) - - index: dict[tuple, tuple[int, str]] = {} - for line_num, row in enumerate(reader, start=2): - # Pad short rows; trim rows wider than the header (malformed CSV safety). - if len(row) < n: - row = row + [""] * (n - len(row)) - row_vals = row[:n] - row_dict = dict(zip(headers, row_vals, strict=True)) - pk_tuple = tuple(row_dict.get(col, "") for col in effective_pk) - - if pk_tuple in index: - raise ValueError( - f"{file_name}: duplicate primary key " - f"{dict(zip(effective_pk, pk_tuple, strict=True))} " - f"at line {line_num} " - f"(first seen at line {index[pk_tuple][0]})." - ) + openers: LazyOpeners = field(default_factory=dict) + meta: dict[str, FeedFileMeta] = field(default_factory=dict) - index[pk_tuple] = (line_num, _row_to_csv(row_vals)) - return headers, index +# Files whose larger side is at least this many bytes (uncompressed) are routed +# to the DuckDB backend, when available. ~50 MB roughly corresponds to the point +# where an in-memory index of a wide GTFS file (e.g. stop_times.txt) becomes +# expensive; tune via ``diff_feeds(large_file_threshold_bytes=...)``. +DEFAULT_LARGE_FILE_THRESHOLD_BYTES = 50 * 1024**2 -def _parse_raw_line(raw_line: str, headers: list[str]) -> dict[str, str]: - """Deserialise a raw CSV string (as stored in the index) back to a row dict.""" - reader = csv.reader(io.StringIO(raw_line)) - try: - row = next(reader) - except StopIteration: - return {col: "" for col in headers} - if len(row) < len(headers): - row = row + [""] * (len(headers) - len(row)) - return dict(zip(headers, row, strict=True)) - - -def _values_differ(a: str, b: str) -> bool: - """Return True if two field values represent meaningfully different data. - - String-identical values are equal (fast path). - If they differ as strings, attempt numeric comparison — this silently - ignores cosmetic differences like trailing zeros in coordinate fields - (e.g. '-73.55625' vs '-73.556250'). - Non-numeric strings fall back to string equality. +@contextmanager +def _materialized_path( + meta: FeedFileMeta | None, +) -> Generator[str | None, None, None]: + """Yield a real on-disk path *or* a remote URL for a feed file (for DuckDB), + or None. + + Remote files expose their direct URL, which DuckDB reads in place via its + ``httpfs`` extension (HTTP range requests) — nothing is downloaded by us. + Directory feeds expose their original path directly (no copy). Zip members + are streamed to a temporary file that is removed on exit, so nothing is left + on disk after processing. """ - a = a.strip() - b = b.strip() - if a.lower() == b.lower(): - return False + if meta is None: + yield None + return + if meta.url is not None: + yield meta.url + return + if meta.local_path is not None: + yield meta.local_path + return + if meta.materialize is None: + yield None + return + fd, tmp_name = tempfile.mkstemp(suffix=".txt") + os.close(fd) try: - return float(a) != float(b) - except (ValueError, OverflowError): - return True - - -def _compute_raw_value( - row_dict: dict[str, str], - columns: list[str], - present_headers: set[str], -) -> str: - """Build an ordered CSV string aligned to the *union* columns list. - - Columns absent from ``present_headers`` (i.e. the file this row came from - did not have that column) are rendered as empty strings. - """ - values = [row_dict[col] if col in present_headers else "" for col in columns] - return _row_to_csv(values) + meta.materialize(tmp_name) + yield tmp_name + finally: + with contextlib.suppress(OSError): + Path(tmp_name).unlink() # --------------------------------------------------------------------------- @@ -219,28 +175,66 @@ def _compute_raw_value( @contextmanager -def _open_feed(path: str | Path) -> Generator[LazyOpeners, None, None]: - """Open a GTFS feed (zip archive or directory) and yield lazy file openers. +def _open_feed( + path: str | Path, files: Iterable[str] | None = None +) -> Generator[FeedHandle, None, None]: + """Open a GTFS feed (zip archive, directory, or HTTP folder URL). - Each entry in the returned dict is a zero-arg callable that, when called, - opens the corresponding ``.txt`` file and returns a utf-8-sig text stream. - Callers are responsible for closing each stream. + Yields a :class:`FeedHandle` whose ``openers`` map each ``.txt`` name to a + zero-arg callable returning a utf-8-sig text stream (callers close it), and + whose ``meta`` carries size / materialization info used to route large files + to the DuckDB backend. Supports: * ``.zip`` archives (files at root *or* inside a single sub-directory). * Plain directories containing ``.txt`` files. + * ``http(s)://`` folder URLs (e.g. a public GCP bucket folder). Remote + folders cannot be listed, so *files* must be supplied: it is the + authoritative list of ``.txt`` names to compare. Each name is probed with + an HTTP request to determine presence (so added/deleted files are still + detected) and fetched lazily. + + Args: + path: Feed location — a path to a zip/directory, or an ``http(s)://`` + folder URL. + files: Optional explicit list of file names to consider. For local + feeds this *filters* the discovered files to just these names; + for remote URL feeds it is *required* and authoritative. """ + requested = list(files) if files is not None else None + + if _is_url(path): + yield _open_remote_feed(str(path), requested) + return + path = Path(path) if path.is_dir(): openers: LazyOpeners = {} + meta: dict[str, FeedFileMeta] = {} for txt_file in sorted(path.glob("*.txt")): def _make_opener(p: Path) -> Callable[[], TextIO]: return lambda: p.open(encoding="utf-8-sig") + def _make_materialize(p: Path) -> Callable[[str], None]: + def _materialize(dest: str) -> None: + with p.open("rb") as src, open(dest, "wb") as out: + shutil.copyfileobj(src, out) + + return _materialize + openers[txt_file.name] = _make_opener(txt_file) - yield openers + try: + size: int | None = txt_file.stat().st_size + except OSError: + size = None + meta[txt_file.name] = FeedFileMeta( + size=size, + local_path=str(txt_file), + materialize=_make_materialize(txt_file), + ) + yield _filter_handle(FeedHandle(openers, meta), requested) elif zipfile.is_zipfile(path): zf = zipfile.ZipFile(path, "r") @@ -254,20 +248,175 @@ def _make_opener(p: Path) -> Callable[[], TextIO]: name_map[basename] = member openers = {} + meta = {} for basename, internal_path in name_map.items(): def _make_opener(ip: str) -> Callable[[], TextIO]: # type: ignore[misc] return lambda: io.TextIOWrapper(zf.open(ip), encoding="utf-8-sig") + def _make_materialize(ip: str) -> Callable[[str], None]: + def _materialize(dest: str) -> None: + with zf.open(ip, "r") as src, open(dest, "wb") as out: + shutil.copyfileobj(src, out) + + return _materialize + openers[basename] = _make_opener(internal_path) - yield openers + try: + zsize: int | None = zf.getinfo(internal_path).file_size + except KeyError: + zsize = None + meta[basename] = FeedFileMeta( + size=zsize, + local_path=None, + materialize=_make_materialize(internal_path), + ) + yield _filter_handle(FeedHandle(openers, meta), requested) finally: zf.close() else: raise ValueError( - f"Unsupported feed path: {path!r}. Must be a .zip file or a directory." + f"Unsupported feed path: {path!r}. Must be a .zip file, a directory, " + f"or an http(s):// folder URL." + ) + + +def _filter_handle(handle: FeedHandle, requested: list[str] | None) -> FeedHandle: + """Restrict a feed handle's openers/meta to the requested file names (if any).""" + if requested is None: + return handle + wanted = set(requested) + return FeedHandle( + openers={n: o for n, o in handle.openers.items() if n in wanted}, + meta={n: m for n, m in handle.meta.items() if n in wanted}, + ) + + +# --------------------------------------------------------------------------- +# Remote (HTTP folder URL) feed support +# --------------------------------------------------------------------------- + +_HTTP_TIMEOUT_SECONDS = 30 + + +def _join_url(base_url: str, name: str) -> str: + """Join a folder *base_url* and a file *name* into a single object URL.""" + return base_url.rstrip("/") + "/" + name.lstrip("/") + + +def _http_exists(url: str) -> bool: + """Return True if *url* points at a fetchable object, False if it is absent. + + Tries a cheap ``HEAD`` first, falling back to a single-byte ranged ``GET`` + when the server rejects ``HEAD``. + + Object stores such as Google Cloud Storage are common hosts for GTFS feeds, + where the *folder* is often private (no list permission) even though the + individual files are public. In that configuration a request for a file + that does **not** exist returns ``403 Forbidden`` (the server will not + confirm or deny existence without list permission) rather than ``404``. + We therefore treat 401/403/404/410 as "absent / not fetchable" instead of + raising, so probing a missing file simply skips it (and is reported as an + added/deleted file when appropriate). + """ + try: + req = urllib.request.Request(url, method="HEAD") + with urllib.request.urlopen(req, timeout=_HTTP_TIMEOUT_SECONDS): + return True + except urllib.error.HTTPError as exc: + if exc.code in (404, 410): + return False + # 403/401 may mean "missing in a private folder" *or* "HEAD not allowed + # on an existing object"; 405/501 mean HEAD is unsupported. A ranged GET + # disambiguates: it succeeds for a real (public) file and fails for a + # missing one. + if exc.code in (401, 403, 405, 501): + return _http_exists_via_get(url) + raise + + +def _http_exists_via_get(url: str) -> bool: + """Fallback existence check using a ranged ``GET`` for servers without HEAD. + + Treats 401/403/404/410 as "absent / not fetchable" (see :func:`_http_exists` + for why a missing file in a private folder reports ``403``). + """ + req = urllib.request.Request(url, method="GET", headers={"Range": "bytes=0-0"}) + try: + with urllib.request.urlopen(req, timeout=_HTTP_TIMEOUT_SECONDS): + return True + except urllib.error.HTTPError as exc: + if exc.code in (401, 403, 404, 410): + return False + raise + + +def _http_get_text(url: str) -> TextIO: + """Fetch *url* and return its body as a utf-8-sig decoded text stream.""" + req = urllib.request.Request(url, method="GET") + with urllib.request.urlopen(req, timeout=_HTTP_TIMEOUT_SECONDS) as resp: + data = resp.read() + return io.TextIOWrapper(io.BytesIO(data), encoding="utf-8-sig") + + +def _http_content_length(url: str) -> int | None: + """Return the ``Content-Length`` of *url* (bytes), or None if unavailable. + + Used only as a cheap size hint for routing large files to DuckDB; any + failure simply yields ``None`` (treated as "unknown size"). + """ + try: + req = urllib.request.Request(url, method="HEAD") + with urllib.request.urlopen(req, timeout=_HTTP_TIMEOUT_SECONDS) as resp: + length = resp.headers.get("Content-Length") + return int(length) if length is not None else None + except (urllib.error.URLError, ValueError, OSError): + return None + + +def _http_stream_to_file(url: str, dest: str) -> None: + """Stream *url* to *dest* on disk with bounded memory (for the DuckDB path).""" + req = urllib.request.Request(url, method="GET") + with ( + urllib.request.urlopen(req, timeout=_HTTP_TIMEOUT_SECONDS) as resp, + open(dest, "wb") as out, + ): + shutil.copyfileobj(resp, out) + + +def _open_remote_feed(base_url: str, files: list[str] | None) -> FeedHandle: + """Build a feed handle for an HTTP folder URL. + + Remote folders cannot be listed, so the file names to fetch are determined + up front: either the caller-supplied *files* (authoritative) or, when none + are given, every known GTFS file (:data:`gtfs_definitions.SUPPORTED_FILES`). + Each candidate name is probed for presence (so missing files are correctly + treated as added/deleted) and fetched lazily only when its opener is called. + """ + candidates = list(files) if files else sorted(SUPPORTED_FILES) + + openers: LazyOpeners = {} + meta: dict[str, FeedFileMeta] = {} + for name in candidates: + file_url = _join_url(base_url, name) + if not _http_exists(file_url): + continue + + def _make_opener(u: str) -> Callable[[], TextIO]: + return lambda: _http_get_text(u) + + def _make_materialize(u: str) -> Callable[[str], None]: + return lambda dest: _http_stream_to_file(u, dest) + + openers[name] = _make_opener(file_url) + meta[name] = FeedFileMeta( + size=_http_content_length(file_url), + local_path=None, + materialize=_make_materialize(file_url), + url=file_url, ) + return FeedHandle(openers, meta) # --------------------------------------------------------------------------- @@ -280,6 +429,13 @@ def _diff_file( base_opener: Callable[[], TextIO] | None, new_opener: Callable[[], TextIO] | None, row_changes_cap: int | None, + id_churn_threshold: float, + not_compared_files: dict[str, str], + base_meta: FeedFileMeta | None = None, + new_meta: FeedFileMeta | None = None, + large_file_threshold_bytes: int | None = None, + use_duckdb: bool = False, + column_stats: bool = True, ) -> tuple[FileDiff, FileSummary]: """Dispatch to the appropriate diff helper based on feed presence.""" if base_opener is None: @@ -287,7 +443,19 @@ def _diff_file( return _diff_file_added(file_name, new_opener) if new_opener is None: return _diff_file_deleted(file_name, base_opener) - return _diff_file_modified(file_name, base_opener, new_opener, row_changes_cap) + return _diff_file_modified( + file_name, + base_opener, + new_opener, + row_changes_cap, + id_churn_threshold, + not_compared_files, + base_meta=base_meta, + new_meta=new_meta, + large_file_threshold_bytes=large_file_threshold_bytes, + use_duckdb=use_duckdb, + column_stats=column_stats, + ) def _diff_file_added( @@ -336,80 +504,159 @@ def _diff_file_deleted( return file_diff, summary -def _diff_columns( - base_headers: list[str], - new_headers: list[str], -) -> tuple[list[ColumnEntry], list[ColumnEntry], list[str]]: - """Compute column-level differences between two header lists. +def _eligible_for_duckdb( + file_name: str, + pk_def: list[str], + pk_is_explicit: bool, +) -> bool: + """Whether a file's primary key is simple enough for the DuckDB backend. + + The DuckDB path only handles an explicit primary key with no + conditionally-present (optional) columns, so the SQL can treat every PK + column as mandatory and present. Empty-PK and optional-PK files (e.g. + ``translations.txt``) stay on the in-memory engine. + """ + if not pk_is_explicit or not pk_def: + return False + return not get_optional_primary_key_columns(file_name) - Returns: - columns_added: Columns present in new but not in base. - columns_deleted: Columns present in base but not in new. - union_columns: All columns — base order first, new-only appended. - Note: column reorders (same columns, different positions) are silently - ignored — values are always compared by name, not position. +def _maybe_diff_modified_duckdb( + file_name: str, + pk_def: list[str], + pk_is_explicit: bool, + base_meta: FeedFileMeta | None, + new_meta: FeedFileMeta | None, + large_file_threshold_bytes: int | None, + use_duckdb: bool, + row_changes_cap: int | None, + id_churn_threshold: float, + not_compared_files: dict[str, str], + column_stats: bool = True, +) -> tuple[FileDiff, FileSummary] | None: + """Diff a modified file via DuckDB when it is large and eligible. + + Returns the ``(FileDiff, FileSummary)`` result, or ``None`` to signal the + caller should use the in-memory engine (file too small, ineligible PK, + unknown size, duckdb unavailable/disabled, or a backend error). """ - base_header_set = set(base_headers) - new_header_set = set(new_headers) - columns_added = [ - ColumnEntry(name=col, position=i + 1) - for i, col in enumerate(new_headers) - if col not in base_header_set - ] - columns_deleted = [ - ColumnEntry(name=col, position=i + 1) - for i, col in enumerate(base_headers) - if col not in new_header_set - ] - new_only_cols = [col for col in new_headers if col not in base_header_set] - union_columns: list[str] = base_headers + new_only_cols - return columns_added, columns_deleted, union_columns + if not use_duckdb or large_file_threshold_bytes is None: + return None + if not _eligible_for_duckdb(file_name, pk_def, pk_is_explicit): + return None + + base_size = base_meta.size if base_meta else None + new_size = new_meta.size if new_meta else None + if base_size is None or new_size is None: + return None # unknown size — safe default is the in-memory engine + if max(base_size, new_size) < large_file_threshold_bytes: + return None + + from . import engine_duckdb + + if not engine_duckdb.is_duckdb_available(): + _trace( + f" [{file_name}] large file but duckdb not installed — " + f"using in-memory engine" + ) + return None + + try: + with ( + _materialized_path(base_meta) as base_path, + _materialized_path(new_meta) as new_path, + ): + if base_path is None or new_path is None: + return None + _trace( + f" [{file_name}] large file " + f"({max(base_size, new_size):,} bytes) — using duckdb backend" + ) + return engine_duckdb.diff_modified_duckdb( + file_name=file_name, + base_path=base_path, + new_path=new_path, + pk_cols=pk_def, + row_changes_cap=row_changes_cap, + id_churn_threshold=id_churn_threshold, + not_compared_files=not_compared_files, + column_stats=column_stats, + ) + except Exception as exc: # pragma: no cover - defensive fallback + _trace( + f" [{file_name}] duckdb backend failed ({exc!r}) — " + f"falling back to in-memory engine" + ) + return None -def _scan_modifications( +def _build_missing_pk_not_compared( file_name: str, - common_keys: set[tuple], - base_index: dict[tuple, tuple[int, str]], - new_index: dict[tuple, tuple[int, str]], - base_headers: list[str], - new_headers: list[str], -) -> list[tuple[tuple, list[FieldChange], int, int]]: - """Scan rows present in both feeds and return those whose field values differ. - - Compares only columns shared between both headers to avoid false positives - when a column is added or removed. - - Returns a list of (pk_tuple, field_changes, base_line, new_line) for - every common row that has at least one changed field. - - Note: row reorders (same rows, different line positions) are silently - ignored — keys are compared as sets, so row order has no effect. + base_opener: Callable[[], TextIO], + new_opener: Callable[[], TextIO], + pk_cols: list[str], +) -> tuple[FileDiff, FileSummary]: + """Build the ``not_compared`` result for a file missing required PK columns. + + Re-reads the headers (and counts data rows) of both feeds to preserve + column-level differences and accurate row counts, then reports which side(s) + lack which mandatory primary-key columns. Used as the fallback when indexing + raises :class:`MissingPrimaryKeyError`, so one broken file no longer aborts + the whole diff. """ - shared_cols = [col for col in base_headers if col in set(new_headers)] - candidates: list[tuple[tuple, list[FieldChange], int, int]] = [] - - n = len(common_keys) - _trace(f" [{file_name}] scanning {n:,} common rows...") - t0 = time.monotonic() - for pk_tuple in common_keys: - b_line, b_raw = base_index[pk_tuple] - n_line, n_raw = new_index[pk_tuple] - b_dict = _parse_raw_line(b_raw, base_headers) - n_dict = _parse_raw_line(n_raw, new_headers) - field_changes = [ - FieldChange(field=col, base_value=b_dict[col], new_value=n_dict[col]) - for col in shared_cols - if _values_differ(b_dict.get(col, ""), n_dict.get(col, "")) - ] - if field_changes: - candidates.append((pk_tuple, field_changes, b_line, n_line)) + with base_opener() as f: + base_headers, base_count = _read_headers_and_count(f) + with new_opener() as f: + new_headers, new_count = _read_headers_and_count(f) - _trace( - f" [{file_name}] scan done in {time.monotonic() - t0:.1f}s — " - f"{len(candidates):,} modified" + missing_base = _missing_required_pk_columns(base_headers, pk_cols, file_name) + missing_new = _missing_required_pk_columns(new_headers, pk_cols, file_name) + reason = _missing_primary_key_reason(missing_base, missing_new) + _trace(f" [{file_name}] not compared — {reason.code}: {reason.message}") + + columns_added, columns_deleted, _ = _diff_columns(base_headers, new_headers) + return _build_not_compared_diff( + file_name=file_name, + reason=reason, + columns_added=columns_added, + columns_deleted=columns_deleted, + base_row_count=base_count, + new_row_count=new_count, + ) + + +def _build_duplicate_pk_not_compared( + file_name: str, + base_opener: Callable[[], TextIO], + new_opener: Callable[[], TextIO], + error: DuplicatePrimaryKeyError, +) -> tuple[FileDiff, FileSummary]: + """Build the ``not_compared`` result for a file with duplicate primary keys. + + Re-reads the headers (and counts data rows) of both feeds to preserve + column-level differences and accurate row counts. Used as the fallback when + indexing raises :class:`DuplicatePrimaryKeyError`, so one file with + ambiguous keys no longer aborts the whole diff. + """ + with base_opener() as f: + base_headers, base_count = _read_headers_and_count(f) + with new_opener() as f: + new_headers, new_count = _read_headers_and_count(f) + + reason = _duplicate_primary_key_reason( + error.primary_key, detail=error.detail, side=error.side + ) + _trace(f" [{file_name}] not compared — {reason.code}: {reason.message}") + + columns_added, columns_deleted, _ = _diff_columns(base_headers, new_headers) + return _build_not_compared_diff( + file_name=file_name, + reason=reason, + columns_added=columns_added, + columns_deleted=columns_deleted, + base_row_count=base_count, + new_row_count=new_count, ) - return candidates def _diff_file_modified( @@ -417,10 +664,18 @@ def _diff_file_modified( base_opener: Callable[[], TextIO], new_opener: Callable[[], TextIO], row_changes_cap: int | None, + id_churn_threshold: float, + not_compared_files: dict[str, str], + base_meta: FeedFileMeta | None = None, + new_meta: FeedFileMeta | None = None, + large_file_threshold_bytes: int | None = None, + use_duckdb: bool = False, + column_stats: bool = True, ) -> tuple[FileDiff, FileSummary]: """Compute the diff for a file present in both feeds.""" pk_def = get_primary_key(file_name) assert pk_def is not None # caller guarantees supported files only + pk_is_explicit = len(pk_def) > 0 # For files with an empty PK definition, use all base columns as the key. if len(pk_def) == 0: @@ -429,24 +684,58 @@ def _diff_file_modified( else: pk_cols = pk_def - # Build indexes (two streaming passes, one per file). - with base_opener() as f: - _trace(f" [{file_name}] indexing base feed...") - t0 = time.monotonic() - base_headers, base_index = _read_csv_index(f, pk_cols, file_name=file_name) - _trace( - f" [{file_name}] base index done: {len(base_index):,} " - f"rows in {time.monotonic() - t0:.1f}s" - ) + # Route very large files to the DuckDB backend (when eligible) so they can be + # diffed without holding every row in memory. On any failure, fall back to + # the in-memory engine below. + duckdb_result = _maybe_diff_modified_duckdb( + file_name=file_name, + pk_def=pk_def, + pk_is_explicit=pk_is_explicit, + base_meta=base_meta, + new_meta=new_meta, + large_file_threshold_bytes=large_file_threshold_bytes, + use_duckdb=use_duckdb, + row_changes_cap=row_changes_cap, + id_churn_threshold=id_churn_threshold, + not_compared_files=not_compared_files, + column_stats=column_stats, + ) + if duckdb_result is not None: + return duckdb_result + + # Build indexes (two streaming passes, one per file). A file that cannot be + # keyed/matched — because it is missing a required primary key column or has + # duplicate primary key values — is reported as not_compared (preserving + # column-level differences) instead of aborting the entire diff, so the rest + # of the feed is still compared. + try: + with base_opener() as f: + _trace(f" [{file_name}] indexing base feed...") + t0 = time.monotonic() + base_headers, base_index = _read_csv_index( + f, pk_cols, file_name=file_name, side="base" + ) + _trace( + f" [{file_name}] base index done: {len(base_index):,} " + f"rows in {time.monotonic() - t0:.1f}s" + ) - with new_opener() as f: - _trace(f" [{file_name}] indexing new feed...") - t0 = time.monotonic() - new_headers, new_index = _read_csv_index(f, pk_cols, file_name=file_name) - _trace( - f" [{file_name}] new index done: {len(new_index):,} " - f"rows in {time.monotonic() - t0:.1f}s" + with new_opener() as f: + _trace(f" [{file_name}] indexing new feed...") + t0 = time.monotonic() + new_headers, new_index = _read_csv_index( + f, pk_cols, file_name=file_name, side="new" + ) + _trace( + f" [{file_name}] new index done: {len(new_index):,} " + f"rows in {time.monotonic() - t0:.1f}s" + ) + except MissingPrimaryKeyError: + return _build_missing_pk_not_compared( + file_name, base_opener, new_opener, pk_cols ) + except DuplicatePrimaryKeyError as exc: + return _build_duplicate_pk_not_compared(file_name, base_opener, new_opener, exc) # Column-level diff columns_added, columns_deleted, union_columns = _diff_columns( @@ -463,10 +752,55 @@ def _diff_file_modified( true_added = len(added_keys) true_deleted = len(deleted_keys) + # Generic "not compared" gate: when a file cannot be meaningfully diffed + # (here: regenerated primary keys), skip the expensive row scan and report + # the file as not_compared, preserving column-level differences. + not_compared_reason = _detect_id_churn( + pk_cols=pk_cols, + pk_is_explicit=pk_is_explicit, + base_row_count=len(base_index), + new_row_count=len(new_index), + common_count=len(common_keys), + id_churn_threshold=id_churn_threshold, + ) + if not_compared_reason is not None: + _trace( + f" [{file_name}] not compared — {not_compared_reason.code}: " + f"{not_compared_reason.message}" + ) + return _build_not_compared_diff( + file_name=file_name, + reason=not_compared_reason, + columns_added=columns_added, + columns_deleted=columns_deleted, + base_row_count=len(base_index), + new_row_count=len(new_index), + ) + + # Foreign-key columns pointing at files that churned are unreliable here too: + # exclude them from the field-level comparison and report them as ignored. + ignored_columns, ignored_names = _compute_ignored_columns( + file_name, base_headers, new_headers, pk_cols, not_compared_files + ) + modified_candidates = _scan_modifications( - file_name, common_keys, base_index, new_index, base_headers, new_headers + file_name, + common_keys, + base_index, + new_index, + base_headers, + new_headers, + ignored_columns=ignored_names, ) true_modified = len(modified_candidates) + + # Per-column modification counts over *all* modified rows (independent of any + # row-changes cap), used to populate column_stats with true counts. + column_mod_counts: dict[str, int] = {} + for _pk, field_changes, _b_line, _n_line in modified_candidates: + for fc in field_changes: + column_mod_counts[fc.field] = column_mod_counts.get(fc.field, 0) + 1 + _trace( f" [{file_name}] row diff summary — " f"added={true_added:,} " @@ -483,50 +817,49 @@ def _diff_file_modified( added_rows: list[RowAdded] = [] deleted_rows: list[RowDeleted] = [] modified_rows: list[RowModified] = [] - truncated: Truncated | None = None include_row_changes = row_changes_cap != 0 if include_row_changes: - cap = row_changes_cap # None = unlimited - - def _remaining(used: int) -> int | None: - return None if cap is None else max(0, cap - used) + added_limit, deleted_limit, modified_limit = _split_row_changes_cap( + row_changes_cap, true_added, true_deleted, true_modified + ) - # Fill added rows up to cap. - added_limit = _remaining(0) - for pk_tuple in list(added_keys)[:added_limit]: + # Fill added rows up to allocated cap (earliest rows first, by line). + added_order = sorted(added_keys, key=lambda k: new_index[k][0]) + for pk_tuple in added_order[:added_limit]: n_line, n_raw = new_index[pk_tuple] n_dict = _parse_raw_line(n_raw, new_headers) - identifier = {col: n_dict.get(col, "") for col in pk_cols} - raw_value = _compute_raw_value(n_dict, union_columns, new_header_set) + identifier, raw_value = _build_identifier_and_raw( + n_dict, pk_cols, union_columns, new_header_set + ) added_rows.append( RowAdded( identifier=identifier, raw_value=raw_value, new_line_number=n_line ) ) - # Fill deleted rows up to remaining cap. - deleted_limit = _remaining(len(added_rows)) - for pk_tuple in list(deleted_keys)[:deleted_limit]: + # Fill deleted rows up to allocated cap (earliest rows first, by line). + deleted_order = sorted(deleted_keys, key=lambda k: base_index[k][0]) + for pk_tuple in deleted_order[:deleted_limit]: b_line, b_raw = base_index[pk_tuple] b_dict = _parse_raw_line(b_raw, base_headers) - identifier = {col: b_dict.get(col, "") for col in pk_cols} - raw_value = _compute_raw_value(b_dict, union_columns, base_header_set) + identifier, raw_value = _build_identifier_and_raw( + b_dict, pk_cols, union_columns, base_header_set + ) deleted_rows.append( RowDeleted( identifier=identifier, raw_value=raw_value, base_line_number=b_line ) ) - # Fill modified rows up to remaining cap. - modified_limit = _remaining(len(added_rows) + len(deleted_rows)) - for pk_tuple, field_changes, b_line, n_line in modified_candidates[ - :modified_limit - ]: + # Fill modified rows up to allocated cap (earliest rows first, by line). + modified_order = sorted(modified_candidates, key=lambda c: c[2]) + for pk_tuple, field_changes, b_line, n_line in modified_order[:modified_limit]: b_raw = base_index[pk_tuple][1] b_dict = _parse_raw_line(b_raw, base_headers) - identifier = {col: b_dict.get(col, "") for col in pk_cols} - raw_value = _compute_raw_value(b_dict, union_columns, base_header_set) + identifier, raw_value = _build_identifier_and_raw( + b_dict, pk_cols, union_columns, base_header_set + ) modified_rows.append( RowModified( identifier=identifier, @@ -537,44 +870,26 @@ def _remaining(used: int) -> int | None: ) ) - total_included = len(added_rows) + len(deleted_rows) + len(modified_rows) - total_true = true_added + true_deleted + true_modified - if cap is not None and total_true > cap: - truncated = Truncated( - is_truncated=True, omitted_count=total_true - total_included - ) - - row_changes: RowChanges | None = None - if include_row_changes: - # Use pk_cols for the primary_key field; for empty-pk files that means - # all base columns — which is correct (they form the composite key). - row_changes = RowChanges( - primary_key=pk_cols, - columns=union_columns, - added=added_rows, - deleted=deleted_rows, - modified=modified_rows, - ) - - file_diff = FileDiff( + return _assemble_modified_file_diff( file_name=file_name, - file_action="modified", + pk_cols=pk_cols, + union_columns=union_columns, columns_added=columns_added, columns_deleted=columns_deleted, - row_changes=row_changes, - truncated=truncated, - stats=FileStats( - total_rows_base=len(base_index), - total_rows_new=len(new_index), - columns_added_count=len(columns_added), - columns_deleted_count=len(columns_deleted), - rows_added_count=true_added, - rows_deleted_count=true_deleted, - rows_modified_count=true_modified, - ), + ignored_columns=ignored_columns, + added_rows=added_rows, + deleted_rows=deleted_rows, + modified_rows=modified_rows, + true_added=true_added, + true_deleted=true_deleted, + true_modified=true_modified, + total_base=len(base_index), + total_new=len(new_index), + cap=row_changes_cap, + include_row_changes=include_row_changes, + column_stats=column_stats, + column_mod_counts=column_mod_counts, ) - summary = FileSummary(file_name=file_name, status="modified") - return file_diff, summary # --------------------------------------------------------------------------- @@ -588,6 +903,11 @@ def diff_feeds( row_changes_cap_per_file: int | None = None, base_downloaded_at: datetime | None = None, new_downloaded_at: datetime | None = None, + id_churn_threshold: float = DEFAULT_ID_CHURN_THRESHOLD, + id_churn_thresholds: Mapping[str, float] | None = None, + files: Iterable[str] | None = None, + large_file_threshold_bytes: int | None = DEFAULT_LARGE_FILE_THRESHOLD_BYTES, + column_stats: bool = True, ) -> GtfsDiff: """Compare two GTFS feeds and return a structured :class:`GtfsDiff` result. @@ -598,11 +918,56 @@ def diff_feeds( * ``None`` — include all row changes (default). * ``0`` — omit all row-level detail (column diffs and counts still computed). - * ``N > 0`` — include up to *N* row changes per file (added first, then - deleted, then modified); a :class:`Truncated` record is - attached when the true count exceeds *N*. + * ``N > 0`` — include up to *N* row changes per file, with the budget + split fairly across the change types that have rows + (added / deleted / modified) so a little of each is + shown — one active type gets the whole cap, two split + it ~50/50, three ~33/33/33, with any leftover + redistributed to types that still have rows; a + :class:`Truncated` record is attached when the true + count exceeds *N*. base_downloaded_at: When the base feed was downloaded; defaults to *now*. new_downloaded_at: When the new feed was downloaded; defaults to *now*. + id_churn_threshold: + Global churn ratio (in ``[0.0, 1.0]``) above which a file is marked + ``not_compared`` because its primary keys appear to be regenerated + across versions. Applies to every file that has no more specific + override. + id_churn_thresholds: + Optional ``{file_name: threshold}`` mapping of caller-supplied + per-file overrides. These take precedence over both the built-in + :data:`gtfs_definitions.ID_CHURN_THRESHOLDS` defaults and + *id_churn_threshold*, letting callers tune individual files (e.g. + ``{"shapes.txt": 0.95}``) without mutating module state. + files: + Optional explicit list of GTFS file names to compare. For local + zip/directory feeds this *restricts* the comparison to just these + files. For ``http(s)://`` folder URLs it names the files to fetch + (e.g. ``["stops.txt", "trips.txt"]``); when omitted, every known + GTFS file (:data:`gtfs_definitions.SUPPORTED_FILES`) is probed for + existence and any that are present are compared. The same list is + applied to both the base and new feeds. + large_file_threshold_bytes: + Files whose larger side is at least this many (uncompressed) bytes + are routed to the DuckDB backend, which diffs them without holding + every row in memory. Defaults to + :data:`DEFAULT_LARGE_FILE_THRESHOLD_BYTES` (50 MB). Pass ``None`` to + disable DuckDB entirely (always use the in-memory engine). The + backend is used only when the file's size is cheaply known and its + primary key is a simple explicit key; otherwise the in-memory engine + is used. + column_stats: + When ``True`` (default), each modified file's ``stats.column_stats`` + is populated with per-column modification counts and percentages + (true counts, unaffected by *row_changes_cap_per_file*). Pass + ``False`` to omit the per-column breakdown from the output. The + file-level ``stats.rows_changed_percentage`` is always computed. + + Files are diffed in foreign-key dependency order (referenced files first). + When a referenced file is marked ``not_compared`` due to id churn, the + foreign-key column(s) pointing at it in any referencing file are excluded + from that file's field-level diff and reported under ``ignored_columns``. + The returned ``file_diffs`` / ``summary.files`` are sorted by file name. Returns: A fully-populated :class:`GtfsDiff` instance. @@ -615,11 +980,25 @@ def diff_feeds( file_summaries: list[FileSummary] = [] unsupported_files: list[UnsupportedFile] = [] - with _open_feed(base_path) as base_openers, _open_feed(new_path) as new_openers: + with ( + _open_feed(base_path, files) as base_handle, + _open_feed(new_path, files) as new_handle, + ): + base_openers = base_handle.openers + new_openers = new_handle.openers all_files = sorted(set(base_openers) | set(new_openers)) - _trace(f"Found {len(all_files)} file(s) to process: {', '.join(all_files)}") - - for file_name in all_files: + # Diff referenced (parent) files before the files that reference them so + # a parent's id_churn status is known when its children are compared. + process_files = _processing_order(all_files) + _trace(f"Found {len(all_files)} file(s) to process: {', '.join(process_files)}") + + # Files marked not_compared because their key values are unreliable — + # regenerated identifiers (id_churn) or a missing required primary key. + # Foreign-key columns pointing at these files are ignored in any + # referencing file's diff. Maps file name -> not_compared reason code. + not_compared_files: dict[str, str] = {} + + for file_name in process_files: _trace(f"Processing {file_name}...") in_base = file_name in base_openers in_new = file_name in new_openers @@ -647,8 +1026,26 @@ def diff_feeds( base_opener=base_opener, new_opener=new_opener, row_changes_cap=row_changes_cap_per_file, + id_churn_threshold=get_id_churn_threshold( + file_name, + default=id_churn_threshold, + overrides=id_churn_thresholds, + ), + not_compared_files=not_compared_files, + base_meta=base_handle.meta.get(file_name), + new_meta=new_handle.meta.get(file_name), + large_file_threshold_bytes=large_file_threshold_bytes, + use_duckdb=large_file_threshold_bytes is not None, + column_stats=column_stats, ) + # Record any not_compared file (id_churn or missing primary key) so + # that files referencing this one can ignore the corresponding + # foreign-key column, the same way id-churn references are ignored. + reason = file_diff.not_compared_reason + if reason is not None: + not_compared_files[file_name] = reason.code + # Per spec: file_diffs[] contains only *changed* files. # Skip files present in both feeds with no actual changes. stats = file_diff.stats @@ -666,10 +1063,16 @@ def diff_feeds( file_diffs.append(file_diff) file_summaries.append(file_summary) + # Restore a deterministic, file-name-sorted output order (independent of the + # dependency-driven processing order). + file_diffs.sort(key=lambda fd: fd.file_name) + file_summaries.sort(key=lambda fs: fs.file_name) + # Build summary aggregates. files_added = sum(1 for s in file_summaries if s.status == "added") files_deleted = sum(1 for s in file_summaries if s.status == "deleted") files_modified = sum(1 for s in file_summaries if s.status == "modified") + files_not_compared = sum(1 for s in file_summaries if s.status == "not_compared") def _stat(attr: str) -> int: return sum(getattr(fd.stats, attr, 0) or 0 for fd in file_diffs if fd.stats) @@ -697,7 +1100,7 @@ def _stat(attr: str) -> int: files_added_count=files_added, files_deleted_count=files_deleted, files_modified_count=files_modified, - files_not_compared_count=0, + files_not_compared_count=files_not_compared, files=file_summaries, ) result = GtfsDiff(metadata=metadata, summary=summary, file_diffs=file_diffs) diff --git a/src/gtfs_diff/engine_duckdb.py b/src/gtfs_diff/engine_duckdb.py new file mode 100644 index 0000000..b188ed3 --- /dev/null +++ b/src/gtfs_diff/engine_duckdb.py @@ -0,0 +1,530 @@ +"""DuckDB-backed diff for very large GTFS files. + +This module mirrors the per-file "modified" diff produced by +:func:`gtfs_diff.engine._diff_file_modified`, but offloads the heavy +set-arithmetic and row scanning to DuckDB (a C++ engine that spills to disk) +so that files with millions of rows can be compared without loading every row +into a Python dict. + +It is intentionally only used for the *common* case of a file that exists in +both feeds and has a simple, fully-present, explicit primary key (the routing +guard in :mod:`gtfs_diff.engine` enforces this). All other cases — +added/deleted files, empty/optional primary keys, id-churn edge cases — stay on +the in-memory engine, whose behavior this module reuses helper-by-helper to +guarantee byte-identical output. + +Parity is preserved by: + +* reading clean (BOM/whitespace-stripped) headers with the same reader the + in-memory engine uses, and aliasing DuckDB's positional columns to them; +* coalescing SQL ``NULL`` (DuckDB's representation of an empty CSV field) to + ``""`` everywhere, matching the in-memory padding behavior; +* using SQL only as a *superset* pre-filter for modified rows + (``IS DISTINCT FROM`` on the raw strings) and letting the in-memory + :func:`_values_differ` be the final arbiter (case-insensitive, trimmed, + numeric-aware); +* building the output models with the exact same helpers + (:func:`_compute_raw_value`, :func:`_compute_ignored_columns`, + :func:`_detect_id_churn`, :func:`_build_not_compared_diff`). +""" + +from __future__ import annotations + +import contextlib +import os +import shutil +import tempfile + +from .csv_utils import _is_url, _read_headers, _values_differ +from .diff_helpers import ( + _assemble_modified_file_diff, + _build_identifier_and_raw, + _build_not_compared_diff, + _compute_ignored_columns, + _detect_id_churn, + _diff_columns, + _duplicate_primary_key_reason, + _shared_columns, + _split_row_changes_cap, +) +from .models import ( + FieldChange, + FileDiff, + FileSummary, + RowAdded, + RowDeleted, + RowModified, +) +from .tracing import _trace + +# Read modest batches when streaming candidate rows so memory stays bounded +# regardless of how many rows differ. +_FETCH_BATCH = 10_000 + +# Environment variable that overrides the base directory DuckDB spills to when +# diffing large files. The system temp dir is often small (e.g. a few GB on +# ``/tmp``); operators comparing multi-gigabyte feeds can point this at a volume +# with enough room. Per-file spill subdirectories are still created beneath it +# and cleaned up after each file. +DUCKDB_TMPDIR_ENV = "GTFS_DIFF_DUCKDB_TMPDIR" + + +def _resolve_spill_base() -> str | None: + """Return the base directory for DuckDB's on-disk spill, or ``None``. + + Honors the :data:`DUCKDB_TMPDIR_ENV` environment variable. A leading ``~`` is + expanded and the directory is created if it does not exist. When the variable + is unset or blank, ``None`` is returned so spill falls back to the system temp + directory (:func:`tempfile.mkdtemp` default). + + Raises: + OSError: If the override is set but the directory cannot be created + (e.g. a path component is a file, or permissions are insufficient). + Failing loudly is intentional: a misconfigured override should not be + silently ignored, which would send large spill to an unexpected disk. + """ + raw = os.environ.get(DUCKDB_TMPDIR_ENV) + if raw is None or not raw.strip(): + return None + base = os.path.expanduser(raw.strip()) + os.makedirs(base, exist_ok=True) + return base + + +def is_duckdb_available() -> bool: + """Return True if the optional ``duckdb`` dependency can be imported.""" + try: + import duckdb # noqa: F401 + except Exception: + return False + return True + + +def _q(identifier: str) -> str: + """Quote a SQL identifier (double-quote, escaping embedded double-quotes).""" + return '"' + identifier.replace('"', '""') + '"' + + +def _open_headers(path: str) -> list[str]: + """Read clean, stripped headers from a local CSV file (BOM-aware).""" + with open(path, encoding="utf-8-sig", newline="") as f: + return _read_headers(f) + + +def _read_headers_via_duckdb(con, path: str) -> list[str]: + """Read clean, stripped headers from a CSV *path* (local or URL) via DuckDB. + + Used for remote URLs so headers are obtained from a tiny ranged read through + the ``httpfs`` extension instead of downloading the whole file. The leading + BOM and surrounding whitespace are stripped to match :func:`_read_headers`. + """ + probe = con.execute( + "SELECT * FROM read_csv(?, all_varchar=true, header=true, " + "null_padding=true, sample_size=-1) LIMIT 0", + [path], + ) + return [d[0].lstrip("\ufeff").strip() for d in probe.description] + + +def _create_table(con, table: str, path: str, clean_headers: list[str]) -> None: + """Load *path* into *table* in a single ``read_csv`` pass. + + The clean header names are imposed directly via DuckDB's ``column_names`` + parameter (with ``header=true`` so the file's own header row is skipped), so + no separate schema-probe read is needed. A 1-based ``__line`` column is added + (header is line 1, so the first data row is line 2) to reproduce the + in-memory engine's line numbers. Every value column is coalesced to ``''`` so + an empty CSV field never surfaces as NULL. + + *clean_headers* is derived from this same file's header (a local read or a + tiny remote probe), so its length matches the file's column count. + """ + select_cols = ", ".join(f"COALESCE({_q(c)}, '') AS {_q(c)}" for c in clean_headers) + con.execute( + f"CREATE TABLE {_q(table)} AS " + f"SELECT row_number() OVER () + 1 AS __line, {select_cols} " + f"FROM read_csv(?, all_varchar=true, header=true, " + f"column_names=?, null_padding=true, sample_size=-1)", + [path, clean_headers], + ) + + +def _has_duplicate_pk(con, table: str, pk_cols: list[str]) -> bool: + """Return True if *table* has more than one row for any primary-key value.""" + pk_list = ", ".join(_q(c) for c in pk_cols) + row = con.execute( + f"SELECT 1 FROM {_q(table)} GROUP BY {pk_list} HAVING COUNT(*) > 1 LIMIT 1" + ).fetchone() + return row is not None + + +def diff_modified_duckdb( + file_name: str, + base_path: str, + new_path: str, + pk_cols: list[str], + row_changes_cap: int | None, + id_churn_threshold: float, + not_compared_files: dict[str, str], + column_stats: bool = True, +) -> tuple[FileDiff, FileSummary]: + """Compute the "modified" diff for a large file using DuckDB. + + The caller (``gtfs_diff.engine``) guarantees the file exists in both feeds + and that *pk_cols* is an explicit primary key fully present in both headers. + + ``base_path`` / ``new_path`` may be local filesystem paths *or* ``http(s)://`` + URLs. URLs are read in place via DuckDB's ``httpfs`` extension (HTTP range + requests), so the file is never fully downloaded by us. + + A file with a duplicate primary key is reported as ``not_compared`` (parity + with the in-memory engine) rather than raising, so one ambiguous file does + not abort the whole diff. + """ + import duckdb + + remote = _is_url(base_path) or _is_url(new_path) + + # Spill goes to a managed temp dir we delete ourselves, rather than DuckDB's + # default ``.tmp`` in the current working directory — large-file spill must + # never litter the caller's CWD and must be cleaned up after each file. The + # base directory honors the GTFS_DIFF_DUCKDB_TMPDIR override (see + # _resolve_spill_base); None falls back to the system temp dir. + spill_dir = tempfile.mkdtemp(prefix="gtfs_duckdb_", dir=_resolve_spill_base()) + con = None + try: + con = duckdb.connect() + con.execute("SET preserve_insertion_order=true") + con.execute("PRAGMA threads=1") # deterministic row_number() line numbers + con.execute("SET temp_directory=?", [spill_dir]) + if remote: + # Read remote files in place over HTTP range requests (no full + # download on our side — only the HEAD probes done by the engine). + con.execute("INSTALL httpfs") + con.execute("LOAD httpfs") + + base_headers = ( + _read_headers_via_duckdb(con, base_path) + if _is_url(base_path) + else _open_headers(base_path) + ) + new_headers = ( + _read_headers_via_duckdb(con, new_path) + if _is_url(new_path) + else _open_headers(new_path) + ) + + columns_added, columns_deleted, union_columns = _diff_columns( + base_headers, new_headers + ) + base_header_set = set(base_headers) + new_header_set = set(new_headers) + + _trace(f" [{file_name}] (duckdb) loading base + new...") + _create_table(con, "base_t", base_path, base_headers) + _create_table(con, "new_t", new_path, new_headers) + + dup_base = _has_duplicate_pk(con, "base_t", pk_cols) + dup_new = _has_duplicate_pk(con, "new_t", pk_cols) + if dup_base or dup_new: + # Duplicate keys mean rows can't be uniquely matched. Report the file + # as not_compared (parity with the in-memory engine) instead of + # aborting the whole diff. Counts/headers are already loaded, so no + # re-read of the large file is needed. + side = "both" if dup_base and dup_new else ("base" if dup_base else "new") + reason = _duplicate_primary_key_reason(pk_cols, side=side) + _trace( + f" [{file_name}] (duckdb) not compared — " + f"{reason.code}: {reason.message}" + ) + total_base = con.execute("SELECT COUNT(*) FROM base_t").fetchone()[0] + total_new = con.execute("SELECT COUNT(*) FROM new_t").fetchone()[0] + return _build_not_compared_diff( + file_name=file_name, + reason=reason, + columns_added=columns_added, + columns_deleted=columns_deleted, + base_row_count=total_base, + new_row_count=total_new, + ) + + total_base = con.execute("SELECT COUNT(*) FROM base_t").fetchone()[0] + total_new = con.execute("SELECT COUNT(*) FROM new_t").fetchone()[0] + + pk_join = " AND ".join(f"b.{_q(c)} = n.{_q(c)}" for c in pk_cols) + common_count = con.execute( + f"SELECT COUNT(*) FROM base_t b JOIN new_t n ON {pk_join}" + ).fetchone()[0] + true_added = total_new - common_count + true_deleted = total_base - common_count + + _trace( + f" [{file_name}] (duckdb) counts — base={total_base:,} new={total_new:,} " + f"common={common_count:,} added={true_added:,} deleted={true_deleted:,}" + ) + + # id-churn gate (identical policy to the in-memory engine). + not_compared_reason = _detect_id_churn( + pk_cols=pk_cols, + pk_is_explicit=True, + base_row_count=total_base, + new_row_count=total_new, + common_count=common_count, + id_churn_threshold=id_churn_threshold, + ) + if not_compared_reason is not None: + _trace( + f" [{file_name}] (duckdb) not compared — " + f"{not_compared_reason.code}: {not_compared_reason.message}" + ) + return _build_not_compared_diff( + file_name=file_name, + reason=not_compared_reason, + columns_added=columns_added, + columns_deleted=columns_deleted, + base_row_count=total_base, + new_row_count=total_new, + ) + + ignored_columns, ignored_names = _compute_ignored_columns( + file_name, base_headers, new_headers, pk_cols, not_compared_files + ) + shared_cols = _shared_columns(base_headers, new_header_set, ignored_names) + + include_row_changes = row_changes_cap != 0 + cap = row_changes_cap + + added_rows: list[RowAdded] = [] + deleted_rows: list[RowDeleted] = [] + + # --- Modified: SQL pre-filters to raw-different rows; Python decides. --- + true_modified, modified_rows, column_mod_counts = _scan_modified( + con, + file_name=file_name, + pk_cols=pk_cols, + base_headers=base_headers, + shared_cols=shared_cols, + union_columns=union_columns, + base_header_set=base_header_set, + collect=include_row_changes, + cap=cap, + ) + + if include_row_changes: + added_limit, deleted_limit, modified_limit = _split_row_changes_cap( + cap, true_added, true_deleted, true_modified + ) + added_rows = _collect_added( + con, + pk_cols=pk_cols, + new_headers=new_headers, + union_columns=union_columns, + new_header_set=new_header_set, + limit=added_limit, + ) + deleted_rows = _collect_deleted( + con, + pk_cols=pk_cols, + base_headers=base_headers, + union_columns=union_columns, + base_header_set=base_header_set, + limit=deleted_limit, + ) + if modified_limit is not None: + modified_rows = modified_rows[:modified_limit] + finally: + # Drop the per-file tables so DuckDB releases their buffers, then close + # the connection (which also frees everything) and remove the spill dir. + # Tables are dropped explicitly so the intent is clear and so memory is + # reclaimed promptly even if the connection were ever reused. The spill + # dir is removed unconditionally — even if ``duckdb.connect()`` itself + # raised — so a failed connection never leaves an empty folder behind. + if con is not None: + with contextlib.suppress(Exception): + con.execute("DROP TABLE IF EXISTS base_t") + con.execute("DROP TABLE IF EXISTS new_t") + con.close() + shutil.rmtree(spill_dir, ignore_errors=True) + + _trace( + f" [{file_name}] (duckdb) row diff summary — " + f"added={true_added:,} deleted={true_deleted:,} modified={true_modified:,}" + ) + + return _assemble_modified_file_diff( + file_name=file_name, + pk_cols=pk_cols, + union_columns=union_columns, + columns_added=columns_added, + columns_deleted=columns_deleted, + ignored_columns=ignored_columns, + added_rows=added_rows, + deleted_rows=deleted_rows, + modified_rows=modified_rows, + true_added=true_added, + true_deleted=true_deleted, + true_modified=true_modified, + total_base=total_base, + total_new=total_new, + cap=cap, + include_row_changes=include_row_changes, + column_stats=column_stats, + column_mod_counts=column_mod_counts, + ) + + +def _scan_modified( + con, + file_name: str, + pk_cols: list[str], + base_headers: list[str], + shared_cols: list[str], + union_columns: list[str], + base_header_set: set[str], + collect: bool, + cap: int | None, +) -> tuple[int, list[RowModified], dict[str, int]]: + """Stream raw-different common rows; apply ``_values_differ`` as final arbiter. + + Returns the *true* modified count, (when *collect*) up to *cap* RowModified + records, and per-column modification counts over the full modified set + (true counts, independent of *cap*). Streaming in batches keeps memory + bounded no matter how many rows changed. + """ + if not shared_cols: + return 0, [], {} + + pk_join = " AND ".join(f"b.{_q(c)} = n.{_q(c)}" for c in pk_cols) + distinct_pred = " OR ".join( + f"b.{_q(c)} IS DISTINCT FROM n.{_q(c)}" for c in shared_cols + ) + + select_parts = ["b.__line AS b_line", "n.__line AS n_line"] + select_parts += [f"b.{_q(c)} AS {_q('b__' + c)}" for c in base_headers] + select_parts += [f"n.{_q(c)} AS {_q('n__' + c)}" for c in shared_cols] + sql = ( + f"SELECT {', '.join(select_parts)} " + f"FROM base_t b JOIN new_t n ON {pk_join} " + f"WHERE {distinct_pred}" + ) + + cur = con.execute(sql) + col_names = [d[0] for d in cur.description] + idx = {name: i for i, name in enumerate(col_names)} + + true_modified = 0 + modified_rows: list[RowModified] = [] + column_mod_counts: dict[str, int] = {} + while True: + batch = cur.fetchmany(_FETCH_BATCH) + if not batch: + break + for row in batch: + field_changes = [ + FieldChange( + field=col, + base_value=row[idx["b__" + col]], + new_value=row[idx["n__" + col]], + ) + for col in shared_cols + if _values_differ(row[idx["b__" + col]], row[idx["n__" + col]]) + ] + if not field_changes: + continue + true_modified += 1 + for fc in field_changes: + column_mod_counts[fc.field] = column_mod_counts.get(fc.field, 0) + 1 + if not collect or (cap is not None and len(modified_rows) >= cap): + continue + b_dict = {col: row[idx["b__" + col]] for col in base_headers} + identifier, raw_value = _build_identifier_and_raw( + b_dict, pk_cols, union_columns, base_header_set + ) + modified_rows.append( + RowModified( + identifier=identifier, + raw_value=raw_value, + base_line_number=row[idx["b_line"]], + new_line_number=row[idx["n_line"]], + field_changes=field_changes, + ) + ) + return true_modified, modified_rows, column_mod_counts + + +def _collect_added( + con, + pk_cols: list[str], + new_headers: list[str], + union_columns: list[str], + new_header_set: set[str], + limit: int | None, +) -> list[RowAdded]: + """Fetch up to *limit* rows present only in the new feed.""" + if limit == 0: + return [] + pk_eq = " AND ".join(f"b.{_q(c)} = n.{_q(c)}" for c in pk_cols) + cols = ", ".join(f"n.{_q(c)} AS {_q(c)}" for c in new_headers) + sql = ( + f"SELECT n.__line AS __line, {cols} FROM new_t n " + f"WHERE NOT EXISTS (SELECT 1 FROM base_t b WHERE {pk_eq})" + f" ORDER BY n.__line" + ) + if limit is not None: + sql += f" LIMIT {int(limit)}" + + cur = con.execute(sql) + idx = {d[0]: i for i, d in enumerate(cur.description)} + rows: list[RowAdded] = [] + for row in cur.fetchall(): + n_dict = {col: row[idx[col]] for col in new_headers} + identifier, raw_value = _build_identifier_and_raw( + n_dict, pk_cols, union_columns, new_header_set + ) + rows.append( + RowAdded( + identifier=identifier, + raw_value=raw_value, + new_line_number=row[idx["__line"]], + ) + ) + return rows + + +def _collect_deleted( + con, + pk_cols: list[str], + base_headers: list[str], + union_columns: list[str], + base_header_set: set[str], + limit: int | None, +) -> list[RowDeleted]: + """Fetch up to *limit* rows present only in the base feed.""" + if limit == 0: + return [] + pk_eq = " AND ".join(f"n.{_q(c)} = b.{_q(c)}" for c in pk_cols) + cols = ", ".join(f"b.{_q(c)} AS {_q(c)}" for c in base_headers) + sql = ( + f"SELECT b.__line AS __line, {cols} FROM base_t b " + f"WHERE NOT EXISTS (SELECT 1 FROM new_t n WHERE {pk_eq})" + f" ORDER BY b.__line" + ) + if limit is not None: + sql += f" LIMIT {int(limit)}" + + cur = con.execute(sql) + idx = {d[0]: i for i, d in enumerate(cur.description)} + rows: list[RowDeleted] = [] + for row in cur.fetchall(): + b_dict = {col: row[idx[col]] for col in base_headers} + identifier, raw_value = _build_identifier_and_raw( + b_dict, pk_cols, union_columns, base_header_set + ) + rows.append( + RowDeleted( + identifier=identifier, + raw_value=raw_value, + base_line_number=row[idx["__line"]], + ) + ) + return rows diff --git a/src/gtfs_diff/gtfs_definitions.py b/src/gtfs_diff/gtfs_definitions.py index 0190b51..7a42844 100644 --- a/src/gtfs_diff/gtfs_definitions.py +++ b/src/gtfs_diff/gtfs_definitions.py @@ -13,6 +13,8 @@ Callers can check ``get_primary_key`` returning None to detect unsupported files. """ +from collections.abc import Mapping + GTFS_PRIMARY_KEYS: dict[str, list[str]] = { # Core required files "agency.txt": ["agency_id"], @@ -65,11 +67,19 @@ "route_networks.txt": ["route_id"], # Fares v2 "fare_media.txt": ["fare_media_id"], - "fare_products.txt": ["fare_product_id"], - "fare_leg_rules.txt": ["leg_group_id"], # partial key, best effort + "fare_products.txt": ["fare_product_id", "rider_category_id", "fare_media_id"], + "fare_leg_rules.txt": [ + "network_id", + "from_area_id", + "to_area_id", + "from_timeframe_group_id", + "to_timeframe_group_id", + "fare_product_id", + ], "fare_transfer_rules.txt": [ "from_leg_group_id", "to_leg_group_id", + "fare_product_id", "transfer_count", "duration_limit", ], @@ -85,7 +95,240 @@ SUPPORTED_FILES: set[str] = set(GTFS_PRIMARY_KEYS) +# Primary-key columns that are only *conditionally* present in a file. When such +# a column is absent from a feed's headers it is treated as a null (empty) value +# for every row during the compare step — the column stays in the effective key +# so both feeds are compared against an identical key structure — rather than +# raising MissingPrimaryKeyError. Only a *mandatory* (always-"Required") missing +# primary-key column is an error. +# +# A primary-key column is listed here when the GTFS Schedule reference +# (https://gtfs.org/documentation/schedule/reference/) gives it any presence +# other than "Required" (i.e. Optional, Conditionally Required, Recommended, or +# Conditionally Forbidden). translations.txt is the canonical case: a translation +# is identified by EITHER ``record_id`` (optionally plus ``record_sub_id``) OR +# ``field_value`` — mutually exclusive, all conditionally required — so a given +# feed only carries the subset it uses. +OPTIONAL_PRIMARY_KEY_COLUMNS: dict[str, set[str]] = { + # agency_id: Conditionally Required (required only with multiple agencies). + "agency.txt": {"agency_id"}, + # route_id, origin_id, destination_id, contains_id: all Optional. + "fare_rules.txt": {"route_id", "origin_id", "destination_id", "contains_id"}, + # All six columns are Conditionally Required or Optional. + "transfers.txt": { + "from_stop_id", + "to_stop_id", + "from_route_id", + "to_route_id", + "from_trip_id", + "to_trip_id", + }, + # record_id, record_sub_id, field_value: all Conditionally Required. + "translations.txt": {"record_id", "record_sub_id", "field_value"}, + # attribution_id: Optional. + "attributions.txt": {"attribution_id"}, + # rider_category_id, fare_media_id: both Optional (fare_product_id is required). + "fare_products.txt": {"rider_category_id", "fare_media_id"}, + # network_id, from_area_id, to_area_id, from/to_timeframe_group_id: all Optional + # (fare_product_id is required). + "fare_leg_rules.txt": { + "network_id", + "from_area_id", + "to_area_id", + "from_timeframe_group_id", + "to_timeframe_group_id", + }, + # All five columns are Optional or Conditionally Forbidden. + "fare_transfer_rules.txt": { + "from_leg_group_id", + "to_leg_group_id", + "fare_product_id", + "transfer_count", + "duration_limit", + }, + # start_time, end_time: Conditionally Required. + "timeframes.txt": {"start_time", "end_time"}, +} + + def get_primary_key(file_name: str) -> list[str] | None: """Return the primary key columns for a supported GTFS file, or None if unsupported.""" return GTFS_PRIMARY_KEYS.get(file_name) + + +def get_optional_primary_key_columns(file_name: str) -> set[str]: + """Return the conditionally-present primary-key columns for *file_name*. + + When absent from a feed's headers, these columns are treated as null (empty) + values for every row during the compare step — the full primary key is kept + so both feeds compare against an identical key structure — rather than + triggering MissingPrimaryKeyError. Returns an empty set for files whose + primary-key columns are all mandatory. + """ + return OPTIONAL_PRIMARY_KEY_COLUMNS.get(file_name, set()) + + +# --------------------------------------------------------------------------- +# Foreign-key relationships ("file hierarchy") +# --------------------------------------------------------------------------- +# +# Maps a file to its foreign keys: ``{column: (referenced_file, ...)}``. A column +# may reference more than one file (e.g. ``service_id`` is defined in either +# calendar.txt or calendar_dates.txt). These relationships serve two purposes: +# +# 1. Ordering — referenced ("parent") files are diffed before the files that +# reference them, so a parent's not_compared status is known in advance. +# 2. Ignored columns — when a parent file is not compared because its primary +# key is unreliable (id_churn), the regenerated key values also appear in +# the child's foreign-key column. Comparing that column would surface pure +# churn noise, so it is excluded from the diff and reported under +# ``ignored_columns`` instead. +# +# Only well-established GTFS relationships are listed. Self-references (e.g. +# stops.parent_station → stops) are intentionally omitted: they add cycles to the +# ordering graph and are moot, since a self-referencing file that churns is +# already reported as not_compared in full. +GTFS_FOREIGN_KEYS: dict[str, dict[str, tuple[str, ...]]] = { + "stops.txt": { + "level_id": ("levels.txt",), + }, + "routes.txt": { + "agency_id": ("agency.txt",), + }, + "trips.txt": { + "route_id": ("routes.txt",), + "service_id": ("calendar.txt", "calendar_dates.txt"), + "shape_id": ("shapes.txt",), + }, + "stop_times.txt": { + "trip_id": ("trips.txt",), + "stop_id": ("stops.txt",), + "location_group_id": ("location_groups.txt",), + "pickup_booking_rule_id": ("booking_rules.txt",), + "drop_off_booking_rule_id": ("booking_rules.txt",), + }, + "calendar_dates.txt": { + "service_id": ("calendar.txt",), + }, + "frequencies.txt": { + "trip_id": ("trips.txt",), + }, + "transfers.txt": { + "from_stop_id": ("stops.txt",), + "to_stop_id": ("stops.txt",), + "from_route_id": ("routes.txt",), + "to_route_id": ("routes.txt",), + "from_trip_id": ("trips.txt",), + "to_trip_id": ("trips.txt",), + }, + "fare_attributes.txt": { + "agency_id": ("agency.txt",), + }, + "timeframes.txt": { + "service_id": ("calendar.txt", "calendar_dates.txt"), + }, + "fare_rules.txt": { + "fare_id": ("fare_attributes.txt",), + "route_id": ("routes.txt",), + }, + "stop_areas.txt": { + "area_id": ("areas.txt",), + "stop_id": ("stops.txt",), + }, + "route_networks.txt": { + "network_id": ("networks.txt",), + "route_id": ("routes.txt",), + }, + "pathways.txt": { + "from_stop_id": ("stops.txt",), + "to_stop_id": ("stops.txt",), + }, + "location_group_stops.txt": { + "location_group_id": ("location_groups.txt",), + "stop_id": ("stops.txt",), + }, + "fare_products.txt": { + "rider_category_id": ("rider_categories.txt",), + "fare_media_id": ("fare_media.txt",), + }, + "fare_leg_rules.txt": { + "network_id": ("networks.txt",), + "from_area_id": ("areas.txt",), + "to_area_id": ("areas.txt",), + "from_timeframe_group_id": ("timeframes.txt",), + "to_timeframe_group_id": ("timeframes.txt",), + "fare_product_id": ("fare_products.txt",), + }, + "fare_transfer_rules.txt": { + "fare_product_id": ("fare_products.txt",), + }, +} + + +def get_foreign_keys(file_name: str) -> dict[str, tuple[str, ...]]: + """Return ``{column: (referenced_file, ...)}`` foreign keys for *file_name*. + + Returns an empty dict for files with no known foreign keys. + """ + return GTFS_FOREIGN_KEYS.get(file_name, {}) + + +# --------------------------------------------------------------------------- +# Primary-key churn ("generated id") detection +# --------------------------------------------------------------------------- +# +# Some GTFS producers regenerate primary-key values on every export (e.g. +# shape_id in shapes.txt, trip_id in trips.txt, service_id in calendar*.txt). +# When that happens a primary-key based comparison reports nearly every row as +# both added and deleted, which is misleading. The engine measures the "churn +# ratio" as the complement of the overlap coefficient — the fraction of the +# SMALLER feed's primary keys that have no match in the other feed: +# +# churn_ratio = 1 - |common| / min(|base|, |new|) +# +# and, when it meets or exceeds the file's threshold, marks the file as +# ``not_compared`` with reason code ``id_churn`` instead of emitting a diff. +# Dividing by ``min`` (rather than ``max`` or the union) keeps the metric robust +# to bulk additions/deletions, which preserve a high overlap and so are NOT +# mistaken for regenerated ids. See docs/architecture.md for the rationale. +# +# Thresholds are expressed as a churn ratio in the range [0.0, 1.0]: 0.0 flags +# any unmatched key, 1.0 only flags a file whose keys are entirely disjoint. + +# Default churn ratio above which a file is considered uncomparable. Chosen to +# be conservative so that ordinary large updates are still reported as a normal +# diff; only near-total key turnover (the signature of regenerated ids) trips it. +DEFAULT_ID_CHURN_THRESHOLD: float = 0.7 + +# Built-in per-file overrides for files whose primary keys are known to be +# volatile. This is the project's baseline domain knowledge; callers can layer +# their own per-file overrides on top at call time (see ``get_id_churn_threshold`` +# and ``diff_feeds``) without mutating this module-level mapping. +ID_CHURN_THRESHOLDS: dict[str, float] = {} + +# Minimum number of rows required on *both* feed sides before id-churn detection +# runs. Below this, near-total key turnover is not a reliable signal of +# regenerated ids (it is just as likely an ordinary edit to a tiny file). +MIN_ROWS_FOR_ID_CHURN_DETECTION: int = 50 + + +def get_id_churn_threshold( + file_name: str, + default: float = DEFAULT_ID_CHURN_THRESHOLD, + overrides: Mapping[str, float] | None = None, +) -> float: + """Return the id-churn threshold for *file_name*. + + Resolution order, highest precedence first: + + 1. *overrides* — a caller-supplied ``{file_name: threshold}`` mapping (e.g. + passed to :func:`gtfs_diff.engine.diff_feeds`). Lets callers tune a single + file without touching module state. + 2. :data:`ID_CHURN_THRESHOLDS` — the project's built-in per-file defaults. + 3. *default* — the global fallback (``DEFAULT_ID_CHURN_THRESHOLD`` unless the + caller overrides the global threshold). + """ + if overrides is not None and file_name in overrides: + return overrides[file_name] + return ID_CHURN_THRESHOLDS.get(file_name, default) diff --git a/src/gtfs_diff/tracing.py b/src/gtfs_diff/tracing.py new file mode 100644 index 0000000..3303c37 --- /dev/null +++ b/src/gtfs_diff/tracing.py @@ -0,0 +1,18 @@ +"""Lightweight progress tracing shared across the diff engine.""" + +from __future__ import annotations + +import sys +from datetime import datetime + + +def _trace(msg: str) -> None: + """Print a timestamped progress message with current RSS to stderr.""" + import psutil + + rss_mb = psutil.Process().memory_info().rss / 1024 / 1024 + print( + f"[gtfs-diff {datetime.now().strftime('%H:%M:%S')} {rss_mb:.0f}MB] {msg}", + file=sys.stderr, + flush=True, + ) diff --git a/tests/test_cli.py b/tests/test_cli.py index e78eece..aa006c0 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -7,6 +7,7 @@ from click.testing import CliRunner +from gtfs_diff import engine_duckdb from gtfs_diff.cli import main from tests.helpers import write_zip @@ -85,8 +86,13 @@ def test_cap_stored_in_metadata(self, tmp_path: Path): assert data["metadata"]["row_changes_cap_per_file"] == 5 -class TestMissingPrimaryKeyError: - def test_exits_nonzero_on_missing_pk_column(self, tmp_path: Path): +class TestMissingPrimaryKeyNotCompared: + @staticmethod + def _stops_file_diff(result): + data = json.loads(result.stdout) + return next(fd for fd in data["file_diffs"] if fd["file_name"] == "stops.txt") + + def test_succeeds_on_missing_pk_column_in_base(self, tmp_path: Path): base = write_zip( tmp_path / "base.zip", { @@ -102,28 +108,33 @@ def test_exits_nonzero_on_missing_pk_column(self, tmp_path: Path): ) runner = CliRunner() result = runner.invoke(main, [str(base), str(new)]) - assert result.exit_code == 1 + assert result.exit_code == 0, result.output + fd = self._stops_file_diff(result) + assert fd["file_action"] == "not_compared" + assert fd["not_compared_reason"]["code"] == "missing_primary_key" - def test_error_message_names_file_and_missing_column(self, tmp_path: Path): + def test_succeeds_on_missing_pk_column_in_new(self, tmp_path: Path): base = write_zip( tmp_path / "base.zip", { - "stops.txt": "stop_name,stop_lat,stop_lon\n" - "Stop One,1.0,2.0\n", # stop_id absent + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", }, ) new = write_zip( tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "stops.txt": "stop_name,stop_lat,stop_lon\n" + "Stop One,1.0,2.0\n", # stop_id absent }, ) runner = CliRunner() result = runner.invoke(main, [str(base), str(new)]) - assert "stops.txt" in result.output - assert "stop_id" in result.output + assert result.exit_code == 0, result.output + fd = self._stops_file_diff(result) + assert fd["file_action"] == "not_compared" + assert fd["not_compared_reason"]["code"] == "missing_primary_key" - def test_error_message_includes_headers_found(self, tmp_path: Path): + def test_not_compared_reason_names_missing_column(self, tmp_path: Path): base = write_zip( tmp_path / "base.zip", { @@ -139,7 +150,11 @@ def test_error_message_includes_headers_found(self, tmp_path: Path): ) runner = CliRunner() result = runner.invoke(main, [str(base), str(new)]) - assert "stop_name" in result.output + assert result.exit_code == 0, result.output + fd = self._stops_file_diff(result) + assert fd["file_action"] == "not_compared" + assert fd["not_compared_reason"]["code"] == "missing_primary_key" + assert "stop_id" in fd["not_compared_reason"]["message"] class TestInvalidPath: @@ -169,3 +184,60 @@ def test_no_pretty_output_is_compact(self, tmp_path: Path): # Should be valid JSON even without pretty-printing data = json.loads(result.stdout) assert "metadata" in data + + +def _make_feed_dirs(tmp_path: Path): + base = tmp_path / "base_dir" + new = tmp_path / "new_dir" + base.mkdir() + new.mkdir() + (base / "stops.txt").write_text(TINY_BASE_FILES["stops.txt"], encoding="utf-8") + (new / "stops.txt").write_text(TINY_NEW_FILES["stops.txt"], encoding="utf-8") + return base, new + + +def _json_without_metadata(output: str) -> dict: + data = json.loads(output) + data.pop("metadata", None) + return data + + +class TestDuckDBOptions: + def test_no_duckdb_is_accepted_and_disables_backend( + self, tmp_path: Path, monkeypatch + ): + base, new = _make_feed_dirs(tmp_path) + + def fail_if_called(*args, **kwargs): + raise AssertionError("DuckDB should not be called") + + monkeypatch.setattr(engine_duckdb, "diff_modified_duckdb", fail_if_called) + result = CliRunner().invoke(main, [str(base), str(new), "--no-duckdb"]) + assert result.exit_code == 0, result.output + data = json.loads(result.stdout) + assert data["summary"]["total_changes"] == 1 + assert data["file_diffs"][0]["stats"]["rows_added_count"] == 1 + + def test_large_file_threshold_zero_is_accepted_and_uses_duckdb( + self, tmp_path: Path, monkeypatch + ): + base, new = _make_feed_dirs(tmp_path) + original = engine_duckdb.diff_modified_duckdb + calls = [] + + def record_call(*args, **kwargs): + calls.append(kwargs["file_name"]) + return original(*args, **kwargs) + + monkeypatch.setattr(engine_duckdb, "diff_modified_duckdb", record_call) + duck = CliRunner().invoke( + main, [str(base), str(new), "--large-file-threshold-mb", "0"] + ) + assert duck.exit_code == 0, duck.output + assert calls == ["stops.txt"] + + normal = CliRunner().invoke(main, [str(base), str(new), "--no-duckdb"]) + assert normal.exit_code == 0, normal.output + assert _json_without_metadata(duck.stdout) == _json_without_metadata( + normal.stdout + ) diff --git a/tests/test_engine.py b/tests/test_engine.py index 383ae6f..0c9bbac 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -2,11 +2,38 @@ from __future__ import annotations +import io +import json +import os +import urllib.error from pathlib import Path import pytest -from gtfs_diff.engine import MissingPrimaryKeyError, diff_feeds +from gtfs_diff import engine_duckdb +from gtfs_diff.csv_utils import DuplicatePrimaryKeyError as CsvDuplicatePrimaryKeyError +from gtfs_diff.diff_helpers import _duplicate_primary_key_reason, _split_row_changes_cap +from gtfs_diff.engine import ( + DuplicatePrimaryKeyError, + FeedFileMeta, + MissingPrimaryKeyError, + _eligible_for_duckdb, + _http_exists, + _http_exists_via_get, + _is_url, + _join_url, + _materialized_path, + _maybe_diff_modified_duckdb, + _open_remote_feed, + _read_csv_index, + diff_feeds, +) +from gtfs_diff.engine_duckdb import DUCKDB_TMPDIR_ENV, _resolve_spill_base +from gtfs_diff.gtfs_definitions import ( + get_foreign_keys, + get_optional_primary_key_columns, + get_primary_key, +) from gtfs_diff.models import GtfsDiff from tests.helpers import write_zip @@ -354,6 +381,264 @@ def test_swapped_column_order_is_not_a_change(self, tmp_path: Path): assert result.summary.total_changes == 0 +class _UrlopenResponse: + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + return False + + +class TestRemoteAndFileFilter: + def test_local_files_filter_limits_compared_files(self, tmp_path: Path): + base = tmp_path / "base" + new = tmp_path / "new" + base.mkdir() + new.mkdir() + + routes = "route_id,route_short_name\nR1,Route 1\n" + (base / "routes.txt").write_text(routes, encoding="utf-8") + (new / "routes.txt").write_text(routes, encoding="utf-8") + (base / "stops.txt").write_text( + STOPS_HEADER + "S1,Stop One,1.0,2.0\n", encoding="utf-8" + ) + (new / "stops.txt").write_text( + STOPS_HEADER + "S1,Stop One Renamed,1.0,2.0\n", encoding="utf-8" + ) + + routes_only = diff_feeds(base, new, files=["routes.txt"]) + assert "stops.txt" not in {fd.file_name for fd in routes_only.file_diffs} + + stops_only = diff_feeds(base, new, files=["stops.txt"]) + stops_diff = _get_file_diff(stops_only, "stops.txt") + assert stops_diff.file_action == "modified" + + def test_remote_modified_file(self, monkeypatch: pytest.MonkeyPatch): + store = { + "https://x/base/stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "https://x/new/stops.txt": STOPS_HEADER + "S1,Stop One Renamed,1.0,2.0\n", + } + monkeypatch.setattr("gtfs_diff.engine._http_exists", lambda url: url in store) + monkeypatch.setattr( + "gtfs_diff.engine._http_get_text", lambda url: io.StringIO(store[url]) + ) + + result = diff_feeds("https://x/base", "https://x/new", files=["stops.txt"]) + + fd = _get_file_diff(result, "stops.txt") + assert fd.file_action == "modified" + assert len(fd.row_changes.modified) == 1 + mod = fd.row_changes.modified[0] + assert mod.identifier == {"stop_id": "S1"} + stop_name_change = next( + fc for fc in mod.field_changes if fc.field == "stop_name" + ) + assert stop_name_change.base_value == "Stop One" + assert stop_name_change.new_value == "Stop One Renamed" + + def test_remote_added_and_deleted_files(self, monkeypatch: pytest.MonkeyPatch): + store = { + "https://x/base/stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "https://x/new/routes.txt": "route_id,route_short_name\nR1,Route 1\n", + } + monkeypatch.setattr("gtfs_diff.engine._http_exists", lambda url: url in store) + monkeypatch.setattr( + "gtfs_diff.engine._http_get_text", lambda url: io.StringIO(store[url]) + ) + + result = diff_feeds( + "https://x/base", + "https://x/new", + files=["stops.txt", "routes.txt"], + ) + + assert _get_file_diff(result, "routes.txt").file_action == "added" + assert _get_file_diff(result, "stops.txt").file_action == "deleted" + + def test_remote_without_files_probes_known_files( + self, monkeypatch: pytest.MonkeyPatch + ): + routes = "route_id,route_short_name\nR1,Route 1\n" + store = { + "https://x/base/stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "https://x/new/stops.txt": STOPS_HEADER + "S1,Stop One Renamed,1.0,2.0\n", + "https://x/base/routes.txt": routes, + "https://x/new/routes.txt": routes, + } + monkeypatch.setattr("gtfs_diff.engine._http_exists", lambda url: url in store) + monkeypatch.setattr( + "gtfs_diff.engine._http_get_text", lambda url: io.StringIO(store[url]) + ) + + result = diff_feeds("https://x/base", "https://x/new") + + assert _get_file_diff(result, "stops.txt").file_action == "modified" + compared_files = {fd.file_name for fd in result.file_diffs} + assert "routes.txt" not in compared_files + + def test_remote_without_files_skips_absent_known_files( + self, monkeypatch: pytest.MonkeyPatch + ): + store = { + "https://x/base/stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "https://x/new/stops.txt": STOPS_HEADER + "S1,Stop One Renamed,1.0,2.0\n", + } + monkeypatch.setattr("gtfs_diff.engine._http_exists", lambda url: url in store) + monkeypatch.setattr( + "gtfs_diff.engine._http_get_text", lambda url: io.StringIO(store[url]) + ) + + result = diff_feeds("https://x/base", "https://x/new") + + assert _get_file_diff(result, "stops.txt").file_action == "modified" + compared_files = {fd.file_name for fd in result.file_diffs} + assert "trips.txt" not in compared_files + + def test_remote_explicit_files_limits_probing( + self, monkeypatch: pytest.MonkeyPatch + ): + store = { + "https://x/base/stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "https://x/new/stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "https://x/base/routes.txt": "route_id,route_short_name\nR1,Route 1\n", + "https://x/new/routes.txt": ( + "route_id,route_short_name\nR1,Route 1 Renamed\n" + ), + } + probed: list[str] = [] + + def exists(url: str) -> bool: + probed.append(url) + return url in store + + monkeypatch.setattr("gtfs_diff.engine._http_exists", exists) + monkeypatch.setattr( + "gtfs_diff.engine._http_get_text", lambda url: io.StringIO(store[url]) + ) + + result = diff_feeds("https://x/base", "https://x/new", files=["stops.txt"]) + + assert result.file_diffs == [] + assert probed == ["https://x/base/stops.txt", "https://x/new/stops.txt"] + + def test_url_helpers(self): + assert _is_url("https://x") is True + assert _is_url("/tmp/x") is False + assert _join_url("https://x/base/", "stops.txt") == "https://x/base/stops.txt" + assert _join_url("https://x/base/", "/stops.txt") == "https://x/base/stops.txt" + + +class TestHttpExistsProbing: + @staticmethod + def _http_error(url: str, code: int, msg: str = "Error") -> urllib.error.HTTPError: + return urllib.error.HTTPError(url, code, msg, hdrs={}, fp=None) + + def test_head_200_returns_true(self, monkeypatch: pytest.MonkeyPatch): + calls: list[str] = [] + + def fake_urlopen(req, timeout): + calls.append(req.get_method()) + return _UrlopenResponse() + + monkeypatch.setattr("gtfs_diff.engine.urllib.request.urlopen", fake_urlopen) + + assert _http_exists("https://x/feed/stops.txt") is True + assert calls == ["HEAD"] + + def test_head_404_returns_false_without_get_fallback( + self, monkeypatch: pytest.MonkeyPatch + ): + calls: list[str] = [] + url = "https://x/feed/stops.txt" + + def fake_urlopen(req, timeout): + calls.append(req.get_method()) + raise self._http_error(url, 404, "Not Found") + + monkeypatch.setattr("gtfs_diff.engine.urllib.request.urlopen", fake_urlopen) + + assert _http_exists(url) is False + assert calls == ["HEAD"] + + def test_private_folder_missing_file_head_403_get_403_returns_false( + self, monkeypatch: pytest.MonkeyPatch + ): + calls: list[tuple[str, str | None]] = [] + url = "https://x/feed/missing.txt" + + def fake_urlopen(req, timeout): + calls.append((req.get_method(), req.get_header("Range"))) + raise self._http_error(url, 403, "Forbidden") + + monkeypatch.setattr("gtfs_diff.engine.urllib.request.urlopen", fake_urlopen) + + assert _http_exists(url) is False + assert calls == [("HEAD", None), ("GET", "bytes=0-0")] + + def test_head_disallowed_get_success_returns_true( + self, monkeypatch: pytest.MonkeyPatch + ): + calls: list[tuple[str, str | None]] = [] + url = "https://x/feed/stops.txt" + + def fake_urlopen(req, timeout): + method = req.get_method() + calls.append((method, req.get_header("Range"))) + if method == "HEAD": + raise self._http_error(url, 405, "Method Not Allowed") + assert method == "GET" + return _UrlopenResponse() + + monkeypatch.setattr("gtfs_diff.engine.urllib.request.urlopen", fake_urlopen) + + assert _http_exists(url) is True + assert calls == [("HEAD", None), ("GET", "bytes=0-0")] + + def test_head_401_get_401_returns_false(self, monkeypatch: pytest.MonkeyPatch): + calls: list[str] = [] + url = "https://x/feed/private.txt" + + def fake_urlopen(req, timeout): + calls.append(req.get_method()) + raise self._http_error(url, 401, "Unauthorized") + + monkeypatch.setattr("gtfs_diff.engine.urllib.request.urlopen", fake_urlopen) + + assert _http_exists(url) is False + assert calls == ["HEAD", "GET"] + + def test_gone_410_returns_false_for_head_and_ranged_get( + self, monkeypatch: pytest.MonkeyPatch + ): + calls: list[str] = [] + url = "https://x/feed/gone.txt" + + def fake_urlopen(req, timeout): + calls.append(req.get_method()) + raise self._http_error(url, 410, "Gone") + + monkeypatch.setattr("gtfs_diff.engine.urllib.request.urlopen", fake_urlopen) + + assert _http_exists(url) is False + assert _http_exists_via_get(url) is False + assert calls == ["HEAD", "GET"] + + def test_head_500_raises(self, monkeypatch: pytest.MonkeyPatch): + calls: list[str] = [] + url = "https://x/feed/stops.txt" + + def fake_urlopen(req, timeout): + calls.append(req.get_method()) + raise self._http_error(url, 500, "Internal Server Error") + + monkeypatch.setattr("gtfs_diff.engine.urllib.request.urlopen", fake_urlopen) + + with pytest.raises(urllib.error.HTTPError) as exc_info: + _http_exists(url) + assert exc_info.value.code == 500 + assert calls == ["HEAD"] + + # --------------------------------------------------------------------------- # Column-level tests # --------------------------------------------------------------------------- @@ -521,6 +806,68 @@ def test_truncated_omitted_count_correct(self, tmp_path: Path): assert fd.truncated.omitted_count == 2 +class TestSplitRowChangesCap: + """Unit tests for the fair cap allocator in diff_helpers.""" + + def test_none_cap_is_unlimited(self): + assert _split_row_changes_cap(None, 10, 10, 10) == (None, None, None) + + def test_single_active_type_gets_whole_cap(self): + assert _split_row_changes_cap(6, 20, 0, 0) == (6, 0, 0) + assert _split_row_changes_cap(6, 0, 20, 0) == (0, 6, 0) + assert _split_row_changes_cap(6, 0, 0, 20) == (0, 0, 6) + + def test_two_active_types_split_evenly(self): + assert _split_row_changes_cap(6, 20, 20, 0) == (3, 3, 0) + assert _split_row_changes_cap(6, 20, 0, 20) == (3, 0, 3) + + def test_three_active_types_split_evenly(self): + assert _split_row_changes_cap(9, 20, 20, 20) == (3, 3, 3) + + def test_indivisible_remainder_favours_earlier_types(self): + # cap 5 over 3 types: 1 each, remainder 2 → added, deleted. + assert _split_row_changes_cap(5, 20, 20, 20) == (2, 2, 1) + + def test_leftover_budget_redistributed(self): + # added only has 1 row; its unused share flows to the others. + assert _split_row_changes_cap(9, 1, 20, 20) == (1, 4, 4) + + def test_never_exceeds_true_counts(self): + a, d, m = _split_row_changes_cap(100, 2, 3, 4) + assert (a, d, m) == (2, 3, 4) + + def test_cap_larger_than_total_includes_all(self): + assert _split_row_changes_cap(50, 5, 5, 5) == (5, 5, 5) + + def test_no_changes_allocates_nothing(self): + assert _split_row_changes_cap(5, 0, 0, 0) == (0, 0, 0) + + +class TestCapFairSplit: + """Integration: the cap is shared across change types, not added-first.""" + + def test_each_change_type_represented(self, tmp_path: Path): + # 5 added, 5 deleted, 5 modified; cap = 6 → 2 of each (a little of all). + base_rows = "".join(f"D{i},Del {i}\n" for i in range(5)) # deleted + base_rows += "".join(f"M{i},Base {i}\n" for i in range(5)) # modified base + new_rows = "".join(f"A{i},Add {i}\n" for i in range(5)) # added + new_rows += "".join(f"M{i},New {i}\n" for i in range(5)) # modified new + base = write_zip( + tmp_path / "base.zip", {"stops.txt": "stop_id,stop_name\n" + base_rows} + ) + new = write_zip( + tmp_path / "new.zip", {"stops.txt": "stop_id,stop_name\n" + new_rows} + ) + result = diff_feeds(base, new, row_changes_cap_per_file=6) + fd = _get_file_diff(result, "stops.txt") + rc = fd.row_changes + assert len(rc.added) == 2 + assert len(rc.deleted) == 2 + assert len(rc.modified) == 2 + assert fd.truncated is not None + assert fd.truncated.omitted_count == 9 # 15 true - 6 shown + + class TestCapNone: def test_cap_none_includes_all(self, tmp_path: Path): base = write_zip( @@ -546,10 +893,593 @@ def test_cap_none_includes_all(self, tmp_path: Path): # --------------------------------------------------------------------------- -class TestMissingPrimaryKeyError: - def test_missing_pk_column_in_base_raises(self, tmp_path: Path): - """diff_feeds raises MissingPrimaryKeyError when the base feed - is missing a required PK column.""" +class TestNotComparedIdChurn: + @staticmethod + def _shapes_csv(start: int, n: int) -> str: + header = "shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence\n" + rows = "".join(f"SHP{i},50.0,-5.0,{i}\n" for i in range(start, start + n)) + return header + rows + + def test_full_id_churn_marks_file_not_compared(self, tmp_path: Path): + # Every shape_id is regenerated → keys are entirely disjoint. + base = write_zip(tmp_path / "base.zip", {"shapes.txt": self._shapes_csv(0, 60)}) + new = write_zip( + tmp_path / "new.zip", {"shapes.txt": self._shapes_csv(1000, 60)} + ) + result = diff_feeds(base, new) + fd = _get_file_diff(result, "shapes.txt") + assert fd.file_action == "not_compared" + assert fd.row_changes is None + assert fd.not_compared_reason is not None + assert fd.not_compared_reason.code == "id_churn" + + def test_not_compared_summary_status_and_count(self, tmp_path: Path): + base = write_zip(tmp_path / "base.zip", {"shapes.txt": self._shapes_csv(0, 60)}) + new = write_zip( + tmp_path / "new.zip", {"shapes.txt": self._shapes_csv(1000, 60)} + ) + result = diff_feeds(base, new) + fs = _get_file_summary(result, "shapes.txt") + assert fs.status == "not_compared" + assert result.summary.files_not_compared_count == 1 + + def test_not_compared_preserves_column_diffs(self, tmp_path: Path): + # New feed regenerates ids AND adds a column; column diff must survive. + base = write_zip(tmp_path / "base.zip", {"shapes.txt": self._shapes_csv(0, 60)}) + new_rows = "".join(f"SHP{i},50.0,-5.0,{i},1.5\n" for i in range(1000, 1060)) + new_csv = ( + "shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence," + "shape_dist_traveled\n" + new_rows + ) + new = write_zip(tmp_path / "new.zip", {"shapes.txt": new_csv}) + result = diff_feeds(base, new) + fd = _get_file_diff(result, "shapes.txt") + assert fd.file_action == "not_compared" + added_names = [c.name for c in fd.columns_added] + assert "shape_dist_traveled" in added_names + assert fd.stats.total_rows_base == 60 + assert fd.stats.total_rows_new == 60 + + def test_stable_ids_below_threshold_diffed_normally(self, tmp_path: Path): + # 60 rows, only 1 deleted + 1 added → overlap-coefficient churn ~1.7%. + base = write_zip(tmp_path / "base.zip", {"shapes.txt": self._shapes_csv(0, 60)}) + new = write_zip(tmp_path / "new.zip", {"shapes.txt": self._shapes_csv(1, 60)}) + result = diff_feeds(base, new) + fd = _get_file_diff(result, "shapes.txt") + assert fd.file_action == "modified" + + def test_bulk_add_is_not_flagged_as_churn(self, tmp_path: Path): + # Base keys are a subset of new keys (file grew 3×). The overlap + # coefficient stays at 1.0 (churn 0) so this is diffed, not flagged — + # the key property that distinguishes it from Jaccard / ÷max. + base = write_zip(tmp_path / "base.zip", {"shapes.txt": self._shapes_csv(0, 60)}) + new = write_zip(tmp_path / "new.zip", {"shapes.txt": self._shapes_csv(0, 180)}) + result = diff_feeds(base, new) + fd = _get_file_diff(result, "shapes.txt") + assert fd.file_action == "modified" + + def test_small_files_are_not_flagged(self, tmp_path: Path): + # Disjoint keys but fewer rows than the detection minimum. + base = write_zip(tmp_path / "base.zip", {"shapes.txt": self._shapes_csv(0, 1)}) + new = write_zip(tmp_path / "new.zip", {"shapes.txt": self._shapes_csv(1000, 1)}) + result = diff_feeds(base, new) + fd = _get_file_diff(result, "shapes.txt") + assert fd.file_action == "modified" + + def test_threshold_override_disables_detection(self, tmp_path: Path): + base = write_zip(tmp_path / "base.zip", {"shapes.txt": self._shapes_csv(0, 60)}) + new = write_zip( + tmp_path / "new.zip", {"shapes.txt": self._shapes_csv(1000, 60)} + ) + # A threshold above 1.0 can never be reached → always diffed. + result = diff_feeds(base, new, id_churn_threshold=1.01) + fd = _get_file_diff(result, "shapes.txt") + assert fd.file_action == "modified" + + def test_per_file_override_disables_detection(self, tmp_path: Path): + base = write_zip(tmp_path / "base.zip", {"shapes.txt": self._shapes_csv(0, 60)}) + new = write_zip( + tmp_path / "new.zip", {"shapes.txt": self._shapes_csv(1000, 60)} + ) + # Fully churned, but a per-file override raises shapes.txt out of reach. + result = diff_feeds(base, new, id_churn_thresholds={"shapes.txt": 1.01}) + fd = _get_file_diff(result, "shapes.txt") + assert fd.file_action == "modified" + + def test_per_file_override_beats_global_threshold(self, tmp_path: Path): + # ~33% churn: above the per-file override (0.1) but below the global + # (0.9). The per-file value must win → flagged not_compared. + base = write_zip(tmp_path / "base.zip", {"shapes.txt": self._shapes_csv(0, 60)}) + new = write_zip(tmp_path / "new.zip", {"shapes.txt": self._shapes_csv(20, 60)}) + result = diff_feeds( + base, + new, + id_churn_threshold=0.9, + id_churn_thresholds={"shapes.txt": 0.1}, + ) + fd = _get_file_diff(result, "shapes.txt") + assert fd.file_action == "not_compared" + assert fd.not_compared_reason.code == "id_churn" + + def test_per_file_override_only_affects_named_file(self, tmp_path: Path): + # Override targets trips.txt; shapes.txt still uses the global default. + base = write_zip(tmp_path / "base.zip", {"shapes.txt": self._shapes_csv(0, 60)}) + new = write_zip( + tmp_path / "new.zip", {"shapes.txt": self._shapes_csv(1000, 60)} + ) + result = diff_feeds(base, new, id_churn_thresholds={"trips.txt": 1.01}) + fd = _get_file_diff(result, "shapes.txt") + assert fd.file_action == "not_compared" + + +# --------------------------------------------------------------------------- +# Foreign-key ignored columns (file hierarchy) +# --------------------------------------------------------------------------- + + +class TestForeignKeyIgnoredColumns: + @staticmethod + def _shapes_csv(start: int, n: int) -> str: + header = "shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence\n" + rows = "".join(f"SHP{i},50.0,-5.0,{i}\n" for i in range(start, start + n)) + return header + rows + + @staticmethod + def _trips_csv(shape_ids: list[str], changed_headsign: bool) -> str: + header = "trip_id,route_id,service_id,shape_id,trip_headsign\n" + rows = [] + for idx, shp in enumerate(shape_ids, start=1): + headsign = "Downtown" + if changed_headsign and idx == 1: + headsign = "Uptown" + rows.append(f"T{idx},R1,SVC1,{shp},{headsign}\n") + return header + "".join(rows) + + def test_fk_column_ignored_when_referenced_file_churns(self, tmp_path: Path): + # shapes.txt fully regenerates shape_id → not_compared. + # trips.txt keeps stable trip_id but its shape_id values also changed; + # that column must be ignored, leaving only the real headsign change. + base = write_zip( + tmp_path / "base.zip", + { + "shapes.txt": self._shapes_csv(0, 60), + "trips.txt": self._trips_csv( + [f"SHP{i}" for i in range(5)], changed_headsign=False + ), + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "shapes.txt": self._shapes_csv(1000, 60), + "trips.txt": self._trips_csv( + [f"SHP{1000 + i}" for i in range(5)], changed_headsign=True + ), + }, + ) + result = diff_feeds(base, new) + + shapes = _get_file_diff(result, "shapes.txt") + assert shapes.file_action == "not_compared" + + trips = _get_file_diff(result, "trips.txt") + assert trips.file_action == "modified" + assert trips.ignored_columns is not None + ignored = {ic.column: ic.reason.code for ic in trips.ignored_columns} + assert ignored == {"shape_id": "references_not_compared_file"} + # Only the headsign change counts; shape_id churn is excluded. + assert trips.stats.rows_modified_count == 1 + changed_fields = { + fc.field for fc in trips.row_changes.modified[0].field_changes + } + assert changed_fields == {"trip_headsign"} + + def test_fk_column_ignored_when_referenced_file_missing_pk(self, tmp_path: Path): + assert "shapes.txt" in get_foreign_keys("trips.txt")["shape_id"] + base = write_zip( + tmp_path / "base.zip", + { + "shapes.txt": ( + "shape_pt_lat,shape_pt_lon,shape_pt_sequence\n" + "50.0,-5.0,1\n" + "51.0,-5.1,2\n" + ), + "trips.txt": self._trips_csv( + [f"SHP{i}" for i in range(5)], changed_headsign=False + ), + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "shapes.txt": ( + "shape_pt_lat,shape_pt_lon,shape_pt_sequence\n" + "50.0,-5.0,1\n" + "51.0,-5.1,2\n" + ), + "trips.txt": self._trips_csv( + [f"SHP{1000 + i}" for i in range(5)], changed_headsign=True + ), + }, + ) + result = diff_feeds(base, new) + + shapes = _get_file_diff(result, "shapes.txt") + assert shapes.file_action == "not_compared" + assert shapes.not_compared_reason.code == "missing_primary_key" + + trips = _get_file_diff(result, "trips.txt") + assert trips.file_action == "modified" + assert trips.ignored_columns is not None + ignored = {ic.column: ic.reason.code for ic in trips.ignored_columns} + assert ignored == {"shape_id": "references_not_compared_file"} + changed_fields = { + fc.field for fc in trips.row_changes.modified[0].field_changes + } + assert changed_fields == {"trip_headsign"} + + def test_whole_diff_continues_when_file_missing_pk(self, tmp_path: Path): + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n", + "routes.txt": "route_id,route_short_name\nR1,Route 1\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "routes.txt": "route_id,route_short_name\nR1,Route One\n", + }, + ) + result = diff_feeds(base, new) + + stops = _get_file_diff(result, "stops.txt") + assert stops.file_action == "not_compared" + assert stops.not_compared_reason.code == "missing_primary_key" + assert _get_file_diff(result, "routes.txt").file_action == "modified" + + def test_fk_only_change_makes_file_unchanged(self, tmp_path: Path): + # trips.txt's ONLY difference is the churned shape_id → once ignored the + # file has no real change and is omitted from file_diffs entirely. + base = write_zip( + tmp_path / "base.zip", + { + "shapes.txt": self._shapes_csv(0, 60), + "trips.txt": self._trips_csv( + [f"SHP{i}" for i in range(5)], changed_headsign=False + ), + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "shapes.txt": self._shapes_csv(1000, 60), + "trips.txt": self._trips_csv( + [f"SHP{1000 + i}" for i in range(5)], changed_headsign=False + ), + }, + ) + result = diff_feeds(base, new) + names = [fd.file_name for fd in result.file_diffs] + assert "trips.txt" not in names + + def test_fk_column_not_ignored_when_parent_stable(self, tmp_path: Path): + # shapes.txt is unchanged (stable shape_id), so a shape_id edit in + # trips.txt is a real change and must NOT be ignored. + shapes = self._shapes_csv(0, 60) + base = write_zip( + tmp_path / "base.zip", + { + "shapes.txt": shapes, + "trips.txt": self._trips_csv(["SHP0"], changed_headsign=False), + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "shapes.txt": shapes, + "trips.txt": self._trips_csv(["SHP1"], changed_headsign=False), + }, + ) + result = diff_feeds(base, new) + trips = _get_file_diff(result, "trips.txt") + assert trips.file_action == "modified" + assert trips.ignored_columns is None + changed_fields = { + fc.field for fc in trips.row_changes.modified[0].field_changes + } + assert "shape_id" in changed_fields + + +class TestProcessingOrder: + def test_parents_precede_children(self): + from gtfs_diff.engine import _processing_order + + order = _processing_order(["trips.txt", "shapes.txt", "stop_times.txt"]) + assert order.index("shapes.txt") < order.index("trips.txt") + assert order.index("trips.txt") < order.index("stop_times.txt") + + def test_missing_parent_is_ignored(self): + from gtfs_diff.engine import _processing_order + + # routes.txt absent; ordering still works and stays deterministic. + order = _processing_order(["trips.txt", "shapes.txt"]) + assert order == ["shapes.txt", "trips.txt"] + + +# --------------------------------------------------------------------------- +# Missing primary key column +# --------------------------------------------------------------------------- + + +class TestOptionalPrimaryKeyColumns: + def test_translations_record_id_variant_pads_missing_pk_columns_with_null(self): + csv_text = ( + "table_name,field_name,language,record_id,translation\n" + "stops,stop_name,en,S1,Stop One\n" + "stops,stop_name,en,S2,Stop Two\n" + ) + + _, index = _read_csv_index( + io.StringIO(csv_text), + get_primary_key("translations.txt"), + "translations.txt", + ) + + assert ("stops", "stop_name", "en", "S1", "", "") in index + assert ("stops", "stop_name", "en", "S2", "", "") in index + assert len(index) == 2 + + def test_translations_field_value_variant_pads_missing_pk_columns_with_null(self): + csv_text = ( + "table_name,field_name,language,field_value,translation\n" + "stops,stop_name,en,Stop One,Arrêt Un\n" + "stops,stop_name,en,Stop Two,Arrêt Deux\n" + ) + + _, index = _read_csv_index( + io.StringIO(csv_text), + get_primary_key("translations.txt"), + "translations.txt", + ) + + assert ("stops", "stop_name", "en", "", "", "Stop One") in index + assert ("stops", "stop_name", "en", "", "", "Stop Two") in index + assert len(index) == 2 + + def test_translations_full_variant_uses_all_pk_columns(self): + csv_text = ( + "table_name,field_name,language,record_id,record_sub_id,field_value,translation\n" + "stop_times,stop_headsign,en,T1,1,Downtown,Centre-ville\n" + "stop_times,stop_headsign,en,T1,2,Uptown,Haut de la ville\n" + ) + + _, index = _read_csv_index( + io.StringIO(csv_text), + get_primary_key("translations.txt"), + "translations.txt", + ) + + assert ("stop_times", "stop_headsign", "en", "T1", "1", "Downtown") in index + assert ("stop_times", "stop_headsign", "en", "T1", "2", "Uptown") in index + assert len(index) == 2 + + def test_translations_missing_mandatory_column_still_raises(self): + csv_text = ( + "table_name,field_name,record_id,translation\nstops,stop_name,S1,Stop One\n" + ) + + with pytest.raises(MissingPrimaryKeyError) as exc_info: + _read_csv_index( + io.StringIO(csv_text), + get_primary_key("translations.txt"), + "translations.txt", + ) + + assert exc_info.value.file_name == "translations.txt" + assert exc_info.value.missing_columns == ["language"] + assert "record_id" not in exc_info.value.missing_columns + assert "record_sub_id" not in exc_info.value.missing_columns + assert "field_value" not in exc_info.value.missing_columns + assert exc_info.value.headers == [ + "table_name", + "field_name", + "record_id", + "translation", + ] + + def test_mandatory_pk_files_are_unaffected(self): + csv_text = "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n" + + with pytest.raises(MissingPrimaryKeyError) as exc_info: + _read_csv_index( + io.StringIO(csv_text), + get_primary_key("stops.txt"), + "stops.txt", + ) + + assert exc_info.value.file_name == "stops.txt" + assert exc_info.value.missing_columns == ["stop_id"] + assert get_optional_primary_key_columns("stops.txt") == set() + assert get_optional_primary_key_columns("translations.txt") == { + "record_id", + "record_sub_id", + "field_value", + } + + def test_diff_feeds_with_translations_record_id_variant(self, tmp_path: Path): + base = write_zip( + tmp_path / "base.zip", + { + "translations.txt": ( + "table_name,field_name,language,record_id,translation\n" + "stops,stop_name,en,S1,Stop One\n" + ), + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "translations.txt": ( + "table_name,field_name,language,record_id,translation\n" + "stops,stop_name,en,S1,Stop 1\n" + ), + }, + ) + + result = diff_feeds(base, new) + + fd = _get_file_diff(result, "translations.txt") + assert fd.file_action == "modified" + assert fd.stats.rows_modified_count == 1 + + def test_diff_feeds_aligns_missing_optional_pk_with_empty_value( + self, tmp_path: Path + ): + base = write_zip( + tmp_path / "base.zip", + { + "translations.txt": ( + "table_name,field_name,language,record_id,record_sub_id,translation\n" + "stops,stop_name,en,S1,,Stop One\n" + ), + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "translations.txt": ( + "table_name,field_name,language,record_id,translation\n" + "stops,stop_name,en,S1,Stop 1\n" + ), + }, + ) + + result = diff_feeds(base, new) + + fd = _get_file_diff(result, "translations.txt") + assert fd.file_action == "modified" + assert fd.stats.rows_added_count == 0 + assert fd.stats.rows_deleted_count == 0 + assert fd.stats.rows_modified_count == 1 + + +class TestExpandedOptionalPrimaryKeys: + """Optional PK columns added from the GTFS reference (April 2026).""" + + def test_optional_pk_mapping_covers_conditionally_required_files(self): + assert get_optional_primary_key_columns("agency.txt") == {"agency_id"} + assert get_optional_primary_key_columns("fare_rules.txt") == { + "route_id", + "origin_id", + "destination_id", + "contains_id", + } + assert get_optional_primary_key_columns("attributions.txt") == { + "attribution_id" + } + assert get_optional_primary_key_columns("timeframes.txt") == { + "start_time", + "end_time", + } + assert get_optional_primary_key_columns("fare_products.txt") == { + "rider_category_id", + "fare_media_id", + } + # Files whose every PK column is "Required" stay mandatory. + assert get_optional_primary_key_columns("stops.txt") == set() + assert get_optional_primary_key_columns("stop_times.txt") == set() + + def test_primary_key_discrepancies_fixed_against_spec(self): + # Spec PK is composite; the project previously listed only the first column. + assert get_primary_key("fare_products.txt") == [ + "fare_product_id", + "rider_category_id", + "fare_media_id", + ] + assert get_primary_key("fare_transfer_rules.txt") == [ + "from_leg_group_id", + "to_leg_group_id", + "fare_product_id", + "transfer_count", + "duration_limit", + ] + + def test_agency_without_agency_id_is_null_padded_not_raised(self): + # A single-agency feed may omit agency_id; it must not raise and the lone + # row is keyed on a null agency_id. + csv_text = ( + "agency_name,agency_url,agency_timezone\nMetro,https://m.example,UTC\n" + ) + headers, index = _read_csv_index( + io.StringIO(csv_text), get_primary_key("agency.txt"), "agency.txt" + ) + assert headers == ["agency_name", "agency_url", "agency_timezone"] + assert ("",) in index + assert len(index) == 1 + + def test_fare_products_aligns_when_optional_pk_columns_absent(self, tmp_path: Path): + # Neither feed carries rider_category_id / fare_media_id; both pad them to + # null, so the row aligns and a price change is reported as modified. + base = write_zip( + tmp_path / "base.zip", + { + "fare_products.txt": ( + "fare_product_id,fare_product_name,amount,currency\n" + "FP1,Single,2.50,USD\n" + ), + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "fare_products.txt": ( + "fare_product_id,fare_product_name,amount,currency\n" + "FP1,Single,3.00,USD\n" + ), + }, + ) + result = diff_feeds(base, new) + fd = _get_file_diff(result, "fare_products.txt") + assert fd.file_action == "modified" + assert fd.stats.rows_added_count == 0 + assert fd.stats.rows_deleted_count == 0 + assert fd.stats.rows_modified_count == 1 + + def test_optional_pk_column_not_added_to_reported_headers(self, tmp_path: Path): + # The injected null PK columns must affect only the compare step, never the + # reported columns/headers. + base = write_zip( + tmp_path / "base.zip", + { + "fare_products.txt": ( + "fare_product_id,fare_product_name,amount,currency\n" + "FP1,Single,2.50,USD\n" + ), + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "fare_products.txt": ( + "fare_product_id,fare_product_name,amount,currency\n" + "FP1,Single,3.00,USD\n" + ), + }, + ) + result = diff_feeds(base, new) + fd = _get_file_diff(result, "fare_products.txt") + reported_cols = set(fd.row_changes.columns) + assert "rider_category_id" not in reported_cols + assert "fare_media_id" not in reported_cols + assert fd.columns_added == [] + assert fd.columns_deleted == [] + + +class TestMissingPrimaryKeyNotCompared: + def test_missing_pk_column_in_base_is_not_compared(self, tmp_path: Path): base = write_zip( tmp_path / "base.zip", { @@ -563,12 +1493,13 @@ def test_missing_pk_column_in_base_raises(self, tmp_path: Path): "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", }, ) - with pytest.raises(MissingPrimaryKeyError): - diff_feeds(base, new) + result = diff_feeds(base, new) + fd = _get_file_diff(result, "stops.txt") + assert fd.file_action == "not_compared" + assert fd.not_compared_reason is not None + assert fd.not_compared_reason.code == "missing_primary_key" - def test_missing_pk_column_in_new_raises(self, tmp_path: Path): - """diff_feeds raises MissingPrimaryKeyError when the new feed - is missing a required PK column.""" + def test_missing_pk_column_in_new_is_not_compared(self, tmp_path: Path): base = write_zip( tmp_path / "base.zip", { @@ -582,10 +1513,13 @@ def test_missing_pk_column_in_new_raises(self, tmp_path: Path): "Stop One,1.0,2.0\n", # stop_id absent }, ) - with pytest.raises(MissingPrimaryKeyError): - diff_feeds(base, new) + result = diff_feeds(base, new) + fd = _get_file_diff(result, "stops.txt") + assert fd.file_action == "not_compared" + assert fd.not_compared_reason is not None + assert fd.not_compared_reason.code == "missing_primary_key" - def test_exception_carries_file_name(self, tmp_path: Path): + def test_missing_pk_column_in_both_feeds_is_not_compared(self, tmp_path: Path): base = write_zip( tmp_path / "base.zip", { @@ -595,14 +1529,16 @@ def test_exception_carries_file_name(self, tmp_path: Path): new = write_zip( tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "stops.txt": "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n", }, ) - with pytest.raises(MissingPrimaryKeyError) as exc_info: - diff_feeds(base, new) - assert exc_info.value.file_name == "stops.txt" + result = diff_feeds(base, new) + fd = _get_file_diff(result, "stops.txt") + assert fd.file_action == "not_compared" + assert fd.not_compared_reason is not None + assert fd.not_compared_reason.code == "missing_primary_key" - def test_exception_carries_missing_columns(self, tmp_path: Path): + def test_not_compared_reason_names_missing_column(self, tmp_path: Path): base = write_zip( tmp_path / "base.zip", { @@ -615,26 +1551,36 @@ def test_exception_carries_missing_columns(self, tmp_path: Path): "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", }, ) - with pytest.raises(MissingPrimaryKeyError) as exc_info: - diff_feeds(base, new) - assert "stop_id" in exc_info.value.missing_columns + result = diff_feeds(base, new) + fd = _get_file_diff(result, "stops.txt") + assert fd.not_compared_reason is not None + assert "stop_id" in fd.not_compared_reason.message - def test_exception_carries_headers(self, tmp_path: Path): + def test_not_compared_stats_reflect_data_row_counts(self, tmp_path: Path): base = write_zip( tmp_path / "base.zip", { - "stops.txt": "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n", + "stops.txt": ( + "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\nStop Two,3.0,4.0\n" + ), }, ) new = write_zip( tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "stops.txt": ( + STOPS_HEADER + + "S1,Stop One,1.0,2.0\n" + + "S2,Stop Two,3.0,4.0\n" + + "S3,Stop Three,5.0,6.0\n" + ), }, ) - with pytest.raises(MissingPrimaryKeyError) as exc_info: - diff_feeds(base, new) - assert exc_info.value.headers == ["stop_name", "stop_lat", "stop_lon"] + result = diff_feeds(base, new) + fd = _get_file_diff(result, "stops.txt") + assert fd.file_action == "not_compared" + assert fd.stats.total_rows_base == 2 + assert fd.stats.total_rows_new == 3 # --------------------------------------------------------------------------- @@ -832,3 +1778,938 @@ def test_modified_row_line_numbers(self, tmp_path: Path): mod = fd.row_changes.modified[0] assert mod.base_line_number == 3 assert mod.new_line_number == 3 + + +# --------------------------------------------------------------------------- +# Change statistics +# --------------------------------------------------------------------------- + + +class TestChangeStats: + @staticmethod + def _diff_dirs( + tmp_path: Path, + name: str, + base_files: dict[str, str], + new_files: dict[str, str], + **kwargs, + ) -> GtfsDiff: + base = _write_feed_dir(tmp_path, f"base_{name}", base_files) + new = _write_feed_dir(tmp_path, f"new_{name}", new_files) + return diff_feeds(base, new, **kwargs) + + @staticmethod + def _column_stats_tuples(fd): + return [ + (stat.column, stat.modifications_count, stat.modifications_percentage) + for stat in fd.stats.column_stats + ] + + def test_basic_column_stats_counts_percentages_and_order(self, tmp_path: Path): + base = { + "stops.txt": "stop_id,stop_name,stop_lat,stop_lon,stop_desc\n" + + "S1,Alpha,1.0,2.0,First\n" + + "S2,Beta,3.0,4.0,Second\n" + + "S3,Gamma,5.0,6.0,Third\n" + } + new = { + "stops.txt": "stop_id,stop_name,stop_lat,stop_lon,stop_desc\n" + + "S1,Alpha Prime,1.1,2.0,First\n" + + "S2,Beta Prime,3.0,4.0,Second\n" + + "S3,Gamma,5.0,6.1,Third Prime\n" + } + result = self._diff_dirs(tmp_path, "basic_stats", base, new) + fd = _get_file_diff(result, "stops.txt") + + assert fd.row_changes.columns == [ + "stop_id", + "stop_name", + "stop_lat", + "stop_lon", + "stop_desc", + ] + assert self._column_stats_tuples(fd) == [ + ("stop_name", 2, 66.67), + ("stop_lat", 1, 33.33), + ("stop_lon", 1, 33.33), + ("stop_desc", 1, 33.33), + ] + + def test_rows_changed_percentage_rounds_and_clamps(self, tmp_path: Path): + rounded = self._diff_dirs( + tmp_path, + "percentage_rounding", + { + "stops.txt": "stop_id,stop_name\n" + + "S1,Alpha\nS2,Beta\nS3,Gamma\nS4,Delta\nS5,Epsilon\n" + }, + { + "stops.txt": "stop_id,stop_name\n" + + "S1,Alpha\nS2,Beta Prime\nS4,Delta\nS5,Epsilon\n" + + "S6,Zeta\nS7,Eta\n" + }, + ) + rounded_fd = _get_file_diff(rounded, "stops.txt") + assert rounded_fd.stats.rows_added_count == 2 + assert rounded_fd.stats.rows_deleted_count == 1 + assert rounded_fd.stats.rows_modified_count == 1 + assert rounded_fd.stats.rows_changed_percentage == 66.67 + + clamped = self._diff_dirs( + tmp_path, + "percentage_clamping", + {"stops.txt": "stop_id,stop_name\nA,Alpha\nB,Beta\n"}, + {"stops.txt": "stop_id,stop_name\nC,Gamma\nD,Delta\n"}, + ) + clamped_fd = _get_file_diff(clamped, "stops.txt") + assert clamped_fd.stats.rows_added_count == 2 + assert clamped_fd.stats.rows_deleted_count == 2 + assert clamped_fd.stats.rows_modified_count == 0 + assert clamped_fd.stats.rows_changed_percentage == 100.0 + + def test_rows_changed_percentage_none_for_header_only_modified_file( + self, tmp_path: Path + ): + result = self._diff_dirs( + tmp_path, + "empty_modified", + {"stops.txt": "stop_id,stop_name\n"}, + {"stops.txt": "stop_id,stop_name,stop_desc\n"}, + ) + fd = _get_file_diff(result, "stops.txt") + + assert fd.file_action == "modified" + assert fd.stats.total_rows_base == 0 + assert fd.stats.total_rows_new == 0 + assert fd.stats.rows_changed_percentage is None + assert fd.stats.column_stats is None + + def test_column_stats_none_when_no_rows_are_modified(self, tmp_path: Path): + result = self._diff_dirs( + tmp_path, + "no_modified_rows", + {"stops.txt": "stop_id,stop_name\nS1,Alpha\nS2,Beta\n"}, + {"stops.txt": "stop_id,stop_name\nS1,Alpha\nS3,Gamma\n"}, + ) + fd = _get_file_diff(result, "stops.txt") + + assert fd.file_action == "modified" + assert fd.stats.rows_added_count == 1 + assert fd.stats.rows_deleted_count == 1 + assert fd.stats.rows_modified_count == 0 + assert fd.stats.column_stats is None + assert fd.stats.rows_changed_percentage == 100.0 + + def test_change_stats_are_independent_of_row_changes_cap(self, tmp_path: Path): + base_files = { + "stops.txt": "stop_id,stop_name,stop_lat\n" + + "S1,Alpha,1.0\n" + + "S2,Beta,2.0\n" + + "S3,Gamma,3.0\n" + + "S4,Delta,4.0\n" + } + new_files = { + "stops.txt": "stop_id,stop_name,stop_lat\n" + + "S1,Alpha Prime,1.1\n" + + "S2,Beta Prime,2.2\n" + + "S3,Gamma Prime,3.0\n" + + "S4,Delta,4.0\n" + + "S5,Epsilon,5.0\n" + } + base = _write_feed_dir(tmp_path, "base_cap_independence", base_files) + new = _write_feed_dir(tmp_path, "new_cap_independence", new_files) + + stats_by_cap = [] + for cap in (0, 1, None): + fd = _get_file_diff( + diff_feeds(base, new, row_changes_cap_per_file=cap), "stops.txt" + ) + stats_by_cap.append( + ( + fd.stats.rows_changed_percentage, + self._column_stats_tuples(fd), + ) + ) + + assert stats_by_cap == [ + ( + 80.0, + [("stop_name", 3, 100.0), ("stop_lat", 2, 66.67)], + ), + ( + 80.0, + [("stop_name", 3, 100.0), ("stop_lat", 2, 66.67)], + ), + ( + 80.0, + [("stop_name", 3, 100.0), ("stop_lat", 2, 66.67)], + ), + ] + + def test_column_stats_toggle_off_keeps_rows_changed_percentage( + self, tmp_path: Path + ): + result = self._diff_dirs( + tmp_path, + "toggle_off", + { + "stops.txt": "stop_id,stop_name\nS1,Alpha\n", + "routes.txt": "route_id,route_short_name\nR1,One\n", + }, + { + "stops.txt": "stop_id,stop_name\nS1,Alpha Prime\n", + "routes.txt": "route_id,route_short_name\nR1,One Prime\n", + }, + column_stats=False, + ) + + modified = [fd for fd in result.file_diffs if fd.file_action == "modified"] + assert {fd.file_name for fd in modified} == {"routes.txt", "stops.txt"} + for fd in modified: + assert fd.stats.column_stats is None + assert fd.stats.rows_changed_percentage is not None + + def test_change_stats_match_between_in_memory_and_duckdb(self, tmp_path: Path): + base = { + "stops.txt": "stop_id,stop_name,stop_lat,stop_lon\n" + + "S1,Alpha,1.0,2.0\n" + + "S2,Beta,3.0,4.0\n" + + "S3,Gamma,5.0,6.0\n" + } + new = { + "stops.txt": "stop_id,stop_name,stop_lat,stop_lon\n" + + "S1,Alpha Prime,1.1,2.0\n" + + "S2,Beta,3.0,4.1\n" + + "S4,Delta,7.0,8.0\n" + } + mem, duck = _assert_duckdb_parity( + tmp_path, base, new, row_changes_cap_per_file=1 + ) + mem_stats = _get_file_diff(mem, "stops.txt").stats + duck_stats = _get_file_diff(duck, "stops.txt").stats + + assert mem_stats.rows_changed_percentage == duck_stats.rows_changed_percentage + assert mem_stats.column_stats == duck_stats.column_stats + + def test_non_modified_files_leave_change_stats_unset(self, tmp_path: Path): + result = self._diff_dirs( + tmp_path, + "non_modified", + { + "stops.txt": "stop_id,stop_name\nS1,Alpha\n", + "routes.txt": "route_id,route_short_name\nR1,One\n", + }, + { + "stops.txt": "stop_id,stop_name\nS1,Alpha Prime\n", + "agency.txt": "agency_id,agency_name\nA1,Agency\n", + }, + ) + added = _get_file_diff(result, "agency.txt") + deleted = _get_file_diff(result, "routes.txt") + + assert added.file_action == "added" + assert added.stats.column_stats is None + assert added.stats.rows_changed_percentage is None + assert deleted.file_action == "deleted" + assert deleted.stats.column_stats is None + assert deleted.stats.rows_changed_percentage is None + + +# --------------------------------------------------------------------------- +# DuckDB backend parity and routing +# --------------------------------------------------------------------------- + + +def _write_feed_dir(tmp_path: Path, name: str, files: dict[str, str]) -> Path: + feed_dir = tmp_path / name + feed_dir.mkdir() + for file_name, content in files.items(): + (feed_dir / file_name).write_text(content, encoding="utf-8") + return feed_dir + + +def _sorted_diff(result: GtfsDiff) -> dict: + d = result.model_dump(mode="json", exclude_none=True) + d.pop("metadata", None) + for fd in d.get("file_diffs", []): + rc = fd.get("row_changes") + if rc: + for k in ("added", "deleted", "modified"): + rc[k].sort(key=lambda r: json.dumps(r, sort_keys=True)) + return d + + +def _diff_with_both_backends( + tmp_path: Path, + base_files: dict[str, str], + new_files: dict[str, str], + *, + row_changes_cap_per_file: int | None = None, +) -> tuple[GtfsDiff, GtfsDiff]: + base = _write_feed_dir(tmp_path, "base", base_files) + new = _write_feed_dir(tmp_path, "new", new_files) + mem = diff_feeds( + base, + new, + row_changes_cap_per_file=row_changes_cap_per_file, + large_file_threshold_bytes=None, + ) + duck = diff_feeds( + base, + new, + row_changes_cap_per_file=row_changes_cap_per_file, + large_file_threshold_bytes=0, + ) + return mem, duck + + +def _assert_duckdb_parity( + tmp_path: Path, + base_files: dict[str, str], + new_files: dict[str, str], + *, + row_changes_cap_per_file: int | None = None, +) -> tuple[GtfsDiff, GtfsDiff]: + mem, duck = _diff_with_both_backends( + tmp_path, + base_files, + new_files, + row_changes_cap_per_file=row_changes_cap_per_file, + ) + assert _sorted_diff(mem) == _sorted_diff(duck) + return mem, duck + + +class TestDuckDBBackend: + def test_duckdb_is_available(self): + assert engine_duckdb.is_duckdb_available() is True + + def test_stops_added_deleted_modified_parity(self, tmp_path: Path): + base = { + "stops.txt": STOPS_HEADER + + "S1,Alpha,1.0,2.0\n" + + "S2,Beta,3.0,4.0\n" + + "S3,Gamma,5.0,6.0\n" + } + new = { + "stops.txt": STOPS_HEADER + + "S1,Alpha Renamed,1.0,2.0\n" + + "S3,Gamma,5.0,6.0\n" + + "S4,Delta,7.0,8.0\n" + } + _assert_duckdb_parity(tmp_path, base, new) + + def test_numeric_equivalence_parity(self, tmp_path: Path): + base = {"stops.txt": STOPS_HEADER + "S1,Alpha,45.5,-73.55625\n"} + new = {"stops.txt": STOPS_HEADER + "S1,Alpha,45.5,-73.556250\n"} + _assert_duckdb_parity(tmp_path, base, new) + + def test_case_and_whitespace_equivalence_parity(self, tmp_path: Path): + base = {"stops.txt": "stop_id,stop_name,stop_desc\nS1,Echo, x \n"} + new = {"stops.txt": "stop_id,stop_name,stop_desc\nS1,ECHO,x\n"} + _assert_duckdb_parity(tmp_path, base, new) + + def test_quoted_embedded_commas_parity(self, tmp_path: Path): + base = {"stops.txt": 'stop_id,stop_name,stop_desc\nS1,"Beta, Inc",old\n'} + new = {"stops.txt": 'stop_id,stop_name,stop_desc\nS1,"Beta, Inc",new\n'} + _assert_duckdb_parity(tmp_path, base, new) + + def test_empty_fields_and_added_column_parity(self, tmp_path: Path): + base = {"stops.txt": "stop_id,stop_name\nS1,\nS2,Two\n"} + new = {"stops.txt": "stop_id,stop_name,stop_desc\nS1,,blank name\nS2,Two,\n"} + _assert_duckdb_parity(tmp_path, base, new) + + def test_bom_header_parity(self, tmp_path: Path): + base = {"stops.txt": "\ufeffstop_id,stop_name\nS1,Alpha\n"} + new = {"stops.txt": "\ufeffstop_id,stop_name\nS1,Alpha Prime\n"} + _assert_duckdb_parity(tmp_path, base, new) + + def test_composite_primary_key_parity(self, tmp_path: Path): + header = "trip_id,arrival_time,departure_time,stop_id,stop_sequence\n" + base = { + "stop_times.txt": header + + "T1,08:00:00,08:00:00,S1,1\n" + + "T1,08:05:00,08:05:00,S2,2\n" + + "T2,09:00:00,09:00:00,S3,1\n" + } + new = { + "stop_times.txt": header + + "T1,08:00:00,08:00:00,S1,1\n" + + "T1,08:06:00,08:06:00,S2,2\n" + + "T3,10:00:00,10:00:00,S4,1\n" + } + _assert_duckdb_parity(tmp_path, base, new) + + def test_modified_line_numbers_parity(self, tmp_path: Path): + base = {"stops.txt": "stop_id,stop_name\nS1,Alpha\nS2,Beta\nS3,Gamma\n"} + new = {"stops.txt": "stop_id,stop_name\nS1,Alpha\nS2,Beta Prime\nS3,Gamma\n"} + mem, duck = _assert_duckdb_parity(tmp_path, base, new) + for result in (mem, duck): + mod = _get_file_diff(result, "stops.txt").row_changes.modified[0] + assert mod.identifier == {"stop_id": "S2"} + assert mod.base_line_number == 3 + assert mod.new_line_number == 3 + + def test_id_churn_parity(self, tmp_path: Path): + base_rows = "".join(f"B{i},Base {i}\n" for i in range(50)) + new_rows = "".join(f"N{i},New {i}\n" for i in range(50)) + base = {"stops.txt": "stop_id,stop_name\n" + base_rows} + new = {"stops.txt": "stop_id,stop_name\n" + new_rows} + mem, duck = _assert_duckdb_parity(tmp_path, base, new) + for result in (mem, duck): + fd = _get_file_diff(result, "stops.txt") + assert fd.file_action == "not_compared" + assert fd.not_compared_reason is not None + assert fd.not_compared_reason.code == "id_churn" + + def test_cap_truncation_counts_and_stats_parity(self, tmp_path: Path): + base = { + "stops.txt": "stop_id,stop_name\n" + + "S1,Base One\nS2,Base Two\nS3,Base Three\nS4,Base Four\n" + } + new = { + "stops.txt": "stop_id,stop_name\n" + + "S1,New One\nS3,Base Three\nS5,New Five\nS6,New Six\n" + } + mem, duck = _assert_duckdb_parity( + tmp_path, base, new, row_changes_cap_per_file=2 + ) + mem_fd = _get_file_diff(mem, "stops.txt") + duck_fd = _get_file_diff(duck, "stops.txt") + assert mem_fd.truncated == duck_fd.truncated + assert mem_fd.truncated is not None + assert mem_fd.truncated.is_truncated is True + assert mem_fd.truncated.omitted_count == 3 + assert mem_fd.stats == duck_fd.stats + assert mem_fd.stats.rows_added_count == 2 + assert mem_fd.stats.rows_deleted_count == 2 + assert mem_fd.stats.rows_modified_count == 1 + assert mem_fd.stats.total_rows_base == 4 + assert mem_fd.stats.total_rows_new == 4 + + def test_cap_split_partial_modified_parity(self, tmp_path: Path): + # 10 modified rows, cap 3 → all 3 budgeted to modified; both engines must + # select the same earliest-by-line rows. + base_rows = "".join(f"S{i},Base {i}\n" for i in range(10)) + new_rows = "".join(f"S{i},New {i}\n" for i in range(10)) + base = {"stops.txt": "stop_id,stop_name\n" + base_rows} + new = {"stops.txt": "stop_id,stop_name\n" + new_rows} + mem, duck = _assert_duckdb_parity( + tmp_path, base, new, row_changes_cap_per_file=3 + ) + mem_fd = _get_file_diff(mem, "stops.txt") + assert len(mem_fd.row_changes.modified) == 3 + assert [m.identifier for m in mem_fd.row_changes.modified] == [ + {"stop_id": "S0"}, + {"stop_id": "S1"}, + {"stop_id": "S2"}, + ] + + def test_cap_zero_omits_row_changes_but_keeps_stats_parity(self, tmp_path: Path): + base = {"stops.txt": "stop_id,stop_name\nS1,Alpha\n"} + new = {"stops.txt": "stop_id,stop_name\nS1,Alpha Prime\nS2,Beta\n"} + mem, duck = _assert_duckdb_parity( + tmp_path, base, new, row_changes_cap_per_file=0 + ) + for result in (mem, duck): + fd = _get_file_diff(result, "stops.txt") + assert fd.row_changes is None + assert fd.stats.rows_added_count == 1 + assert fd.stats.rows_modified_count == 1 + + def test_duplicate_primary_key_not_compared_in_both_backends(self, tmp_path: Path): + base = {"stops.txt": "stop_id,stop_name\nS1,Alpha\nS1,Duplicate\n"} + new = {"stops.txt": "stop_id,stop_name\nS1,Alpha\nS1,Duplicate\n"} + + mem, duck = _diff_with_both_backends(tmp_path, base, new) + for result in (mem, duck): + fd = _get_file_diff(result, "stops.txt") + assert fd.file_action == "not_compared" + assert fd.not_compared_reason is not None + assert fd.not_compared_reason.code == "duplicate_primary_key" + assert "stop_id" in fd.not_compared_reason.message + + mem_message = _get_file_diff(mem, "stops.txt").not_compared_reason.message + assert "found in the base feed" in mem_message + duck_message = _get_file_diff(duck, "stops.txt").not_compared_reason.message + assert "found in both the base and new feed" in duck_message + + def test_duplicate_primary_key_in_base_only_is_not_compared(self, tmp_path: Path): + base = {"stops.txt": "stop_id,stop_name\nS1,Alpha\nS1,Duplicate\n"} + new = {"stops.txt": "stop_id,stop_name\nS1,Alpha\n"} + + mem, duck = _diff_with_both_backends(tmp_path, base, new) + for result in (mem, duck): + fd = _get_file_diff(result, "stops.txt") + assert fd.file_action == "not_compared" + assert fd.not_compared_reason is not None + assert fd.not_compared_reason.code == "duplicate_primary_key" + message = fd.not_compared_reason.message + assert "found in the base feed" in message + assert "found in the new feed" not in message + + def test_duplicate_primary_key_in_new_only_is_not_compared(self, tmp_path: Path): + base = {"stops.txt": "stop_id,stop_name\nS1,Alpha\n"} + new = {"stops.txt": "stop_id,stop_name\nS1,Alpha\nS1,Duplicate\n"} + + mem, duck = _diff_with_both_backends(tmp_path, base, new) + for result in (mem, duck): + fd = _get_file_diff(result, "stops.txt") + assert fd.file_action == "not_compared" + assert fd.not_compared_reason is not None + assert fd.not_compared_reason.code == "duplicate_primary_key" + message = fd.not_compared_reason.message + assert "found in the new feed" in message + assert "found in the base feed" not in message + + def test_duplicate_primary_key_in_both_in_memory_short_circuits_on_base( + self, tmp_path: Path + ): + base = _write_feed_dir( + tmp_path, + "base", + {"stops.txt": "stop_id,stop_name\nS1,Alpha\nS1,Duplicate\n"}, + ) + new = _write_feed_dir( + tmp_path, + "new", + {"stops.txt": "stop_id,stop_name\nS1,Alpha\nS1,Duplicate\n"}, + ) + + result = diff_feeds(base, new, large_file_threshold_bytes=None) + + fd = _get_file_diff(result, "stops.txt") + assert fd.file_action == "not_compared" + assert fd.not_compared_reason is not None + assert fd.not_compared_reason.code == "duplicate_primary_key" + assert "found in the base feed" in fd.not_compared_reason.message + + def test_duplicate_primary_key_does_not_abort_whole_diff(self, tmp_path: Path): + base = { + "stops.txt": "stop_id,stop_name\nS1,Alpha\nS1,Duplicate\n", + "routes.txt": "route_id,route_short_name\nR1,One\n", + } + new = { + "stops.txt": "stop_id,stop_name\nS1,Alpha\n", + "routes.txt": "route_id,route_short_name\nR1,Route One\n", + } + + mem, duck = _diff_with_both_backends(tmp_path, base, new) + for result in (mem, duck): + stops = _get_file_diff(result, "stops.txt") + assert stops.file_action == "not_compared" + assert stops.not_compared_reason.code == "duplicate_primary_key" + assert _get_file_diff(result, "routes.txt").file_action == "modified" + + def test_duplicate_primary_key_summary_status_count_stats_and_columns( + self, tmp_path: Path + ): + base = { + "stops.txt": ( + "stop_id,stop_name,stop_lat\nS1,Alpha,1.0\nS1,Duplicate,1.1\n" + ) + } + new = { + "stops.txt": ( + "stop_id,stop_name,stop_lon,stop_desc\n" + "S1,Alpha,2.0,First\n" + "S2,Beta,3.0,Second\n" + "S3,Gamma,4.0,Third\n" + ) + } + + mem, duck = _diff_with_both_backends(tmp_path, base, new) + for result in (mem, duck): + fs = _get_file_summary(result, "stops.txt") + assert fs.status == "not_compared" + assert result.summary.files_not_compared_count == 1 + + fd = _get_file_diff(result, "stops.txt") + assert fd.file_action == "not_compared" + assert fd.stats.total_rows_base == 2 + assert fd.stats.total_rows_new == 3 + assert [c.name for c in fd.columns_added] == ["stop_lon", "stop_desc"] + assert [c.name for c in fd.columns_deleted] == ["stop_lat"] + + def test_fk_column_ignored_when_referenced_file_has_duplicate_pk( + self, tmp_path: Path + ): + assert "stops.txt" in get_foreign_keys("stop_times.txt")["stop_id"] + stop_times_header = ( + "trip_id,arrival_time,departure_time,stop_id,stop_sequence\n" + ) + base = { + "stops.txt": "stop_id,stop_name\nS1,Alpha\nS1,Duplicate\n", + "stop_times.txt": stop_times_header + "T1,08:00:00,08:00:00,S1,1\n", + } + new = { + "stops.txt": "stop_id,stop_name\nS1,Alpha\n", + "stop_times.txt": stop_times_header + "T1,08:05:00,08:00:00,S2,1\n", + } + + mem, duck = _diff_with_both_backends(tmp_path, base, new) + for result in (mem, duck): + stops = _get_file_diff(result, "stops.txt") + assert stops.file_action == "not_compared" + assert stops.not_compared_reason.code == "duplicate_primary_key" + + stop_times = _get_file_diff(result, "stop_times.txt") + assert stop_times.file_action == "modified" + assert stop_times.ignored_columns is not None + ignored = {ic.column: ic.reason.code for ic in stop_times.ignored_columns} + assert ignored == {"stop_id": "references_not_compared_file"} + changed_fields = { + fc.field for fc in stop_times.row_changes.modified[0].field_changes + } + assert changed_fields == {"arrival_time"} + + def test_duplicate_primary_key_reason_and_error_reexport(self): + reason = _duplicate_primary_key_reason(["stop_id"]) + assert reason.code == "duplicate_primary_key" + assert "stop_id" in reason.message + assert "found in the base or new feed" in reason.message + assert ( + "found in the base feed" + in _duplicate_primary_key_reason(["stop_id"], side="base").message + ) + assert ( + "found in the new feed" + in _duplicate_primary_key_reason(["stop_id"], side="new").message + ) + assert ( + "found in both the base and new feed" + in _duplicate_primary_key_reason(["stop_id"], side="both").message + ) + + error = DuplicatePrimaryKeyError( + "stops.txt", + ["stop_id"], + {"stop_id": "S1"}, + line_number=3, + first_line=2, + side="new", + ) + assert isinstance(error, ValueError) + assert issubclass(DuplicatePrimaryKeyError, ValueError) + assert DuplicatePrimaryKeyError is CsvDuplicatePrimaryKeyError + assert error.detail is not None + assert "stop_id" in error.detail + assert error.side == "new" + + def test_eligibility_rules(self): + assert not _eligible_for_duckdb( + "translations.txt", get_primary_key("translations.txt"), True + ) + assert _eligible_for_duckdb("stops.txt", get_primary_key("stops.txt"), True) + assert _eligible_for_duckdb( + "stop_times.txt", get_primary_key("stop_times.txt"), True + ) + assert not _eligible_for_duckdb("unknown.txt", [], False) + + def test_threshold_none_never_calls_duckdb_but_zero_does( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ): + base = _write_feed_dir( + tmp_path, "base", {"stops.txt": "stop_id,stop_name\nS1,Alpha\n"} + ) + new = _write_feed_dir( + tmp_path, "new", {"stops.txt": "stop_id,stop_name\nS1,Beta\n"} + ) + + original = engine_duckdb.diff_modified_duckdb + + def fail_if_called(*args, **kwargs): + raise AssertionError("DuckDB should not be called") + + monkeypatch.setattr(engine_duckdb, "diff_modified_duckdb", fail_if_called) + result = diff_feeds(base, new, large_file_threshold_bytes=None) + assert _get_file_diff(result, "stops.txt").file_action == "modified" + + calls = [] + monkeypatch.undo() + + def record_call(*args, **kwargs): + calls.append(kwargs["file_name"]) + return original(*args, **kwargs) + + monkeypatch.setattr(engine_duckdb, "diff_modified_duckdb", record_call) + result = diff_feeds(base, new, large_file_threshold_bytes=0) + assert _get_file_diff(result, "stops.txt").file_action == "modified" + assert calls == ["stops.txt"] + + def test_unknown_size_returns_none(self): + meta = FeedFileMeta(size=None, local_path="unused.txt") + assert ( + _maybe_diff_modified_duckdb( + file_name="stops.txt", + pk_def=get_primary_key("stops.txt"), + pk_is_explicit=True, + base_meta=meta, + new_meta=meta, + large_file_threshold_bytes=0, + use_duckdb=True, + row_changes_cap=None, + id_churn_threshold=0.7, + not_compared_files={}, + ) + is None + ) + + def test_duckdb_unavailable_falls_back_to_in_memory( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ): + base_files = {"stops.txt": "stop_id,stop_name\nS1,Alpha\n"} + new_files = {"stops.txt": "stop_id,stop_name\nS1,Beta\nS2,Gamma\n"} + base = _write_feed_dir(tmp_path, "base", base_files) + new = _write_feed_dir(tmp_path, "new", new_files) + mem = diff_feeds(base, new, large_file_threshold_bytes=None) + monkeypatch.setattr(engine_duckdb, "is_duckdb_available", lambda: False) + fallback = diff_feeds(base, new, large_file_threshold_bytes=0) + assert _sorted_diff(mem) == _sorted_diff(fallback) + + +class TestDuckDBSpillBase: + def test_unset_env_returns_none(self, monkeypatch: pytest.MonkeyPatch): + monkeypatch.delenv(DUCKDB_TMPDIR_ENV, raising=False) + + assert _resolve_spill_base() is None + + def test_empty_env_returns_none(self, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv(DUCKDB_TMPDIR_ENV, "") + + assert _resolve_spill_base() is None + + def test_whitespace_env_returns_none(self, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv(DUCKDB_TMPDIR_ENV, " ") + + assert _resolve_spill_base() is None + + def test_existing_directory_returns_stripped_path( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ): + monkeypatch.setenv(DUCKDB_TMPDIR_ENV, f" {tmp_path} ") + + assert _resolve_spill_base() == str(tmp_path) + + def test_nested_directory_is_created( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ): + spill_base = tmp_path / "nested" / "spill" + monkeypatch.setenv(DUCKDB_TMPDIR_ENV, str(spill_base)) + + assert _resolve_spill_base() == str(spill_base) + assert os.path.isdir(spill_base) + + def test_leading_tilde_is_expanded( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ): + fake_home = tmp_path / "home" + + def fake_expanduser(path: str) -> str: + return path.replace("~", str(fake_home), 1) + + monkeypatch.setattr(os.path, "expanduser", fake_expanduser) + monkeypatch.setenv(DUCKDB_TMPDIR_ENV, "~/some_subdir_unlikely") + + resolved = _resolve_spill_base() + + assert resolved is not None + assert not resolved.startswith("~") + assert resolved.startswith(str(fake_home)) + assert os.path.isdir(resolved) + + def test_spill_dir_removed_when_connect_fails( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ): + # Regression: if duckdb.connect() raises, the just-created spill dir must + # still be cleaned up rather than leaking an empty folder. + spill_base = tmp_path / "spill" + spill_base.mkdir() + monkeypatch.setenv(DUCKDB_TMPDIR_ENV, str(spill_base)) + + import duckdb + + def boom(*args, **kwargs): + raise RuntimeError("connect failed") + + monkeypatch.setattr(duckdb, "connect", boom) + + with pytest.raises(RuntimeError, match="connect failed"): + engine_duckdb.diff_modified_duckdb( + file_name="stops.txt", + base_path=str(tmp_path / "missing_base.txt"), + new_path=str(tmp_path / "missing_new.txt"), + pk_cols=["stop_id"], + row_changes_cap=None, + id_churn_threshold=0.7, + not_compared_files={}, + ) + + # No gtfs_duckdb_* spill directory should remain under the base. + assert list(spill_base.glob("gtfs_duckdb_*")) == [] + + +class TestDuckDBRemoteUrl: + """The DuckDB backend reads remote files in place via httpfs (no download).""" + + def test_open_remote_feed_sets_url_on_meta(self, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setattr("gtfs_diff.engine._http_exists", lambda url: True) + monkeypatch.setattr("gtfs_diff.engine._http_content_length", lambda url: 12_345) + handle = _open_remote_feed("https://x/base", ["stops.txt"]) + meta = handle.meta["stops.txt"] + assert meta.url == "https://x/base/stops.txt" + assert meta.local_path is None + assert meta.size == 12_345 + + def test_materialized_path_yields_url_without_downloading(self): + def boom(dest: str) -> None: + raise AssertionError("a URL must never be materialized to disk") + + meta = FeedFileMeta(size=1, url="https://x/base/stops.txt", materialize=boom) + with _materialized_path(meta) as path: + assert path == "https://x/base/stops.txt" + + def test_is_remote(self): + assert engine_duckdb._is_url("https://x/base/stops.txt") + assert engine_duckdb._is_url("http://x/base/stops.txt") + assert not engine_duckdb._is_url("/tmp/base/stops.txt") + + def test_read_headers_via_duckdb_strips_bom_and_whitespace(self, tmp_path: Path): + import duckdb + + p = tmp_path / "stops.txt" + p.write_text("\ufeff stop_id , stop_name \nS1,Alpha\n", encoding="utf-8") + con = duckdb.connect() + try: + assert engine_duckdb._read_headers_via_duckdb(con, str(p)) == [ + "stop_id", + "stop_name", + ] + finally: + con.close() + + def test_duckdb_receives_url_and_never_downloads( + self, monkeypatch: pytest.MonkeyPatch + ): + from gtfs_diff.models import FileDiff, FileStats, FileSummary + + monkeypatch.setattr("gtfs_diff.engine._http_exists", lambda url: True) + monkeypatch.setattr("gtfs_diff.engine._http_content_length", lambda url: 10_000) + + def no_download(url: str, dest: str) -> None: + raise AssertionError("remote DuckDB path must not download to a temp file") + + monkeypatch.setattr("gtfs_diff.engine._http_stream_to_file", no_download) + + captured: dict = {} + + def fake_duckdb(**kwargs): + captured.update(kwargs) + fd = FileDiff( + file_name=kwargs["file_name"], + file_action="modified", + columns_added=[], + columns_deleted=[], + stats=FileStats(columns_added_count=0, columns_deleted_count=0), + ) + return fd, FileSummary(file_name=kwargs["file_name"], status="modified") + + monkeypatch.setattr(engine_duckdb, "diff_modified_duckdb", fake_duckdb) + + diff_feeds( + "https://x/base", + "https://x/new", + files=["stops.txt"], + large_file_threshold_bytes=0, + ) + + # The raw URLs (not a staged temp path) are handed straight to DuckDB. + assert captured["base_path"] == "https://x/base/stops.txt" + assert captured["new_path"] == "https://x/new/stops.txt" + + def test_remote_duckdb_reads_via_httpfs_live(self, tmp_path: Path): + """End-to-end: DuckDB reads a real local HTTP URL via httpfs. + + Skipped when the httpfs extension cannot be installed/loaded (e.g. no + network and not already cached). + """ + import functools + import http.server + import socketserver + import threading + + duckdb = pytest.importorskip("duckdb") + con = duckdb.connect() + try: + con.execute("INSTALL httpfs") + con.execute("LOAD httpfs") + except Exception: + pytest.skip("httpfs extension unavailable (no network / not cached)") + finally: + con.close() + + base = _write_feed_dir( + tmp_path, + "base", + {"stops.txt": "stop_id,stop_name\nS1,Alpha\nS2,Beta\nS3,Gamma\n"}, + ) + new = _write_feed_dir( + tmp_path, + "new", + {"stops.txt": "stop_id,stop_name\nS1,Alpha\nS2,Renamed\nS4,Delta\n"}, + ) + + handler = functools.partial( + http.server.SimpleHTTPRequestHandler, directory=str(tmp_path) + ) + httpd = socketserver.TCPServer(("127.0.0.1", 0), handler) + port = httpd.server_address[1] + thread = threading.Thread(target=httpd.serve_forever, daemon=True) + thread.start() + try: + remote = diff_feeds( + f"http://127.0.0.1:{port}/base", + f"http://127.0.0.1:{port}/new", + files=["stops.txt"], + large_file_threshold_bytes=0, + ) + finally: + httpd.shutdown() + + # Parity: reading the same files over httpfs must match the local diff. + local = diff_feeds(base, new, large_file_threshold_bytes=0) + assert _sorted_diff(remote) == _sorted_diff(local) + + +class TestDuckDBResourceCleanup: + """The DuckDB backend frees per-file tables and spill on completion.""" + + def test_no_spill_dir_leaks_and_no_tmp_in_cwd( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ): + import glob + import tempfile as _tempfile + + # Run from an isolated CWD so DuckDB's default ``.tmp`` spill dir would + # be visible here if temp_directory were not redirected. + workdir = tmp_path / "cwd" + workdir.mkdir() + monkeypatch.chdir(workdir) + + base = _write_feed_dir( + tmp_path, "base", {"stops.txt": "stop_id,stop_name\nS1,A\nS2,B\n"} + ) + new = _write_feed_dir( + tmp_path, "new", {"stops.txt": "stop_id,stop_name\nS1,A\nS2,C\n"} + ) + + pattern = str(Path(_tempfile.gettempdir()) / "gtfs_duckdb_*") + before = set(glob.glob(pattern)) + + result = diff_feeds(base, new, large_file_threshold_bytes=0) + + assert _get_file_diff(result, "stops.txt").file_action == "modified" + # The managed spill directory must be removed after the diff. + assert set(glob.glob(pattern)) == before + # DuckDB's default CWD spill directory must never be created. + assert not (workdir / ".tmp").exists()