Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 148 additions & 14 deletions README.md

Large diffs are not rendered by default.

210 changes: 208 additions & 2 deletions docs/architecture.md

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies = [
"pydantic>=2.0",
"click>=8.0",
"psutil>=5.9",
"duckdb>=1.0",
]

[project.optional-dependencies]
Expand All @@ -19,6 +20,7 @@ dev = [
"pytest-cov",
"datamodel-code-generator[ruff]>=0.59",
"ruff>=0.11",
"duckdb>=1.0",
]

[project.scripts]
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
annotated-types==0.7.0
click==8.3.2
coverage==7.13.5
duckdb==1.5.3
iniconfig==2.3.0
packaging==26.1
pluggy==1.6.0
Expand Down
119 changes: 100 additions & 19 deletions src/gtfs_diff/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,24 @@

import click

from gtfs_diff.engine import MissingPrimaryKeyError, diff_feeds
from gtfs_diff.engine import _is_url, diff_feeds
from gtfs_diff.gtfs_definitions import DEFAULT_ID_CHURN_THRESHOLD


@click.command()
@click.version_option(version="0.1.0", prog_name="gtfs-diff-engine")
@click.argument("base_feed", type=click.Path(exists=True, path_type=Path))
@click.argument("new_feed", type=click.Path(exists=True, path_type=Path))
@click.argument("base_feed", type=str)
@click.argument("new_feed", type=str)
@click.option(
"--files",
default=None,
metavar="NAMES",
help=(
"Comma-separated list of GTFS files to compare, e.g. "
"'stops.txt,trips.txt'. Optional: for folder URLs, omitting it "
"probes all known GTFS files; for local feeds it restricts the comparison."
),
)
@click.option(
"--output",
"-o",
Expand Down Expand Up @@ -40,20 +51,88 @@
default=None,
help="ISO 8601 datetime for when new was downloaded.",
)
@click.option(
"--id-churn-threshold",
type=click.FloatRange(0.0, 1.0),
default=DEFAULT_ID_CHURN_THRESHOLD,
show_default=True,
help=(
"Primary-key churn ratio (0.0-1.0) above which a file is reported as "
"not_compared instead of diffed (detects regenerated ids)."
),
)
@click.option(
"--id-churn-threshold-for",
type=(str, click.FloatRange(0.0, 1.0)),
multiple=True,
metavar="FILENAME RATIO",
help=(
"Per-file id-churn threshold override; repeatable. Takes precedence "
"over --id-churn-threshold. Example: "
"--id-churn-threshold-for shapes.txt 0.95"
),
)
@click.option(
"--large-file-threshold-mb",
type=click.FloatRange(0.0),
default=50.0,
show_default=True,
help=(
"Files whose larger side is at least this many megabytes are diffed "
"with the built-in DuckDB backend (lower memory for very large files). "
"Use --no-duckdb to always use the in-memory engine."
),
)
@click.option(
"--no-duckdb",
is_flag=True,
default=False,
help="Disable the DuckDB backend; always use the in-memory engine.",
)
@click.option(
"--column-stats/--no-column-stats",
default=True,
help=(
"Include per-column modification counts and percentages in each "
"modified file's stats (default: on). The file-level "
"rows_changed_percentage is always computed."
),
)
def main(
base_feed: Path,
new_feed: Path,
base_feed: str,
new_feed: str,
files: str | None,
output: Path | None,
cap: int | None,
pretty: bool,
base_downloaded_at: str | None,
new_downloaded_at: str | None,
id_churn_threshold: float,
id_churn_threshold_for: tuple[tuple[str, float], ...],
large_file_threshold_mb: float,
no_duckdb: bool,
column_stats: bool,
) -> None:
"""Compare two GTFS feeds (zip or directory) and output a JSON diff.
"""Compare two GTFS feeds and output a JSON diff.

BASE_FEED: path to the base GTFS feed (zip or directory)\n
NEW_FEED: path to the new GTFS feed (zip or directory)
BASE_FEED: local path or http(s):// folder URL to the base GTFS feed\n
NEW_FEED: local path or http(s):// folder URL to the new GTFS feed\n
Use optional --files with a comma-separated GTFS file list. For URLs,
omitting --files auto-discovers known GTFS files.
"""
base_is_url = _is_url(base_feed)
new_is_url = _is_url(new_feed)

base_path: str | Path = base_feed if base_is_url else Path(base_feed)
new_path: str | Path = new_feed if new_is_url else Path(new_feed)

if isinstance(base_path, Path) and not base_path.exists():
click.echo(f"Error: {base_path} does not exist.", err=True)
sys.exit(1)
if isinstance(new_path, Path) and not new_path.exists():
click.echo(f"Error: {new_path} does not exist.", err=True)
sys.exit(1)

try:
base_dt = (
datetime.fromisoformat(base_downloaded_at) if base_downloaded_at else None
Expand All @@ -66,22 +145,24 @@ def main(
sys.exit(1)

try:
parsed_files = (
[f.strip() for f in files.split(",") if f.strip()] if files else None
)
large_file_threshold_bytes = (
None if no_duckdb else int(large_file_threshold_mb * 1024 * 1024)
)
result = diff_feeds(
base_path=base_feed,
new_path=new_feed,
base_path=base_path,
new_path=new_path,
row_changes_cap_per_file=cap,
base_downloaded_at=base_dt,
new_downloaded_at=new_dt,
id_churn_threshold=id_churn_threshold,
id_churn_thresholds=dict(id_churn_threshold_for),
files=parsed_files,
large_file_threshold_bytes=large_file_threshold_bytes,
column_stats=column_stats,
)
except MissingPrimaryKeyError as exc:
click.echo(
f"ERROR: Cannot process '{exc.file_name}' — "
f"required primary key column(s) {exc.missing_columns} "
f"are missing from the file headers.\n"
f"Headers found: {exc.headers}",
err=True,
)
sys.exit(1)
except Exception as exc:
click.echo(f"Error: {exc}", err=True)
sys.exit(1)
Expand Down
Loading