MobilityData · cka-y · Jun 11, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/README.md b/README.md
diff --git a/docs/architecture.md b/docs/architecture.md
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
     "pydantic>=2.0",
     "click>=8.0",
     "psutil>=5.9",
+    "duckdb>=1.0",
 ]
 
 [project.optional-dependencies]
@@ -19,6 +20,7 @@ dev = [
     "pytest-cov",
     "datamodel-code-generator[ruff]>=0.59",
     "ruff>=0.11",
+    "duckdb>=1.0",
 ]
 
 [project.scripts]

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,7 @@
 annotated-types==0.7.0
 click==8.3.2
 coverage==7.13.5
+duckdb==1.5.3
 iniconfig==2.3.0
 packaging==26.1
 pluggy==1.6.0

diff --git a/src/gtfs_diff/cli.py b/src/gtfs_diff/cli.py
@@ -6,13 +6,24 @@
 
 import click
 
-from gtfs_diff.engine import MissingPrimaryKeyError, diff_feeds
+from gtfs_diff.engine import _is_url, diff_feeds
+from gtfs_diff.gtfs_definitions import DEFAULT_ID_CHURN_THRESHOLD
 
 
 @click.command()
 @click.version_option(version="0.1.0", prog_name="gtfs-diff-engine")
-@click.argument("base_feed", type=click.Path(exists=True, path_type=Path))
-@click.argument("new_feed", type=click.Path(exists=True, path_type=Path))
+@click.argument("base_feed", type=str)
+@click.argument("new_feed", type=str)
+@click.option(
+    "--files",
+    default=None,
+    metavar="NAMES",
+    help=(
+        "Comma-separated list of GTFS files to compare, e.g. "
+        "'stops.txt,trips.txt'. Optional: for folder URLs, omitting it "
+        "probes all known GTFS files; for local feeds it restricts the comparison."
+    ),
+)
 @click.option(
     "--output",
     "-o",
@@ -40,20 +51,88 @@
     default=None,
     help="ISO 8601 datetime for when new was downloaded.",
 )
+@click.option(
+    "--id-churn-threshold",
+    type=click.FloatRange(0.0, 1.0),
+    default=DEFAULT_ID_CHURN_THRESHOLD,
+    show_default=True,
+    help=(
+        "Primary-key churn ratio (0.0-1.0) above which a file is reported as "
+        "not_compared instead of diffed (detects regenerated ids)."
+    ),
+)
+@click.option(
+    "--id-churn-threshold-for",
+    type=(str, click.FloatRange(0.0, 1.0)),
+    multiple=True,
+    metavar="FILENAME RATIO",
+    help=(
+        "Per-file id-churn threshold override; repeatable. Takes precedence "
+        "over --id-churn-threshold. Example: "
+        "--id-churn-threshold-for shapes.txt 0.95"
+    ),
+)
+@click.option(
+    "--large-file-threshold-mb",
+    type=click.FloatRange(0.0),
+    default=50.0,
+    show_default=True,
+    help=(
+        "Files whose larger side is at least this many megabytes are diffed "
+        "with the built-in DuckDB backend (lower memory for very large files). "
+        "Use --no-duckdb to always use the in-memory engine."
+    ),
+)
+@click.option(
+    "--no-duckdb",
+    is_flag=True,
+    default=False,
+    help="Disable the DuckDB backend; always use the in-memory engine.",
+)
+@click.option(
+    "--column-stats/--no-column-stats",
+    default=True,
+    help=(
+        "Include per-column modification counts and percentages in each "
+        "modified file's stats (default: on). The file-level "
+        "rows_changed_percentage is always computed."
+    ),
+)
 def main(
-    base_feed: Path,
-    new_feed: Path,
+    base_feed: str,
+    new_feed: str,
+    files: str | None,
     output: Path | None,
     cap: int | None,
     pretty: bool,
     base_downloaded_at: str | None,
     new_downloaded_at: str | None,
+    id_churn_threshold: float,
+    id_churn_threshold_for: tuple[tuple[str, float], ...],
+    large_file_threshold_mb: float,
+    no_duckdb: bool,
+    column_stats: bool,
 ) -> None:
-    """Compare two GTFS feeds (zip or directory) and output a JSON diff.
+    """Compare two GTFS feeds and output a JSON diff.
 
-    BASE_FEED: path to the base GTFS feed (zip or directory)\n
-    NEW_FEED:  path to the new GTFS feed (zip or directory)
+    BASE_FEED: local path or http(s):// folder URL to the base GTFS feed\n
+    NEW_FEED:  local path or http(s):// folder URL to the new GTFS feed\n
+    Use optional --files with a comma-separated GTFS file list. For URLs,
+    omitting --files auto-discovers known GTFS files.
     """
+    base_is_url = _is_url(base_feed)
+    new_is_url = _is_url(new_feed)
+
+    base_path: str | Path = base_feed if base_is_url else Path(base_feed)
+    new_path: str | Path = new_feed if new_is_url else Path(new_feed)
+
+    if isinstance(base_path, Path) and not base_path.exists():
+        click.echo(f"Error: {base_path} does not exist.", err=True)
+        sys.exit(1)
+    if isinstance(new_path, Path) and not new_path.exists():
+        click.echo(f"Error: {new_path} does not exist.", err=True)
+        sys.exit(1)
+
     try:
         base_dt = (
             datetime.fromisoformat(base_downloaded_at) if base_downloaded_at else None
@@ -66,22 +145,24 @@ def main(
         sys.exit(1)
 
     try:
+        parsed_files = (
+            [f.strip() for f in files.split(",") if f.strip()] if files else None
+        )
+        large_file_threshold_bytes = (
+            None if no_duckdb else int(large_file_threshold_mb * 1024 * 1024)
+        )
         result = diff_feeds(
-            base_path=base_feed,
-            new_path=new_feed,
+            base_path=base_path,
+            new_path=new_path,
             row_changes_cap_per_file=cap,
             base_downloaded_at=base_dt,
             new_downloaded_at=new_dt,
+            id_churn_threshold=id_churn_threshold,
+            id_churn_thresholds=dict(id_churn_threshold_for),
+            files=parsed_files,
+            large_file_threshold_bytes=large_file_threshold_bytes,
+            column_stats=column_stats,
         )
-    except MissingPrimaryKeyError as exc:
-        click.echo(
-            f"ERROR: Cannot process '{exc.file_name}' — "
-            f"required primary key column(s) {exc.missing_columns} "
-            f"are missing from the file headers.\n"
-            f"Headers found: {exc.headers}",
-            err=True,
-        )
-        sys.exit(1)
     except Exception as exc:
         click.echo(f"Error: {exc}", err=True)
         sys.exit(1)