diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..f18f0f2 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,81 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + +jobs: + detect-changes: + name: Detect changed paths + runs-on: ubuntu-latest + permissions: + pull-requests: read + outputs: + src: ${{ steps.filter.outputs.src }} + models: ${{ steps.filter.outputs.models }} + steps: + - uses: actions/checkout@v6 + - uses: dorny/paths-filter@v4 + id: filter + with: + filters: | + src: + - 'src/**' + - 'tests/**' + - 'pyproject.toml' + models: + - 'src/gtfs_diff/schema.conf' + - 'scripts/generate_models.sh' + - 'src/gtfs_diff/models.py' + - '.github/workflows/ci.yml' + + test: + needs: + - detect-changes + - lint + name: Run tests on Python ${{ matrix.python-version }} + if: needs.detect-changes.outputs.src == 'true' + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.12", "3.14"] + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + - run: pip install -e '.[dev]' + - run: pytest --tb=short + + lint: + needs: detect-changes + name: Lint and format check + if: needs.detect-changes.outputs.src == 'true' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 + with: + python-version: "3.12" + - run: pip install -e '.[dev]' + - run: ./scripts/lint.sh + + models-freshness: + needs: detect-changes + name: Check models.py is up to date + if: needs.detect-changes.outputs.models == 'true' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 + with: + python-version: "3.12" + - run: pip install 'datamodel-code-generator[ruff]>=0.59' + - run: ./scripts/generate_models.sh + - name: Check models.py is up to date + run: | + if ! git diff --exit-code src/gtfs_diff/models.py; then + echo "::error::src/gtfs_diff/models.py is out of date with the schema. Run ./scripts/generate_models.sh to regenerate." + exit 1 + fi diff --git a/pyproject.toml b/pyproject.toml index e5c7f60..86fb16d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,8 @@ dependencies = [ dev = [ "pytest>=7.0", "pytest-cov", + "datamodel-code-generator[ruff]>=0.59", + "ruff>=0.11", ] [project.scripts] @@ -27,3 +29,21 @@ packages = ["src/gtfs_diff"] [tool.pytest.ini_options] testpaths = ["tests"] + +[tool.ruff] +target-version = "py310" +src = ["src", "tests"] +exclude = ["src/gtfs_diff/models.py"] + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "F", # pyflakes + "I", # isort + "UP", # pyupgrade + "B", # flake8-bugbear + "SIM", # flake8-simplify +] + +[tool.ruff.format] +exclude = ["src/gtfs_diff/models.py"] diff --git a/scripts/generate_models.sh b/scripts/generate_models.sh new file mode 100755 index 0000000..59a394a --- /dev/null +++ b/scripts/generate_models.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# Generate Pydantic v2 models from the GTFS Diff JSON Schema. +# +# Usage: +# ./scripts/generate_models.sh # use version from src/gtfs_diff/schema.conf +# ./scripts/generate_models.sh v2-rc1 # fetch a specific version from GitHub +# ./scripts/generate_models.sh /path/to/local.json # use a local file +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +OUTPUT="$REPO_ROOT/src/gtfs_diff/models.py" +SCHEMA_REPO="MobilityData/gtfs-diff" +SCHEMA_BRANCH="main" + +# --- Resolve input: argument > schema.conf ----------------------------------- +SCHEMA_CONF="$REPO_ROOT/src/gtfs_diff/schema.conf" +if [ $# -ge 1 ]; then + INPUT="$1" +elif [ -f "$SCHEMA_CONF" ]; then + # shellcheck source=../src/gtfs_diff/schema.conf + source "$SCHEMA_CONF" + INPUT="${SCHEMA_VERSION:?SCHEMA_VERSION not set in schema.conf}" + echo "Using version from schema.conf: $INPUT" +else + echo "Usage: $0 []" >&2 + echo " e.g. $0 v2-rc1" >&2 + echo " or set SCHEMA_VERSION in src/gtfs_diff/schema.conf" >&2 + exit 1 +fi + +# --- Resolve schema source --------------------------------------------------- +if [ -f "$INPUT" ]; then + SCHEMA_FILE="$INPUT" + echo "Using local schema: $SCHEMA_FILE" +else + VERSION="$INPUT" + SCHEMA_URL="https://raw.githubusercontent.com/$SCHEMA_REPO/$SCHEMA_BRANCH/spec/v2/json_schema/${VERSION}.json" + SCHEMA_FILE="$(mktemp "${TMPDIR:-/tmp}/gtfs_diff_schema_XXXXXX.json")" + trap 'rm -f "$SCHEMA_FILE"' EXIT + + echo "Fetching schema from $SCHEMA_URL ..." + curl -fsSL "$SCHEMA_URL" -o "$SCHEMA_FILE" +fi + +# --- Ensure datamodel-code-generator is available ---------------------------- +if ! command -v datamodel-codegen &>/dev/null; then + echo "Installing datamodel-code-generator ..." + pip install 'datamodel-code-generator[ruff]>=0.59' +fi + +# --- Generate ---------------------------------------------------------------- +echo "Generating models → $OUTPUT" +datamodel-codegen \ + --formatters ruff-format ruff-check \ + --input "$SCHEMA_FILE" \ + --input-file-type jsonschema \ + --output-model-type pydantic_v2.BaseModel \ + --target-python-version 3.10 \ + --use-standard-collections \ + --use-union-operator \ + --snake-case-field \ + --collapse-root-models \ + --enum-field-as-literal all \ + --field-constraints \ + --use-schema-description \ + --extra-fields allow \ + --class-name GtfsDiff \ + --output "$OUTPUT" + +# --- Post-process: replace header, append __all__ ---------------------------- +# Replace the codegen header with a clear auto-generated notice. +{ + echo "# AUTO-GENERATED — DO NOT EDIT" + echo "# This file is generated from the GTFS Diff JSON Schema." + echo "# To regenerate: ./scripts/generate_models.sh" + echo "# Schema source: https://github.com/$SCHEMA_REPO" + echo "" + # Strip the original codegen comment block (everything before the first blank line). + sed -n '/^$/,$p' "$OUTPUT" +} > "$OUTPUT.tmp" && mv "$OUTPUT.tmp" "$OUTPUT" + +# Collect class names and append __all__. +CLASSES=$(grep -oE '^class ([A-Za-z_][A-Za-z0-9_]*)' "$OUTPUT" | awk '{print $2}') +{ + echo "" + echo "" + echo "__all__ = [" + for cls in $CLASSES; do + echo " \"$cls\"," + done + echo "]" +} >> "$OUTPUT" + +COUNT=$(echo "$CLASSES" | wc -w | tr -d ' ') +echo "Done — $COUNT model(s) generated." diff --git a/scripts/lint-fix.sh b/scripts/lint-fix.sh new file mode 100755 index 0000000..2c450de --- /dev/null +++ b/scripts/lint-fix.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# Auto-fix lint violations and reformat code. +# Usage: ./scripts/lint-fix.sh +set -euo pipefail + +echo "Fixing lint violations ..." +ruff check --fix src/ tests/ + +echo "Formatting ..." +ruff format src/ tests/ + +echo "Done." diff --git a/scripts/lint.sh b/scripts/lint.sh new file mode 100755 index 0000000..25a957f --- /dev/null +++ b/scripts/lint.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# Check lint and formatting (exits non-zero on violations). +# Usage: ./scripts/lint.sh +set -euo pipefail + +echo "Checking lint rules ..." +ruff check src/ tests/ + +echo "Checking formatting ..." +ruff format --check src/ tests/ + +echo "All clean." diff --git a/src/gtfs_diff/cli.py b/src/gtfs_diff/cli.py index cb2fa35..5371563 100644 --- a/src/gtfs_diff/cli.py +++ b/src/gtfs_diff/cli.py @@ -13,16 +13,33 @@ @click.version_option(version="0.1.0", prog_name="gtfs-diff-engine") @click.argument("base_feed", type=click.Path(exists=True, path_type=Path)) @click.argument("new_feed", type=click.Path(exists=True, path_type=Path)) -@click.option("--output", "-o", type=click.Path(path_type=Path), default=None, - help="Write JSON output to FILE instead of stdout.") -@click.option("--cap", "-c", type=int, default=None, - help="Max row changes per file (0 = omit row-level detail).") -@click.option("--pretty/--no-pretty", default=True, - help="Pretty-print JSON (default: --pretty).") -@click.option("--base-downloaded-at", default=None, - help="ISO 8601 datetime for when base was downloaded.") -@click.option("--new-downloaded-at", default=None, - help="ISO 8601 datetime for when new was downloaded.") +@click.option( + "--output", + "-o", + type=click.Path(path_type=Path), + default=None, + help="Write JSON output to FILE instead of stdout.", +) +@click.option( + "--cap", + "-c", + type=int, + default=None, + help="Max row changes per file (0 = omit row-level detail).", +) +@click.option( + "--pretty/--no-pretty", default=True, help="Pretty-print JSON (default: --pretty)." +) +@click.option( + "--base-downloaded-at", + default=None, + help="ISO 8601 datetime for when base was downloaded.", +) +@click.option( + "--new-downloaded-at", + default=None, + help="ISO 8601 datetime for when new was downloaded.", +) def main( base_feed: Path, new_feed: Path, @@ -38,8 +55,12 @@ def main( NEW_FEED: path to the new GTFS feed (zip or directory) """ try: - base_dt = datetime.fromisoformat(base_downloaded_at) if base_downloaded_at else None - new_dt = datetime.fromisoformat(new_downloaded_at) if new_downloaded_at else None + base_dt = ( + datetime.fromisoformat(base_downloaded_at) if base_downloaded_at else None + ) + new_dt = ( + datetime.fromisoformat(new_downloaded_at) if new_downloaded_at else None + ) except ValueError as exc: click.echo(f"Error: {exc}", err=True) sys.exit(1) @@ -55,7 +76,8 @@ def main( except MissingPrimaryKeyError as exc: click.echo( f"ERROR: Cannot process '{exc.file_name}' — " - f"required primary key column(s) {exc.missing_columns} are missing from the file headers.\n" + f"required primary key column(s) {exc.missing_columns} " + f"are missing from the file headers.\n" f"Headers found: {exc.headers}", err=True, ) @@ -64,7 +86,10 @@ def main( click.echo(f"Error: {exc}", err=True) sys.exit(1) - json_str = result.model_dump_json(indent=2, exclude_none=True) if pretty else result.model_dump_json(exclude_none=True) + if pretty: + json_str = result.model_dump_json(indent=2, exclude_none=True) + else: + json_str = result.model_dump_json(exclude_none=True) if output is not None: try: diff --git a/src/gtfs_diff/engine.py b/src/gtfs_diff/engine.py index b273d08..ebced6c 100644 --- a/src/gtfs_diff/engine.py +++ b/src/gtfs_diff/engine.py @@ -6,20 +6,23 @@ (line_number, raw_csv_string) for every row in each file. For typical transit feeds this is fine. For very large feeds (stop_times.txt can exceed 10 M rows) a disk-backed index (e.g. SQLite) would be more appropriate; that is left as a -future optimisation. +future optimization. """ from __future__ import annotations +import configparser import csv import io import sys import time import zipfile +from collections.abc import Callable, Generator from contextlib import contextmanager from datetime import datetime, timezone +from importlib import resources from pathlib import Path -from typing import Callable, Generator, TextIO +from typing import TextIO from .gtfs_definitions import get_primary_key from .models import ( @@ -27,6 +30,7 @@ FeedSource, FieldChange, FileDiff, + FileStats, FileSummary, GtfsDiff, Metadata, @@ -39,27 +43,40 @@ UnsupportedFile, ) -SCHEMA_VERSION = "2.0" + +def _read_schema_version() -> str: + conf_text = resources.files("gtfs_diff").joinpath("schema.conf").read_text() + parser = configparser.ConfigParser() + parser.read_string("[default]\n" + conf_text) + return parser.get("default", "SCHEMA_VERSION") class MissingPrimaryKeyError(ValueError): """Raised when a required primary key column is absent from a file's headers.""" - def __init__(self, file_name: str, missing_columns: list[str], headers: list[str]) -> None: + def __init__( + self, file_name: str, missing_columns: list[str], headers: list[str] + ) -> None: self.file_name = file_name self.missing_columns = missing_columns self.headers = headers super().__init__( - f"'{file_name}': required primary key column(s) {missing_columns} " - f"not found in headers {headers}." + f"'{file_name}': required primary key column(s) " + f"{missing_columns} not found in headers {headers}." ) def _trace(msg: str) -> None: """Print a timestamped progress message with current RSS to stderr.""" import psutil + rss_mb = psutil.Process().memory_info().rss / 1024 / 1024 - print(f"[gtfs-diff {datetime.now().strftime('%H:%M:%S')} {rss_mb:.0f}MB] {msg}", file=sys.stderr, flush=True) + print( + f"[gtfs-diff {datetime.now().strftime('%H:%M:%S')} {rss_mb:.0f}MB] {msg}", + file=sys.stderr, + flush=True, + ) + # A "lazy opener" maps a filename (e.g. "stops.txt") to a zero-arg callable # that opens the file and returns a text stream. @@ -70,8 +87,9 @@ def _trace(msg: str) -> None: # Low-level CSV helpers # --------------------------------------------------------------------------- + def _row_to_csv(values: list[str]) -> str: - """Serialise a list of string values to a single CSV line (no trailing newline).""" + """Serialize a list of string values to a single CSV line (no trailing newline).""" buf = io.StringIO() writer = csv.writer(buf, lineterminator="") writer.writerow(values) @@ -134,13 +152,15 @@ def _read_csv_index( if len(row) < n: row = row + [""] * (n - len(row)) row_vals = row[:n] - row_dict = dict(zip(headers, row_vals)) + row_dict = dict(zip(headers, row_vals, strict=True)) pk_tuple = tuple(row_dict.get(col, "") for col in effective_pk) if pk_tuple in index: raise ValueError( - f"{file_name}: duplicate primary key {dict(zip(effective_pk, pk_tuple))} " - f"at line {line_num} (first seen at line {index[pk_tuple][0]})." + f"{file_name}: duplicate primary key " + f"{dict(zip(effective_pk, pk_tuple, strict=True))} " + f"at line {line_num} " + f"(first seen at line {index[pk_tuple][0]})." ) index[pk_tuple] = (line_num, _row_to_csv(row_vals)) @@ -157,7 +177,7 @@ def _parse_raw_line(raw_line: str, headers: list[str]) -> dict[str, str]: return {col: "" for col in headers} if len(row) < len(headers): row = row + [""] * (len(headers) - len(row)) - return dict(zip(headers, row)) + return dict(zip(headers, row, strict=True)) def _values_differ(a: str, b: str) -> bool: @@ -197,6 +217,7 @@ def _compute_raw_value( # Feed opener # --------------------------------------------------------------------------- + @contextmanager def _open_feed(path: str | Path) -> Generator[LazyOpeners, None, None]: """Open a GTFS feed (zip archive or directory) and yield lazy file openers. @@ -214,8 +235,10 @@ def _open_feed(path: str | Path) -> Generator[LazyOpeners, None, None]: if path.is_dir(): openers: LazyOpeners = {} for txt_file in sorted(path.glob("*.txt")): + def _make_opener(p: Path) -> Callable[[], TextIO]: return lambda: p.open(encoding="utf-8-sig") + openers[txt_file.name] = _make_opener(txt_file) yield openers @@ -232,8 +255,10 @@ def _make_opener(p: Path) -> Callable[[], TextIO]: openers = {} for basename, internal_path in name_map.items(): + def _make_opener(ip: str) -> Callable[[], TextIO]: # type: ignore[misc] return lambda: io.TextIOWrapper(zf.open(ip), encoding="utf-8-sig") + openers[basename] = _make_opener(internal_path) yield openers finally: @@ -249,6 +274,7 @@ def _make_opener(ip: str) -> Callable[[], TextIO]: # type: ignore[misc] # Per-file diff # --------------------------------------------------------------------------- + def _diff_file( file_name: str, base_opener: Callable[[], TextIO] | None, @@ -272,23 +298,18 @@ def _diff_file_added( with new_opener() as f: new_headers = _read_headers(f) columns_added = [ - ColumnEntry(name=col, position=i + 1) - for i, col in enumerate(new_headers) + ColumnEntry(name=col, position=i + 1) for i, col in enumerate(new_headers) ] file_diff = FileDiff( file_name=file_name, file_action="added", columns_added=columns_added, columns_deleted=[], - row_changes=None, - truncated=None, - ) - summary = FileSummary( - file_name=file_name, - status="added", - columns_added_count=len(columns_added), - columns_deleted_count=0, + stats=FileStats( + columns_added_count=len(columns_added), columns_deleted_count=0 + ), ) + summary = FileSummary(file_name=file_name, status="added") return file_diff, summary @@ -300,23 +321,18 @@ def _diff_file_deleted( with base_opener() as f: base_headers = _read_headers(f) columns_deleted = [ - ColumnEntry(name=col, position=i + 1) - for i, col in enumerate(base_headers) + ColumnEntry(name=col, position=i + 1) for i, col in enumerate(base_headers) ] file_diff = FileDiff( file_name=file_name, file_action="deleted", columns_added=[], columns_deleted=columns_deleted, - row_changes=None, - truncated=None, - ) - summary = FileSummary( - file_name=file_name, - status="deleted", - columns_added_count=0, - columns_deleted_count=len(columns_deleted), + stats=FileStats( + columns_added_count=0, columns_deleted_count=len(columns_deleted) + ), ) + summary = FileSummary(file_name=file_name, status="deleted") return file_diff, summary @@ -373,7 +389,8 @@ def _scan_modifications( shared_cols = [col for col in base_headers if col in set(new_headers)] candidates: list[tuple[tuple, list[FieldChange], int, int]] = [] - _trace(f" [{file_name}] scanning {len(common_keys):,} common rows for modifications...") + n = len(common_keys) + _trace(f" [{file_name}] scanning {n:,} common rows...") t0 = time.monotonic() for pk_tuple in common_keys: b_line, b_raw = base_index[pk_tuple] @@ -388,7 +405,10 @@ def _scan_modifications( if field_changes: candidates.append((pk_tuple, field_changes, b_line, n_line)) - _trace(f" [{file_name}] scan done in {time.monotonic()-t0:.1f}s — {len(candidates):,} modified") + _trace( + f" [{file_name}] scan done in {time.monotonic() - t0:.1f}s — " + f"{len(candidates):,} modified" + ) return candidates @@ -414,16 +434,24 @@ def _diff_file_modified( _trace(f" [{file_name}] indexing base feed...") t0 = time.monotonic() base_headers, base_index = _read_csv_index(f, pk_cols, file_name=file_name) - _trace(f" [{file_name}] base index done: {len(base_index):,} rows in {time.monotonic()-t0:.1f}s") + _trace( + f" [{file_name}] base index done: {len(base_index):,} " + f"rows in {time.monotonic() - t0:.1f}s" + ) with new_opener() as f: _trace(f" [{file_name}] indexing new feed...") t0 = time.monotonic() new_headers, new_index = _read_csv_index(f, pk_cols, file_name=file_name) - _trace(f" [{file_name}] new index done: {len(new_index):,} rows in {time.monotonic()-t0:.1f}s") + _trace( + f" [{file_name}] new index done: {len(new_index):,} " + f"rows in {time.monotonic() - t0:.1f}s" + ) # Column-level diff - columns_added, columns_deleted, union_columns = _diff_columns(base_headers, new_headers) + columns_added, columns_deleted, union_columns = _diff_columns( + base_headers, new_headers + ) # Row-level diff base_keys = set(base_index) @@ -439,7 +467,12 @@ def _diff_file_modified( file_name, common_keys, base_index, new_index, base_headers, new_headers ) true_modified = len(modified_candidates) - _trace(f" [{file_name}] row diff summary — added={true_added:,} deleted={true_deleted:,} modified={true_modified:,}") + _trace( + f" [{file_name}] row diff summary — " + f"added={true_added:,} " + f"deleted={true_deleted:,} " + f"modified={true_modified:,}" + ) # Determine row-changes output based on cap. # cap=0 means "summary counts only" — row_changes is omitted from the output @@ -467,7 +500,9 @@ def _remaining(used: int) -> int | None: identifier = {col: n_dict.get(col, "") for col in pk_cols} raw_value = _compute_raw_value(n_dict, union_columns, new_header_set) added_rows.append( - RowAdded(identifier=identifier, raw_value=raw_value, new_line_number=n_line) + RowAdded( + identifier=identifier, raw_value=raw_value, new_line_number=n_line + ) ) # Fill deleted rows up to remaining cap. @@ -478,16 +513,20 @@ def _remaining(used: int) -> int | None: identifier = {col: b_dict.get(col, "") for col in pk_cols} raw_value = _compute_raw_value(b_dict, union_columns, base_header_set) deleted_rows.append( - RowDeleted(identifier=identifier, raw_value=raw_value, base_line_number=b_line) + RowDeleted( + identifier=identifier, raw_value=raw_value, base_line_number=b_line + ) ) # Fill modified rows up to remaining cap. modified_limit = _remaining(len(added_rows) + len(deleted_rows)) - for pk_tuple, field_changes, b_line, n_line in modified_candidates[:modified_limit]: - n_raw = new_index[pk_tuple][1] - n_dict = _parse_raw_line(n_raw, new_headers) - identifier = {col: n_dict.get(col, "") for col in pk_cols} - raw_value = _compute_raw_value(n_dict, union_columns, new_header_set) + for pk_tuple, field_changes, b_line, n_line in modified_candidates[ + :modified_limit + ]: + b_raw = base_index[pk_tuple][1] + b_dict = _parse_raw_line(b_raw, base_headers) + identifier = {col: b_dict.get(col, "") for col in pk_cols} + raw_value = _compute_raw_value(b_dict, union_columns, base_header_set) modified_rows.append( RowModified( identifier=identifier, @@ -501,7 +540,9 @@ def _remaining(used: int) -> int | None: total_included = len(added_rows) + len(deleted_rows) + len(modified_rows) total_true = true_added + true_deleted + true_modified if cap is not None and total_true > cap: - truncated = Truncated(is_truncated=True, omitted_count=total_true - total_included) + truncated = Truncated( + is_truncated=True, omitted_count=total_true - total_included + ) row_changes: RowChanges | None = None if include_row_changes: @@ -522,16 +563,17 @@ def _remaining(used: int) -> int | None: columns_deleted=columns_deleted, row_changes=row_changes, truncated=truncated, + stats=FileStats( + total_rows_base=len(base_index), + total_rows_new=len(new_index), + columns_added_count=len(columns_added), + columns_deleted_count=len(columns_deleted), + rows_added_count=true_added, + rows_deleted_count=true_deleted, + rows_modified_count=true_modified, + ), ) - summary = FileSummary( - file_name=file_name, - status="modified", - columns_added_count=len(columns_added), - columns_deleted_count=len(columns_deleted), - rows_added_count=true_added, - rows_deleted_count=true_deleted, - rows_modified_count=true_modified, - ) + summary = FileSummary(file_name=file_name, status="modified") return file_diff, summary @@ -539,6 +581,7 @@ def _remaining(used: int) -> int | None: # Public API # --------------------------------------------------------------------------- + def diff_feeds( base_path: str | Path, new_path: str | Path, @@ -553,7 +596,8 @@ def diff_feeds( new_path: Path to the new GTFS feed — zip or directory. row_changes_cap_per_file: * ``None`` — include all row changes (default). - * ``0`` — omit all row-level detail (column diffs and counts still computed). + * ``0`` — omit all row-level detail (column diffs and counts + still computed). * ``N > 0`` — include up to *N* row changes per file (added first, then deleted, then modified); a :class:`Truncated` record is attached when the true count exceeds *N*. @@ -607,13 +651,15 @@ def diff_feeds( # Per spec: file_diffs[] contains only *changed* files. # Skip files present in both feeds with no actual changes. + stats = file_diff.stats if ( file_summary.status == "modified" - and not file_summary.columns_added_count - and not file_summary.columns_deleted_count - and not file_summary.rows_added_count - and not file_summary.rows_deleted_count - and not file_summary.rows_modified_count + and stats is not None + and not stats.columns_added_count + and not stats.columns_deleted_count + and not stats.rows_added_count + and not stats.rows_deleted_count + and not stats.rows_modified_count ): continue @@ -625,18 +671,21 @@ def diff_feeds( files_deleted = sum(1 for s in file_summaries if s.status == "deleted") files_modified = sum(1 for s in file_summaries if s.status == "modified") + def _stat(attr: str) -> int: + return sum(getattr(fd.stats, attr, 0) or 0 for fd in file_diffs if fd.stats) + total_changes = ( - sum(s.rows_added_count or 0 for s in file_summaries) - + sum(s.rows_deleted_count or 0 for s in file_summaries) - + sum(s.rows_modified_count or 0 for s in file_summaries) - + sum(s.columns_added_count or 0 for s in file_summaries) - + sum(s.columns_deleted_count or 0 for s in file_summaries) + _stat("rows_added_count") + + _stat("rows_deleted_count") + + _stat("rows_modified_count") + + _stat("columns_added_count") + + _stat("columns_deleted_count") + files_added + files_deleted ) metadata = Metadata( - schema_version=SCHEMA_VERSION, + schema_version=_read_schema_version(), generated_at=now, row_changes_cap_per_file=row_changes_cap_per_file, base_feed=FeedSource(source=str(base_path), downloaded_at=base_downloaded_at), @@ -648,8 +697,9 @@ def diff_feeds( files_added_count=files_added, files_deleted_count=files_deleted, files_modified_count=files_modified, + files_not_compared_count=0, files=file_summaries, ) result = GtfsDiff(metadata=metadata, summary=summary, file_diffs=file_diffs) _trace("diff_feeds complete") - return result \ No newline at end of file + return result diff --git a/src/gtfs_diff/gtfs_definitions.py b/src/gtfs_diff/gtfs_definitions.py index 1fd0853..0190b51 100644 --- a/src/gtfs_diff/gtfs_definitions.py +++ b/src/gtfs_diff/gtfs_definitions.py @@ -25,17 +25,37 @@ "calendar_dates.txt": ["service_id", "date"], # Fares v1 "fare_attributes.txt": ["fare_id"], - "fare_rules.txt": ["fare_id", "route_id", "origin_id", "destination_id", "contains_id"], + "fare_rules.txt": [ + "fare_id", + "route_id", + "origin_id", + "destination_id", + "contains_id", + ], # Shapes / frequencies / transfers "shapes.txt": ["shape_id", "shape_pt_sequence"], "frequencies.txt": ["trip_id", "start_time"], - "transfers.txt": ["from_stop_id", "to_stop_id", "from_route_id", "to_route_id", "from_trip_id", "to_trip_id"], + "transfers.txt": [ + "from_stop_id", + "to_stop_id", + "from_route_id", + "to_route_id", + "from_trip_id", + "to_trip_id", + ], # Pathways / levels "pathways.txt": ["pathway_id"], "levels.txt": ["level_id"], # Feed metadata "feed_info.txt": [], # single-row file, no primary key - "translations.txt": ["table_name", "field_name", "language", "record_id", "record_sub_id", "field_value"], + "translations.txt": [ + "table_name", + "field_name", + "language", + "record_id", + "record_sub_id", + "field_value", + ], "attributions.txt": ["attribution_id"], # Areas "areas.txt": ["area_id"], @@ -47,7 +67,12 @@ "fare_media.txt": ["fare_media_id"], "fare_products.txt": ["fare_product_id"], "fare_leg_rules.txt": ["leg_group_id"], # partial key, best effort - "fare_transfer_rules.txt": ["from_leg_group_id", "to_leg_group_id", "transfer_count", "duration_limit"], + "fare_transfer_rules.txt": [ + "from_leg_group_id", + "to_leg_group_id", + "transfer_count", + "duration_limit", + ], "timeframes.txt": ["timeframe_group_id", "start_time", "end_time", "service_id"], # Rider categories / booking "rider_categories.txt": ["rider_category_id"], @@ -61,5 +86,6 @@ def get_primary_key(file_name: str) -> list[str] | None: - """Return the primary key columns for a supported GTFS file, or None if unsupported.""" + """Return the primary key columns for a supported GTFS file, + or None if unsupported.""" return GTFS_PRIMARY_KEYS.get(file_name) diff --git a/src/gtfs_diff/models.py b/src/gtfs_diff/models.py index 6665e5e..b8eb026 100644 --- a/src/gtfs_diff/models.py +++ b/src/gtfs_diff/models.py @@ -1,178 +1,316 @@ -"""Pydantic v2 models for the GTFS Diff v2 output format.""" +# AUTO-GENERATED — DO NOT EDIT +# This file is generated from the GTFS Diff JSON Schema. +# To regenerate: ./scripts/generate_models.sh +# Schema source: https://github.com/MobilityData/gtfs-diff -from __future__ import annotations - -from datetime import datetime -from typing import Literal, Optional - -from pydantic import BaseModel, ConfigDict, Field - - -class ColumnEntry(BaseModel): - """A column that was added or deleted, with its name and original position.""" - - model_config = ConfigDict(populate_by_name=True) - - name: str - position: int = Field(..., ge=1) +from __future__ import annotations -class FeedSource(BaseModel): - """Identifies a GTFS feed by its source URL and the time it was downloaded.""" - - model_config = ConfigDict(populate_by_name=True) +from typing import Literal - source: str - downloaded_at: datetime +from pydantic import AwareDatetime, BaseModel, ConfigDict, Field class UnsupportedFile(BaseModel): - """A file present in one or both feeds that the diff engine does not support.""" - - model_config = ConfigDict(populate_by_name=True) + model_config = ConfigDict( + extra="allow", + ) + file_name: str = Field(..., description="File name as it appears in the archive.") + present_in: Literal["base", "new", "both"] = Field( + ..., description="Which archive(s) contain this file." + ) - file_name: str - present_in: Literal["base", "new", "both"] +class ColumnEntry(BaseModel): + model_config = ConfigDict( + extra="allow", + ) + name: str = Field(..., description="Column name.") + position: int = Field( + ..., description="1-based position of this column in the CSV header row.", ge=1 + ) -class Metadata(BaseModel): - """Top-level metadata describing the diff run and both feed sources.""" - - model_config = ConfigDict(populate_by_name=True) - schema_version: str - generated_at: datetime - row_changes_cap_per_file: Optional[int] = Field(None, ge=0) - base_feed: FeedSource - new_feed: FeedSource - unsupported_files: list[UnsupportedFile] +class FeedSource(BaseModel): + model_config = ConfigDict( + extra="allow", + ) + source: str = Field(..., description="URL or local path to the GTFS archive.") + downloaded_at: AwareDatetime = Field( + ..., description="ISO 8601 timestamp of when the feed was downloaded." + ) class FileSummary(BaseModel): - """High-level change counts for a single GTFS file.""" - - model_config = ConfigDict(populate_by_name=True) - - file_name: str - status: Literal["added", "deleted", "modified"] - columns_added_count: Optional[int] = Field(None, ge=0) - columns_deleted_count: Optional[int] = Field(None, ge=0) - rows_added_count: Optional[int] = Field(None, ge=0) - rows_deleted_count: Optional[int] = Field(None, ge=0) - rows_modified_count: Optional[int] = Field(None, ge=0) - - -class Summary(BaseModel): - """Aggregate change counts across all GTFS files in the diff.""" - - model_config = ConfigDict(populate_by_name=True) - - total_changes: int = Field(..., ge=0) - files_added_count: int = Field(..., ge=0) - files_deleted_count: int = Field(..., ge=0) - files_modified_count: int = Field(..., ge=0) - files: list[FileSummary] - - -class FieldChange(BaseModel): - """The before and after values for a single field within a modified row.""" + model_config = ConfigDict( + extra="allow", + ) + file_name: str = Field(..., description="Name of the GTFS file.") + status: Literal["added", "deleted", "modified", "not_compared"] = Field( + ..., description="File-level status." + ) - model_config = ConfigDict(populate_by_name=True) - field: str - base_value: str - new_value: str +class Truncated(BaseModel): + model_config = ConfigDict( + extra="allow", + ) + is_truncated: Literal[True] = Field(..., description="Always true when present.") + omitted_count: int = Field( + ..., description="Number of row changes omitted due to the cap.", ge=1 + ) + + +class NotComparedReason(BaseModel): + model_config = ConfigDict( + extra="allow", + ) + code: str = Field( + ..., + description='Machine-readable reason code (e.g. "id_churn", "missing_primary_key", "file_too_large").', + ) + message: str = Field( + ..., + description="Human-readable explanation of why the file or column was not compared.", + ) + + +class IgnoredColumn(BaseModel): + model_config = ConfigDict( + extra="allow", + ) + column: str = Field(..., description="The column name that was ignored.") + reason: NotComparedReason + + +class ColumnStat(BaseModel): + model_config = ConfigDict( + extra="allow", + ) + column: str = Field(..., description="The column name.") + modifications_count: int = Field( + ..., + description="Number of modified rows that had a change in this column.", + ge=0, + ) + modifications_percentage: float = Field( + ..., + description="modifications_count as a percentage of total modified rows.", + ge=0.0, + le=100.0, + ) class RowAdded(BaseModel): - """A row that exists only in the new feed.""" - - model_config = ConfigDict(populate_by_name=True) - - identifier: dict[str, str] - raw_value: str - new_line_number: int = Field(..., ge=1) + model_config = ConfigDict( + extra="allow", + ) + identifier: dict[str, str] = Field( + ..., description="Primary key values identifying this row." + ) + raw_value: str = Field( + ..., + description="The CSV row from the new file as a comma-separated string. Field order matches the columns array, which preserves the raw CSV column order of the base feed (new-feed-only columns appended).", + ) + new_line_number: int = Field( + ..., description="1-based line number of this row in the new CSV file.", ge=1 + ) class RowDeleted(BaseModel): - """A row that exists only in the base feed.""" + model_config = ConfigDict( + extra="allow", + ) + identifier: dict[str, str] = Field( + ..., description="Primary key values identifying this row." + ) + raw_value: str = Field( + ..., + description="The CSV row from the base file as a comma-separated string. Field order matches the columns array, which preserves the raw CSV column order of the base feed (new-feed-only columns appended).", + ) + base_line_number: int = Field( + ..., description="1-based line number of this row in the base CSV file.", ge=1 + ) - model_config = ConfigDict(populate_by_name=True) - identifier: dict[str, str] - raw_value: str - base_line_number: int = Field(..., ge=1) +class FieldChange(BaseModel): + model_config = ConfigDict( + extra="allow", + ) + field: str = Field(..., description="The column name that changed.") + base_value: str = Field(..., description="The value in the base feed.") + new_value: str = Field(..., description="The value in the new feed.") class RowModified(BaseModel): - """A row present in both feeds whose field values differ.""" + model_config = ConfigDict( + extra="allow", + ) + identifier: dict[str, str] = Field( + ..., description="Primary key values identifying this row." + ) + raw_value: str = Field( + ..., + description="The base CSV row as a comma-separated string. Field order matches the columns array, which preserves the raw CSV column order of the base feed (new-feed-only columns appended).", + ) + base_line_number: int = Field( + ..., description="1-based line number of this row in the base CSV file.", ge=1 + ) + new_line_number: int = Field( + ..., description="1-based line number of this row in the new CSV file.", ge=1 + ) + field_changes: list[FieldChange] = Field( + ..., description="List of field-level changes.", min_length=1 + ) - model_config = ConfigDict(populate_by_name=True) - - identifier: dict[str, str] - raw_value: str - base_line_number: int = Field(..., ge=1) - new_line_number: int = Field(..., ge=1) - field_changes: list[FieldChange] = Field(..., min_length=1) - - -class RowChanges(BaseModel): - """All row-level changes for a file, keyed by primary key columns.""" - model_config = ConfigDict(populate_by_name=True) - - primary_key: list[str] = Field(..., min_length=1) - columns: list[str] - added: list[RowAdded] - deleted: list[RowDeleted] - modified: list[RowModified] +class Metadata(BaseModel): + model_config = ConfigDict( + extra="allow", + ) + schema_version: str = Field(..., description="The version of the schema.") + generated_at: AwareDatetime = Field( + ..., description="ISO 8601 timestamp of when the diff was generated." + ) + row_changes_cap_per_file: int | None = Field( + ..., + description="Maximum number of row changes included per file in file_diffs. 0 means no row changes are included; null means no cap is applied.", + ge=0, + ) + base_feed: FeedSource + new_feed: FeedSource + unsupported_files: list[UnsupportedFile] -class Truncated(BaseModel): - """Indicates that row changes were capped and some were omitted.""" +class Summary(BaseModel): + model_config = ConfigDict( + extra="allow", + ) + total_changes: int = Field( + ..., description="Total number of changes across all files.", ge=0 + ) + files_added_count: int = Field(..., description="Number of files added.", ge=0) + files_deleted_count: int = Field(..., description="Number of files deleted.", ge=0) + files_modified_count: int = Field( + ..., description="Number of files modified.", ge=0 + ) + files_not_compared_count: int = Field( + ..., + description="Number of files that could not be meaningfully compared.", + ge=0, + ) + files: list[FileSummary] - model_config = ConfigDict(populate_by_name=True) - is_truncated: Literal[True] - omitted_count: int = Field(..., ge=1) +class RowChanges(BaseModel): + model_config = ConfigDict( + extra="allow", + ) + primary_key: list[str] = Field( + ..., description="Column(s) that uniquely identify a row.", min_length=1 + ) + columns: list[str] = Field( + ..., + description="Union of all columns across both base and new versions. Order matches the base feed's original column order; columns only in the new feed are appended.", + ) + added: list[RowAdded] = Field(..., description="Added rows (capped).") + deleted: list[RowDeleted] = Field(..., description="Deleted rows (capped).") + modified: list[RowModified] = Field(..., description="Modified rows (capped).") + + +class FileStats(BaseModel): + model_config = ConfigDict( + extra="allow", + ) + total_rows_base: int | None = Field( + None, description="Total number of rows in the base version of the file.", ge=0 + ) + total_rows_new: int | None = Field( + None, description="Total number of rows in the new version of the file.", ge=0 + ) + columns_added_count: int | None = Field( + None, description="Number of columns added.", ge=0 + ) + columns_deleted_count: int | None = Field( + None, description="Number of columns deleted.", ge=0 + ) + rows_added_count: int | None = Field( + None, description="True count of rows added.", ge=0 + ) + rows_deleted_count: int | None = Field( + None, description="True count of rows deleted.", ge=0 + ) + rows_modified_count: int | None = Field( + None, description="True count of rows modified.", ge=0 + ) + rows_changed_percentage: float | None = Field( + None, + description="Percentage of rows that were added, deleted, or modified relative to the larger of the two versions.", + ge=0.0, + le=100.0, + ) + column_stats: list[ColumnStat] | None = Field( + None, + description="Per-column modification statistics. Only covers modified rows.", + ) class FileDiff(BaseModel): - """Complete diff for a single GTFS file, including column and row changes.""" - - model_config = ConfigDict(populate_by_name=True) - - file_name: str - file_action: Literal["modified", "added", "deleted"] - columns_added: list[ColumnEntry] - columns_deleted: list[ColumnEntry] - row_changes: Optional[RowChanges] = None - truncated: Optional[Truncated] = None + model_config = ConfigDict( + extra="allow", + ) + file_name: str = Field(..., description="Name of the GTFS file.") + file_action: Literal["modified", "added", "deleted", "not_compared"] = Field( + ..., description="Action describing how this file changed." + ) + not_compared_reason: NotComparedReason | None = None + ignored_columns: list[IgnoredColumn] | None = Field( + None, + description="Columns excluded from the diff because their values are unreliable (e.g. they reference a file that was not compared).", + ) + columns_added: list[ColumnEntry] | None = Field( + None, + description="Columns added to this file, in the order they appear in the new file's CSV header. Each entry includes the column name and its 1-based position in that header.", + ) + columns_deleted: list[ColumnEntry] | None = Field( + None, + description="Columns deleted from this file, in the order they appeared in the base file's CSV header. Each entry includes the column name and its 1-based position in that header.", + ) + row_changes: RowChanges | None = None + truncated: Truncated | None = None + stats: FileStats | None = None class GtfsDiff(BaseModel): - """Root model for the GTFS Diff v2 output format.""" - - model_config = ConfigDict(populate_by_name=True) + """ + Schema for GTFS Diff v2 output: a single JSON document describing all differences between two GTFS archives. + """ + model_config = ConfigDict( + extra="allow", + ) metadata: Metadata summary: Summary file_diffs: list[FileDiff] __all__ = [ + "UnsupportedFile", "ColumnEntry", "FeedSource", - "UnsupportedFile", - "Metadata", "FileSummary", - "Summary", - "FieldChange", + "Truncated", + "NotComparedReason", + "IgnoredColumn", + "ColumnStat", "RowAdded", "RowDeleted", + "FieldChange", "RowModified", + "Metadata", + "Summary", "RowChanges", - "Truncated", + "FileStats", "FileDiff", "GtfsDiff", ] diff --git a/src/gtfs_diff/schema.conf b/src/gtfs_diff/schema.conf new file mode 100644 index 0000000..f64bca1 --- /dev/null +++ b/src/gtfs_diff/schema.conf @@ -0,0 +1,3 @@ +# Schema version used to generate src/gtfs_diff/models.py +# Run ./scripts/generate_models.sh to regenerate after changing this. +SCHEMA_VERSION=v2-rc1 diff --git a/tests/conftest.py b/tests/conftest.py index 7accda8..26ab1bc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,7 +8,6 @@ from tests.helpers import write_zip - # --------------------------------------------------------------------------- # Reusable fixtures # --------------------------------------------------------------------------- diff --git a/tests/test_cli.py b/tests/test_cli.py index 4f8164b..e78eece 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -5,11 +5,9 @@ import json from pathlib import Path -import pytest from click.testing import CliRunner from gtfs_diff.cli import main - from tests.helpers import write_zip STOPS_HEADER = "stop_id,stop_name,stop_lat,stop_lon\n" @@ -89,35 +87,56 @@ def test_cap_stored_in_metadata(self, tmp_path: Path): class TestMissingPrimaryKeyError: def test_exits_nonzero_on_missing_pk_column(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n", # stop_id absent - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_name,stop_lat,stop_lon\n" + "Stop One,1.0,2.0\n", # stop_id absent + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) runner = CliRunner() result = runner.invoke(main, [str(base), str(new)]) assert result.exit_code == 1 def test_error_message_names_file_and_missing_column(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n", # stop_id absent - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_name,stop_lat,stop_lon\n" + "Stop One,1.0,2.0\n", # stop_id absent + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) runner = CliRunner() result = runner.invoke(main, [str(base), str(new)]) assert "stops.txt" in result.output assert "stop_id" in result.output def test_error_message_includes_headers_found(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n", # stop_id absent - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_name,stop_lat,stop_lon\n" + "Stop One,1.0,2.0\n", # stop_id absent + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) runner = CliRunner() result = runner.invoke(main, [str(base), str(new)]) assert "stop_name" in result.output diff --git a/tests/test_engine.py b/tests/test_engine.py index 35378f7..383ae6f 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -8,14 +8,13 @@ from gtfs_diff.engine import MissingPrimaryKeyError, diff_feeds from gtfs_diff.models import GtfsDiff - from tests.helpers import write_zip - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- + def _get_file_diff(result: GtfsDiff, file_name: str): """Return the FileDiff for a given file name, or raise.""" for fd in result.file_diffs: @@ -38,15 +37,22 @@ def _get_file_summary(result: GtfsDiff, file_name: str): # File-level tests # --------------------------------------------------------------------------- + class TestFileAdded: def test_file_added(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - "routes.txt": "route_id,route_short_name\nR1,Route 1\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "routes.txt": "route_id,route_short_name\nR1,Route 1\n", + }, + ) result = diff_feeds(base, new) fd = _get_file_diff(result, "routes.txt") assert fd.file_action == "added" @@ -56,13 +62,19 @@ def test_file_added(self, tmp_path: Path): assert "route_short_name" in column_names def test_file_added_summary_status(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - "routes.txt": "route_id,route_short_name\nR1,Route 1\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "routes.txt": "route_id,route_short_name\nR1,Route 1\n", + }, + ) result = diff_feeds(base, new) fs = _get_file_summary(result, "routes.txt") assert fs.status == "added" @@ -71,13 +83,19 @@ def test_file_added_summary_status(self, tmp_path: Path): class TestFileDeleted: def test_file_deleted(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - "routes.txt": "route_id,route_short_name\nR1,Route 1\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "routes.txt": "route_id,route_short_name\nR1,Route 1\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) result = diff_feeds(base, new) fd = _get_file_diff(result, "routes.txt") assert fd.file_action == "deleted" @@ -86,13 +104,19 @@ def test_file_deleted(self, tmp_path: Path): assert "route_id" in column_names def test_file_deleted_summary_status(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - "routes.txt": "route_id,route_short_name\nR1,Route 1\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "routes.txt": "route_id,route_short_name\nR1,Route 1\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) result = diff_feeds(base, new) fs = _get_file_summary(result, "routes.txt") assert fs.status == "deleted" @@ -114,25 +138,40 @@ def test_identical_feeds(self, tmp_path: Path): # Row-level tests # --------------------------------------------------------------------------- + class TestRowsAdded: def test_rows_added_count(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\nS2,Stop Two,3.0,4.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + + "S1,Stop One,1.0,2.0\nS2,Stop Two,3.0,4.0\n", + }, + ) result = diff_feeds(base, new) - fs = _get_file_summary(result, "stops.txt") - assert fs.rows_added_count == 1 + fd = _get_file_diff(result, "stops.txt") + assert fd.stats.rows_added_count == 1 def test_rows_added_identifier(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\nS2,Stop Two,3.0,4.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + + "S1,Stop One,1.0,2.0\nS2,Stop Two,3.0,4.0\n", + }, + ) result = diff_feeds(base, new) fd = _get_file_diff(result, "stops.txt") assert len(fd.row_changes.added) == 1 @@ -141,23 +180,37 @@ def test_rows_added_identifier(self, tmp_path: Path): class TestRowsDeleted: def test_rows_deleted_count(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\nS2,Stop Two,3.0,4.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + + "S1,Stop One,1.0,2.0\nS2,Stop Two,3.0,4.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) result = diff_feeds(base, new) - fs = _get_file_summary(result, "stops.txt") - assert fs.rows_deleted_count == 1 + fd = _get_file_diff(result, "stops.txt") + assert fd.stats.rows_deleted_count == 1 def test_rows_deleted_identifier(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\nS2,Stop Two,3.0,4.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + + "S1,Stop One,1.0,2.0\nS2,Stop Two,3.0,4.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) result = diff_feeds(base, new) fd = _get_file_diff(result, "stops.txt") assert len(fd.row_changes.deleted) == 1 @@ -166,23 +219,35 @@ def test_rows_deleted_identifier(self, tmp_path: Path): class TestRowsModified: def test_rows_modified_count(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One Renamed,1.0,2.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One Renamed,1.0,2.0\n", + }, + ) result = diff_feeds(base, new) - fs = _get_file_summary(result, "stops.txt") - assert fs.rows_modified_count == 1 + fd = _get_file_diff(result, "stops.txt") + assert fd.stats.rows_modified_count == 1 def test_rows_modified_field_changes(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One Renamed,1.0,2.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One Renamed,1.0,2.0\n", + }, + ) result = diff_feeds(base, new) fd = _get_file_diff(result, "stops.txt") assert len(fd.row_changes.modified) == 1 @@ -190,7 +255,9 @@ def test_rows_modified_field_changes(self, tmp_path: Path): assert mod.identifier == {"stop_id": "S1"} field_names = [fc.field for fc in mod.field_changes] assert "stop_name" in field_names - stop_name_change = next(fc for fc in mod.field_changes if fc.field == "stop_name") + stop_name_change = next( + fc for fc in mod.field_changes if fc.field == "stop_name" + ) assert stop_name_change.base_value == "Stop One" assert stop_name_change.new_value == "Stop One Renamed" @@ -224,12 +291,20 @@ def test_swapped_row_order_is_not_a_change(self, tmp_path: Path): # Swapping two rows must not produce any diff. # NOTE: should a row reorder be reported as a structural change even when # no field values differ? This is currently an open design question. - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\nS2,Stop Two,3.0,4.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S2,Stop Two,3.0,4.0\nS1,Stop One,1.0,2.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + + "S1,Stop One,1.0,2.0\nS2,Stop Two,3.0,4.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + + "S2,Stop Two,3.0,4.0\nS1,Stop One,1.0,2.0\n", + }, + ) result = diff_feeds(base, new) assert result.file_diffs == [] assert result.summary.total_changes == 0 @@ -237,18 +312,22 @@ def test_swapped_row_order_is_not_a_change(self, tmp_path: Path): def test_trailing_zeros_in_coordinates_are_not_a_change(self, tmp_path: Path): # A producer may write '-73.55625' in one version and '-73.556250' in the # next. These are numerically identical and must not be reported as a diff. - base = write_zip(tmp_path / "base.zip", { - "shapes.txt": ( - "shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence\n" - "11071,45.518332,-73.55625,150001\n" - ), - }) - new = write_zip(tmp_path / "new.zip", { - "shapes.txt": ( - "shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence\n" - "11071,45.518332,-73.556250,150001\n" - ), - }) + base = write_zip( + tmp_path / "base.zip", + { + "shapes.txt": ( + "shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence\n11071,45.518332,-73.55625,150001\n" + ), + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "shapes.txt": ( + "shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence\n11071,45.518332,-73.556250,150001\n" + ), + }, + ) result = diff_feeds(base, new) assert result.file_diffs == [] assert result.summary.total_changes == 0 @@ -258,12 +337,18 @@ def test_swapped_column_order_is_not_a_change(self, tmp_path: Path): # Swapping two columns must not produce any diff. # NOTE: should a column reorder be reported as a structural change even when # no field values differ? This is currently an open design question. - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_id,stop_name,stop_lat\nS1,Stop One,1.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": "stop_name,stop_id,stop_lat\nStop One,S1,1.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_id,stop_name,stop_lat\nS1,Stop One,1.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": "stop_name,stop_id,stop_lat\nStop One,S1,1.0\n", + }, + ) result = diff_feeds(base, new) assert result.file_diffs == [] assert result.summary.total_changes == 0 @@ -273,26 +358,39 @@ def test_swapped_column_order_is_not_a_change(self, tmp_path: Path): # Column-level tests # --------------------------------------------------------------------------- + class TestColumnAdded: def test_column_added(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_id,stop_name\nS1,Stop One\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": "stop_id,stop_name,stop_desc\nS1,Stop One,A description\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_id,stop_name\nS1,Stop One\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": "stop_id,stop_name,stop_desc\nS1,Stop One,A description\n", + }, + ) result = diff_feeds(base, new) fd = _get_file_diff(result, "stops.txt") added_names = [c.name for c in fd.columns_added] assert "stop_desc" in added_names def test_column_added_position(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_id,stop_name\nS1,Stop One\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": "stop_id,stop_name,stop_desc\nS1,Stop One,A description\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_id,stop_name\nS1,Stop One\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": "stop_id,stop_name,stop_desc\nS1,Stop One,A description\n", + }, + ) result = diff_feeds(base, new) fd = _get_file_diff(result, "stops.txt") stop_desc_col = next(c for c in fd.columns_added if c.name == "stop_desc") @@ -301,24 +399,36 @@ def test_column_added_position(self, tmp_path: Path): class TestColumnDeleted: def test_column_deleted(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_id,stop_name,stop_desc\nS1,Stop One,A description\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": "stop_id,stop_name\nS1,Stop One\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_id,stop_name,stop_desc\nS1,Stop One,A description\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": "stop_id,stop_name\nS1,Stop One\n", + }, + ) result = diff_feeds(base, new) fd = _get_file_diff(result, "stops.txt") deleted_names = [c.name for c in fd.columns_deleted] assert "stop_desc" in deleted_names def test_column_deleted_position(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_id,stop_name,stop_desc\nS1,Stop One,A description\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": "stop_id,stop_name\nS1,Stop One\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_id,stop_name,stop_desc\nS1,Stop One,A description\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": "stop_id,stop_name\nS1,Stop One\n", + }, + ) result = diff_feeds(base, new) fd = _get_file_diff(result, "stops.txt") stop_desc_col = next(c for c in fd.columns_deleted if c.name == "stop_desc") @@ -329,6 +439,7 @@ def test_column_deleted_position(self, tmp_path: Path): # Cap tests # --------------------------------------------------------------------------- + def _make_stops_csv(n: int) -> str: header = "stop_id,stop_name\n" rows = "".join(f"S{i},Stop {i}\n" for i in range(1, n + 1)) @@ -337,12 +448,18 @@ def _make_stops_csv(n: int) -> str: class TestCapZero: def test_cap_zero_omits_row_changes(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_id,stop_name\nS1,Stop One\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": "stop_id,stop_name\nS1,Stop One\nS2,Stop Two\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_id,stop_name\nS1,Stop One\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": "stop_id,stop_name\nS1,Stop One\nS2,Stop Two\n", + }, + ) result = diff_feeds(base, new, row_changes_cap_per_file=0) fd = _get_file_diff(result, "stops.txt") assert fd.row_changes is None @@ -351,12 +468,19 @@ def test_cap_zero_omits_row_changes(self, tmp_path: Path): class TestCapLimits: def test_cap_limits_row_changes(self, tmp_path: Path): # 5 new rows added, cap = 3 → 3 included, omitted_count = 2 - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_id,stop_name\nS0,Stop Zero\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": "stop_id,stop_name\n" + "".join(f"S{i},Stop {i}\n" for i in range(1, 7)), - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_id,stop_name\nS0,Stop Zero\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": "stop_id,stop_name\n" + + "".join(f"S{i},Stop {i}\n" for i in range(1, 7)), + }, + ) result = diff_feeds(base, new, row_changes_cap_per_file=3) fd = _get_file_diff(result, "stops.txt") # Total included across added+deleted+modified <= 3 @@ -372,16 +496,25 @@ def test_cap_limits_row_changes(self, tmp_path: Path): def test_truncated_omitted_count_correct(self, tmp_path: Path): # 5 added rows, 0 deleted, 0 modified; cap = 3 → omitted = 2 - base = write_zip(tmp_path / "base.zip", { - "stops.txt": _make_stops_csv(0).replace("\n", "", 1), # header only - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": _make_stops_csv(0).replace("\n", "", 1), # header only + }, + ) # Write header-only base - base = write_zip(tmp_path / "base2.zip", { - "stops.txt": "stop_id,stop_name\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": _make_stops_csv(5), - }) + base = write_zip( + tmp_path / "base2.zip", + { + "stops.txt": "stop_id,stop_name\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": _make_stops_csv(5), + }, + ) result = diff_feeds(base, new, row_changes_cap_per_file=3) fd = _get_file_diff(result, "stops.txt") assert len(fd.row_changes.added) == 3 @@ -390,12 +523,18 @@ def test_truncated_omitted_count_correct(self, tmp_path: Path): class TestCapNone: def test_cap_none_includes_all(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_id,stop_name\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": _make_stops_csv(5), - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_id,stop_name\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": _make_stops_csv(5), + }, + ) result = diff_feeds(base, new, row_changes_cap_per_file=None) fd = _get_file_diff(result, "stops.txt") assert len(fd.row_changes.added) == 5 @@ -406,58 +545,93 @@ def test_cap_none_includes_all(self, tmp_path: Path): # Missing primary key column # --------------------------------------------------------------------------- + class TestMissingPrimaryKeyError: def test_missing_pk_column_in_base_raises(self, tmp_path: Path): - """diff_feeds raises MissingPrimaryKeyError when the base feed is missing a required PK column.""" - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n", # stop_id absent - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) + """diff_feeds raises MissingPrimaryKeyError when the base feed + is missing a required PK column.""" + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_name,stop_lat,stop_lon\n" + "Stop One,1.0,2.0\n", # stop_id absent + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) with pytest.raises(MissingPrimaryKeyError): diff_feeds(base, new) def test_missing_pk_column_in_new_raises(self, tmp_path: Path): - """diff_feeds raises MissingPrimaryKeyError when the new feed is missing a required PK column.""" - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n", # stop_id absent - }) + """diff_feeds raises MissingPrimaryKeyError when the new feed + is missing a required PK column.""" + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": "stop_name,stop_lat,stop_lon\n" + "Stop One,1.0,2.0\n", # stop_id absent + }, + ) with pytest.raises(MissingPrimaryKeyError): diff_feeds(base, new) def test_exception_carries_file_name(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) with pytest.raises(MissingPrimaryKeyError) as exc_info: diff_feeds(base, new) assert exc_info.value.file_name == "stops.txt" def test_exception_carries_missing_columns(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) with pytest.raises(MissingPrimaryKeyError) as exc_info: diff_feeds(base, new) assert "stop_id" in exc_info.value.missing_columns def test_exception_carries_headers(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_name,stop_lat,stop_lon\nStop One,1.0,2.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) with pytest.raises(MissingPrimaryKeyError) as exc_info: diff_feeds(base, new) assert exc_info.value.headers == ["stop_name", "stop_lat", "stop_lon"] @@ -467,31 +641,48 @@ def test_exception_carries_headers(self, tmp_path: Path): # Unsupported files # --------------------------------------------------------------------------- + class TestUnsupportedFile: def test_unsupported_file_in_metadata(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - "custom_data.txt": "foo,bar\n1,2\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - "custom_data.txt": "foo,bar\n1,2\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "custom_data.txt": "foo,bar\n1,2\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "custom_data.txt": "foo,bar\n1,2\n", + }, + ) result = diff_feeds(base, new) unsupported_names = [u.file_name for u in result.metadata.unsupported_files] assert "custom_data.txt" in unsupported_names def test_unsupported_file_present_in_both(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - "custom_data.txt": "foo,bar\n1,2\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - "custom_data.txt": "foo,bar\n1,2\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "custom_data.txt": "foo,bar\n1,2\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + "custom_data.txt": "foo,bar\n1,2\n", + }, + ) result = diff_feeds(base, new) - uf = next(u for u in result.metadata.unsupported_files if u.file_name == "custom_data.txt") + uf = next( + u + for u in result.metadata.unsupported_files + if u.file_name == "custom_data.txt" + ) assert uf.present_in == "both" @@ -499,35 +690,50 @@ def test_unsupported_file_present_in_both(self, tmp_path: Path): # Schema validation (model round-trip) # --------------------------------------------------------------------------- + class TestOutputMatchesSchema: def test_output_matches_schema(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\nS2,Stop Two,3.0,4.0\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": STOPS_HEADER + "S1,Stop One,1.0,2.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": STOPS_HEADER + + "S1,Stop One,1.0,2.0\nS2,Stop Two,3.0,4.0\n", + }, + ) result = diff_feeds(base, new) json_str = result.model_dump_json() restored = GtfsDiff.model_validate_json(json_str) assert restored.summary.total_changes >= 1 - assert restored.metadata.schema_version == "2.0" + assert restored.metadata.schema_version == "v2-rc1" # --------------------------------------------------------------------------- # raw_value column ordering # --------------------------------------------------------------------------- + class TestRawValueColumnOrder: def test_raw_value_has_empty_for_base_only_column(self, tmp_path: Path): # base has stop_lat; new does NOT have stop_lat (deleted column). # Added rows in new should have empty string for stop_lat in raw_value. - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_id,stop_name,stop_lat\nS1,Stop One,1.0\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": "stop_id,stop_name\nS1,Stop One\nS2,Stop Two\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_id,stop_name,stop_lat\nS1,Stop One,1.0\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": "stop_id,stop_name\nS1,Stop One\nS2,Stop Two\n", + }, + ) result = diff_feeds(base, new) fd = _get_file_diff(result, "stops.txt") # S2 is an added row; union_columns = [stop_id, stop_name, stop_lat] @@ -536,7 +742,9 @@ def test_raw_value_has_empty_for_base_only_column(self, tmp_path: Path): added = fd.row_changes.added[0] assert added.identifier == {"stop_id": "S2"} # Parse raw_value CSV - import csv, io + import csv + import io + row = next(csv.reader(io.StringIO(added.raw_value))) # union_columns = base_headers + new_only_cols = [stop_id, stop_name, stop_lat] # stop_lat not in new → empty @@ -544,12 +752,19 @@ def test_raw_value_has_empty_for_base_only_column(self, tmp_path: Path): def test_union_columns_order_base_first(self, tmp_path: Path): # New feed has an extra column; union_columns must be base_headers + new_only - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_id,stop_name\nS1,Stop One\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": "stop_id,stop_name,stop_desc\nS1,Stop One,Desc One\nS2,Stop Two,Desc Two\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_id,stop_name\nS1,Stop One\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": "stop_id,stop_name,stop_desc\nS1,Stop One,Desc One\n" + "S2,Stop Two,Desc Two\n", + }, + ) result = diff_feeds(base, new) fd = _get_file_diff(result, "stops.txt") assert fd.row_changes.columns == ["stop_id", "stop_name", "stop_desc"] @@ -559,39 +774,58 @@ def test_union_columns_order_base_first(self, tmp_path: Path): # Line numbers # --------------------------------------------------------------------------- + class TestLineNumbers: def test_added_row_line_number(self, tmp_path: Path): # Header = line 1, first data row = line 2, second data row = line 3 - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_id,stop_name\nS1,Stop One\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": "stop_id,stop_name\nS1,Stop One\nS2,Stop Two\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_id,stop_name\nS1,Stop One\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": "stop_id,stop_name\nS1,Stop One\nS2,Stop Two\n", + }, + ) result = diff_feeds(base, new) fd = _get_file_diff(result, "stops.txt") assert len(fd.row_changes.added) == 1 assert fd.row_changes.added[0].new_line_number == 3 def test_deleted_row_line_number(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_id,stop_name\nS1,Stop One\nS2,Stop Two\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": "stop_id,stop_name\nS1,Stop One\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_id,stop_name\nS1,Stop One\nS2,Stop Two\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": "stop_id,stop_name\nS1,Stop One\n", + }, + ) result = diff_feeds(base, new) fd = _get_file_diff(result, "stops.txt") assert len(fd.row_changes.deleted) == 1 assert fd.row_changes.deleted[0].base_line_number == 3 def test_modified_row_line_numbers(self, tmp_path: Path): - base = write_zip(tmp_path / "base.zip", { - "stops.txt": "stop_id,stop_name\nS1,Stop One\nS2,Stop Two\n", - }) - new = write_zip(tmp_path / "new.zip", { - "stops.txt": "stop_id,stop_name\nS1,Stop One\nS2,Stop Two RENAMED\n", - }) + base = write_zip( + tmp_path / "base.zip", + { + "stops.txt": "stop_id,stop_name\nS1,Stop One\nS2,Stop Two\n", + }, + ) + new = write_zip( + tmp_path / "new.zip", + { + "stops.txt": "stop_id,stop_name\nS1,Stop One\nS2,Stop Two RENAMED\n", + }, + ) result = diff_feeds(base, new) fd = _get_file_diff(result, "stops.txt") assert len(fd.row_changes.modified) == 1 diff --git a/tests/test_models.py b/tests/test_models.py index 00bf7af..1a0a7da 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -9,15 +9,14 @@ from gtfs_diff.models import ( ColumnEntry, + FeedSource, FieldChange, FileDiff, + FileStats, FileSummary, GtfsDiff, Metadata, - FeedSource, - RowAdded, RowChanges, - RowDeleted, RowModified, Summary, Truncated, @@ -31,6 +30,7 @@ # Helpers to build valid model instances # --------------------------------------------------------------------------- + def _feed_source(url: str = "http://example.com/feed.zip") -> FeedSource: return FeedSource(source=url, downloaded_at=NOW) @@ -67,6 +67,7 @@ def _summary(**kwargs) -> Summary: files_added_count=0, files_deleted_count=0, files_modified_count=0, + files_not_compared_count=0, files=[], ) defaults.update(kwargs) @@ -87,6 +88,7 @@ def _gtfs_diff(**kwargs) -> GtfsDiff: # GtfsDiff round-trip # --------------------------------------------------------------------------- + class TestGtfsDiffRoundTrip: def test_round_trip_empty(self): obj = _gtfs_diff() @@ -117,6 +119,7 @@ def test_round_trip_json(self): # ColumnEntry # --------------------------------------------------------------------------- + class TestColumnEntry: def test_valid(self): col = ColumnEntry(name="stop_id", position=1) @@ -140,6 +143,7 @@ def test_position_one_accepted(self): # RowChanges # --------------------------------------------------------------------------- + class TestRowChanges: def test_valid(self): rc = RowChanges( @@ -166,6 +170,7 @@ def test_empty_primary_key_rejected(self): # RowModified # --------------------------------------------------------------------------- + class TestRowModified: def test_valid(self): rm = RowModified( @@ -194,30 +199,41 @@ def test_empty_field_changes_rejected(self): # FileSummary # --------------------------------------------------------------------------- + class TestFileSummary: - def test_all_optional_counts_none(self): + def test_valid(self): fs = FileSummary(file_name="stops.txt", status="modified") - assert fs.rows_added_count is None - assert fs.rows_deleted_count is None - assert fs.rows_modified_count is None - assert fs.columns_added_count is None - assert fs.columns_deleted_count is None + assert fs.file_name == "stops.txt" + assert fs.status == "modified" + + def test_not_compared_status(self): + fs = FileSummary(file_name="stops.txt", status="not_compared") + assert fs.status == "not_compared" + + +class TestFileStats: + def test_all_optional_counts_none(self): + stats = FileStats() + assert stats.rows_added_count is None + assert stats.rows_deleted_count is None + assert stats.rows_modified_count is None + assert stats.columns_added_count is None + assert stats.columns_deleted_count is None def test_with_counts(self): - fs = FileSummary( - file_name="stops.txt", - status="modified", + stats = FileStats( rows_added_count=3, rows_deleted_count=1, rows_modified_count=0, ) - assert fs.rows_added_count == 3 + assert stats.rows_added_count == 3 # --------------------------------------------------------------------------- # Truncated # --------------------------------------------------------------------------- + class TestTruncated: def test_valid(self): t = Truncated(is_truncated=True, omitted_count=5) @@ -237,6 +253,7 @@ def test_omitted_count_must_be_positive(self): # UnsupportedFile # --------------------------------------------------------------------------- + class TestUnsupportedFile: @pytest.mark.parametrize("present_in", ["base", "new", "both"]) def test_valid_present_in(self, present_in: str):