From f442d41c3b94a1d5e277beffa04c61a50a82b11c Mon Sep 17 00:00:00 2001 From: jcpitre Date: Mon, 8 Jun 2026 14:55:05 -0400 Subject: [PATCH 01/21] Preliminary code. --- .../src/pipeline_tasks.py | 28 +- .../gtfs_change_tracker/function_config.json | 28 ++ .../gtfs_change_tracker/requirements.txt | 23 ++ .../gtfs_change_tracker/requirements_dev.txt | 2 + .../gtfs_change_tracker/src/main.py | 274 ++++++++++++++++++ .../gtfs_change_tracker/tests/__init__.py | 0 .../gtfs_change_tracker/tests/test_main.py | 237 +++++++++++++++ functions-python/helpers/utils.py | 34 +++ 8 files changed, 625 insertions(+), 1 deletion(-) create mode 100644 functions-python/gtfs_change_tracker/function_config.json create mode 100644 functions-python/gtfs_change_tracker/requirements.txt create mode 100644 functions-python/gtfs_change_tracker/requirements_dev.txt create mode 100644 functions-python/gtfs_change_tracker/src/main.py create mode 100644 functions-python/gtfs_change_tracker/tests/__init__.py create mode 100644 functions-python/gtfs_change_tracker/tests/test_main.py diff --git a/functions-python/batch_process_dataset/src/pipeline_tasks.py b/functions-python/batch_process_dataset/src/pipeline_tasks.py index 9e739d318..b86eba5b7 100644 --- a/functions-python/batch_process_dataset/src/pipeline_tasks.py +++ b/functions-python/batch_process_dataset/src/pipeline_tasks.py @@ -8,7 +8,11 @@ from shared.database.database import with_db_session from shared.database_gen.sqlacodegen_models import Gtfsdataset -from shared.helpers.utils import create_http_task, create_http_pmtiles_builder_task +from shared.helpers.utils import ( + create_http_task, + create_http_pmtiles_builder_task, + create_http_gtfs_change_tracker_task, +) def create_http_reverse_geolocation_processor_task( @@ -136,3 +140,25 @@ def create_pipeline_tasks(dataset: Gtfsdataset, db_session: Session) -> None: f"routes.txt file size : {routes_file.file_size_bytes} bytes" f" and changed files: {changed_files}" ) + + # Create GTFS change tracker task when a previous dataset exists + previous_dataset = ( + db_session.query(Gtfsdataset) + .filter( + Gtfsdataset.feed_id == dataset.feed_id, + Gtfsdataset.id != dataset.id, + ) + .order_by(Gtfsdataset.downloaded_at.desc()) + .first() + ) + if previous_dataset: + create_http_gtfs_change_tracker_task( + feed_id=dataset.feed_id, + previous_dataset_id=previous_dataset.id, + current_dataset_id=dataset.id, + ) + else: + logging.info( + "Skipping change tracker task for dataset %s: no previous dataset found.", + dataset_stable_id, + ) diff --git a/functions-python/gtfs_change_tracker/function_config.json b/functions-python/gtfs_change_tracker/function_config.json new file mode 100644 index 000000000..e9e6713a6 --- /dev/null +++ b/functions-python/gtfs_change_tracker/function_config.json @@ -0,0 +1,28 @@ +{ + "name": "gtfs-change-tracker", + "description": "Tracks changes between two GTFS datasets and stores a structured changelog in GCS and the database", + "entry_point": "gtfs_change_tracker", + "timeout": 540, + "memory": "4Gi", + "trigger_http": true, + "include_folders": ["helpers"], + "include_api_folders": ["database_gen", "database", "common"], + "environment_variables": [ + { + "key": "DATASETS_BUCKET_NAME" + }, + { + "key": "LOGGING_LEVEL" + } + ], + "secret_environment_variables": [ + { + "key": "FEEDS_DATABASE_URL" + } + ], + "ingress_settings": "ALLOW_INTERNAL_AND_GCLB", + "max_instance_request_concurrency": 1, + "max_instance_count": 10, + "min_instance_count": 0, + "available_cpu": 2 +} diff --git a/functions-python/gtfs_change_tracker/requirements.txt b/functions-python/gtfs_change_tracker/requirements.txt new file mode 100644 index 000000000..d97b27fd5 --- /dev/null +++ b/functions-python/gtfs_change_tracker/requirements.txt @@ -0,0 +1,23 @@ +# Common packages +functions-framework==3.* +google-cloud-logging +psycopg2-binary==2.9.6 +urllib3~=2.6.3 +requests~=2.33.1 +attrs~=23.1.0 +certifi~=2025.8.3 + +# SQL Alchemy and Geo Alchemy +SQLAlchemy==2.0.23 +geoalchemy2==0.14.7 + +# Google specific packages for this function +flask +google-cloud-storage +google-cloud-tasks + +# GTFS diff engine +gtfs-diff-engine==0.1.0 + +# Configuration +python-dotenv==1.2.2 diff --git a/functions-python/gtfs_change_tracker/requirements_dev.txt b/functions-python/gtfs_change_tracker/requirements_dev.txt new file mode 100644 index 000000000..9fbb0d627 --- /dev/null +++ b/functions-python/gtfs_change_tracker/requirements_dev.txt @@ -0,0 +1,2 @@ +pytest~=7.4.3 +requests-mock diff --git a/functions-python/gtfs_change_tracker/src/main.py b/functions-python/gtfs_change_tracker/src/main.py new file mode 100644 index 000000000..4cdba092a --- /dev/null +++ b/functions-python/gtfs_change_tracker/src/main.py @@ -0,0 +1,274 @@ +# +# MobilityData 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This module provides the GtfsChangeTracker Cloud Function. It orchestrates change tracking +# between two consecutive GTFS datasets: downloads both from GCS, computes a structured diff +# using gtfs-diff-engine, uploads the changelog JSON to GCS, and persists the record in the +# gtfs_dataset_changelog database table. +import logging +import os +import tempfile + +import flask +import functions_framework +import requests +from google.cloud import storage +from gtfs_diff.engine import diff_feeds +from sqlalchemy.dialects.postgresql import insert +from sqlalchemy.orm import Session + +from shared.database.database import with_db_session +from shared.database_gen.sqlacodegen_models import ( + GtfsDatasetChangelog, + Gtfsdataset, +) +from shared.helpers.logger import get_logger, init_logger + +init_logger() + + +@functions_framework.http +def gtfs_change_tracker(request: flask.Request) -> dict: + """ + HTTP entrypoint for the GTFS change tracker function. + + Expects a JSON body with: + feed_id – DB id of the GTFS feed (FK to gtfsfeed.id) + previous_dataset_id – DB id of the previous Gtfsdataset + current_dataset_id – DB id of the current Gtfsdataset + """ + payload = request.get_json(silent=True) or {} + feed_id = payload.get("feed_id") + previous_dataset_id = payload.get("previous_dataset_id") + current_dataset_id = payload.get("current_dataset_id") + + if not (feed_id and previous_dataset_id and current_dataset_id): + return { + "status": "error", + "error": "feed_id, previous_dataset_id, and current_dataset_id are required.", + } + + bucket_name = os.getenv("DATASETS_BUCKET_NAME") + if not bucket_name: + return { + "status": "error", + "error": "DATASETS_BUCKET_NAME environment variable is not set.", + } + + try: + tracker = GtfsChangeTracker( + feed_id=feed_id, + previous_dataset_id=previous_dataset_id, + current_dataset_id=current_dataset_id, + bucket_name=bucket_name, + ) + result = tracker.run() + return {"status": "success", **result} + except Exception as e: + logging.exception( + "Failed to generate changelog for %s -> %s", + previous_dataset_id, + current_dataset_id, + ) + return { + "status": "error", + "error": f"Failed to generate changelog: {e}", + } + + +class GtfsChangeTracker: + """ + Orchestrates GTFS change tracking between two consecutive datasets. + + Steps: + 1. Resolve both datasets and the feed stable_id from the database. + 2. Download both dataset zip archives to a temporary directory. + 3. Compute a structured diff using gtfs-diff-engine. + 4. Upload the changelog JSON to GCS at two paths (shared changelogs index + and per-dataset location). + 5. Upsert a row in gtfs_dataset_changelog. + """ + + def __init__( + self, + feed_id: str, + previous_dataset_id: str, + current_dataset_id: str, + bucket_name: str, + ): + self.feed_id = feed_id + self.previous_dataset_id = previous_dataset_id + self.current_dataset_id = current_dataset_id + self.bucket_name = bucket_name + self.logger = get_logger(GtfsChangeTracker.__name__, current_dataset_id) + + def run(self) -> dict: + """Execute the full change-tracking pipeline.""" + prev_dataset, curr_dataset, feed_stable_id = self._resolve_datasets() + + self.logger.info( + "Computing diff for feed %s: %s -> %s", + feed_stable_id, + prev_dataset.stable_id, + curr_dataset.stable_id, + ) + + with tempfile.TemporaryDirectory(prefix="gtfs_change_tracker_") as tmpdir: + prev_zip = os.path.join(tmpdir, "previous.zip") + curr_zip = os.path.join(tmpdir, "current.zip") + + self._download_zip(prev_dataset.hosted_url, prev_zip) + self._download_zip(curr_dataset.hosted_url, curr_zip) + + diff_result = diff_feeds(prev_zip, curr_zip) + + changelog_json = diff_result.model_dump_json(indent=2).encode("utf-8") + changelog_url = self._upload_changelog( + changelog_json, + feed_stable_id, + prev_dataset.stable_id, + curr_dataset.stable_id, + ) + + diff_summary = diff_result.summary.model_dump() + self._save_changelog_record( + changelog_url=changelog_url, diff_summary=diff_summary + ) + + self.logger.info("Changelog stored at %s", changelog_url) + return { + "message": "Changelog generated successfully.", + "changelog_url": changelog_url, + } + + @with_db_session + def _resolve_datasets(self, db_session: Session = None) -> tuple: + """ + Load both Gtfsdataset rows and return (previous_dataset, current_dataset, feed_stable_id). + """ + prev_dataset = ( + db_session.query(Gtfsdataset) + .filter(Gtfsdataset.id == self.previous_dataset_id) + .one_or_none() + ) + if prev_dataset is None: + raise ValueError(f"Previous dataset not found: {self.previous_dataset_id}") + + curr_dataset = ( + db_session.query(Gtfsdataset) + .filter(Gtfsdataset.id == self.current_dataset_id) + .one_or_none() + ) + if curr_dataset is None: + raise ValueError(f"Current dataset not found: {self.current_dataset_id}") + + if not prev_dataset.hosted_url: + raise ValueError( + f"Previous dataset {self.previous_dataset_id} has no hosted_url." + ) + if not curr_dataset.hosted_url: + raise ValueError( + f"Current dataset {self.current_dataset_id} has no hosted_url." + ) + + feed_stable_id = curr_dataset.feed.stable_id + if not feed_stable_id: + raise ValueError(f"Feed {self.feed_id} has no stable_id.") + + # Detach from session before returning so objects can be used outside the session + db_session.expunge(prev_dataset) + db_session.expunge(curr_dataset) + + return prev_dataset, curr_dataset, feed_stable_id + + def _download_zip(self, url: str, local_path: str) -> None: + """Download a dataset zip from a URL to a local path.""" + self.logger.info("Downloading %s", url) + response = requests.get(url, stream=True, timeout=120) + response.raise_for_status() + with open(local_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + def _upload_changelog( + self, + json_bytes: bytes, + feed_stable_id: str, + prev_stable_id: str, + curr_stable_id: str, + ) -> str: + """ + Upload the changelog JSON to GCS at two paths. + + Paths: + - /changelogs/__changelog.json + - //__changelog.json + + Returns the primary (changelogs/) URL. + """ + primary_blob_path = f"{feed_stable_id}/changelogs/{prev_stable_id}_{curr_stable_id}_changelog.json" + secondary_blob_path = f"{feed_stable_id}/{curr_stable_id}/{curr_stable_id}_{prev_stable_id}_changelog.json" + + bucket = storage.Client().bucket(self.bucket_name) + primary_url = None + + for blob_path in (primary_blob_path, secondary_blob_path): + blob = bucket.blob(blob_path) + blob.upload_from_string(json_bytes, content_type="application/json") + self.logger.info( + "Uploaded changelog to gs://%s/%s", self.bucket_name, blob_path + ) + if blob_path == primary_blob_path: + primary_url = ( + f"https://storage.googleapis.com/{self.bucket_name}/{blob_path}" + ) + + return primary_url + + @with_db_session + def _save_changelog_record( + self, + changelog_url: str, + diff_summary: dict, + db_session: Session = None, + ) -> None: + """ + Upsert a row into gtfs_dataset_changelog. + The UNIQUE constraint on (previous_dataset_id, current_dataset_id) ensures idempotency. + """ + stmt = ( + insert(GtfsDatasetChangelog) + .values( + feed_id=self.feed_id, + previous_dataset_id=self.previous_dataset_id, + current_dataset_id=self.current_dataset_id, + changelog_url=changelog_url, + diff_summary=diff_summary, + ) + .on_conflict_do_update( + constraint="gtfs_dataset_changelog_previous_current_key", + set_={ + "changelog_url": changelog_url, + "diff_summary": diff_summary, + "generated_at": GtfsDatasetChangelog.generated_at.default, + }, + ) + ) + db_session.execute(stmt) + db_session.commit() + self.logger.info( + "Saved changelog record for %s -> %s", + self.previous_dataset_id, + self.current_dataset_id, + ) diff --git a/functions-python/gtfs_change_tracker/tests/__init__.py b/functions-python/gtfs_change_tracker/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/functions-python/gtfs_change_tracker/tests/test_main.py b/functions-python/gtfs_change_tracker/tests/test_main.py new file mode 100644 index 000000000..aa09f1dd5 --- /dev/null +++ b/functions-python/gtfs_change_tracker/tests/test_main.py @@ -0,0 +1,237 @@ +# +# MobilityData 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest +from unittest.mock import MagicMock, patch + +import flask + +from main import GtfsChangeTracker, gtfs_change_tracker + + +def _make_dataset(id_, stable_id, hosted_url, feed_stable_id="mdb-1"): + dataset = MagicMock() + dataset.id = id_ + dataset.stable_id = stable_id + dataset.hosted_url = hosted_url + dataset.feed = MagicMock() + dataset.feed.stable_id = feed_stable_id + return dataset + + +class TestGtfsChangeTrackerHandler(unittest.TestCase): + """Tests for the HTTP handler function.""" + + def _request(self, payload): + with flask.Flask(__name__).test_request_context(json=payload): + return gtfs_change_tracker(flask.request) + + def test_missing_all_params(self): + result = self._request({}) + self.assertEqual(result["status"], "error") + self.assertIn("required", result["error"]) + + def test_missing_one_param(self): + result = self._request({"feed_id": "f1", "previous_dataset_id": "p1"}) + self.assertEqual(result["status"], "error") + + def test_missing_bucket_env(self): + os.environ.pop("DATASETS_BUCKET_NAME", None) + result = self._request( + {"feed_id": "f1", "previous_dataset_id": "p1", "current_dataset_id": "c1"} + ) + self.assertEqual(result["status"], "error") + self.assertIn("DATASETS_BUCKET_NAME", result["error"]) + + @patch("main.GtfsChangeTracker") + def test_success(self, mock_tracker_cls): + os.environ["DATASETS_BUCKET_NAME"] = "test-bucket" + instance = MagicMock() + instance.run.return_value = { + "message": "Changelog generated successfully.", + "changelog_url": "https://storage.googleapis.com/test-bucket/mdb-1/changelogs/p1_c1_changelog.json", + } + mock_tracker_cls.return_value = instance + + result = self._request( + {"feed_id": "f1", "previous_dataset_id": "p1", "current_dataset_id": "c1"} + ) + self.assertEqual(result["status"], "success") + self.assertIn("changelog_url", result) + mock_tracker_cls.assert_called_once_with( + feed_id="f1", + previous_dataset_id="p1", + current_dataset_id="c1", + bucket_name="test-bucket", + ) + + @patch("main.GtfsChangeTracker") + def test_exception_returns_error(self, mock_tracker_cls): + os.environ["DATASETS_BUCKET_NAME"] = "test-bucket" + instance = MagicMock() + instance.run.side_effect = Exception("something went wrong") + mock_tracker_cls.return_value = instance + + result = self._request( + {"feed_id": "f1", "previous_dataset_id": "p1", "current_dataset_id": "c1"} + ) + self.assertEqual(result["status"], "error") + self.assertIn("something went wrong", result["error"]) + + +class TestGtfsChangeTrackerRun(unittest.TestCase): + """Tests for GtfsChangeTracker.run() with mocked collaborators.""" + + def setUp(self): + os.environ["DATASETS_BUCKET_NAME"] = "test-bucket" + self.tracker = GtfsChangeTracker( + feed_id="feed-uuid", + previous_dataset_id="prev-uuid", + current_dataset_id="curr-uuid", + bucket_name="test-bucket", + ) + + @patch("main.GtfsChangeTracker._save_changelog_record") + @patch("main.GtfsChangeTracker._upload_changelog") + @patch("main.GtfsChangeTracker._download_zip") + @patch("main.GtfsChangeTracker._resolve_datasets") + def test_run_happy_path(self, mock_resolve, mock_download, mock_upload, mock_save): + prev_ds = _make_dataset( + "prev-uuid", "mdb-1-20240101", "https://example.com/prev.zip" + ) + curr_ds = _make_dataset( + "curr-uuid", "mdb-1-20240201", "https://example.com/curr.zip" + ) + mock_resolve.return_value = (prev_ds, curr_ds, "mdb-1") + + fake_summary = MagicMock() + fake_summary.model_dump.return_value = {"total_changes": 42} + fake_diff = MagicMock() + fake_diff.summary = fake_summary + fake_diff.model_dump_json.return_value = ( + '{"metadata": {}, "summary": {}, "file_diffs": []}' + ) + + changelog_url = ( + "https://storage.googleapis.com/test-bucket/mdb-1/changelogs/" + "mdb-1-20240101_mdb-1-20240201_changelog.json" + ) + mock_upload.return_value = changelog_url + + with patch("main.diff_feeds", return_value=fake_diff, create=True) as mock_diff: + result = self.tracker.run() + + mock_resolve.assert_called_once() + self.assertEqual(mock_download.call_count, 2) + mock_diff.assert_called_once() + mock_upload.assert_called_once_with( + fake_diff.model_dump_json.return_value.encode("utf-8"), + "mdb-1", + "mdb-1-20240101", + "mdb-1-20240201", + ) + mock_save.assert_called_once_with( + changelog_url=changelog_url, + diff_summary={"total_changes": 42}, + ) + self.assertEqual(result["changelog_url"], changelog_url) + + @patch("main.GtfsChangeTracker._resolve_datasets") + def test_run_raises_when_resolve_fails(self, mock_resolve): + mock_resolve.side_effect = ValueError("Previous dataset not found: prev-uuid") + with self.assertRaises(ValueError): + self.tracker.run() + + +class TestDownloadZip(unittest.TestCase): + def setUp(self): + self.tracker = GtfsChangeTracker( + feed_id="f", + previous_dataset_id="p", + current_dataset_id="c", + bucket_name="bucket", + ) + + @patch("main.requests.get") + def test_download_writes_chunks(self, mock_get): + mock_response = MagicMock() + mock_response.iter_content.return_value = [b"chunk1", b"chunk2"] + mock_response.raise_for_status.return_value = None + mock_get.return_value = mock_response + + import tempfile + import os + + with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp: + tmp_path = tmp.name + try: + self.tracker._download_zip("https://example.com/feed.zip", tmp_path) + with open(tmp_path, "rb") as f: + content = f.read() + self.assertEqual(content, b"chunk1chunk2") + finally: + os.unlink(tmp_path) + + @patch("main.requests.get") + def test_download_raises_on_http_error(self, mock_get): + mock_response = MagicMock() + mock_response.raise_for_status.side_effect = Exception("404 Not Found") + mock_get.return_value = mock_response + + with self.assertRaises(Exception): + self.tracker._download_zip("https://example.com/missing.zip", "/tmp/x.zip") + + +class TestUploadChangelog(unittest.TestCase): + def setUp(self): + self.tracker = GtfsChangeTracker( + feed_id="f", + previous_dataset_id="p", + current_dataset_id="c", + bucket_name="my-bucket", + ) + + @patch("main.storage.Client") + def test_uploads_two_blobs_and_returns_primary_url(self, mock_storage_cls): + mock_bucket = MagicMock() + mock_blob = MagicMock() + mock_storage_cls.return_value.bucket.return_value = mock_bucket + mock_bucket.blob.return_value = mock_blob + + url = self.tracker._upload_changelog( + b'{"data": 1}', "mdb-1", "mdb-1-20240101", "mdb-1-20240201" + ) + + self.assertEqual(mock_bucket.blob.call_count, 2) + upload_calls = mock_blob.upload_from_string.call_args_list + self.assertEqual(len(upload_calls), 2) + for c in upload_calls: + self.assertEqual( + c.kwargs.get("content_type") or c.args[1], "application/json" + ) + + expected_primary = ( + "https://storage.googleapis.com/my-bucket/mdb-1/changelogs/" + "mdb-1-20240101_mdb-1-20240201_changelog.json" + ) + self.assertEqual(url, expected_primary) + + blob_paths = [c.args[0] for c in mock_bucket.blob.call_args_list] + self.assertIn( + "mdb-1/changelogs/mdb-1-20240101_mdb-1-20240201_changelog.json", blob_paths + ) + self.assertIn( + "mdb-1/mdb-1-20240201/mdb-1-20240201_mdb-1-20240101_changelog.json", + blob_paths, + ) diff --git a/functions-python/helpers/utils.py b/functions-python/helpers/utils.py index bb66b4049..04badcacf 100644 --- a/functions-python/helpers/utils.py +++ b/functions-python/helpers/utils.py @@ -535,6 +535,40 @@ def create_http_pmtiles_builder_task( ) +def create_http_gtfs_change_tracker_task( + feed_id: str, + previous_dataset_id: str, + current_dataset_id: str, +) -> None: + """ + Create a Cloud Task to run the gtfs-change-tracker function for a pair of datasets. + """ + from google.cloud import tasks_v2 + import json + + client = tasks_v2.CloudTasksClient() + body = json.dumps( + { + "feed_id": feed_id, + "previous_dataset_id": previous_dataset_id, + "current_dataset_id": current_dataset_id, + } + ).encode() + queue_name = os.getenv("GTFS_CHANGE_TRACKER_QUEUE") + project_id = os.getenv("PROJECT_ID") + gcp_region = os.getenv("GCP_REGION") + gcp_env = os.getenv("ENVIRONMENT") + + create_http_task( + client, + body, + f"https://{gcp_region}-{project_id}.cloudfunctions.net/gtfs-change-tracker-{gcp_env}", + project_id, + gcp_region, + queue_name, + ) + + def get_execution_id(json_payload: dict, stable_id: Optional[str]) -> str: """ Extracts the execution_id from the JSON payload. From fdafd7b50d0aa8417741f1a697075810a50bc10a Mon Sep 17 00:00:00 2001 From: jcpitre Date: Mon, 8 Jun 2026 15:12:44 -0400 Subject: [PATCH 02/21] Read pre-extracted GTFS files from GCS instead of downloading the dataset zip --- .../gtfs_change_tracker/requirements.txt | 1 - .../gtfs_change_tracker/src/main.py | 68 +++++++++++------ .../gtfs_change_tracker/tests/test_main.py | 76 +++++++++++-------- 3 files changed, 89 insertions(+), 56 deletions(-) diff --git a/functions-python/gtfs_change_tracker/requirements.txt b/functions-python/gtfs_change_tracker/requirements.txt index d97b27fd5..4a385b07f 100644 --- a/functions-python/gtfs_change_tracker/requirements.txt +++ b/functions-python/gtfs_change_tracker/requirements.txt @@ -3,7 +3,6 @@ functions-framework==3.* google-cloud-logging psycopg2-binary==2.9.6 urllib3~=2.6.3 -requests~=2.33.1 attrs~=23.1.0 certifi~=2025.8.3 diff --git a/functions-python/gtfs_change_tracker/src/main.py b/functions-python/gtfs_change_tracker/src/main.py index 4cdba092a..77dd64d35 100644 --- a/functions-python/gtfs_change_tracker/src/main.py +++ b/functions-python/gtfs_change_tracker/src/main.py @@ -13,16 +13,16 @@ # limitations under the License. # # This module provides the GtfsChangeTracker Cloud Function. It orchestrates change tracking -# between two consecutive GTFS datasets: downloads both from GCS, computes a structured diff -# using gtfs-diff-engine, uploads the changelog JSON to GCS, and persists the record in the -# gtfs_dataset_changelog database table. +# between two consecutive GTFS datasets: reads the pre-extracted GTFS files from GCS (uploaded +# by batch_process_dataset at //extracted/), computes a +# structured diff using gtfs-diff-engine, uploads the changelog JSON to GCS, and persists the +# record in the gtfs_dataset_changelog database table. import logging import os import tempfile import flask import functions_framework -import requests from google.cloud import storage from gtfs_diff.engine import diff_feeds from sqlalchemy.dialects.postgresql import insert @@ -93,7 +93,8 @@ class GtfsChangeTracker: Steps: 1. Resolve both datasets and the feed stable_id from the database. - 2. Download both dataset zip archives to a temporary directory. + 2. Download the pre-extracted GTFS files from GCS for both datasets + (//extracted/). 3. Compute a structured diff using gtfs-diff-engine. 4. Upload the changelog JSON to GCS at two paths (shared changelogs index and per-dataset location). @@ -125,13 +126,19 @@ def run(self) -> dict: ) with tempfile.TemporaryDirectory(prefix="gtfs_change_tracker_") as tmpdir: - prev_zip = os.path.join(tmpdir, "previous.zip") - curr_zip = os.path.join(tmpdir, "current.zip") + prev_dir = os.path.join(tmpdir, "previous") + curr_dir = os.path.join(tmpdir, "current") + os.makedirs(prev_dir) + os.makedirs(curr_dir) - self._download_zip(prev_dataset.hosted_url, prev_zip) - self._download_zip(curr_dataset.hosted_url, curr_zip) + self._download_extracted_files( + feed_stable_id, prev_dataset.stable_id, prev_dir + ) + self._download_extracted_files( + feed_stable_id, curr_dataset.stable_id, curr_dir + ) - diff_result = diff_feeds(prev_zip, curr_zip) + diff_result = diff_feeds(prev_dir, curr_dir) changelog_json = diff_result.model_dump_json(indent=2).encode("utf-8") changelog_url = self._upload_changelog( @@ -173,13 +180,13 @@ def _resolve_datasets(self, db_session: Session = None) -> tuple: if curr_dataset is None: raise ValueError(f"Current dataset not found: {self.current_dataset_id}") - if not prev_dataset.hosted_url: + if not prev_dataset.stable_id: raise ValueError( - f"Previous dataset {self.previous_dataset_id} has no hosted_url." + f"Previous dataset {self.previous_dataset_id} has no stable_id." ) - if not curr_dataset.hosted_url: + if not curr_dataset.stable_id: raise ValueError( - f"Current dataset {self.current_dataset_id} has no hosted_url." + f"Current dataset {self.current_dataset_id} has no stable_id." ) feed_stable_id = curr_dataset.feed.stable_id @@ -192,14 +199,31 @@ def _resolve_datasets(self, db_session: Session = None) -> tuple: return prev_dataset, curr_dataset, feed_stable_id - def _download_zip(self, url: str, local_path: str) -> None: - """Download a dataset zip from a URL to a local path.""" - self.logger.info("Downloading %s", url) - response = requests.get(url, stream=True, timeout=120) - response.raise_for_status() - with open(local_path, "wb") as f: - for chunk in response.iter_content(chunk_size=8192): - f.write(chunk) + def _download_extracted_files( + self, feed_stable_id: str, dataset_stable_id: str, dest_dir: str + ) -> None: + """ + Download all pre-extracted GTFS files from GCS to a local directory. + + batch_process_dataset uploads each file from the dataset ZIP individually to: + //extracted/ + + This avoids re-downloading and re-unzipping the full archive. + """ + prefix = f"{feed_stable_id}/{dataset_stable_id}/extracted/" + bucket = storage.Client().bucket(self.bucket_name) + blobs = list(bucket.list_blobs(prefix=prefix)) + if not blobs: + raise ValueError( + f"No extracted files found in GCS at gs://{self.bucket_name}/{prefix}" + ) + for blob in blobs: + filename = os.path.basename(blob.name) + if not filename: + continue + local_path = os.path.join(dest_dir, filename) + blob.download_to_filename(local_path) + self.logger.debug("Downloaded gs://%s/%s", self.bucket_name, blob.name) def _upload_changelog( self, diff --git a/functions-python/gtfs_change_tracker/tests/test_main.py b/functions-python/gtfs_change_tracker/tests/test_main.py index aa09f1dd5..2ce4fa677 100644 --- a/functions-python/gtfs_change_tracker/tests/test_main.py +++ b/functions-python/gtfs_change_tracker/tests/test_main.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +import tempfile import unittest from unittest.mock import MagicMock, patch @@ -104,7 +105,7 @@ def setUp(self): @patch("main.GtfsChangeTracker._save_changelog_record") @patch("main.GtfsChangeTracker._upload_changelog") - @patch("main.GtfsChangeTracker._download_zip") + @patch("main.GtfsChangeTracker._download_extracted_files") @patch("main.GtfsChangeTracker._resolve_datasets") def test_run_happy_path(self, mock_resolve, mock_download, mock_upload, mock_save): prev_ds = _make_dataset( @@ -134,6 +135,12 @@ def test_run_happy_path(self, mock_resolve, mock_download, mock_upload, mock_sav mock_resolve.assert_called_once() self.assertEqual(mock_download.call_count, 2) + # Verify correct dataset stable_ids are passed to the downloader + download_calls = mock_download.call_args_list + self.assertEqual(download_calls[0].args[0], "mdb-1") + self.assertEqual(download_calls[0].args[1], "mdb-1-20240101") + self.assertEqual(download_calls[1].args[0], "mdb-1") + self.assertEqual(download_calls[1].args[1], "mdb-1-20240201") mock_diff.assert_called_once() mock_upload.assert_called_once_with( fake_diff.model_dump_json.return_value.encode("utf-8"), @@ -154,43 +161,46 @@ def test_run_raises_when_resolve_fails(self, mock_resolve): self.tracker.run() -class TestDownloadZip(unittest.TestCase): +class TestDownloadExtractedFiles(unittest.TestCase): def setUp(self): self.tracker = GtfsChangeTracker( feed_id="f", previous_dataset_id="p", current_dataset_id="c", - bucket_name="bucket", - ) - - @patch("main.requests.get") - def test_download_writes_chunks(self, mock_get): - mock_response = MagicMock() - mock_response.iter_content.return_value = [b"chunk1", b"chunk2"] - mock_response.raise_for_status.return_value = None - mock_get.return_value = mock_response - - import tempfile - import os - - with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp: - tmp_path = tmp.name - try: - self.tracker._download_zip("https://example.com/feed.zip", tmp_path) - with open(tmp_path, "rb") as f: - content = f.read() - self.assertEqual(content, b"chunk1chunk2") - finally: - os.unlink(tmp_path) - - @patch("main.requests.get") - def test_download_raises_on_http_error(self, mock_get): - mock_response = MagicMock() - mock_response.raise_for_status.side_effect = Exception("404 Not Found") - mock_get.return_value = mock_response - - with self.assertRaises(Exception): - self.tracker._download_zip("https://example.com/missing.zip", "/tmp/x.zip") + bucket_name="my-bucket", + ) + + @patch("main.storage.Client") + def test_downloads_all_blobs_to_dest_dir(self, mock_storage_cls): + mock_bucket = MagicMock() + mock_storage_cls.return_value.bucket.return_value = mock_bucket + + blob1 = MagicMock() + blob1.name = "mdb-1/mdb-1-20240101/extracted/stops.txt" + blob2 = MagicMock() + blob2.name = "mdb-1/mdb-1-20240101/extracted/routes.txt" + mock_bucket.list_blobs.return_value = [blob1, blob2] + + with tempfile.TemporaryDirectory() as dest: + self.tracker._download_extracted_files("mdb-1", "mdb-1-20240101", dest) + + mock_bucket.list_blobs.assert_called_once_with( + prefix="mdb-1/mdb-1-20240101/extracted/" + ) + self.assertEqual(blob1.download_to_filename.call_count, 1) + self.assertEqual(blob2.download_to_filename.call_count, 1) + # Verify destination paths use just the basename + self.assertIn("stops.txt", blob1.download_to_filename.call_args.args[0]) + self.assertIn("routes.txt", blob2.download_to_filename.call_args.args[0]) + + @patch("main.storage.Client") + def test_raises_when_no_files_in_gcs(self, mock_storage_cls): + mock_bucket = MagicMock() + mock_storage_cls.return_value.bucket.return_value = mock_bucket + mock_bucket.list_blobs.return_value = [] + + with self.assertRaises(ValueError, msg="No extracted files found"): + self.tracker._download_extracted_files("mdb-1", "mdb-1-20240101", "/tmp") class TestUploadChangelog(unittest.TestCase): From 2d5a130bbd07e71e8e1ca8de52b49846de567aa9 Mon Sep 17 00:00:00 2001 From: jcpitre Date: Mon, 8 Jun 2026 15:24:03 -0400 Subject: [PATCH 03/21] Added terraform config --- infra/functions-python/main.tf | 82 +++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 2 deletions(-) diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index 07a4a757f..238e2eb5d 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -65,6 +65,9 @@ locals { function_pmtiles_builder_config = jsondecode(file("${path.module}/../../functions-python/pmtiles_builder/function_config.json")) function_pmtiles_builder_zip = "${path.module}/../../functions-python/pmtiles_builder/.dist/pmtiles_builder.zip" + + function_gtfs_change_tracker_config = jsondecode(file("${path.module}/../../functions-python/gtfs_change_tracker/function_config.json")) + function_gtfs_change_tracker_zip = "${path.module}/../../functions-python/gtfs_change_tracker/.dist/gtfs_change_tracker.zip" } locals { @@ -76,7 +79,8 @@ locals { local.function_update_feed_status_config.secret_environment_variables, local.function_export_csv_config.secret_environment_variables, local.function_tasks_executor_config.secret_environment_variables, - local.function_pmtiles_builder_config.secret_environment_variables + local.function_pmtiles_builder_config.secret_environment_variables, + local.function_gtfs_change_tracker_config.secret_environment_variables ) # Remove duplicates by key, keeping the first occurrence @@ -222,6 +226,13 @@ resource "google_storage_bucket_object" "pmtiles_builder_zip" { source = local.function_pmtiles_builder_zip } +# 16. GTFS Change Tracker +resource "google_storage_bucket_object" "gtfs_change_tracker_zip" { + bucket = google_storage_bucket.functions_bucket.name + name = "gtfs-change-tracker-${substr(filebase64sha256(local.function_gtfs_change_tracker_zip), 0, 10)}.zip" + source = local.function_gtfs_change_tracker_zip +} + # Web app revalidation secret resource "google_secret_manager_secret" "web_app_revalidate_secret" { project = var.project_id @@ -1034,6 +1045,24 @@ resource "google_cloudfunctions2_function_iam_member" "pmtiles_builder_invoker" member = "serviceAccount:${google_service_account.functions_service_account.email}" } +# Grant execution permission to batchfunctions service account to the gtfs_change_tracker function +resource "google_cloudfunctions2_function_iam_member" "gtfs_change_tracker_invoker_batch_sa" { + project = var.project_id + location = var.gcp_region + cloud_function = google_cloudfunctions2_function.gtfs_change_tracker.name + role = "roles/cloudfunctions.invoker" + member = "serviceAccount:${local.batchfunctions_sa_email}" +} + +# Grant execution permission to the functions service account to the gtfs_change_tracker function +resource "google_cloudfunctions2_function_iam_member" "gtfs_change_tracker_invoker" { + project = var.project_id + location = var.gcp_region + cloud_function = google_cloudfunctions2_function.gtfs_change_tracker.name + role = "roles/cloudfunctions.invoker" + member = "serviceAccount:${google_service_account.functions_service_account.email}" +} + # 13.3 functions/reverse_geolocation - batch cloud function resource "google_cloudfunctions2_function" "reverse_geolocation_batch" { name = "${local.function_reverse_geolocation_config.name}-batch" @@ -1166,7 +1195,7 @@ resource "google_cloudfunctions2_function" "tasks_executor" { } } -# Grant execution permission to bathcfunctions service account to the tasks_executor function +# Grant execution permission to batchfunctions service account to the tasks_executor function resource "google_cloudfunctions2_function_iam_member" "tasks_executor_invoker" { project = var.project_id location = var.gcp_region @@ -1227,6 +1256,55 @@ resource "google_cloudfunctions2_function" "pmtiles_builder" { } } + +# 16. functions/gtfs_change_tracker cloud function +resource "google_cloudfunctions2_function" "gtfs_change_tracker" { + name = "${local.function_gtfs_change_tracker_config.name}-${var.environment}" + project = var.project_id + description = local.function_gtfs_change_tracker_config.description + location = var.gcp_region + depends_on = [google_secret_manager_secret_iam_member.secret_iam_member] + + build_config { + runtime = var.python_runtime + entry_point = local.function_gtfs_change_tracker_config.entry_point + source { + storage_source { + bucket = google_storage_bucket.functions_bucket.name + object = google_storage_bucket_object.gtfs_change_tracker_zip.name + } + } + } + service_config { + environment_variables = { + ENVIRONMENT = var.environment + PROJECT_ID = var.project_id + GCP_REGION = var.gcp_region + DATASETS_BUCKET_NAME = "${var.datasets_bucket_name}-${var.environment}" + } + available_memory = local.function_gtfs_change_tracker_config.memory + timeout_seconds = local.function_gtfs_change_tracker_config.timeout + available_cpu = local.function_gtfs_change_tracker_config.available_cpu + max_instance_request_concurrency = local.function_gtfs_change_tracker_config.max_instance_request_concurrency + max_instance_count = local.function_gtfs_change_tracker_config.max_instance_count + min_instance_count = local.function_gtfs_change_tracker_config.min_instance_count + service_account_email = google_service_account.functions_service_account.email + ingress_settings = local.function_gtfs_change_tracker_config.ingress_settings + vpc_connector = data.google_vpc_access_connector.vpc_connector.id + vpc_connector_egress_settings = "PRIVATE_RANGES_ONLY" + + dynamic "secret_environment_variables" { + for_each = local.function_gtfs_change_tracker_config.secret_environment_variables + content { + key = secret_environment_variables.value["key"] + project_id = var.project_id + secret = lookup(secret_environment_variables.value, "secret", "${upper(var.environment)}_${secret_environment_variables.value["key"]}") + version = "latest" + } + } + } +} + # Create the Pub/Sub topic used for publishing messages about rebuilding missing bounding boxes resource "google_pubsub_topic" "rebuild_missing_bounding_boxes" { name = "rebuild-bounding-boxes-topic" From 03969b06b45e079d080577194ae0fd6dcaede5e5 Mon Sep 17 00:00:00 2001 From: jcpitre Date: Tue, 9 Jun 2026 15:46:55 -0400 Subject: [PATCH 04/21] Mounted buckets on file system in terraform --- .../gtfs_change_tracker/src/main.py | 152 ++++++++---------- .../gtfs_change_tracker/tests/test_main.py | 100 +++++------- infra/batch/main.tf | 31 ++++ infra/functions-python/main.tf | 14 ++ 4 files changed, 150 insertions(+), 147 deletions(-) diff --git a/functions-python/gtfs_change_tracker/src/main.py b/functions-python/gtfs_change_tracker/src/main.py index 77dd64d35..1e6fac6a0 100644 --- a/functions-python/gtfs_change_tracker/src/main.py +++ b/functions-python/gtfs_change_tracker/src/main.py @@ -19,7 +19,6 @@ # record in the gtfs_dataset_changelog database table. import logging import os -import tempfile import flask import functions_framework @@ -66,12 +65,14 @@ def gtfs_change_tracker(request: flask.Request) -> dict: "error": "DATASETS_BUCKET_NAME environment variable is not set.", } + bucket_mount = os.getenv("DATASETS_BUCKET_MOUNT", "/mobilitydata-datasets") try: tracker = GtfsChangeTracker( feed_id=feed_id, previous_dataset_id=previous_dataset_id, current_dataset_id=current_dataset_id, bucket_name=bucket_name, + bucket_mount=bucket_mount, ) result = tracker.run() return {"status": "success", **result} @@ -93,8 +94,8 @@ class GtfsChangeTracker: Steps: 1. Resolve both datasets and the feed stable_id from the database. - 2. Download the pre-extracted GTFS files from GCS for both datasets - (//extracted/). + 2. Locate the pre-extracted GTFS files on the mounted GCS bucket filesystem + (///extracted/). 3. Compute a structured diff using gtfs-diff-engine. 4. Upload the changelog JSON to GCS at two paths (shared changelogs index and per-dataset location). @@ -107,11 +108,13 @@ def __init__( previous_dataset_id: str, current_dataset_id: str, bucket_name: str, + bucket_mount: str, ): self.feed_id = feed_id self.previous_dataset_id = previous_dataset_id self.current_dataset_id = current_dataset_id self.bucket_name = bucket_name + self.bucket_mount = bucket_mount self.logger = get_logger(GtfsChangeTracker.__name__, current_dataset_id) def run(self) -> dict: @@ -125,20 +128,10 @@ def run(self) -> dict: curr_dataset.stable_id, ) - with tempfile.TemporaryDirectory(prefix="gtfs_change_tracker_") as tmpdir: - prev_dir = os.path.join(tmpdir, "previous") - curr_dir = os.path.join(tmpdir, "current") - os.makedirs(prev_dir) - os.makedirs(curr_dir) + prev_dir = self._extracted_dir(feed_stable_id, prev_dataset.stable_id) + curr_dir = self._extracted_dir(feed_stable_id, curr_dataset.stable_id) - self._download_extracted_files( - feed_stable_id, prev_dataset.stable_id, prev_dir - ) - self._download_extracted_files( - feed_stable_id, curr_dataset.stable_id, curr_dir - ) - - diff_result = diff_feeds(prev_dir, curr_dir) + diff_result = diff_feeds(prev_dir, curr_dir) changelog_json = diff_result.model_dump_json(indent=2).encode("utf-8") changelog_url = self._upload_changelog( @@ -199,66 +192,44 @@ def _resolve_datasets(self, db_session: Session = None) -> tuple: return prev_dataset, curr_dataset, feed_stable_id - def _download_extracted_files( - self, feed_stable_id: str, dataset_stable_id: str, dest_dir: str - ) -> None: + def _extracted_dir(self, feed_stable_id: str, dataset_stable_id: str) -> str: """ - Download all pre-extracted GTFS files from GCS to a local directory. + Return the path to the pre-extracted GTFS files on the mounted bucket filesystem. - batch_process_dataset uploads each file from the dataset ZIP individually to: + batch_process_dataset uploads each file to: //extracted/ - - This avoids re-downloading and re-unzipping the full archive. + which appears on the mount at: + ///extracted/ """ - prefix = f"{feed_stable_id}/{dataset_stable_id}/extracted/" - bucket = storage.Client().bucket(self.bucket_name) - blobs = list(bucket.list_blobs(prefix=prefix)) - if not blobs: - raise ValueError( - f"No extracted files found in GCS at gs://{self.bucket_name}/{prefix}" - ) - for blob in blobs: - filename = os.path.basename(blob.name) - if not filename: - continue - local_path = os.path.join(dest_dir, filename) - blob.download_to_filename(local_path) - self.logger.debug("Downloaded gs://%s/%s", self.bucket_name, blob.name) + path = os.path.join( + self.bucket_mount, feed_stable_id, dataset_stable_id, "extracted" + ) + if not os.path.isdir(path): + raise ValueError(f"Extracted files not found on mounted bucket at {path}") + self.logger.debug("Using extracted dir from mounted bucket: %s", path) + return path def _upload_changelog( self, json_bytes: bytes, feed_stable_id: str, - prev_stable_id: str, - curr_stable_id: str, + prev_dataset_id: str, + curr_dataset_id: str, ) -> str: """ - Upload the changelog JSON to GCS at two paths. - - Paths: - - /changelogs/__changelog.json - - //__changelog.json + Upload the changelog JSON to GCS at: + //__changelog.json - Returns the primary (changelogs/) URL. + Returns the GCS public URL. """ - primary_blob_path = f"{feed_stable_id}/changelogs/{prev_stable_id}_{curr_stable_id}_changelog.json" - secondary_blob_path = f"{feed_stable_id}/{curr_stable_id}/{curr_stable_id}_{prev_stable_id}_changelog.json" - + blob_path = f"{feed_stable_id}/{curr_dataset_id}/{curr_dataset_id}_{prev_dataset_id}_changelog.json" bucket = storage.Client().bucket(self.bucket_name) - primary_url = None - - for blob_path in (primary_blob_path, secondary_blob_path): - blob = bucket.blob(blob_path) - blob.upload_from_string(json_bytes, content_type="application/json") - self.logger.info( - "Uploaded changelog to gs://%s/%s", self.bucket_name, blob_path - ) - if blob_path == primary_blob_path: - primary_url = ( - f"https://storage.googleapis.com/{self.bucket_name}/{blob_path}" - ) - - return primary_url + blob = bucket.blob(blob_path) + blob.upload_from_string(json_bytes, content_type="application/json") + self.logger.info( + "Uploaded changelog to gs://%s/%s", self.bucket_name, blob_path + ) + return f"https://storage.googleapis.com/{self.bucket_name}/{blob_path}" @with_db_session def _save_changelog_record( @@ -271,28 +242,39 @@ def _save_changelog_record( Upsert a row into gtfs_dataset_changelog. The UNIQUE constraint on (previous_dataset_id, current_dataset_id) ensures idempotency. """ - stmt = ( - insert(GtfsDatasetChangelog) - .values( - feed_id=self.feed_id, - previous_dataset_id=self.previous_dataset_id, - current_dataset_id=self.current_dataset_id, - changelog_url=changelog_url, - diff_summary=diff_summary, + # TODO: remove this flag and always write to DB once testing is complete + if os.getenv("CHANGELOG_DB_WRITE_ENABLED", "false").lower() == "true": + stmt = ( + insert(GtfsDatasetChangelog) + .values( + feed_id=self.feed_id, + previous_dataset_id=self.previous_dataset_id, + current_dataset_id=self.current_dataset_id, + changelog_url=changelog_url, + diff_summary=diff_summary, + ) + .on_conflict_do_update( + constraint="gtfs_dataset_changelog_previous_current_key", + set_={ + "changelog_url": changelog_url, + "diff_summary": diff_summary, + "generated_at": GtfsDatasetChangelog.generated_at.default, + }, + ) ) - .on_conflict_do_update( - constraint="gtfs_dataset_changelog_previous_current_key", - set_={ - "changelog_url": changelog_url, - "diff_summary": diff_summary, - "generated_at": GtfsDatasetChangelog.generated_at.default, - }, + db_session.execute(stmt) + db_session.commit() + self.logger.info( + "Saved changelog record for %s -> %s", + self.previous_dataset_id, + self.current_dataset_id, + ) + else: + self.logger.info( + "[TEMP] Would upsert gtfs_dataset_changelog: feed_id=%s previous=%s current=%s url=%s summary=%s", + self.feed_id, + self.previous_dataset_id, + self.current_dataset_id, + changelog_url, + diff_summary, ) - ) - db_session.execute(stmt) - db_session.commit() - self.logger.info( - "Saved changelog record for %s -> %s", - self.previous_dataset_id, - self.current_dataset_id, - ) diff --git a/functions-python/gtfs_change_tracker/tests/test_main.py b/functions-python/gtfs_change_tracker/tests/test_main.py index 2ce4fa677..f16e4b226 100644 --- a/functions-python/gtfs_change_tracker/tests/test_main.py +++ b/functions-python/gtfs_change_tracker/tests/test_main.py @@ -61,7 +61,7 @@ def test_success(self, mock_tracker_cls): instance = MagicMock() instance.run.return_value = { "message": "Changelog generated successfully.", - "changelog_url": "https://storage.googleapis.com/test-bucket/mdb-1/changelogs/p1_c1_changelog.json", + "changelog_url": "https://storage.googleapis.com/test-bucket/mdb-1/c1/c1_p1_changelog.json", } mock_tracker_cls.return_value = instance @@ -75,6 +75,7 @@ def test_success(self, mock_tracker_cls): previous_dataset_id="p1", current_dataset_id="c1", bucket_name="test-bucket", + bucket_mount="/mobilitydata-datasets", ) @patch("main.GtfsChangeTracker") @@ -101,13 +102,16 @@ def setUp(self): previous_dataset_id="prev-uuid", current_dataset_id="curr-uuid", bucket_name="test-bucket", + bucket_mount="/mobilitydata-datasets", ) @patch("main.GtfsChangeTracker._save_changelog_record") @patch("main.GtfsChangeTracker._upload_changelog") - @patch("main.GtfsChangeTracker._download_extracted_files") + @patch("main.GtfsChangeTracker._extracted_dir") @patch("main.GtfsChangeTracker._resolve_datasets") - def test_run_happy_path(self, mock_resolve, mock_download, mock_upload, mock_save): + def test_run_happy_path( + self, mock_resolve, mock_extracted_dir, mock_upload, mock_save + ): prev_ds = _make_dataset( "prev-uuid", "mdb-1-20240101", "https://example.com/prev.zip" ) @@ -115,6 +119,10 @@ def test_run_happy_path(self, mock_resolve, mock_download, mock_upload, mock_sav "curr-uuid", "mdb-1-20240201", "https://example.com/curr.zip" ) mock_resolve.return_value = (prev_ds, curr_ds, "mdb-1") + mock_extracted_dir.side_effect = [ + "/mobilitydata-datasets/mdb-1/mdb-1-20240101/extracted", + "/mobilitydata-datasets/mdb-1/mdb-1-20240201/extracted", + ] fake_summary = MagicMock() fake_summary.model_dump.return_value = {"total_changes": 42} @@ -125,8 +133,8 @@ def test_run_happy_path(self, mock_resolve, mock_download, mock_upload, mock_sav ) changelog_url = ( - "https://storage.googleapis.com/test-bucket/mdb-1/changelogs/" - "mdb-1-20240101_mdb-1-20240201_changelog.json" + "https://storage.googleapis.com/test-bucket/mdb-1/" + "mdb-1-20240201/mdb-1-20240201_mdb-1-20240101_changelog.json" ) mock_upload.return_value = changelog_url @@ -134,13 +142,10 @@ def test_run_happy_path(self, mock_resolve, mock_download, mock_upload, mock_sav result = self.tracker.run() mock_resolve.assert_called_once() - self.assertEqual(mock_download.call_count, 2) - # Verify correct dataset stable_ids are passed to the downloader - download_calls = mock_download.call_args_list - self.assertEqual(download_calls[0].args[0], "mdb-1") - self.assertEqual(download_calls[0].args[1], "mdb-1-20240101") - self.assertEqual(download_calls[1].args[0], "mdb-1") - self.assertEqual(download_calls[1].args[1], "mdb-1-20240201") + self.assertEqual(mock_extracted_dir.call_count, 2) + extracted_calls = mock_extracted_dir.call_args_list + self.assertEqual(extracted_calls[0].args, ("mdb-1", "mdb-1-20240101")) + self.assertEqual(extracted_calls[1].args, ("mdb-1", "mdb-1-20240201")) mock_diff.assert_called_once() mock_upload.assert_called_once_with( fake_diff.model_dump_json.return_value.encode("utf-8"), @@ -161,46 +166,27 @@ def test_run_raises_when_resolve_fails(self, mock_resolve): self.tracker.run() -class TestDownloadExtractedFiles(unittest.TestCase): +class TestExtractedDir(unittest.TestCase): def setUp(self): self.tracker = GtfsChangeTracker( feed_id="f", previous_dataset_id="p", current_dataset_id="c", bucket_name="my-bucket", + bucket_mount="/mobilitydata-datasets", ) - @patch("main.storage.Client") - def test_downloads_all_blobs_to_dest_dir(self, mock_storage_cls): - mock_bucket = MagicMock() - mock_storage_cls.return_value.bucket.return_value = mock_bucket + def test_returns_correct_path_when_dir_exists(self): + with tempfile.TemporaryDirectory() as mount: + extracted = os.path.join(mount, "mdb-1", "mdb-1-20240101", "extracted") + os.makedirs(extracted) + self.tracker.bucket_mount = mount + result = self.tracker._extracted_dir("mdb-1", "mdb-1-20240101") + self.assertEqual(result, extracted) - blob1 = MagicMock() - blob1.name = "mdb-1/mdb-1-20240101/extracted/stops.txt" - blob2 = MagicMock() - blob2.name = "mdb-1/mdb-1-20240101/extracted/routes.txt" - mock_bucket.list_blobs.return_value = [blob1, blob2] - - with tempfile.TemporaryDirectory() as dest: - self.tracker._download_extracted_files("mdb-1", "mdb-1-20240101", dest) - - mock_bucket.list_blobs.assert_called_once_with( - prefix="mdb-1/mdb-1-20240101/extracted/" - ) - self.assertEqual(blob1.download_to_filename.call_count, 1) - self.assertEqual(blob2.download_to_filename.call_count, 1) - # Verify destination paths use just the basename - self.assertIn("stops.txt", blob1.download_to_filename.call_args.args[0]) - self.assertIn("routes.txt", blob2.download_to_filename.call_args.args[0]) - - @patch("main.storage.Client") - def test_raises_when_no_files_in_gcs(self, mock_storage_cls): - mock_bucket = MagicMock() - mock_storage_cls.return_value.bucket.return_value = mock_bucket - mock_bucket.list_blobs.return_value = [] - - with self.assertRaises(ValueError, msg="No extracted files found"): - self.tracker._download_extracted_files("mdb-1", "mdb-1-20240101", "/tmp") + def test_raises_when_dir_not_found(self): + with self.assertRaises(ValueError, msg="Extracted files not found"): + self.tracker._extracted_dir("mdb-1", "mdb-1-20240101") class TestUploadChangelog(unittest.TestCase): @@ -210,10 +196,11 @@ def setUp(self): previous_dataset_id="p", current_dataset_id="c", bucket_name="my-bucket", + bucket_mount="/mobilitydata-datasets", ) @patch("main.storage.Client") - def test_uploads_two_blobs_and_returns_primary_url(self, mock_storage_cls): + def test_uploads_one_blob_and_returns_url(self, mock_storage_cls): mock_bucket = MagicMock() mock_blob = MagicMock() mock_storage_cls.return_value.bucket.return_value = mock_bucket @@ -223,25 +210,14 @@ def test_uploads_two_blobs_and_returns_primary_url(self, mock_storage_cls): b'{"data": 1}', "mdb-1", "mdb-1-20240101", "mdb-1-20240201" ) - self.assertEqual(mock_bucket.blob.call_count, 2) - upload_calls = mock_blob.upload_from_string.call_args_list - self.assertEqual(len(upload_calls), 2) - for c in upload_calls: - self.assertEqual( - c.kwargs.get("content_type") or c.args[1], "application/json" - ) - - expected_primary = ( - "https://storage.googleapis.com/my-bucket/mdb-1/changelogs/" - "mdb-1-20240101_mdb-1-20240201_changelog.json" + mock_bucket.blob.assert_called_once_with( + "mdb-1/mdb-1-20240201/mdb-1-20240201_mdb-1-20240101_changelog.json" ) - self.assertEqual(url, expected_primary) - - blob_paths = [c.args[0] for c in mock_bucket.blob.call_args_list] - self.assertIn( - "mdb-1/changelogs/mdb-1-20240101_mdb-1-20240201_changelog.json", blob_paths + mock_blob.upload_from_string.assert_called_once_with( + b'{"data": 1}', content_type="application/json" ) - self.assertIn( + self.assertEqual( + url, + "https://storage.googleapis.com/my-bucket/" "mdb-1/mdb-1-20240201/mdb-1-20240201_mdb-1-20240101_changelog.json", - blob_paths, ) diff --git a/infra/batch/main.tf b/infra/batch/main.tf index 3c6d8083a..7c58653d1 100644 --- a/infra/batch/main.tf +++ b/infra/batch/main.tf @@ -44,6 +44,8 @@ locals { deployment_timestamp = formatdate("YYYYMMDDhhmmss", timestamp()) function_pmtiles_builder_config = jsondecode(file("${path.module}/../../functions-python/pmtiles_builder/function_config.json")) + + function_gtfs_change_tracker_config = jsondecode(file("${path.module}/../../functions-python/gtfs_change_tracker/function_config.json")) } data "google_vpc_access_connector" "vpc_connector" { @@ -272,6 +274,26 @@ resource "google_cloud_tasks_queue" "pmtiles_builder_task_queue" { } } +# Task queue to invoke gtfs_change_tracker function +resource "google_cloud_tasks_queue" "gtfs_change_tracker_task_queue" { + project = var.project_id + location = var.gcp_region + name = "gtfs-change-tracker-queue-${var.environment}-${local.deployment_timestamp}" + + rate_limits { + max_concurrent_dispatches = 10 + max_dispatches_per_second = 1 + } + + retry_config { + # Retries span ~10 minutes: initial try + 2 retries at 120s then 240s + max_attempts = 3 + min_backoff = "120s" + max_backoff = "240s" + max_doublings = 1 + } +} + # Batch process dataset function resource "google_cloudfunctions2_function" "pubsub_function" { @@ -310,6 +332,7 @@ resource "google_cloudfunctions2_function" "pubsub_function" { MATERIALIZED_VIEW_QUEUE = google_cloud_tasks_queue.refresh_materialized_view_task_queue.name PMTILES_BUILDER_QUEUE = google_cloud_tasks_queue.pmtiles_builder_task_queue.name REVERSE_GEOLOCATION_QUEUE = "reverse-geolocation-processor-task-queue" + GTFS_CHANGE_TRACKER_QUEUE = google_cloud_tasks_queue.gtfs_change_tracker_task_queue.name WEB_REVALIDATION_QUEUE = google_cloud_tasks_queue.web_revalidation_task_queue.name } dynamic "secret_environment_variables" { @@ -468,4 +491,12 @@ resource "google_cloud_run_service_iam_member" "pmtiles_builder_invoker" { service = "${local.function_pmtiles_builder_config.name}-${var.environment}" role = "roles/run.invoker" member = "serviceAccount:${google_service_account.functions_service_account.email}" +} + +resource "google_cloud_run_service_iam_member" "gtfs_change_tracker_invoker" { + project = var.project_id + location = var.gcp_region + service = "${local.function_gtfs_change_tracker_config.name}-${var.environment}" + role = "roles/run.invoker" + member = "serviceAccount:${google_service_account.functions_service_account.email}" } \ No newline at end of file diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index 238e2eb5d..5d41c679c 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -1281,6 +1281,7 @@ resource "google_cloudfunctions2_function" "gtfs_change_tracker" { PROJECT_ID = var.project_id GCP_REGION = var.gcp_region DATASETS_BUCKET_NAME = "${var.datasets_bucket_name}-${var.environment}" + DATASETS_BUCKET_MOUNT = "/mobilitydata-datasets" } available_memory = local.function_gtfs_change_tracker_config.memory timeout_seconds = local.function_gtfs_change_tracker_config.timeout @@ -1293,6 +1294,11 @@ resource "google_cloudfunctions2_function" "gtfs_change_tracker" { vpc_connector = data.google_vpc_access_connector.vpc_connector.id vpc_connector_egress_settings = "PRIVATE_RANGES_ONLY" + volume_mounts { + name = "datasets-bucket" + mount_path = "/mobilitydata-datasets" + } + dynamic "secret_environment_variables" { for_each = local.function_gtfs_change_tracker_config.secret_environment_variables content { @@ -1303,6 +1309,14 @@ resource "google_cloudfunctions2_function" "gtfs_change_tracker" { } } } + + volumes { + name = "datasets-bucket" + cloud_storage { + bucket = "${var.datasets_bucket_name}-${var.environment}" + read_only = true + } + } } # Create the Pub/Sub topic used for publishing messages about rebuilding missing bounding boxes From 806dc290b33c9fd83e64946fc79b5f6bee0c52a7 Mon Sep 17 00:00:00 2001 From: jcpitre Date: Tue, 9 Jun 2026 16:46:11 -0400 Subject: [PATCH 05/21] Mount GCS bucket on gtfs_change_tracker via terraform_data workaround --- infra/functions-python/main.tf | 52 +++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index 5d41c679c..f7859f7df 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -1277,10 +1277,10 @@ resource "google_cloudfunctions2_function" "gtfs_change_tracker" { } service_config { environment_variables = { - ENVIRONMENT = var.environment - PROJECT_ID = var.project_id - GCP_REGION = var.gcp_region - DATASETS_BUCKET_NAME = "${var.datasets_bucket_name}-${var.environment}" + ENVIRONMENT = var.environment + PROJECT_ID = var.project_id + GCP_REGION = var.gcp_region + DATASETS_BUCKET_NAME = "${var.datasets_bucket_name}-${var.environment}" DATASETS_BUCKET_MOUNT = "/mobilitydata-datasets" } available_memory = local.function_gtfs_change_tracker_config.memory @@ -1294,11 +1294,6 @@ resource "google_cloudfunctions2_function" "gtfs_change_tracker" { vpc_connector = data.google_vpc_access_connector.vpc_connector.id vpc_connector_egress_settings = "PRIVATE_RANGES_ONLY" - volume_mounts { - name = "datasets-bucket" - mount_path = "/mobilitydata-datasets" - } - dynamic "secret_environment_variables" { for_each = local.function_gtfs_change_tracker_config.secret_environment_variables content { @@ -1309,14 +1304,39 @@ resource "google_cloudfunctions2_function" "gtfs_change_tracker" { } } } +} - volumes { - name = "datasets-bucket" - cloud_storage { - bucket = "${var.datasets_bucket_name}-${var.environment}" - read_only = true - } - } +# google_cloudfunctions2_function does not expose volume mounts in its schema. +# This terraform_data resource mounts the datasets GCS bucket on the underlying Cloud Run service +# after the function is deployed, using `gcloud run services update`. +resource "terraform_data" "gtfs_change_tracker_gcs_mount" { + triggers_replace = { + function_name = google_cloudfunctions2_function.gtfs_change_tracker.name + bucket = "${var.datasets_bucket_name}-${var.environment}" + region = var.gcp_region + project = var.project_id + } + + provisioner "local-exec" { + command = <<-EOT + MOUNTS=$(gcloud run services describe ${google_cloudfunctions2_function.gtfs_change_tracker.name} \ + --project ${var.project_id} \ + --region ${var.gcp_region} \ + --format='value(spec.template.spec.volumes[].name)' 2>/dev/null) + if echo "$MOUNTS" | grep -q "datasets-bucket"; then + echo "GCS volume already mounted, skipping." + else + gcloud run services update ${google_cloudfunctions2_function.gtfs_change_tracker.name} \ + --project ${var.project_id} \ + --region ${var.gcp_region} \ + --add-volume name=datasets-bucket,type=cloud-storage,bucket=${var.datasets_bucket_name}-${var.environment},readonly=true \ + --add-volume-mount volume=datasets-bucket,mount-path=/mobilitydata-datasets \ + --quiet + fi + EOT + } + + depends_on = [google_cloudfunctions2_function.gtfs_change_tracker] } # Create the Pub/Sub topic used for publishing messages about rebuilding missing bounding boxes From 4489e2440a62e9841357476eabbf1b805e4f6b7c Mon Sep 17 00:00:00 2001 From: jcpitre Date: Tue, 9 Jun 2026 22:36:15 -0400 Subject: [PATCH 06/21] Allowed ALL ingress --- functions-python/gtfs_change_tracker/function_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functions-python/gtfs_change_tracker/function_config.json b/functions-python/gtfs_change_tracker/function_config.json index e9e6713a6..8508b3cc5 100644 --- a/functions-python/gtfs_change_tracker/function_config.json +++ b/functions-python/gtfs_change_tracker/function_config.json @@ -20,7 +20,7 @@ "key": "FEEDS_DATABASE_URL" } ], - "ingress_settings": "ALLOW_INTERNAL_AND_GCLB", + "ingress_settings": "ALLOW_ALL", "max_instance_request_concurrency": 1, "max_instance_count": 10, "min_instance_count": 0, From 5b1177f2d562cba6dbab802251ef24cafa49f9e2 Mon Sep 17 00:00:00 2001 From: jcpitre Date: Tue, 9 Jun 2026 23:12:29 -0400 Subject: [PATCH 07/21] Change ids to stable_ids --- .../gtfs_change_tracker/src/main.py | 120 ++++++++++-------- .../gtfs_change_tracker/tests/test_main.py | 60 ++++++--- 2 files changed, 105 insertions(+), 75 deletions(-) diff --git a/functions-python/gtfs_change_tracker/src/main.py b/functions-python/gtfs_change_tracker/src/main.py index 1e6fac6a0..a1e490885 100644 --- a/functions-python/gtfs_change_tracker/src/main.py +++ b/functions-python/gtfs_change_tracker/src/main.py @@ -43,19 +43,21 @@ def gtfs_change_tracker(request: flask.Request) -> dict: HTTP entrypoint for the GTFS change tracker function. Expects a JSON body with: - feed_id – DB id of the GTFS feed (FK to gtfsfeed.id) - previous_dataset_id – DB id of the previous Gtfsdataset - current_dataset_id – DB id of the current Gtfsdataset + feed_stable_id – stable_id of the GTFS feed + previous_dataset_stable_id – stable_id of the previous Gtfsdataset + current_dataset_stable_id – stable_id of the current Gtfsdataset """ payload = request.get_json(silent=True) or {} - feed_id = payload.get("feed_id") - previous_dataset_id = payload.get("previous_dataset_id") - current_dataset_id = payload.get("current_dataset_id") + feed_stable_id = payload.get("feed_stable_id") + previous_dataset_stable_id = payload.get("previous_dataset_stable_id") + current_dataset_stable_id = payload.get("current_dataset_stable_id") - if not (feed_id and previous_dataset_id and current_dataset_id): + if not ( + feed_stable_id and previous_dataset_stable_id and current_dataset_stable_id + ): return { "status": "error", - "error": "feed_id, previous_dataset_id, and current_dataset_id are required.", + "error": "feed_stable_id, previous_dataset_stable_id, and current_dataset_stable_id are required.", } bucket_name = os.getenv("DATASETS_BUCKET_NAME") @@ -68,9 +70,9 @@ def gtfs_change_tracker(request: flask.Request) -> dict: bucket_mount = os.getenv("DATASETS_BUCKET_MOUNT", "/mobilitydata-datasets") try: tracker = GtfsChangeTracker( - feed_id=feed_id, - previous_dataset_id=previous_dataset_id, - current_dataset_id=current_dataset_id, + feed_stable_id=feed_stable_id, + previous_dataset_stable_id=previous_dataset_stable_id, + current_dataset_stable_id=current_dataset_stable_id, bucket_name=bucket_name, bucket_mount=bucket_mount, ) @@ -79,8 +81,8 @@ def gtfs_change_tracker(request: flask.Request) -> dict: except Exception as e: logging.exception( "Failed to generate changelog for %s -> %s", - previous_dataset_id, - current_dataset_id, + previous_dataset_stable_id, + current_dataset_stable_id, ) return { "status": "error", @@ -93,57 +95,64 @@ class GtfsChangeTracker: Orchestrates GTFS change tracking between two consecutive datasets. Steps: - 1. Resolve both datasets and the feed stable_id from the database. + 1. Resolve both datasets from the database using their stable_ids. 2. Locate the pre-extracted GTFS files on the mounted GCS bucket filesystem (///extracted/). 3. Compute a structured diff using gtfs-diff-engine. - 4. Upload the changelog JSON to GCS at two paths (shared changelogs index - and per-dataset location). + 4. Upload the changelog JSON to GCS. 5. Upsert a row in gtfs_dataset_changelog. """ def __init__( self, - feed_id: str, - previous_dataset_id: str, - current_dataset_id: str, + feed_stable_id: str, + previous_dataset_stable_id: str, + current_dataset_stable_id: str, bucket_name: str, bucket_mount: str, ): - self.feed_id = feed_id - self.previous_dataset_id = previous_dataset_id - self.current_dataset_id = current_dataset_id + self.feed_stable_id = feed_stable_id + self.previous_dataset_stable_id = previous_dataset_stable_id + self.current_dataset_stable_id = current_dataset_stable_id self.bucket_name = bucket_name self.bucket_mount = bucket_mount - self.logger = get_logger(GtfsChangeTracker.__name__, current_dataset_id) + self.logger = get_logger(GtfsChangeTracker.__name__, current_dataset_stable_id) def run(self) -> dict: """Execute the full change-tracking pipeline.""" - prev_dataset, curr_dataset, feed_stable_id = self._resolve_datasets() + prev_dataset, curr_dataset = self._resolve_datasets() self.logger.info( "Computing diff for feed %s: %s -> %s", - feed_stable_id, - prev_dataset.stable_id, - curr_dataset.stable_id, + self.feed_stable_id, + self.previous_dataset_stable_id, + self.current_dataset_stable_id, ) - prev_dir = self._extracted_dir(feed_stable_id, prev_dataset.stable_id) - curr_dir = self._extracted_dir(feed_stable_id, curr_dataset.stable_id) + prev_dir = self._extracted_dir( + self.feed_stable_id, self.previous_dataset_stable_id + ) + curr_dir = self._extracted_dir( + self.feed_stable_id, self.current_dataset_stable_id + ) diff_result = diff_feeds(prev_dir, curr_dir) changelog_json = diff_result.model_dump_json(indent=2).encode("utf-8") changelog_url = self._upload_changelog( changelog_json, - feed_stable_id, - prev_dataset.stable_id, - curr_dataset.stable_id, + self.feed_stable_id, + self.previous_dataset_stable_id, + self.current_dataset_stable_id, ) diff_summary = diff_result.summary.model_dump() self._save_changelog_record( - changelog_url=changelog_url, diff_summary=diff_summary + feed_uuid=curr_dataset.feed.id, + prev_dataset_uuid=prev_dataset.id, + curr_dataset_uuid=curr_dataset.id, + changelog_url=changelog_url, + diff_summary=diff_summary, ) self.logger.info("Changelog stored at %s", changelog_url) @@ -155,42 +164,38 @@ def run(self) -> dict: @with_db_session def _resolve_datasets(self, db_session: Session = None) -> tuple: """ - Load both Gtfsdataset rows and return (previous_dataset, current_dataset, feed_stable_id). + Load both Gtfsdataset rows by stable_id and return (previous_dataset, current_dataset). """ prev_dataset = ( db_session.query(Gtfsdataset) - .filter(Gtfsdataset.id == self.previous_dataset_id) + .filter(Gtfsdataset.stable_id == self.previous_dataset_stable_id) .one_or_none() ) if prev_dataset is None: - raise ValueError(f"Previous dataset not found: {self.previous_dataset_id}") + raise ValueError( + f"Previous dataset not found: {self.previous_dataset_stable_id}" + ) curr_dataset = ( db_session.query(Gtfsdataset) - .filter(Gtfsdataset.id == self.current_dataset_id) + .filter(Gtfsdataset.stable_id == self.current_dataset_stable_id) .one_or_none() ) if curr_dataset is None: - raise ValueError(f"Current dataset not found: {self.current_dataset_id}") - - if not prev_dataset.stable_id: raise ValueError( - f"Previous dataset {self.previous_dataset_id} has no stable_id." + f"Current dataset not found: {self.current_dataset_stable_id}" ) - if not curr_dataset.stable_id: + + if curr_dataset.feed.stable_id != self.feed_stable_id: raise ValueError( - f"Current dataset {self.current_dataset_id} has no stable_id." + f"Dataset {self.current_dataset_stable_id} does not belong to feed {self.feed_stable_id}." ) - feed_stable_id = curr_dataset.feed.stable_id - if not feed_stable_id: - raise ValueError(f"Feed {self.feed_id} has no stable_id.") - # Detach from session before returning so objects can be used outside the session db_session.expunge(prev_dataset) db_session.expunge(curr_dataset) - return prev_dataset, curr_dataset, feed_stable_id + return prev_dataset, curr_dataset def _extracted_dir(self, feed_stable_id: str, dataset_stable_id: str) -> str: """ @@ -234,6 +239,9 @@ def _upload_changelog( @with_db_session def _save_changelog_record( self, + feed_uuid: str, + prev_dataset_uuid: str, + curr_dataset_uuid: str, changelog_url: str, diff_summary: dict, db_session: Session = None, @@ -247,9 +255,9 @@ def _save_changelog_record( stmt = ( insert(GtfsDatasetChangelog) .values( - feed_id=self.feed_id, - previous_dataset_id=self.previous_dataset_id, - current_dataset_id=self.current_dataset_id, + feed_id=feed_uuid, + previous_dataset_id=prev_dataset_uuid, + current_dataset_id=curr_dataset_uuid, changelog_url=changelog_url, diff_summary=diff_summary, ) @@ -266,15 +274,15 @@ def _save_changelog_record( db_session.commit() self.logger.info( "Saved changelog record for %s -> %s", - self.previous_dataset_id, - self.current_dataset_id, + self.previous_dataset_stable_id, + self.current_dataset_stable_id, ) else: self.logger.info( "[TEMP] Would upsert gtfs_dataset_changelog: feed_id=%s previous=%s current=%s url=%s summary=%s", - self.feed_id, - self.previous_dataset_id, - self.current_dataset_id, + feed_uuid, + prev_dataset_uuid, + curr_dataset_uuid, changelog_url, diff_summary, ) diff --git a/functions-python/gtfs_change_tracker/tests/test_main.py b/functions-python/gtfs_change_tracker/tests/test_main.py index f16e4b226..74d5bffc2 100644 --- a/functions-python/gtfs_change_tracker/tests/test_main.py +++ b/functions-python/gtfs_change_tracker/tests/test_main.py @@ -21,12 +21,15 @@ from main import GtfsChangeTracker, gtfs_change_tracker -def _make_dataset(id_, stable_id, hosted_url, feed_stable_id="mdb-1"): +def _make_dataset( + id_, stable_id, hosted_url, feed_stable_id="mdb-1", feed_id="feed-uuid" +): dataset = MagicMock() dataset.id = id_ dataset.stable_id = stable_id dataset.hosted_url = hosted_url dataset.feed = MagicMock() + dataset.feed.id = feed_id dataset.feed.stable_id = feed_stable_id return dataset @@ -44,13 +47,19 @@ def test_missing_all_params(self): self.assertIn("required", result["error"]) def test_missing_one_param(self): - result = self._request({"feed_id": "f1", "previous_dataset_id": "p1"}) + result = self._request( + {"feed_stable_id": "mdb-1", "previous_dataset_stable_id": "mdb-1-20240101"} + ) self.assertEqual(result["status"], "error") def test_missing_bucket_env(self): os.environ.pop("DATASETS_BUCKET_NAME", None) result = self._request( - {"feed_id": "f1", "previous_dataset_id": "p1", "current_dataset_id": "c1"} + { + "feed_stable_id": "mdb-1", + "previous_dataset_stable_id": "mdb-1-20240101", + "current_dataset_stable_id": "mdb-1-20240201", + } ) self.assertEqual(result["status"], "error") self.assertIn("DATASETS_BUCKET_NAME", result["error"]) @@ -66,14 +75,18 @@ def test_success(self, mock_tracker_cls): mock_tracker_cls.return_value = instance result = self._request( - {"feed_id": "f1", "previous_dataset_id": "p1", "current_dataset_id": "c1"} + { + "feed_stable_id": "mdb-1", + "previous_dataset_stable_id": "mdb-1-20240101", + "current_dataset_stable_id": "mdb-1-20240201", + } ) self.assertEqual(result["status"], "success") self.assertIn("changelog_url", result) mock_tracker_cls.assert_called_once_with( - feed_id="f1", - previous_dataset_id="p1", - current_dataset_id="c1", + feed_stable_id="mdb-1", + previous_dataset_stable_id="mdb-1-20240101", + current_dataset_stable_id="mdb-1-20240201", bucket_name="test-bucket", bucket_mount="/mobilitydata-datasets", ) @@ -86,7 +99,11 @@ def test_exception_returns_error(self, mock_tracker_cls): mock_tracker_cls.return_value = instance result = self._request( - {"feed_id": "f1", "previous_dataset_id": "p1", "current_dataset_id": "c1"} + { + "feed_stable_id": "mdb-1", + "previous_dataset_stable_id": "mdb-1-20240101", + "current_dataset_stable_id": "mdb-1-20240201", + } ) self.assertEqual(result["status"], "error") self.assertIn("something went wrong", result["error"]) @@ -98,9 +115,9 @@ class TestGtfsChangeTrackerRun(unittest.TestCase): def setUp(self): os.environ["DATASETS_BUCKET_NAME"] = "test-bucket" self.tracker = GtfsChangeTracker( - feed_id="feed-uuid", - previous_dataset_id="prev-uuid", - current_dataset_id="curr-uuid", + feed_stable_id="mdb-1", + previous_dataset_stable_id="mdb-1-20240101", + current_dataset_stable_id="mdb-1-20240201", bucket_name="test-bucket", bucket_mount="/mobilitydata-datasets", ) @@ -118,7 +135,7 @@ def test_run_happy_path( curr_ds = _make_dataset( "curr-uuid", "mdb-1-20240201", "https://example.com/curr.zip" ) - mock_resolve.return_value = (prev_ds, curr_ds, "mdb-1") + mock_resolve.return_value = (prev_ds, curr_ds) mock_extracted_dir.side_effect = [ "/mobilitydata-datasets/mdb-1/mdb-1-20240101/extracted", "/mobilitydata-datasets/mdb-1/mdb-1-20240201/extracted", @@ -154,6 +171,9 @@ def test_run_happy_path( "mdb-1-20240201", ) mock_save.assert_called_once_with( + feed_uuid="feed-uuid", + prev_dataset_uuid="prev-uuid", + curr_dataset_uuid="curr-uuid", changelog_url=changelog_url, diff_summary={"total_changes": 42}, ) @@ -161,7 +181,9 @@ def test_run_happy_path( @patch("main.GtfsChangeTracker._resolve_datasets") def test_run_raises_when_resolve_fails(self, mock_resolve): - mock_resolve.side_effect = ValueError("Previous dataset not found: prev-uuid") + mock_resolve.side_effect = ValueError( + "Previous dataset not found: mdb-1-20240101" + ) with self.assertRaises(ValueError): self.tracker.run() @@ -169,9 +191,9 @@ def test_run_raises_when_resolve_fails(self, mock_resolve): class TestExtractedDir(unittest.TestCase): def setUp(self): self.tracker = GtfsChangeTracker( - feed_id="f", - previous_dataset_id="p", - current_dataset_id="c", + feed_stable_id="mdb-1", + previous_dataset_stable_id="mdb-1-20240101", + current_dataset_stable_id="mdb-1-20240201", bucket_name="my-bucket", bucket_mount="/mobilitydata-datasets", ) @@ -192,9 +214,9 @@ def test_raises_when_dir_not_found(self): class TestUploadChangelog(unittest.TestCase): def setUp(self): self.tracker = GtfsChangeTracker( - feed_id="f", - previous_dataset_id="p", - current_dataset_id="c", + feed_stable_id="mdb-1", + previous_dataset_stable_id="mdb-1-20240101", + current_dataset_stable_id="mdb-1-20240201", bucket_name="my-bucket", bucket_mount="/mobilitydata-datasets", ) From 55b9942a64c8a4ab05a398fdc6e85f1ccd8fa060 Mon Sep 17 00:00:00 2001 From: jcpitre Date: Wed, 10 Jun 2026 09:59:46 -0400 Subject: [PATCH 08/21] Use stable IDs in payload and return plain UUIDs from _resolve_datasets --- .../gtfs_change_tracker/src/main.py | 17 ++--- .../gtfs_change_tracker/tests/test_main.py | 70 +++++++++++++++++-- 2 files changed, 70 insertions(+), 17 deletions(-) diff --git a/functions-python/gtfs_change_tracker/src/main.py b/functions-python/gtfs_change_tracker/src/main.py index a1e490885..92ac0ce3e 100644 --- a/functions-python/gtfs_change_tracker/src/main.py +++ b/functions-python/gtfs_change_tracker/src/main.py @@ -120,7 +120,7 @@ def __init__( def run(self) -> dict: """Execute the full change-tracking pipeline.""" - prev_dataset, curr_dataset = self._resolve_datasets() + prev_dataset_uuid, curr_dataset_uuid, feed_uuid = self._resolve_datasets() self.logger.info( "Computing diff for feed %s: %s -> %s", @@ -148,9 +148,9 @@ def run(self) -> dict: diff_summary = diff_result.summary.model_dump() self._save_changelog_record( - feed_uuid=curr_dataset.feed.id, - prev_dataset_uuid=prev_dataset.id, - curr_dataset_uuid=curr_dataset.id, + feed_uuid=feed_uuid, + prev_dataset_uuid=prev_dataset_uuid, + curr_dataset_uuid=curr_dataset_uuid, changelog_url=changelog_url, diff_summary=diff_summary, ) @@ -164,7 +164,8 @@ def run(self) -> dict: @with_db_session def _resolve_datasets(self, db_session: Session = None) -> tuple: """ - Load both Gtfsdataset rows by stable_id and return (previous_dataset, current_dataset). + Validate both datasets exist and belong to the given feed. + Returns (prev_dataset_uuid, curr_dataset_uuid, feed_uuid) as plain strings. """ prev_dataset = ( db_session.query(Gtfsdataset) @@ -191,11 +192,7 @@ def _resolve_datasets(self, db_session: Session = None) -> tuple: f"Dataset {self.current_dataset_stable_id} does not belong to feed {self.feed_stable_id}." ) - # Detach from session before returning so objects can be used outside the session - db_session.expunge(prev_dataset) - db_session.expunge(curr_dataset) - - return prev_dataset, curr_dataset + return prev_dataset.id, curr_dataset.id, curr_dataset.feed.id def _extracted_dir(self, feed_stable_id: str, dataset_stable_id: str) -> str: """ diff --git a/functions-python/gtfs_change_tracker/tests/test_main.py b/functions-python/gtfs_change_tracker/tests/test_main.py index 74d5bffc2..a7435f434 100644 --- a/functions-python/gtfs_change_tracker/tests/test_main.py +++ b/functions-python/gtfs_change_tracker/tests/test_main.py @@ -129,13 +129,7 @@ def setUp(self): def test_run_happy_path( self, mock_resolve, mock_extracted_dir, mock_upload, mock_save ): - prev_ds = _make_dataset( - "prev-uuid", "mdb-1-20240101", "https://example.com/prev.zip" - ) - curr_ds = _make_dataset( - "curr-uuid", "mdb-1-20240201", "https://example.com/curr.zip" - ) - mock_resolve.return_value = (prev_ds, curr_ds) + mock_resolve.return_value = ("prev-uuid", "curr-uuid", "feed-uuid") mock_extracted_dir.side_effect = [ "/mobilitydata-datasets/mdb-1/mdb-1-20240101/extracted", "/mobilitydata-datasets/mdb-1/mdb-1-20240201/extracted", @@ -188,6 +182,68 @@ def test_run_raises_when_resolve_fails(self, mock_resolve): self.tracker.run() +class TestResolveDatasets(unittest.TestCase): + """Tests for _resolve_datasets — verifies plain string UUIDs are returned.""" + + def setUp(self): + self.tracker = GtfsChangeTracker( + feed_stable_id="mdb-1", + previous_dataset_stable_id="mdb-1-20240101", + current_dataset_stable_id="mdb-1-20240201", + bucket_name="my-bucket", + bucket_mount="/mobilitydata-datasets", + ) + + def _mock_session_with_datasets(self): + prev_ds = MagicMock() + prev_ds.id = "prev-uuid" + prev_ds.stable_id = "mdb-1-20240101" + + curr_ds = MagicMock() + curr_ds.id = "curr-uuid" + curr_ds.stable_id = "mdb-1-20240201" + curr_ds.feed.id = "feed-uuid" + curr_ds.feed.stable_id = "mdb-1" + + mock_session = MagicMock() + mock_session.query.return_value.filter.return_value.one_or_none.side_effect = [ + prev_ds, + curr_ds, + ] + return mock_session + + @patch("main.with_db_session", lambda f: f) + def test_returns_plain_string_uuids(self): + """_resolve_datasets must return str UUIDs, not ORM objects. + If ORM objects were returned, accessing lazy attributes after session + close would raise DetachedInstanceError in production.""" + mock_session = self._mock_session_with_datasets() + result = self.tracker._resolve_datasets(db_session=mock_session) + prev_uuid, curr_uuid, feed_uuid = result + self.assertIsInstance(prev_uuid, str, "prev_uuid must be a plain string") + self.assertIsInstance(curr_uuid, str, "curr_uuid must be a plain string") + self.assertIsInstance(feed_uuid, str, "feed_uuid must be a plain string") + self.assertEqual(prev_uuid, "prev-uuid") + self.assertEqual(curr_uuid, "curr-uuid") + self.assertEqual(feed_uuid, "feed-uuid") + + @patch("main.with_db_session", lambda f: f) + def test_raises_when_feed_mismatch(self): + prev_ds = MagicMock() + prev_ds.id = "prev-uuid" + curr_ds = MagicMock() + curr_ds.id = "curr-uuid" + curr_ds.feed.stable_id = "mdb-999" # wrong feed + + mock_session = MagicMock() + mock_session.query.return_value.filter.return_value.one_or_none.side_effect = [ + prev_ds, + curr_ds, + ] + with self.assertRaises(ValueError, msg="should reject dataset from wrong feed"): + self.tracker._resolve_datasets(db_session=mock_session) + + class TestExtractedDir(unittest.TestCase): def setUp(self): self.tracker = GtfsChangeTracker( From dcdf833f9d83bc07ceae334d298e917a339bed73 Mon Sep 17 00:00:00 2001 From: jcpitre Date: Wed, 10 Jun 2026 21:51:33 -0400 Subject: [PATCH 09/21] Idempotency check, memory limiting, and retry suppression --- .../gtfs_change_tracker/function_config.json | 3 + .../gtfs_change_tracker/src/main.py | 125 +++++++++++------- .../gtfs_change_tracker/tests/test_main.py | 100 +++++++++----- infra/functions-python/main.tf | 22 ++- 4 files changed, 159 insertions(+), 91 deletions(-) diff --git a/functions-python/gtfs_change_tracker/function_config.json b/functions-python/gtfs_change_tracker/function_config.json index 8508b3cc5..3d04400b7 100644 --- a/functions-python/gtfs_change_tracker/function_config.json +++ b/functions-python/gtfs_change_tracker/function_config.json @@ -13,6 +13,9 @@ }, { "key": "LOGGING_LEVEL" + }, + { + "key": "GTFS_DIFF_DUCKDB_TMPDIR" } ], "secret_environment_variables": [ diff --git a/functions-python/gtfs_change_tracker/src/main.py b/functions-python/gtfs_change_tracker/src/main.py index 92ac0ce3e..1d9780385 100644 --- a/functions-python/gtfs_change_tracker/src/main.py +++ b/functions-python/gtfs_change_tracker/src/main.py @@ -27,14 +27,17 @@ from sqlalchemy.dialects.postgresql import insert from sqlalchemy.orm import Session +from shared.common.gcp_memory_utils import limit_gcp_memory from shared.database.database import with_db_session from shared.database_gen.sqlacodegen_models import ( GtfsDatasetChangelog, Gtfsdataset, ) from shared.helpers.logger import get_logger, init_logger +from shared.helpers.runtime_metrics import track_metrics init_logger() +limit_gcp_memory(os.getenv("GTFS_DIFF_DUCKDB_TMPDIR", "/tmp/in-memory")) @functions_framework.http @@ -43,51 +46,62 @@ def gtfs_change_tracker(request: flask.Request) -> dict: HTTP entrypoint for the GTFS change tracker function. Expects a JSON body with: - feed_stable_id – stable_id of the GTFS feed - previous_dataset_stable_id – stable_id of the previous Gtfsdataset - current_dataset_stable_id – stable_id of the current Gtfsdataset + feed_stable_id – stable_id of the GTFS feed + base_dataset_stable_id – stable_id of the previous Gtfsdataset + new_dataset_stable_id – stable_id of the current Gtfsdataset + + Always returns HTTP 200 — errors are reported in the response body. + This prevents GCP from retrying failures: we cannot distinguish transient from + permanent errors (e.g. a DB blip vs the DB being down), so retrying would only + waste resources. The idempotency check in run() makes explicit reruns safe. """ payload = request.get_json(silent=True) or {} feed_stable_id = payload.get("feed_stable_id") - previous_dataset_stable_id = payload.get("previous_dataset_stable_id") - current_dataset_stable_id = payload.get("current_dataset_stable_id") + base_dataset_stable_id = payload.get("base_dataset_stable_id") + new_dataset_stable_id = payload.get("new_dataset_stable_id") - if not ( - feed_stable_id and previous_dataset_stable_id and current_dataset_stable_id - ): - return { - "status": "error", - "error": "feed_stable_id, previous_dataset_stable_id, and current_dataset_stable_id are required.", - } + if not (feed_stable_id and base_dataset_stable_id and new_dataset_stable_id): + return flask.make_response( + { + "status": "error", + "error": "feed_stable_id, base_dataset_stable_id, and new_dataset_stable_id are required.", + }, + 200, + ) bucket_name = os.getenv("DATASETS_BUCKET_NAME") if not bucket_name: - return { - "status": "error", - "error": "DATASETS_BUCKET_NAME environment variable is not set.", - } + return flask.make_response( + { + "status": "error", + "error": "DATASETS_BUCKET_NAME environment variable is not set.", + }, + 200, + ) - bucket_mount = os.getenv("DATASETS_BUCKET_MOUNT", "/mobilitydata-datasets") + bucket_mount = os.getenv("DATASETS_BUCKET_MOUNT", "/tmp/mobilitydata-datasets") try: tracker = GtfsChangeTracker( feed_stable_id=feed_stable_id, - previous_dataset_stable_id=previous_dataset_stable_id, - current_dataset_stable_id=current_dataset_stable_id, + base_dataset_stable_id=base_dataset_stable_id, + new_dataset_stable_id=new_dataset_stable_id, bucket_name=bucket_name, bucket_mount=bucket_mount, ) result = tracker.run() - return {"status": "success", **result} + return flask.make_response({"status": "success", **result}, 200) except Exception as e: + # We cannot reliably distinguish transient from permanent errors, so we always + # return HTTP 200 to suppress GCP retries. If a specific exception type is + # identified as safely retriable in the future, catch it here and return 500. logging.exception( "Failed to generate changelog for %s -> %s", - previous_dataset_stable_id, - current_dataset_stable_id, + base_dataset_stable_id, + new_dataset_stable_id, + ) + return flask.make_response( + {"status": "error", "error": f"Failed to generate changelog: {e}"}, 200 ) - return { - "status": "error", - "error": f"Failed to generate changelog: {e}", - } class GtfsChangeTracker: @@ -106,35 +120,46 @@ class GtfsChangeTracker: def __init__( self, feed_stable_id: str, - previous_dataset_stable_id: str, - current_dataset_stable_id: str, + base_dataset_stable_id: str, + new_dataset_stable_id: str, bucket_name: str, bucket_mount: str, ): self.feed_stable_id = feed_stable_id - self.previous_dataset_stable_id = previous_dataset_stable_id - self.current_dataset_stable_id = current_dataset_stable_id + self.base_dataset_stable_id = base_dataset_stable_id + self.new_dataset_stable_id = new_dataset_stable_id self.bucket_name = bucket_name self.bucket_mount = bucket_mount - self.logger = get_logger(GtfsChangeTracker.__name__, current_dataset_stable_id) + self.logger = get_logger(GtfsChangeTracker.__name__, new_dataset_stable_id) + @track_metrics(metrics=("time", "memory", "cpu")) def run(self) -> dict: """Execute the full change-tracking pipeline.""" + # Idempotency: if the changelog already exists in GCS, skip recomputing. + changelog_blob_path = ( + f"{self.feed_stable_id}/{self.new_dataset_stable_id}/" + f"{self.new_dataset_stable_id}_{self.base_dataset_stable_id}_changelog.json" + ) + blob = storage.Client().bucket(self.bucket_name).blob(changelog_blob_path) + if blob.exists(): + changelog_url = f"https://storage.googleapis.com/{self.bucket_name}/{changelog_blob_path}" + self.logger.info("Changelog already exists, skipping: %s", changelog_url) + return { + "message": "Changelog already exists.", + "changelog_url": changelog_url, + } + prev_dataset_uuid, curr_dataset_uuid, feed_uuid = self._resolve_datasets() self.logger.info( "Computing diff for feed %s: %s -> %s", self.feed_stable_id, - self.previous_dataset_stable_id, - self.current_dataset_stable_id, + self.base_dataset_stable_id, + self.new_dataset_stable_id, ) - prev_dir = self._extracted_dir( - self.feed_stable_id, self.previous_dataset_stable_id - ) - curr_dir = self._extracted_dir( - self.feed_stable_id, self.current_dataset_stable_id - ) + prev_dir = self._extracted_dir(self.feed_stable_id, self.base_dataset_stable_id) + curr_dir = self._extracted_dir(self.feed_stable_id, self.new_dataset_stable_id) diff_result = diff_feeds(prev_dir, curr_dir) @@ -142,8 +167,8 @@ def run(self) -> dict: changelog_url = self._upload_changelog( changelog_json, self.feed_stable_id, - self.previous_dataset_stable_id, - self.current_dataset_stable_id, + self.base_dataset_stable_id, + self.new_dataset_stable_id, ) diff_summary = diff_result.summary.model_dump() @@ -169,27 +194,25 @@ def _resolve_datasets(self, db_session: Session = None) -> tuple: """ prev_dataset = ( db_session.query(Gtfsdataset) - .filter(Gtfsdataset.stable_id == self.previous_dataset_stable_id) + .filter(Gtfsdataset.stable_id == self.base_dataset_stable_id) .one_or_none() ) if prev_dataset is None: raise ValueError( - f"Previous dataset not found: {self.previous_dataset_stable_id}" + f"Previous dataset not found: {self.base_dataset_stable_id}" ) curr_dataset = ( db_session.query(Gtfsdataset) - .filter(Gtfsdataset.stable_id == self.current_dataset_stable_id) + .filter(Gtfsdataset.stable_id == self.new_dataset_stable_id) .one_or_none() ) if curr_dataset is None: - raise ValueError( - f"Current dataset not found: {self.current_dataset_stable_id}" - ) + raise ValueError(f"Current dataset not found: {self.new_dataset_stable_id}") if curr_dataset.feed.stable_id != self.feed_stable_id: raise ValueError( - f"Dataset {self.current_dataset_stable_id} does not belong to feed {self.feed_stable_id}." + f"Dataset {self.new_dataset_stable_id} does not belong to feed {self.feed_stable_id}." ) return prev_dataset.id, curr_dataset.id, curr_dataset.feed.id @@ -248,7 +271,7 @@ def _save_changelog_record( The UNIQUE constraint on (previous_dataset_id, current_dataset_id) ensures idempotency. """ # TODO: remove this flag and always write to DB once testing is complete - if os.getenv("CHANGELOG_DB_WRITE_ENABLED", "false").lower() == "true": + if os.getenv("CHANGELOG_DB_WRITE_ENABLED", "true").lower() == "true": stmt = ( insert(GtfsDatasetChangelog) .values( @@ -271,8 +294,8 @@ def _save_changelog_record( db_session.commit() self.logger.info( "Saved changelog record for %s -> %s", - self.previous_dataset_stable_id, - self.current_dataset_stable_id, + self.base_dataset_stable_id, + self.new_dataset_stable_id, ) else: self.logger.info( diff --git a/functions-python/gtfs_change_tracker/tests/test_main.py b/functions-python/gtfs_change_tracker/tests/test_main.py index a7435f434..5afcb5050 100644 --- a/functions-python/gtfs_change_tracker/tests/test_main.py +++ b/functions-python/gtfs_change_tracker/tests/test_main.py @@ -37,36 +37,44 @@ def _make_dataset( class TestGtfsChangeTrackerHandler(unittest.TestCase): """Tests for the HTTP handler function.""" + app = flask.Flask(__name__) + def _request(self, payload): - with flask.Flask(__name__).test_request_context(json=payload): - return gtfs_change_tracker(flask.request) + with self.app.test_request_context(json=payload): + response = gtfs_change_tracker(flask.request) + with self.app.app_context(): + return response.status_code, response.get_json() def test_missing_all_params(self): - result = self._request({}) - self.assertEqual(result["status"], "error") - self.assertIn("required", result["error"]) + status, body = self._request({}) + self.assertEqual(status, 200) + self.assertEqual(body["status"], "error") + self.assertIn("required", body["error"]) def test_missing_one_param(self): - result = self._request( - {"feed_stable_id": "mdb-1", "previous_dataset_stable_id": "mdb-1-20240101"} + status, body = self._request( + {"feed_stable_id": "mdb-1", "base_dataset_stable_id": "mdb-1-20240101"} ) - self.assertEqual(result["status"], "error") + self.assertEqual(status, 200) + self.assertEqual(body["status"], "error") def test_missing_bucket_env(self): os.environ.pop("DATASETS_BUCKET_NAME", None) - result = self._request( + status, body = self._request( { "feed_stable_id": "mdb-1", - "previous_dataset_stable_id": "mdb-1-20240101", - "current_dataset_stable_id": "mdb-1-20240201", + "base_dataset_stable_id": "mdb-1-20240101", + "new_dataset_stable_id": "mdb-1-20240201", } ) - self.assertEqual(result["status"], "error") - self.assertIn("DATASETS_BUCKET_NAME", result["error"]) + self.assertEqual(status, 200) + self.assertEqual(body["status"], "error") + self.assertIn("DATASETS_BUCKET_NAME", body["error"]) @patch("main.GtfsChangeTracker") def test_success(self, mock_tracker_cls): os.environ["DATASETS_BUCKET_NAME"] = "test-bucket" + os.environ["DATASETS_BUCKET_MOUNT"] = "/mobilitydata-datasets" instance = MagicMock() instance.run.return_value = { "message": "Changelog generated successfully.", @@ -74,39 +82,42 @@ def test_success(self, mock_tracker_cls): } mock_tracker_cls.return_value = instance - result = self._request( + status, body = self._request( { "feed_stable_id": "mdb-1", - "previous_dataset_stable_id": "mdb-1-20240101", - "current_dataset_stable_id": "mdb-1-20240201", + "base_dataset_stable_id": "mdb-1-20240101", + "new_dataset_stable_id": "mdb-1-20240201", } ) - self.assertEqual(result["status"], "success") - self.assertIn("changelog_url", result) + self.assertEqual(status, 200) + self.assertEqual(body["status"], "success") + self.assertIn("changelog_url", body) mock_tracker_cls.assert_called_once_with( feed_stable_id="mdb-1", - previous_dataset_stable_id="mdb-1-20240101", - current_dataset_stable_id="mdb-1-20240201", + base_dataset_stable_id="mdb-1-20240101", + new_dataset_stable_id="mdb-1-20240201", bucket_name="test-bucket", bucket_mount="/mobilitydata-datasets", ) @patch("main.GtfsChangeTracker") - def test_exception_returns_error(self, mock_tracker_cls): + def test_exception_returns_200_with_error(self, mock_tracker_cls): + """All exceptions return HTTP 200 to suppress GCP retries.""" os.environ["DATASETS_BUCKET_NAME"] = "test-bucket" instance = MagicMock() instance.run.side_effect = Exception("something went wrong") mock_tracker_cls.return_value = instance - result = self._request( + status, body = self._request( { "feed_stable_id": "mdb-1", - "previous_dataset_stable_id": "mdb-1-20240101", - "current_dataset_stable_id": "mdb-1-20240201", + "base_dataset_stable_id": "mdb-1-20240101", + "new_dataset_stable_id": "mdb-1-20240201", } ) - self.assertEqual(result["status"], "error") - self.assertIn("something went wrong", result["error"]) + self.assertEqual(status, 200) + self.assertEqual(body["status"], "error") + self.assertIn("something went wrong", body["error"]) class TestGtfsChangeTrackerRun(unittest.TestCase): @@ -116,19 +127,23 @@ def setUp(self): os.environ["DATASETS_BUCKET_NAME"] = "test-bucket" self.tracker = GtfsChangeTracker( feed_stable_id="mdb-1", - previous_dataset_stable_id="mdb-1-20240101", - current_dataset_stable_id="mdb-1-20240201", + base_dataset_stable_id="mdb-1-20240101", + new_dataset_stable_id="mdb-1-20240201", bucket_name="test-bucket", bucket_mount="/mobilitydata-datasets", ) + @patch("main.storage.Client") @patch("main.GtfsChangeTracker._save_changelog_record") @patch("main.GtfsChangeTracker._upload_changelog") @patch("main.GtfsChangeTracker._extracted_dir") @patch("main.GtfsChangeTracker._resolve_datasets") def test_run_happy_path( - self, mock_resolve, mock_extracted_dir, mock_upload, mock_save + self, mock_resolve, mock_extracted_dir, mock_upload, mock_save, mock_storage ): + mock_storage.return_value.bucket.return_value.blob.return_value.exists.return_value = ( + False + ) mock_resolve.return_value = ("prev-uuid", "curr-uuid", "feed-uuid") mock_extracted_dir.side_effect = [ "/mobilitydata-datasets/mdb-1/mdb-1-20240101/extracted", @@ -173,14 +188,27 @@ def test_run_happy_path( ) self.assertEqual(result["changelog_url"], changelog_url) + @patch("main.storage.Client") @patch("main.GtfsChangeTracker._resolve_datasets") - def test_run_raises_when_resolve_fails(self, mock_resolve): + def test_run_raises_when_resolve_fails(self, mock_resolve, mock_storage): + mock_storage.return_value.bucket.return_value.blob.return_value.exists.return_value = ( + False + ) mock_resolve.side_effect = ValueError( "Previous dataset not found: mdb-1-20240101" ) with self.assertRaises(ValueError): self.tracker.run() + @patch("main.storage.Client") + def test_run_skips_when_changelog_exists(self, mock_storage): + mock_storage.return_value.bucket.return_value.blob.return_value.exists.return_value = ( + True + ) + result = self.tracker.run() + self.assertIn("already exists", result["message"]) + self.assertIn("changelog_url", result) + class TestResolveDatasets(unittest.TestCase): """Tests for _resolve_datasets — verifies plain string UUIDs are returned.""" @@ -188,8 +216,8 @@ class TestResolveDatasets(unittest.TestCase): def setUp(self): self.tracker = GtfsChangeTracker( feed_stable_id="mdb-1", - previous_dataset_stable_id="mdb-1-20240101", - current_dataset_stable_id="mdb-1-20240201", + base_dataset_stable_id="mdb-1-20240101", + new_dataset_stable_id="mdb-1-20240201", bucket_name="my-bucket", bucket_mount="/mobilitydata-datasets", ) @@ -248,8 +276,8 @@ class TestExtractedDir(unittest.TestCase): def setUp(self): self.tracker = GtfsChangeTracker( feed_stable_id="mdb-1", - previous_dataset_stable_id="mdb-1-20240101", - current_dataset_stable_id="mdb-1-20240201", + base_dataset_stable_id="mdb-1-20240101", + new_dataset_stable_id="mdb-1-20240201", bucket_name="my-bucket", bucket_mount="/mobilitydata-datasets", ) @@ -271,8 +299,8 @@ class TestUploadChangelog(unittest.TestCase): def setUp(self): self.tracker = GtfsChangeTracker( feed_stable_id="mdb-1", - previous_dataset_stable_id="mdb-1-20240101", - current_dataset_stable_id="mdb-1-20240201", + base_dataset_stable_id="mdb-1-20240101", + new_dataset_stable_id="mdb-1-20240201", bucket_name="my-bucket", bucket_mount="/mobilitydata-datasets", ) diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index f7859f7df..473310f6b 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -1282,6 +1282,7 @@ resource "google_cloudfunctions2_function" "gtfs_change_tracker" { GCP_REGION = var.gcp_region DATASETS_BUCKET_NAME = "${var.datasets_bucket_name}-${var.environment}" DATASETS_BUCKET_MOUNT = "/mobilitydata-datasets" + GTFS_DIFF_DUCKDB_TMPDIR = "/tmp/in-memory" } available_memory = local.function_gtfs_change_tracker_config.memory timeout_seconds = local.function_gtfs_change_tracker_config.timeout @@ -1307,8 +1308,8 @@ resource "google_cloudfunctions2_function" "gtfs_change_tracker" { } # google_cloudfunctions2_function does not expose volume mounts in its schema. -# This terraform_data resource mounts the datasets GCS bucket on the underlying Cloud Run service -# after the function is deployed, using `gcloud run services update`. +# This terraform_data resource mounts both the datasets GCS bucket and an in-memory tmpfs +# on the underlying Cloud Run service after the function is deployed. resource "terraform_data" "gtfs_change_tracker_gcs_mount" { triggers_replace = { function_name = google_cloudfunctions2_function.gtfs_change_tracker.name @@ -1323,14 +1324,27 @@ resource "terraform_data" "gtfs_change_tracker_gcs_mount" { --project ${var.project_id} \ --region ${var.gcp_region} \ --format='value(spec.template.spec.volumes[].name)' 2>/dev/null) + + ARGS="" if echo "$MOUNTS" | grep -q "datasets-bucket"; then echo "GCS volume already mounted, skipping." else + ARGS="$ARGS --add-volume name=datasets-bucket,type=cloud-storage,bucket=${var.datasets_bucket_name}-${var.environment},readonly=true" + ARGS="$ARGS --add-volume-mount volume=datasets-bucket,mount-path=/mobilitydata-datasets" + fi + + if echo "$MOUNTS" | grep -q "in-memory"; then + echo "In-memory volume already mounted, skipping." + else + ARGS="$ARGS --add-volume name=in-memory,type=in-memory,size-limit=2Gi" + ARGS="$ARGS --add-volume-mount volume=in-memory,mount-path=/tmp/in-memory" + fi + + if [ -n "$ARGS" ]; then gcloud run services update ${google_cloudfunctions2_function.gtfs_change_tracker.name} \ --project ${var.project_id} \ --region ${var.gcp_region} \ - --add-volume name=datasets-bucket,type=cloud-storage,bucket=${var.datasets_bucket_name}-${var.environment},readonly=true \ - --add-volume-mount volume=datasets-bucket,mount-path=/mobilitydata-datasets \ + $ARGS \ --quiet fi EOT From ffe92d200897dc840fc12b895adba3b046060277 Mon Sep 17 00:00:00 2001 From: jcpitre Date: Wed, 10 Jun 2026 22:13:59 -0400 Subject: [PATCH 10/21] Added dry_run and allow_overwrite payload parameters. --- .../gtfs_change_tracker/src/main.py | 26 ++++- .../gtfs_change_tracker/tests/test_main.py | 100 ++++++++++++++++++ 2 files changed, 122 insertions(+), 4 deletions(-) diff --git a/functions-python/gtfs_change_tracker/src/main.py b/functions-python/gtfs_change_tracker/src/main.py index 1d9780385..b9a53c83b 100644 --- a/functions-python/gtfs_change_tracker/src/main.py +++ b/functions-python/gtfs_change_tracker/src/main.py @@ -47,8 +47,10 @@ def gtfs_change_tracker(request: flask.Request) -> dict: Expects a JSON body with: feed_stable_id – stable_id of the GTFS feed - base_dataset_stable_id – stable_id of the previous Gtfsdataset - new_dataset_stable_id – stable_id of the current Gtfsdataset + base_dataset_stable_id – stable_id of the base (previous) Gtfsdataset + new_dataset_stable_id – stable_id of the new (current) Gtfsdataset + allow_overwrite – (optional, default false) overwrite existing changelog + dry_run – (optional, default false) compute diff but skip GCS upload and DB write Always returns HTTP 200 — errors are reported in the response body. This prevents GCP from retrying failures: we cannot distinguish transient from @@ -59,6 +61,8 @@ def gtfs_change_tracker(request: flask.Request) -> dict: feed_stable_id = payload.get("feed_stable_id") base_dataset_stable_id = payload.get("base_dataset_stable_id") new_dataset_stable_id = payload.get("new_dataset_stable_id") + allow_overwrite = bool(payload.get("allow_overwrite", False)) + dry_run = bool(payload.get("dry_run", False)) if not (feed_stable_id and base_dataset_stable_id and new_dataset_stable_id): return flask.make_response( @@ -87,6 +91,8 @@ def gtfs_change_tracker(request: flask.Request) -> dict: new_dataset_stable_id=new_dataset_stable_id, bucket_name=bucket_name, bucket_mount=bucket_mount, + allow_overwrite=allow_overwrite, + dry_run=dry_run, ) result = tracker.run() return flask.make_response({"status": "success", **result}, 200) @@ -124,24 +130,29 @@ def __init__( new_dataset_stable_id: str, bucket_name: str, bucket_mount: str, + allow_overwrite: bool = False, + dry_run: bool = False, ): self.feed_stable_id = feed_stable_id self.base_dataset_stable_id = base_dataset_stable_id self.new_dataset_stable_id = new_dataset_stable_id self.bucket_name = bucket_name self.bucket_mount = bucket_mount + self.allow_overwrite = allow_overwrite + self.dry_run = dry_run self.logger = get_logger(GtfsChangeTracker.__name__, new_dataset_stable_id) @track_metrics(metrics=("time", "memory", "cpu")) def run(self) -> dict: """Execute the full change-tracking pipeline.""" - # Idempotency: if the changelog already exists in GCS, skip recomputing. changelog_blob_path = ( f"{self.feed_stable_id}/{self.new_dataset_stable_id}/" f"{self.new_dataset_stable_id}_{self.base_dataset_stable_id}_changelog.json" ) blob = storage.Client().bucket(self.bucket_name).blob(changelog_blob_path) - if blob.exists(): + + # Idempotency: skip if changelog already exists, unless allow_overwrite is set. + if not self.allow_overwrite and blob.exists(): changelog_url = f"https://storage.googleapis.com/{self.bucket_name}/{changelog_blob_path}" self.logger.info("Changelog already exists, skipping: %s", changelog_url) return { @@ -163,6 +174,13 @@ def run(self) -> dict: diff_result = diff_feeds(prev_dir, curr_dir) + if self.dry_run: + self.logger.info("Dry run — skipping GCS upload and DB write.") + return { + "message": "Dry run completed. Diff computed but not persisted.", + "summary": diff_result.summary.model_dump(), + } + changelog_json = diff_result.model_dump_json(indent=2).encode("utf-8") changelog_url = self._upload_changelog( changelog_json, diff --git a/functions-python/gtfs_change_tracker/tests/test_main.py b/functions-python/gtfs_change_tracker/tests/test_main.py index 5afcb5050..ccd32c1b8 100644 --- a/functions-python/gtfs_change_tracker/tests/test_main.py +++ b/functions-python/gtfs_change_tracker/tests/test_main.py @@ -98,6 +98,35 @@ def test_success(self, mock_tracker_cls): new_dataset_stable_id="mdb-1-20240201", bucket_name="test-bucket", bucket_mount="/mobilitydata-datasets", + allow_overwrite=False, + dry_run=False, + ) + + @patch("main.GtfsChangeTracker") + def test_allow_overwrite_and_dry_run_passed(self, mock_tracker_cls): + os.environ["DATASETS_BUCKET_NAME"] = "test-bucket" + os.environ["DATASETS_BUCKET_MOUNT"] = "/mobilitydata-datasets" + instance = MagicMock() + instance.run.return_value = {"message": "Dry run completed.", "summary": {}} + mock_tracker_cls.return_value = instance + + self._request( + { + "feed_stable_id": "mdb-1", + "base_dataset_stable_id": "mdb-1-20240101", + "new_dataset_stable_id": "mdb-1-20240201", + "allow_overwrite": True, + "dry_run": True, + } + ) + mock_tracker_cls.assert_called_once_with( + feed_stable_id="mdb-1", + base_dataset_stable_id="mdb-1-20240101", + new_dataset_stable_id="mdb-1-20240201", + bucket_name="test-bucket", + bucket_mount="/mobilitydata-datasets", + allow_overwrite=True, + dry_run=True, ) @patch("main.GtfsChangeTracker") @@ -209,6 +238,77 @@ def test_run_skips_when_changelog_exists(self, mock_storage): self.assertIn("already exists", result["message"]) self.assertIn("changelog_url", result) + @patch("main.storage.Client") + @patch("main.GtfsChangeTracker._save_changelog_record") + @patch("main.GtfsChangeTracker._upload_changelog") + @patch("main.GtfsChangeTracker._extracted_dir") + @patch("main.GtfsChangeTracker._resolve_datasets") + def test_allow_overwrite_skips_existence_check( + self, mock_resolve, mock_extracted_dir, mock_upload, mock_save, mock_storage + ): + """allow_overwrite=True should proceed even when the blob exists.""" + mock_storage.return_value.bucket.return_value.blob.return_value.exists.return_value = ( + True + ) + mock_resolve.return_value = ("prev-uuid", "curr-uuid", "feed-uuid") + mock_extracted_dir.side_effect = [ + "/mobilitydata-datasets/mdb-1/mdb-1-20240101/extracted", + "/mobilitydata-datasets/mdb-1/mdb-1-20240201/extracted", + ] + fake_diff = MagicMock() + fake_diff.summary.model_dump.return_value = {} + fake_diff.model_dump_json.return_value = "{}" + mock_upload.return_value = ( + "https://storage.googleapis.com/test-bucket/changelog.json" + ) + + tracker = GtfsChangeTracker( + feed_stable_id="mdb-1", + base_dataset_stable_id="mdb-1-20240101", + new_dataset_stable_id="mdb-1-20240201", + bucket_name="test-bucket", + bucket_mount="/mobilitydata-datasets", + allow_overwrite=True, + ) + with patch("main.diff_feeds", return_value=fake_diff, create=True): + result = tracker.run() + self.assertIn("generated successfully", result["message"]) + mock_upload.assert_called_once() + + @patch("main.storage.Client") + @patch("main.GtfsChangeTracker._upload_changelog") + @patch("main.GtfsChangeTracker._extracted_dir") + @patch("main.GtfsChangeTracker._resolve_datasets") + def test_dry_run_skips_upload_and_db( + self, mock_resolve, mock_extracted_dir, mock_upload, mock_storage + ): + """dry_run=True should compute the diff but not upload or write to DB.""" + mock_storage.return_value.bucket.return_value.blob.return_value.exists.return_value = ( + False + ) + mock_resolve.return_value = ("prev-uuid", "curr-uuid", "feed-uuid") + mock_extracted_dir.side_effect = [ + "/mobilitydata-datasets/mdb-1/mdb-1-20240101/extracted", + "/mobilitydata-datasets/mdb-1/mdb-1-20240201/extracted", + ] + fake_diff = MagicMock() + fake_diff.summary.model_dump.return_value = {"total_changes": 5} + + tracker = GtfsChangeTracker( + feed_stable_id="mdb-1", + base_dataset_stable_id="mdb-1-20240101", + new_dataset_stable_id="mdb-1-20240201", + bucket_name="test-bucket", + bucket_mount="/mobilitydata-datasets", + dry_run=True, + ) + with patch("main.diff_feeds", return_value=fake_diff, create=True): + result = tracker.run() + + self.assertIn("Dry run", result["message"]) + self.assertIn("summary", result) + mock_upload.assert_not_called() + class TestResolveDatasets(unittest.TestCase): """Tests for _resolve_datasets — verifies plain string UUIDs are returned.""" From 8e35bd67db74016f653ddd3c03a32d8a4df8766d Mon Sep 17 00:00:00 2001 From: jcpitre Date: Wed, 10 Jun 2026 22:19:20 -0400 Subject: [PATCH 11/21] Flipped allow_overwrite to disallow_overwrite --- .../gtfs_change_tracker/src/main.py | 14 +++--- .../gtfs_change_tracker/tests/test_main.py | 50 ++++++++++++------- 2 files changed, 40 insertions(+), 24 deletions(-) diff --git a/functions-python/gtfs_change_tracker/src/main.py b/functions-python/gtfs_change_tracker/src/main.py index b9a53c83b..33b86e836 100644 --- a/functions-python/gtfs_change_tracker/src/main.py +++ b/functions-python/gtfs_change_tracker/src/main.py @@ -49,7 +49,7 @@ def gtfs_change_tracker(request: flask.Request) -> dict: feed_stable_id – stable_id of the GTFS feed base_dataset_stable_id – stable_id of the base (previous) Gtfsdataset new_dataset_stable_id – stable_id of the new (current) Gtfsdataset - allow_overwrite – (optional, default false) overwrite existing changelog + disallow_overwrite – (optional, default false) skip if changelog already exists dry_run – (optional, default false) compute diff but skip GCS upload and DB write Always returns HTTP 200 — errors are reported in the response body. @@ -61,7 +61,7 @@ def gtfs_change_tracker(request: flask.Request) -> dict: feed_stable_id = payload.get("feed_stable_id") base_dataset_stable_id = payload.get("base_dataset_stable_id") new_dataset_stable_id = payload.get("new_dataset_stable_id") - allow_overwrite = bool(payload.get("allow_overwrite", False)) + disallow_overwrite = bool(payload.get("disallow_overwrite", False)) dry_run = bool(payload.get("dry_run", False)) if not (feed_stable_id and base_dataset_stable_id and new_dataset_stable_id): @@ -91,7 +91,7 @@ def gtfs_change_tracker(request: flask.Request) -> dict: new_dataset_stable_id=new_dataset_stable_id, bucket_name=bucket_name, bucket_mount=bucket_mount, - allow_overwrite=allow_overwrite, + disallow_overwrite=disallow_overwrite, dry_run=dry_run, ) result = tracker.run() @@ -130,7 +130,7 @@ def __init__( new_dataset_stable_id: str, bucket_name: str, bucket_mount: str, - allow_overwrite: bool = False, + disallow_overwrite: bool = False, dry_run: bool = False, ): self.feed_stable_id = feed_stable_id @@ -138,7 +138,7 @@ def __init__( self.new_dataset_stable_id = new_dataset_stable_id self.bucket_name = bucket_name self.bucket_mount = bucket_mount - self.allow_overwrite = allow_overwrite + self.disallow_overwrite = disallow_overwrite self.dry_run = dry_run self.logger = get_logger(GtfsChangeTracker.__name__, new_dataset_stable_id) @@ -151,8 +151,8 @@ def run(self) -> dict: ) blob = storage.Client().bucket(self.bucket_name).blob(changelog_blob_path) - # Idempotency: skip if changelog already exists, unless allow_overwrite is set. - if not self.allow_overwrite and blob.exists(): + # Idempotency: skip if changelog already exists and disallow_overwrite is set. + if self.disallow_overwrite and blob.exists(): changelog_url = f"https://storage.googleapis.com/{self.bucket_name}/{changelog_blob_path}" self.logger.info("Changelog already exists, skipping: %s", changelog_url) return { diff --git a/functions-python/gtfs_change_tracker/tests/test_main.py b/functions-python/gtfs_change_tracker/tests/test_main.py index ccd32c1b8..e68b82550 100644 --- a/functions-python/gtfs_change_tracker/tests/test_main.py +++ b/functions-python/gtfs_change_tracker/tests/test_main.py @@ -98,12 +98,12 @@ def test_success(self, mock_tracker_cls): new_dataset_stable_id="mdb-1-20240201", bucket_name="test-bucket", bucket_mount="/mobilitydata-datasets", - allow_overwrite=False, + disallow_overwrite=False, dry_run=False, ) @patch("main.GtfsChangeTracker") - def test_allow_overwrite_and_dry_run_passed(self, mock_tracker_cls): + def test_disallow_overwrite_and_dry_run_passed(self, mock_tracker_cls): os.environ["DATASETS_BUCKET_NAME"] = "test-bucket" os.environ["DATASETS_BUCKET_MOUNT"] = "/mobilitydata-datasets" instance = MagicMock() @@ -115,7 +115,7 @@ def test_allow_overwrite_and_dry_run_passed(self, mock_tracker_cls): "feed_stable_id": "mdb-1", "base_dataset_stable_id": "mdb-1-20240101", "new_dataset_stable_id": "mdb-1-20240201", - "allow_overwrite": True, + "disallow_overwrite": True, "dry_run": True, } ) @@ -125,7 +125,7 @@ def test_allow_overwrite_and_dry_run_passed(self, mock_tracker_cls): new_dataset_stable_id="mdb-1-20240201", bucket_name="test-bucket", bucket_mount="/mobilitydata-datasets", - allow_overwrite=True, + disallow_overwrite=True, dry_run=True, ) @@ -234,19 +234,44 @@ def test_run_skips_when_changelog_exists(self, mock_storage): mock_storage.return_value.bucket.return_value.blob.return_value.exists.return_value = ( True ) - result = self.tracker.run() + tracker = GtfsChangeTracker( + feed_stable_id="mdb-1", + base_dataset_stable_id="mdb-1-20240101", + new_dataset_stable_id="mdb-1-20240201", + bucket_name="test-bucket", + bucket_mount="/mobilitydata-datasets", + disallow_overwrite=True, + ) + result = tracker.run() self.assertIn("already exists", result["message"]) self.assertIn("changelog_url", result) + @patch("main.storage.Client") + def test_disallow_overwrite_skips_when_exists(self, mock_storage): + """disallow_overwrite=True should skip when the blob already exists.""" + mock_storage.return_value.bucket.return_value.blob.return_value.exists.return_value = ( + True + ) + tracker = GtfsChangeTracker( + feed_stable_id="mdb-1", + base_dataset_stable_id="mdb-1-20240101", + new_dataset_stable_id="mdb-1-20240201", + bucket_name="test-bucket", + bucket_mount="/mobilitydata-datasets", + disallow_overwrite=True, + ) + result = tracker.run() + self.assertIn("already exists", result["message"]) + @patch("main.storage.Client") @patch("main.GtfsChangeTracker._save_changelog_record") @patch("main.GtfsChangeTracker._upload_changelog") @patch("main.GtfsChangeTracker._extracted_dir") @patch("main.GtfsChangeTracker._resolve_datasets") - def test_allow_overwrite_skips_existence_check( + def test_overwrite_proceeds_when_exists_by_default( self, mock_resolve, mock_extracted_dir, mock_upload, mock_save, mock_storage ): - """allow_overwrite=True should proceed even when the blob exists.""" + """By default (disallow_overwrite=False) the changelog is overwritten even if it exists.""" mock_storage.return_value.bucket.return_value.blob.return_value.exists.return_value = ( True ) @@ -261,17 +286,8 @@ def test_allow_overwrite_skips_existence_check( mock_upload.return_value = ( "https://storage.googleapis.com/test-bucket/changelog.json" ) - - tracker = GtfsChangeTracker( - feed_stable_id="mdb-1", - base_dataset_stable_id="mdb-1-20240101", - new_dataset_stable_id="mdb-1-20240201", - bucket_name="test-bucket", - bucket_mount="/mobilitydata-datasets", - allow_overwrite=True, - ) with patch("main.diff_feeds", return_value=fake_diff, create=True): - result = tracker.run() + result = self.tracker.run() self.assertIn("generated successfully", result["message"]) mock_upload.assert_called_once() From 8d8f164a8a7ad761736448696f39849681544bba Mon Sep 17 00:00:00 2001 From: jcpitre Date: Thu, 11 Jun 2026 09:15:46 -0400 Subject: [PATCH 12/21] Added README.md --- .../gtfs_change_tracker/README.md | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 functions-python/gtfs_change_tracker/README.md diff --git a/functions-python/gtfs_change_tracker/README.md b/functions-python/gtfs_change_tracker/README.md new file mode 100644 index 000000000..3f8ea3e81 --- /dev/null +++ b/functions-python/gtfs_change_tracker/README.md @@ -0,0 +1,82 @@ +# GTFS Change Tracker + +This function computes a structured diff between two consecutive GTFS datasets and stores the resulting changelog in GCS and the database. + +The function reads pre-extracted GTFS files from a GCS-mounted bucket (uploaded by `batch_process_dataset`), runs the diff engine, uploads the changelog JSON to GCS, and upserts a row in `gtfs_dataset_changelog`. + +## Usage + +The function receives the following request: +``` +{ + "feed_stable_id": str, – stable_id of the GTFS feed + "base_dataset_stable_id": str, – stable_id of the base (older) dataset + "new_dataset_stable_id": str, – stable_id of the new (recent) dataset + "disallow_overwrite": bool (optional), – skip if changelog already exists (default: false) + "dry_run": bool (optional) – compute diff but skip GCS upload and DB write (default: false) +} +``` + +Example: +```json +{ + "feed_stable_id": "mdb-2142", + "base_dataset_stable_id": "mdb-2142-202502251658", + "new_dataset_stable_id": "mdb-2142-202507081652" +} +``` + +Example curl call: +```bash +curl -X POST https:// \ + -H "Authorization: Bearer $(gcloud auth print-identity-token)" \ + -H "Content-Type: application/json" \ + -d '{ + "feed_stable_id": "mdb-2142", + "base_dataset_stable_id": "mdb-2142-202502251658", + "new_dataset_stable_id": "mdb-2142-202507081652" + }' +``` + +### `disallow_overwrite` +By default the function will overwrite an existing changelog for the same dataset pair. Set `disallow_overwrite: true` to skip execution if a changelog already exists in GCS. + +### `dry_run` +When `dry_run: true`, the diff is computed and a summary is returned in the response, but nothing is written to GCS or the database. Useful for validating that the extracted files are present and the diff engine runs correctly. + +## Response + +Success: +```json +{ + "status": "success", + "message": "Changelog generated successfully.", + "changelog_url": "https://storage.googleapis.com////..." +} +``` + +Dry run: +```json +{ + "status": "success", + "message": "Dry run completed. Diff computed but not persisted.", + "summary": { + "total_changes": 42, + "files_added_count": 0, + "files_deleted_count": 0, + "files_modified_count": 3, + ... + } +} +``` + +The function always returns HTTP 200, including on errors. Errors are reported in the response body under `"status": "error"`. This prevents GCP from retrying failures where re-running with the same parameters would produce the same result. + +## GCP environment variables + +- `DATASETS_BUCKET_NAME`: The GCS bucket where datasets are stored (required). Must include the environment suffix, e.g. `mobilitydata-datasets-dev`. +- `DATASETS_BUCKET_MOUNT`: Mount path for the GCS bucket (default: `/mobilitydata-datasets`). +- `GTFS_DIFF_DUCKDB_TMPDIR`: Mount path for the in-memory tmpfs used by the diff engine (default: `/tmp/in-memory`). Used by `limit_gcp_memory` to compute the available process memory and set `RLIMIT_AS`, preventing silent OOM kills. +- `MEMORY_MARGIN_MB`: Safety margin in MiB subtracted from the memory limit before setting `RLIMIT_AS` (default: `200`). +- `CHANGELOG_DB_WRITE_ENABLED`: Set to `true` to enable writing changelog records to the database (default: `false`). +- `LOGGING_LEVEL`: Log level (default: `INFO`). From c0320e6d5f007df571f5e9139637dc3d5260734b Mon Sep 17 00:00:00 2001 From: jcpitre Date: Thu, 11 Jun 2026 09:49:04 -0400 Subject: [PATCH 13/21] Corrected some tests --- .../batch_process_dataset/src/pipeline_tasks.py | 6 +++--- .../tests/test_batch_process_dataset_main.py | 3 ++- functions-python/helpers/utils.py | 12 ++++++------ 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/functions-python/batch_process_dataset/src/pipeline_tasks.py b/functions-python/batch_process_dataset/src/pipeline_tasks.py index b86eba5b7..2783bcef5 100644 --- a/functions-python/batch_process_dataset/src/pipeline_tasks.py +++ b/functions-python/batch_process_dataset/src/pipeline_tasks.py @@ -153,9 +153,9 @@ def create_pipeline_tasks(dataset: Gtfsdataset, db_session: Session) -> None: ) if previous_dataset: create_http_gtfs_change_tracker_task( - feed_id=dataset.feed_id, - previous_dataset_id=previous_dataset.id, - current_dataset_id=dataset.id, + feed_stable_id=stable_id, + base_dataset_stable_id=previous_dataset.stable_id, + new_dataset_stable_id=dataset_stable_id, ) else: logging.info( diff --git a/functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py b/functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py index 784c40612..327d85e7a 100644 --- a/functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py +++ b/functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py @@ -449,8 +449,9 @@ def mock_remove_side_effect(path): @patch.dict( os.environ, {"FEEDS_CREDENTIALS": '{"test_stable_id": "test_credentials"}'} ) + @patch("pipeline_tasks.create_http_gtfs_change_tracker_task") @with_db_session(db_url=default_db_url) - def test_process(self, db_session): + def test_process(self, mock_gtfs_change_tracker_task, db_session): feeds = db_session.query(Gtfsfeed).all() feed_id = feeds[0].id diff --git a/functions-python/helpers/utils.py b/functions-python/helpers/utils.py index 04badcacf..5d9448f2f 100644 --- a/functions-python/helpers/utils.py +++ b/functions-python/helpers/utils.py @@ -536,9 +536,9 @@ def create_http_pmtiles_builder_task( def create_http_gtfs_change_tracker_task( - feed_id: str, - previous_dataset_id: str, - current_dataset_id: str, + feed_stable_id: str, + base_dataset_stable_id: str, + new_dataset_stable_id: str, ) -> None: """ Create a Cloud Task to run the gtfs-change-tracker function for a pair of datasets. @@ -549,9 +549,9 @@ def create_http_gtfs_change_tracker_task( client = tasks_v2.CloudTasksClient() body = json.dumps( { - "feed_id": feed_id, - "previous_dataset_id": previous_dataset_id, - "current_dataset_id": current_dataset_id, + "feed_stable_id": feed_stable_id, + "base_dataset_stable_id": base_dataset_stable_id, + "new_dataset_stable_id": new_dataset_stable_id, } ).encode() queue_name = os.getenv("GTFS_CHANGE_TRACKER_QUEUE") From c9ccd42b98c2328c94db8cacf19078854a8710e2 Mon Sep 17 00:00:00 2001 From: jcpitre Date: Thu, 11 Jun 2026 10:33:08 -0400 Subject: [PATCH 14/21] Gated the call to the change tracker based on the presence in the gtfs_dateset_changelog. --- .../src/pipeline_tasks.py | 30 +++++++++++++++---- functions-python/helpers/utils.py | 1 + 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/functions-python/batch_process_dataset/src/pipeline_tasks.py b/functions-python/batch_process_dataset/src/pipeline_tasks.py index 2783bcef5..3ac4f22e8 100644 --- a/functions-python/batch_process_dataset/src/pipeline_tasks.py +++ b/functions-python/batch_process_dataset/src/pipeline_tasks.py @@ -7,7 +7,7 @@ from sqlalchemy.orm import Session from shared.database.database import with_db_session -from shared.database_gen.sqlacodegen_models import Gtfsdataset +from shared.database_gen.sqlacodegen_models import Gtfsdataset, GtfsDatasetChangelog from shared.helpers.utils import ( create_http_task, create_http_pmtiles_builder_task, @@ -152,11 +152,31 @@ def create_pipeline_tasks(dataset: Gtfsdataset, db_session: Session) -> None: .first() ) if previous_dataset: - create_http_gtfs_change_tracker_task( - feed_stable_id=stable_id, - base_dataset_stable_id=previous_dataset.stable_id, - new_dataset_stable_id=dataset_stable_id, + # Check the DB for an existing changelog record rather than the GCS blob presence. + # The unique constraint on (previous_dataset_id, current_dataset_id) makes this the + # authoritative idempotency check. GCS blob presence could be used instead, but that + # would require an extra API call and could miss cases where the blob exists but the + # DB record does not (or vice versa). + changelog_exists = ( + db_session.query(GtfsDatasetChangelog) + .filter_by( + previous_dataset_id=previous_dataset.id, + current_dataset_id=dataset.id, + ) + .first() + is not None ) + if changelog_exists: + logging.info( + "Skipping change tracker task for dataset %s: changelog already exists.", + dataset_stable_id, + ) + else: + create_http_gtfs_change_tracker_task( + feed_stable_id=stable_id, + base_dataset_stable_id=previous_dataset.stable_id, + new_dataset_stable_id=dataset_stable_id, + ) else: logging.info( "Skipping change tracker task for dataset %s: no previous dataset found.", diff --git a/functions-python/helpers/utils.py b/functions-python/helpers/utils.py index 5d9448f2f..133f55357 100644 --- a/functions-python/helpers/utils.py +++ b/functions-python/helpers/utils.py @@ -552,6 +552,7 @@ def create_http_gtfs_change_tracker_task( "feed_stable_id": feed_stable_id, "base_dataset_stable_id": base_dataset_stable_id, "new_dataset_stable_id": new_dataset_stable_id, + "disallow_overwrite": True, } ).encode() queue_name = os.getenv("GTFS_CHANGE_TRACKER_QUEUE") From 9f2b47ea8dffa083f595fa4289efd5450eeb5de9 Mon Sep 17 00:00:00 2001 From: jcpitre Date: Thu, 11 Jun 2026 10:51:35 -0400 Subject: [PATCH 15/21] Adjusted the memory allocation --- functions-python/gtfs_change_tracker/function_config.json | 2 +- infra/functions-python/main.tf | 2 +- infra/functions-python/vars.tf | 7 ++++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/functions-python/gtfs_change_tracker/function_config.json b/functions-python/gtfs_change_tracker/function_config.json index 3d04400b7..5e41e4bb8 100644 --- a/functions-python/gtfs_change_tracker/function_config.json +++ b/functions-python/gtfs_change_tracker/function_config.json @@ -3,7 +3,7 @@ "description": "Tracks changes between two GTFS datasets and stores a structured changelog in GCS and the database", "entry_point": "gtfs_change_tracker", "timeout": 540, - "memory": "4Gi", + "memory": "8Gi", "trigger_http": true, "include_folders": ["helpers"], "include_api_folders": ["database_gen", "database", "common"], diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index 473310f6b..a6b3d6858 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -1336,7 +1336,7 @@ resource "terraform_data" "gtfs_change_tracker_gcs_mount" { if echo "$MOUNTS" | grep -q "in-memory"; then echo "In-memory volume already mounted, skipping." else - ARGS="$ARGS --add-volume name=in-memory,type=in-memory,size-limit=2Gi" + ARGS="$ARGS --add-volume name=in-memory,type=in-memory,size-limit=${var.gtfs_change_tracker_in_memory_size}" ARGS="$ARGS --add-volume-mount volume=in-memory,mount-path=/tmp/in-memory" fi diff --git a/infra/functions-python/vars.tf b/infra/functions-python/vars.tf index f70df30c7..4cab92d8c 100644 --- a/infra/functions-python/vars.tf +++ b/infra/functions-python/vars.tf @@ -134,4 +134,9 @@ variable "brevo_api_announcements_list_id" { type = string description = "Brevo list ID for API announcements" default = "" -} \ No newline at end of file +} +variable "gtfs_change_tracker_in_memory_size" { + type = string + description = "Size limit for the gtfs_change_tracker in-memory tmpfs volume" + default = "3Gi" +} From 6ecedd0b524908d09131787b806be3ed2125419f Mon Sep 17 00:00:00 2001 From: jcpitre Date: Thu, 11 Jun 2026 13:35:35 -0400 Subject: [PATCH 16/21] Made sure the base dataset is from the right feed. --- .../gtfs_change_tracker/src/main.py | 4 ++++ .../gtfs_change_tracker/tests/test_main.py | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/functions-python/gtfs_change_tracker/src/main.py b/functions-python/gtfs_change_tracker/src/main.py index 33b86e836..960df1840 100644 --- a/functions-python/gtfs_change_tracker/src/main.py +++ b/functions-python/gtfs_change_tracker/src/main.py @@ -219,6 +219,10 @@ def _resolve_datasets(self, db_session: Session = None) -> tuple: raise ValueError( f"Previous dataset not found: {self.base_dataset_stable_id}" ) + if prev_dataset.feed.stable_id != self.feed_stable_id: + raise ValueError( + f"Dataset {self.base_dataset_stable_id} does not belong to feed {self.feed_stable_id}." + ) curr_dataset = ( db_session.query(Gtfsdataset) diff --git a/functions-python/gtfs_change_tracker/tests/test_main.py b/functions-python/gtfs_change_tracker/tests/test_main.py index e68b82550..e92f43113 100644 --- a/functions-python/gtfs_change_tracker/tests/test_main.py +++ b/functions-python/gtfs_change_tracker/tests/test_main.py @@ -342,6 +342,8 @@ def _mock_session_with_datasets(self): prev_ds = MagicMock() prev_ds.id = "prev-uuid" prev_ds.stable_id = "mdb-1-20240101" + prev_ds.feed.id = "feed-uuid" + prev_ds.feed.stable_id = "mdb-1" curr_ds = MagicMock() curr_ds.id = "curr-uuid" @@ -371,6 +373,21 @@ def test_returns_plain_string_uuids(self): self.assertEqual(curr_uuid, "curr-uuid") self.assertEqual(feed_uuid, "feed-uuid") + @patch("main.with_db_session", lambda f: f) + def test_raises_when_prev_feed_mismatch(self): + prev_ds = MagicMock() + prev_ds.id = "prev-uuid" + prev_ds.feed.stable_id = "mdb-999" # wrong feed + + mock_session = MagicMock() + mock_session.query.return_value.filter.return_value.one_or_none.side_effect = [ + prev_ds, + ] + with self.assertRaises( + ValueError, msg="should reject prev dataset from wrong feed" + ): + self.tracker._resolve_datasets(db_session=mock_session) + @patch("main.with_db_session", lambda f: f) def test_raises_when_feed_mismatch(self): prev_ds = MagicMock() From fcec71abb1175b6d8bd4154efd37c9f14adbd461 Mon Sep 17 00:00:00 2001 From: jcpitre Date: Thu, 11 Jun 2026 14:16:10 -0400 Subject: [PATCH 17/21] Remove CHANGELOG_DB_WRITE_ENABLED flag (covered by dry_run) --- .../gtfs_change_tracker/README.md | 1 - .../gtfs_change_tracker/src/main.py | 57 ++++++++----------- infra/functions-python/main.tf | 3 +- 3 files changed, 25 insertions(+), 36 deletions(-) diff --git a/functions-python/gtfs_change_tracker/README.md b/functions-python/gtfs_change_tracker/README.md index 3f8ea3e81..cb7f46293 100644 --- a/functions-python/gtfs_change_tracker/README.md +++ b/functions-python/gtfs_change_tracker/README.md @@ -78,5 +78,4 @@ The function always returns HTTP 200, including on errors. Errors are reported i - `DATASETS_BUCKET_MOUNT`: Mount path for the GCS bucket (default: `/mobilitydata-datasets`). - `GTFS_DIFF_DUCKDB_TMPDIR`: Mount path for the in-memory tmpfs used by the diff engine (default: `/tmp/in-memory`). Used by `limit_gcp_memory` to compute the available process memory and set `RLIMIT_AS`, preventing silent OOM kills. - `MEMORY_MARGIN_MB`: Safety margin in MiB subtracted from the memory limit before setting `RLIMIT_AS` (default: `200`). -- `CHANGELOG_DB_WRITE_ENABLED`: Set to `true` to enable writing changelog records to the database (default: `false`). - `LOGGING_LEVEL`: Log level (default: `INFO`). diff --git a/functions-python/gtfs_change_tracker/src/main.py b/functions-python/gtfs_change_tracker/src/main.py index 960df1840..b976a2540 100644 --- a/functions-python/gtfs_change_tracker/src/main.py +++ b/functions-python/gtfs_change_tracker/src/main.py @@ -292,39 +292,28 @@ def _save_changelog_record( Upsert a row into gtfs_dataset_changelog. The UNIQUE constraint on (previous_dataset_id, current_dataset_id) ensures idempotency. """ - # TODO: remove this flag and always write to DB once testing is complete - if os.getenv("CHANGELOG_DB_WRITE_ENABLED", "true").lower() == "true": - stmt = ( - insert(GtfsDatasetChangelog) - .values( - feed_id=feed_uuid, - previous_dataset_id=prev_dataset_uuid, - current_dataset_id=curr_dataset_uuid, - changelog_url=changelog_url, - diff_summary=diff_summary, - ) - .on_conflict_do_update( - constraint="gtfs_dataset_changelog_previous_current_key", - set_={ - "changelog_url": changelog_url, - "diff_summary": diff_summary, - "generated_at": GtfsDatasetChangelog.generated_at.default, - }, - ) + stmt = ( + insert(GtfsDatasetChangelog) + .values( + feed_id=feed_uuid, + previous_dataset_id=prev_dataset_uuid, + current_dataset_id=curr_dataset_uuid, + changelog_url=changelog_url, + diff_summary=diff_summary, ) - db_session.execute(stmt) - db_session.commit() - self.logger.info( - "Saved changelog record for %s -> %s", - self.base_dataset_stable_id, - self.new_dataset_stable_id, - ) - else: - self.logger.info( - "[TEMP] Would upsert gtfs_dataset_changelog: feed_id=%s previous=%s current=%s url=%s summary=%s", - feed_uuid, - prev_dataset_uuid, - curr_dataset_uuid, - changelog_url, - diff_summary, + .on_conflict_do_update( + constraint="gtfs_dataset_changelog_previous_current_key", + set_={ + "changelog_url": changelog_url, + "diff_summary": diff_summary, + "generated_at": GtfsDatasetChangelog.generated_at.default, + }, ) + ) + db_session.execute(stmt) + db_session.commit() + self.logger.info( + "Saved changelog record for %s -> %s", + self.base_dataset_stable_id, + self.new_dataset_stable_id, + ) diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index a6b3d6858..d3993035e 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -1282,7 +1282,8 @@ resource "google_cloudfunctions2_function" "gtfs_change_tracker" { GCP_REGION = var.gcp_region DATASETS_BUCKET_NAME = "${var.datasets_bucket_name}-${var.environment}" DATASETS_BUCKET_MOUNT = "/mobilitydata-datasets" - GTFS_DIFF_DUCKDB_TMPDIR = "/tmp/in-memory" + # GTFS_DIFF_DUCKDB_TMPDIR: directs DuckDB spill files to the in-memory volume. + GTFS_DIFF_DUCKDB_TMPDIR = "/tmp/in-memory" } available_memory = local.function_gtfs_change_tracker_config.memory timeout_seconds = local.function_gtfs_change_tracker_config.timeout From a2254ec6bdbc0c90ca941a3cbfedef880caea9ed Mon Sep 17 00:00:00 2001 From: jcpitre Date: Thu, 11 Jun 2026 14:18:17 -0400 Subject: [PATCH 18/21] Made the uploaded report public --- functions-python/gtfs_change_tracker/src/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/functions-python/gtfs_change_tracker/src/main.py b/functions-python/gtfs_change_tracker/src/main.py index b976a2540..76c44d6eb 100644 --- a/functions-python/gtfs_change_tracker/src/main.py +++ b/functions-python/gtfs_change_tracker/src/main.py @@ -273,6 +273,7 @@ def _upload_changelog( bucket = storage.Client().bucket(self.bucket_name) blob = bucket.blob(blob_path) blob.upload_from_string(json_bytes, content_type="application/json") + blob.make_public() self.logger.info( "Uploaded changelog to gs://%s/%s", self.bucket_name, blob_path ) From 842094113710d1222a8bc7311501b894b455b834 Mon Sep 17 00:00:00 2001 From: jcpitre Date: Fri, 12 Jun 2026 12:14:52 -0400 Subject: [PATCH 19/21] improve error logging, add payload to error response, and fix upsert timestamp --- .../gtfs_change_tracker/requirements.txt | 2 +- functions-python/gtfs_change_tracker/src/main.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/functions-python/gtfs_change_tracker/requirements.txt b/functions-python/gtfs_change_tracker/requirements.txt index 4a385b07f..9bfc2b25a 100644 --- a/functions-python/gtfs_change_tracker/requirements.txt +++ b/functions-python/gtfs_change_tracker/requirements.txt @@ -16,7 +16,7 @@ google-cloud-storage google-cloud-tasks # GTFS diff engine -gtfs-diff-engine==0.1.0 +gtfs-diff-engine # Configuration python-dotenv==1.2.2 diff --git a/functions-python/gtfs_change_tracker/src/main.py b/functions-python/gtfs_change_tracker/src/main.py index 76c44d6eb..533572941 100644 --- a/functions-python/gtfs_change_tracker/src/main.py +++ b/functions-python/gtfs_change_tracker/src/main.py @@ -26,6 +26,7 @@ from gtfs_diff.engine import diff_feeds from sqlalchemy.dialects.postgresql import insert from sqlalchemy.orm import Session +from sqlalchemy.sql import func from shared.common.gcp_memory_utils import limit_gcp_memory from shared.database.database import with_db_session @@ -101,12 +102,18 @@ def gtfs_change_tracker(request: flask.Request) -> dict: # return HTTP 200 to suppress GCP retries. If a specific exception type is # identified as safely retriable in the future, catch it here and return 500. logging.exception( - "Failed to generate changelog for %s -> %s", + "Failed to generate changelog for feed=%s base=%s new=%s", + feed_stable_id, base_dataset_stable_id, new_dataset_stable_id, ) return flask.make_response( - {"status": "error", "error": f"Failed to generate changelog: {e}"}, 200 + { + "status": "error", + "error": f"Failed to generate changelog: {e}", + "payload": payload, + }, + 200, ) @@ -307,7 +314,7 @@ def _save_changelog_record( set_={ "changelog_url": changelog_url, "diff_summary": diff_summary, - "generated_at": GtfsDatasetChangelog.generated_at.default, + "generated_at": func.now(), }, ) ) From a1fa432b61fa0207bf72ba7302d71e81fb5aba55 Mon Sep 17 00:00:00 2001 From: jcpitre Date: Mon, 15 Jun 2026 13:08:45 -0400 Subject: [PATCH 20/21] Changed the name to gtfs-datasets-comparer --- .../src/pipeline_tasks.py | 4 +- .../tests/test_batch_process_dataset_main.py | 4 +- .../gtfs_change_tracker/function_config.json | 2 +- functions-python/helpers/utils.py | 6 +- infra/batch/main.tf | 14 ++-- infra/functions-python/main.tf | 64 +++++++++---------- infra/functions-python/vars.tf | 4 +- 7 files changed, 49 insertions(+), 49 deletions(-) diff --git a/functions-python/batch_process_dataset/src/pipeline_tasks.py b/functions-python/batch_process_dataset/src/pipeline_tasks.py index 3ac4f22e8..5012b65eb 100644 --- a/functions-python/batch_process_dataset/src/pipeline_tasks.py +++ b/functions-python/batch_process_dataset/src/pipeline_tasks.py @@ -11,7 +11,7 @@ from shared.helpers.utils import ( create_http_task, create_http_pmtiles_builder_task, - create_http_gtfs_change_tracker_task, + create_http_gtfs_datasets_comparer_task, ) @@ -172,7 +172,7 @@ def create_pipeline_tasks(dataset: Gtfsdataset, db_session: Session) -> None: dataset_stable_id, ) else: - create_http_gtfs_change_tracker_task( + create_http_gtfs_datasets_comparer_task( feed_stable_id=stable_id, base_dataset_stable_id=previous_dataset.stable_id, new_dataset_stable_id=dataset_stable_id, diff --git a/functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py b/functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py index 327d85e7a..4bdd8901c 100644 --- a/functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py +++ b/functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py @@ -449,9 +449,9 @@ def mock_remove_side_effect(path): @patch.dict( os.environ, {"FEEDS_CREDENTIALS": '{"test_stable_id": "test_credentials"}'} ) - @patch("pipeline_tasks.create_http_gtfs_change_tracker_task") + @patch("pipeline_tasks.create_http_gtfs_datasets_comparer_task") @with_db_session(db_url=default_db_url) - def test_process(self, mock_gtfs_change_tracker_task, db_session): + def test_process(self, mock_gtfs_datasets_comparer_task, db_session): feeds = db_session.query(Gtfsfeed).all() feed_id = feeds[0].id diff --git a/functions-python/gtfs_change_tracker/function_config.json b/functions-python/gtfs_change_tracker/function_config.json index 5e41e4bb8..b772ed458 100644 --- a/functions-python/gtfs_change_tracker/function_config.json +++ b/functions-python/gtfs_change_tracker/function_config.json @@ -1,5 +1,5 @@ { - "name": "gtfs-change-tracker", + "name": "gtfs-datasets-comparer", "description": "Tracks changes between two GTFS datasets and stores a structured changelog in GCS and the database", "entry_point": "gtfs_change_tracker", "timeout": 540, diff --git a/functions-python/helpers/utils.py b/functions-python/helpers/utils.py index 133f55357..bb7914bfc 100644 --- a/functions-python/helpers/utils.py +++ b/functions-python/helpers/utils.py @@ -535,13 +535,13 @@ def create_http_pmtiles_builder_task( ) -def create_http_gtfs_change_tracker_task( +def create_http_gtfs_datasets_comparer_task( feed_stable_id: str, base_dataset_stable_id: str, new_dataset_stable_id: str, ) -> None: """ - Create a Cloud Task to run the gtfs-change-tracker function for a pair of datasets. + Create a Cloud Task to run the gtfs-datasets-comparer function for a pair of datasets. """ from google.cloud import tasks_v2 import json @@ -563,7 +563,7 @@ def create_http_gtfs_change_tracker_task( create_http_task( client, body, - f"https://{gcp_region}-{project_id}.cloudfunctions.net/gtfs-change-tracker-{gcp_env}", + f"https://{gcp_region}-{project_id}.cloudfunctions.net/gtfs-datasets-comparer-{gcp_env}", project_id, gcp_region, queue_name, diff --git a/infra/batch/main.tf b/infra/batch/main.tf index 7c58653d1..e559facb3 100644 --- a/infra/batch/main.tf +++ b/infra/batch/main.tf @@ -45,7 +45,7 @@ locals { function_pmtiles_builder_config = jsondecode(file("${path.module}/../../functions-python/pmtiles_builder/function_config.json")) - function_gtfs_change_tracker_config = jsondecode(file("${path.module}/../../functions-python/gtfs_change_tracker/function_config.json")) + function_gtfs_datasets_comparer_config = jsondecode(file("${path.module}/../../functions-python/gtfs_datasets_comparer/function_config.json")) } data "google_vpc_access_connector" "vpc_connector" { @@ -274,11 +274,11 @@ resource "google_cloud_tasks_queue" "pmtiles_builder_task_queue" { } } -# Task queue to invoke gtfs_change_tracker function -resource "google_cloud_tasks_queue" "gtfs_change_tracker_task_queue" { +# Task queue to invoke gtfs_datasets_comparer function +resource "google_cloud_tasks_queue" "gtfs_datasets_comparer_task_queue" { project = var.project_id location = var.gcp_region - name = "gtfs-change-tracker-queue-${var.environment}-${local.deployment_timestamp}" + name = "gtfs-datasets-comparer-queue-${var.environment}-${local.deployment_timestamp}" rate_limits { max_concurrent_dispatches = 10 @@ -332,7 +332,7 @@ resource "google_cloudfunctions2_function" "pubsub_function" { MATERIALIZED_VIEW_QUEUE = google_cloud_tasks_queue.refresh_materialized_view_task_queue.name PMTILES_BUILDER_QUEUE = google_cloud_tasks_queue.pmtiles_builder_task_queue.name REVERSE_GEOLOCATION_QUEUE = "reverse-geolocation-processor-task-queue" - GTFS_CHANGE_TRACKER_QUEUE = google_cloud_tasks_queue.gtfs_change_tracker_task_queue.name + GTFS_CHANGE_TRACKER_QUEUE = google_cloud_tasks_queue.gtfs_datasets_comparer_task_queue.name WEB_REVALIDATION_QUEUE = google_cloud_tasks_queue.web_revalidation_task_queue.name } dynamic "secret_environment_variables" { @@ -493,10 +493,10 @@ resource "google_cloud_run_service_iam_member" "pmtiles_builder_invoker" { member = "serviceAccount:${google_service_account.functions_service_account.email}" } -resource "google_cloud_run_service_iam_member" "gtfs_change_tracker_invoker" { +resource "google_cloud_run_service_iam_member" "gtfs_datasets_comparer_invoker" { project = var.project_id location = var.gcp_region - service = "${local.function_gtfs_change_tracker_config.name}-${var.environment}" + service = "${local.function_gtfs_datasets_comparer_config.name}-${var.environment}" role = "roles/run.invoker" member = "serviceAccount:${google_service_account.functions_service_account.email}" } \ No newline at end of file diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index d3993035e..f6fe26c51 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -66,8 +66,8 @@ locals { function_pmtiles_builder_config = jsondecode(file("${path.module}/../../functions-python/pmtiles_builder/function_config.json")) function_pmtiles_builder_zip = "${path.module}/../../functions-python/pmtiles_builder/.dist/pmtiles_builder.zip" - function_gtfs_change_tracker_config = jsondecode(file("${path.module}/../../functions-python/gtfs_change_tracker/function_config.json")) - function_gtfs_change_tracker_zip = "${path.module}/../../functions-python/gtfs_change_tracker/.dist/gtfs_change_tracker.zip" + function_gtfs_datasets_comparer_config = jsondecode(file("${path.module}/../../functions-python/gtfs_datasets_comparer/function_config.json")) + function_gtfs_datasets_comparer_zip = "${path.module}/../../functions-python/gtfs_datasets_comparer/.dist/gtfs_datasets_comparer.zip" } locals { @@ -80,7 +80,7 @@ locals { local.function_export_csv_config.secret_environment_variables, local.function_tasks_executor_config.secret_environment_variables, local.function_pmtiles_builder_config.secret_environment_variables, - local.function_gtfs_change_tracker_config.secret_environment_variables + local.function_gtfs_datasets_comparer_config.secret_environment_variables ) # Remove duplicates by key, keeping the first occurrence @@ -227,10 +227,10 @@ resource "google_storage_bucket_object" "pmtiles_builder_zip" { } # 16. GTFS Change Tracker -resource "google_storage_bucket_object" "gtfs_change_tracker_zip" { +resource "google_storage_bucket_object" "gtfs_datasets_comparer_zip" { bucket = google_storage_bucket.functions_bucket.name - name = "gtfs-change-tracker-${substr(filebase64sha256(local.function_gtfs_change_tracker_zip), 0, 10)}.zip" - source = local.function_gtfs_change_tracker_zip + name = "gtfs-datasets-comparer-${substr(filebase64sha256(local.function_gtfs_datasets_comparer_zip), 0, 10)}.zip" + source = local.function_gtfs_datasets_comparer_zip } # Web app revalidation secret @@ -1045,20 +1045,20 @@ resource "google_cloudfunctions2_function_iam_member" "pmtiles_builder_invoker" member = "serviceAccount:${google_service_account.functions_service_account.email}" } -# Grant execution permission to batchfunctions service account to the gtfs_change_tracker function -resource "google_cloudfunctions2_function_iam_member" "gtfs_change_tracker_invoker_batch_sa" { +# Grant execution permission to batchfunctions service account to the gtfs_datasets_comparer function +resource "google_cloudfunctions2_function_iam_member" "gtfs_datasets_comparer_invoker_batch_sa" { project = var.project_id location = var.gcp_region - cloud_function = google_cloudfunctions2_function.gtfs_change_tracker.name + cloud_function = google_cloudfunctions2_function.gtfs_datasets_comparer.name role = "roles/cloudfunctions.invoker" member = "serviceAccount:${local.batchfunctions_sa_email}" } -# Grant execution permission to the functions service account to the gtfs_change_tracker function -resource "google_cloudfunctions2_function_iam_member" "gtfs_change_tracker_invoker" { +# Grant execution permission to the functions service account to the gtfs_datasets_comparer function +resource "google_cloudfunctions2_function_iam_member" "gtfs_datasets_comparer_invoker" { project = var.project_id location = var.gcp_region - cloud_function = google_cloudfunctions2_function.gtfs_change_tracker.name + cloud_function = google_cloudfunctions2_function.gtfs_datasets_comparer.name role = "roles/cloudfunctions.invoker" member = "serviceAccount:${google_service_account.functions_service_account.email}" } @@ -1257,21 +1257,21 @@ resource "google_cloudfunctions2_function" "pmtiles_builder" { } -# 16. functions/gtfs_change_tracker cloud function -resource "google_cloudfunctions2_function" "gtfs_change_tracker" { - name = "${local.function_gtfs_change_tracker_config.name}-${var.environment}" +# 16. functions/gtfs_datasets_comparer cloud function +resource "google_cloudfunctions2_function" "gtfs_datasets_comparer" { + name = "${local.function_gtfs_datasets_comparer_config.name}-${var.environment}" project = var.project_id - description = local.function_gtfs_change_tracker_config.description + description = local.function_gtfs_datasets_comparer_config.description location = var.gcp_region depends_on = [google_secret_manager_secret_iam_member.secret_iam_member] build_config { runtime = var.python_runtime - entry_point = local.function_gtfs_change_tracker_config.entry_point + entry_point = local.function_gtfs_datasets_comparer_config.entry_point source { storage_source { bucket = google_storage_bucket.functions_bucket.name - object = google_storage_bucket_object.gtfs_change_tracker_zip.name + object = google_storage_bucket_object.gtfs_datasets_comparer_zip.name } } } @@ -1285,19 +1285,19 @@ resource "google_cloudfunctions2_function" "gtfs_change_tracker" { # GTFS_DIFF_DUCKDB_TMPDIR: directs DuckDB spill files to the in-memory volume. GTFS_DIFF_DUCKDB_TMPDIR = "/tmp/in-memory" } - available_memory = local.function_gtfs_change_tracker_config.memory - timeout_seconds = local.function_gtfs_change_tracker_config.timeout - available_cpu = local.function_gtfs_change_tracker_config.available_cpu - max_instance_request_concurrency = local.function_gtfs_change_tracker_config.max_instance_request_concurrency - max_instance_count = local.function_gtfs_change_tracker_config.max_instance_count - min_instance_count = local.function_gtfs_change_tracker_config.min_instance_count + available_memory = local.function_gtfs_datasets_comparer_config.memory + timeout_seconds = local.function_gtfs_datasets_comparer_config.timeout + available_cpu = local.function_gtfs_datasets_comparer_config.available_cpu + max_instance_request_concurrency = local.function_gtfs_datasets_comparer_config.max_instance_request_concurrency + max_instance_count = local.function_gtfs_datasets_comparer_config.max_instance_count + min_instance_count = local.function_gtfs_datasets_comparer_config.min_instance_count service_account_email = google_service_account.functions_service_account.email - ingress_settings = local.function_gtfs_change_tracker_config.ingress_settings + ingress_settings = local.function_gtfs_datasets_comparer_config.ingress_settings vpc_connector = data.google_vpc_access_connector.vpc_connector.id vpc_connector_egress_settings = "PRIVATE_RANGES_ONLY" dynamic "secret_environment_variables" { - for_each = local.function_gtfs_change_tracker_config.secret_environment_variables + for_each = local.function_gtfs_datasets_comparer_config.secret_environment_variables content { key = secret_environment_variables.value["key"] project_id = var.project_id @@ -1311,9 +1311,9 @@ resource "google_cloudfunctions2_function" "gtfs_change_tracker" { # google_cloudfunctions2_function does not expose volume mounts in its schema. # This terraform_data resource mounts both the datasets GCS bucket and an in-memory tmpfs # on the underlying Cloud Run service after the function is deployed. -resource "terraform_data" "gtfs_change_tracker_gcs_mount" { +resource "terraform_data" "gtfs_datasets_comparer_gcs_mount" { triggers_replace = { - function_name = google_cloudfunctions2_function.gtfs_change_tracker.name + function_name = google_cloudfunctions2_function.gtfs_datasets_comparer.name bucket = "${var.datasets_bucket_name}-${var.environment}" region = var.gcp_region project = var.project_id @@ -1321,7 +1321,7 @@ resource "terraform_data" "gtfs_change_tracker_gcs_mount" { provisioner "local-exec" { command = <<-EOT - MOUNTS=$(gcloud run services describe ${google_cloudfunctions2_function.gtfs_change_tracker.name} \ + MOUNTS=$(gcloud run services describe ${google_cloudfunctions2_function.gtfs_datasets_comparer.name} \ --project ${var.project_id} \ --region ${var.gcp_region} \ --format='value(spec.template.spec.volumes[].name)' 2>/dev/null) @@ -1337,12 +1337,12 @@ resource "terraform_data" "gtfs_change_tracker_gcs_mount" { if echo "$MOUNTS" | grep -q "in-memory"; then echo "In-memory volume already mounted, skipping." else - ARGS="$ARGS --add-volume name=in-memory,type=in-memory,size-limit=${var.gtfs_change_tracker_in_memory_size}" + ARGS="$ARGS --add-volume name=in-memory,type=in-memory,size-limit=${var.gtfs_datasets_comparer_in_memory_size}" ARGS="$ARGS --add-volume-mount volume=in-memory,mount-path=/tmp/in-memory" fi if [ -n "$ARGS" ]; then - gcloud run services update ${google_cloudfunctions2_function.gtfs_change_tracker.name} \ + gcloud run services update ${google_cloudfunctions2_function.gtfs_datasets_comparer.name} \ --project ${var.project_id} \ --region ${var.gcp_region} \ $ARGS \ @@ -1351,7 +1351,7 @@ resource "terraform_data" "gtfs_change_tracker_gcs_mount" { EOT } - depends_on = [google_cloudfunctions2_function.gtfs_change_tracker] + depends_on = [google_cloudfunctions2_function.gtfs_datasets_comparer] } # Create the Pub/Sub topic used for publishing messages about rebuilding missing bounding boxes diff --git a/infra/functions-python/vars.tf b/infra/functions-python/vars.tf index 4cab92d8c..156b60abc 100644 --- a/infra/functions-python/vars.tf +++ b/infra/functions-python/vars.tf @@ -135,8 +135,8 @@ variable "brevo_api_announcements_list_id" { description = "Brevo list ID for API announcements" default = "" } -variable "gtfs_change_tracker_in_memory_size" { +variable "gtfs_datasets_comparer_in_memory_size" { type = string - description = "Size limit for the gtfs_change_tracker in-memory tmpfs volume" + description = "Size limit for the gtfs_datasets_comparer in-memory tmpfs volume" default = "3Gi" } From 358f6adec30d3750ad165447347c9e786b907b3b Mon Sep 17 00:00:00 2001 From: jcpitre Date: Mon, 15 Jun 2026 13:30:52 -0400 Subject: [PATCH 21/21] Changed the folder name to gtfs-datasets-comparer --- .../README.md | 0 .../function_config.json | 2 +- .../requirements.txt | 0 .../requirements_dev.txt | 0 .../src/main.py | 12 ++--- .../tests/__init__.py | 0 .../tests/test_main.py | 54 +++++++++---------- 7 files changed, 34 insertions(+), 34 deletions(-) rename functions-python/{gtfs_change_tracker => gtfs_datasets_comparer}/README.md (100%) rename functions-python/{gtfs_change_tracker => gtfs_datasets_comparer}/function_config.json (94%) rename functions-python/{gtfs_change_tracker => gtfs_datasets_comparer}/requirements.txt (100%) rename functions-python/{gtfs_change_tracker => gtfs_datasets_comparer}/requirements_dev.txt (100%) rename functions-python/{gtfs_change_tracker => gtfs_datasets_comparer}/src/main.py (97%) rename functions-python/{gtfs_change_tracker => gtfs_datasets_comparer}/tests/__init__.py (100%) rename functions-python/{gtfs_change_tracker => gtfs_datasets_comparer}/tests/test_main.py (92%) diff --git a/functions-python/gtfs_change_tracker/README.md b/functions-python/gtfs_datasets_comparer/README.md similarity index 100% rename from functions-python/gtfs_change_tracker/README.md rename to functions-python/gtfs_datasets_comparer/README.md diff --git a/functions-python/gtfs_change_tracker/function_config.json b/functions-python/gtfs_datasets_comparer/function_config.json similarity index 94% rename from functions-python/gtfs_change_tracker/function_config.json rename to functions-python/gtfs_datasets_comparer/function_config.json index b772ed458..a2e36b0f2 100644 --- a/functions-python/gtfs_change_tracker/function_config.json +++ b/functions-python/gtfs_datasets_comparer/function_config.json @@ -1,7 +1,7 @@ { "name": "gtfs-datasets-comparer", "description": "Tracks changes between two GTFS datasets and stores a structured changelog in GCS and the database", - "entry_point": "gtfs_change_tracker", + "entry_point": "gtfs_datasets_comparer", "timeout": 540, "memory": "8Gi", "trigger_http": true, diff --git a/functions-python/gtfs_change_tracker/requirements.txt b/functions-python/gtfs_datasets_comparer/requirements.txt similarity index 100% rename from functions-python/gtfs_change_tracker/requirements.txt rename to functions-python/gtfs_datasets_comparer/requirements.txt diff --git a/functions-python/gtfs_change_tracker/requirements_dev.txt b/functions-python/gtfs_datasets_comparer/requirements_dev.txt similarity index 100% rename from functions-python/gtfs_change_tracker/requirements_dev.txt rename to functions-python/gtfs_datasets_comparer/requirements_dev.txt diff --git a/functions-python/gtfs_change_tracker/src/main.py b/functions-python/gtfs_datasets_comparer/src/main.py similarity index 97% rename from functions-python/gtfs_change_tracker/src/main.py rename to functions-python/gtfs_datasets_comparer/src/main.py index 533572941..4cac88714 100644 --- a/functions-python/gtfs_change_tracker/src/main.py +++ b/functions-python/gtfs_datasets_comparer/src/main.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# This module provides the GtfsChangeTracker Cloud Function. It orchestrates change tracking +# This module provides the GtfsDatasetsComparer Cloud Function. It orchestrates change tracking # between two consecutive GTFS datasets: reads the pre-extracted GTFS files from GCS (uploaded # by batch_process_dataset at //extracted/), computes a # structured diff using gtfs-diff-engine, uploads the changelog JSON to GCS, and persists the @@ -42,9 +42,9 @@ @functions_framework.http -def gtfs_change_tracker(request: flask.Request) -> dict: +def gtfs_datasets_comparer(request: flask.Request) -> dict: """ - HTTP entrypoint for the GTFS change tracker function. + HTTP entrypoint for the GTFS datasets comparer function. Expects a JSON body with: feed_stable_id – stable_id of the GTFS feed @@ -86,7 +86,7 @@ def gtfs_change_tracker(request: flask.Request) -> dict: bucket_mount = os.getenv("DATASETS_BUCKET_MOUNT", "/tmp/mobilitydata-datasets") try: - tracker = GtfsChangeTracker( + tracker = GtfsDatasetsComparer( feed_stable_id=feed_stable_id, base_dataset_stable_id=base_dataset_stable_id, new_dataset_stable_id=new_dataset_stable_id, @@ -117,7 +117,7 @@ def gtfs_change_tracker(request: flask.Request) -> dict: ) -class GtfsChangeTracker: +class GtfsDatasetsComparer: """ Orchestrates GTFS change tracking between two consecutive datasets. @@ -147,7 +147,7 @@ def __init__( self.bucket_mount = bucket_mount self.disallow_overwrite = disallow_overwrite self.dry_run = dry_run - self.logger = get_logger(GtfsChangeTracker.__name__, new_dataset_stable_id) + self.logger = get_logger(GtfsDatasetsComparer.__name__, new_dataset_stable_id) @track_metrics(metrics=("time", "memory", "cpu")) def run(self) -> dict: diff --git a/functions-python/gtfs_change_tracker/tests/__init__.py b/functions-python/gtfs_datasets_comparer/tests/__init__.py similarity index 100% rename from functions-python/gtfs_change_tracker/tests/__init__.py rename to functions-python/gtfs_datasets_comparer/tests/__init__.py diff --git a/functions-python/gtfs_change_tracker/tests/test_main.py b/functions-python/gtfs_datasets_comparer/tests/test_main.py similarity index 92% rename from functions-python/gtfs_change_tracker/tests/test_main.py rename to functions-python/gtfs_datasets_comparer/tests/test_main.py index e92f43113..69d8a2721 100644 --- a/functions-python/gtfs_change_tracker/tests/test_main.py +++ b/functions-python/gtfs_datasets_comparer/tests/test_main.py @@ -18,7 +18,7 @@ import flask -from main import GtfsChangeTracker, gtfs_change_tracker +from main import GtfsDatasetsComparer, gtfs_datasets_comparer def _make_dataset( @@ -34,14 +34,14 @@ def _make_dataset( return dataset -class TestGtfsChangeTrackerHandler(unittest.TestCase): +class TestGtfsDatasetsComparerHandler(unittest.TestCase): """Tests for the HTTP handler function.""" app = flask.Flask(__name__) def _request(self, payload): with self.app.test_request_context(json=payload): - response = gtfs_change_tracker(flask.request) + response = gtfs_datasets_comparer(flask.request) with self.app.app_context(): return response.status_code, response.get_json() @@ -71,7 +71,7 @@ def test_missing_bucket_env(self): self.assertEqual(body["status"], "error") self.assertIn("DATASETS_BUCKET_NAME", body["error"]) - @patch("main.GtfsChangeTracker") + @patch("main.GtfsDatasetsComparer") def test_success(self, mock_tracker_cls): os.environ["DATASETS_BUCKET_NAME"] = "test-bucket" os.environ["DATASETS_BUCKET_MOUNT"] = "/mobilitydata-datasets" @@ -102,7 +102,7 @@ def test_success(self, mock_tracker_cls): dry_run=False, ) - @patch("main.GtfsChangeTracker") + @patch("main.GtfsDatasetsComparer") def test_disallow_overwrite_and_dry_run_passed(self, mock_tracker_cls): os.environ["DATASETS_BUCKET_NAME"] = "test-bucket" os.environ["DATASETS_BUCKET_MOUNT"] = "/mobilitydata-datasets" @@ -129,7 +129,7 @@ def test_disallow_overwrite_and_dry_run_passed(self, mock_tracker_cls): dry_run=True, ) - @patch("main.GtfsChangeTracker") + @patch("main.GtfsDatasetsComparer") def test_exception_returns_200_with_error(self, mock_tracker_cls): """All exceptions return HTTP 200 to suppress GCP retries.""" os.environ["DATASETS_BUCKET_NAME"] = "test-bucket" @@ -149,12 +149,12 @@ def test_exception_returns_200_with_error(self, mock_tracker_cls): self.assertIn("something went wrong", body["error"]) -class TestGtfsChangeTrackerRun(unittest.TestCase): - """Tests for GtfsChangeTracker.run() with mocked collaborators.""" +class TestGtfsDatasetsComparerRun(unittest.TestCase): + """Tests for GtfsDatasetsComparer.run() with mocked collaborators.""" def setUp(self): os.environ["DATASETS_BUCKET_NAME"] = "test-bucket" - self.tracker = GtfsChangeTracker( + self.tracker = GtfsDatasetsComparer( feed_stable_id="mdb-1", base_dataset_stable_id="mdb-1-20240101", new_dataset_stable_id="mdb-1-20240201", @@ -163,10 +163,10 @@ def setUp(self): ) @patch("main.storage.Client") - @patch("main.GtfsChangeTracker._save_changelog_record") - @patch("main.GtfsChangeTracker._upload_changelog") - @patch("main.GtfsChangeTracker._extracted_dir") - @patch("main.GtfsChangeTracker._resolve_datasets") + @patch("main.GtfsDatasetsComparer._save_changelog_record") + @patch("main.GtfsDatasetsComparer._upload_changelog") + @patch("main.GtfsDatasetsComparer._extracted_dir") + @patch("main.GtfsDatasetsComparer._resolve_datasets") def test_run_happy_path( self, mock_resolve, mock_extracted_dir, mock_upload, mock_save, mock_storage ): @@ -218,7 +218,7 @@ def test_run_happy_path( self.assertEqual(result["changelog_url"], changelog_url) @patch("main.storage.Client") - @patch("main.GtfsChangeTracker._resolve_datasets") + @patch("main.GtfsDatasetsComparer._resolve_datasets") def test_run_raises_when_resolve_fails(self, mock_resolve, mock_storage): mock_storage.return_value.bucket.return_value.blob.return_value.exists.return_value = ( False @@ -234,7 +234,7 @@ def test_run_skips_when_changelog_exists(self, mock_storage): mock_storage.return_value.bucket.return_value.blob.return_value.exists.return_value = ( True ) - tracker = GtfsChangeTracker( + tracker = GtfsDatasetsComparer( feed_stable_id="mdb-1", base_dataset_stable_id="mdb-1-20240101", new_dataset_stable_id="mdb-1-20240201", @@ -252,7 +252,7 @@ def test_disallow_overwrite_skips_when_exists(self, mock_storage): mock_storage.return_value.bucket.return_value.blob.return_value.exists.return_value = ( True ) - tracker = GtfsChangeTracker( + tracker = GtfsDatasetsComparer( feed_stable_id="mdb-1", base_dataset_stable_id="mdb-1-20240101", new_dataset_stable_id="mdb-1-20240201", @@ -264,10 +264,10 @@ def test_disallow_overwrite_skips_when_exists(self, mock_storage): self.assertIn("already exists", result["message"]) @patch("main.storage.Client") - @patch("main.GtfsChangeTracker._save_changelog_record") - @patch("main.GtfsChangeTracker._upload_changelog") - @patch("main.GtfsChangeTracker._extracted_dir") - @patch("main.GtfsChangeTracker._resolve_datasets") + @patch("main.GtfsDatasetsComparer._save_changelog_record") + @patch("main.GtfsDatasetsComparer._upload_changelog") + @patch("main.GtfsDatasetsComparer._extracted_dir") + @patch("main.GtfsDatasetsComparer._resolve_datasets") def test_overwrite_proceeds_when_exists_by_default( self, mock_resolve, mock_extracted_dir, mock_upload, mock_save, mock_storage ): @@ -292,9 +292,9 @@ def test_overwrite_proceeds_when_exists_by_default( mock_upload.assert_called_once() @patch("main.storage.Client") - @patch("main.GtfsChangeTracker._upload_changelog") - @patch("main.GtfsChangeTracker._extracted_dir") - @patch("main.GtfsChangeTracker._resolve_datasets") + @patch("main.GtfsDatasetsComparer._upload_changelog") + @patch("main.GtfsDatasetsComparer._extracted_dir") + @patch("main.GtfsDatasetsComparer._resolve_datasets") def test_dry_run_skips_upload_and_db( self, mock_resolve, mock_extracted_dir, mock_upload, mock_storage ): @@ -310,7 +310,7 @@ def test_dry_run_skips_upload_and_db( fake_diff = MagicMock() fake_diff.summary.model_dump.return_value = {"total_changes": 5} - tracker = GtfsChangeTracker( + tracker = GtfsDatasetsComparer( feed_stable_id="mdb-1", base_dataset_stable_id="mdb-1-20240101", new_dataset_stable_id="mdb-1-20240201", @@ -330,7 +330,7 @@ class TestResolveDatasets(unittest.TestCase): """Tests for _resolve_datasets — verifies plain string UUIDs are returned.""" def setUp(self): - self.tracker = GtfsChangeTracker( + self.tracker = GtfsDatasetsComparer( feed_stable_id="mdb-1", base_dataset_stable_id="mdb-1-20240101", new_dataset_stable_id="mdb-1-20240201", @@ -407,7 +407,7 @@ def test_raises_when_feed_mismatch(self): class TestExtractedDir(unittest.TestCase): def setUp(self): - self.tracker = GtfsChangeTracker( + self.tracker = GtfsDatasetsComparer( feed_stable_id="mdb-1", base_dataset_stable_id="mdb-1-20240101", new_dataset_stable_id="mdb-1-20240201", @@ -430,7 +430,7 @@ def test_raises_when_dir_not_found(self): class TestUploadChangelog(unittest.TestCase): def setUp(self): - self.tracker = GtfsChangeTracker( + self.tracker = GtfsDatasetsComparer( feed_stable_id="mdb-1", base_dataset_stable_id="mdb-1-20240101", new_dataset_stable_id="mdb-1-20240201",