From 73876c4f771b367b3cfb249ff4bfa38d3c3d7c1f Mon Sep 17 00:00:00 2001 From: sofq Date: Tue, 5 May 2026 15:05:44 +0700 Subject: [PATCH] fix(validate): skip-list for shards the validator can't compare Add SKIP_REVALIDATION dict in pipeline/validate/driver.py and short-circuit those shards with exit="skipped" in the report. Listed shards return 0 so the data-validate workflow closes any open drift issue. Initial entry: gcp-gce. Ingest (pipeline/ingest/gcp_gce.py) synthesizes per-machine totals from per-vCPU + per-GiB component prices, while the validator reads the raw component unitPrice. The two are not comparable without re-doing the ingest math, producing 20/20 false-positive drift records on every run. Closes #62. --- pipeline/tests/test_validate_driver.py | 33 +++++++++++++++++++++++ pipeline/validate/driver.py | 36 +++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/pipeline/tests/test_validate_driver.py b/pipeline/tests/test_validate_driver.py index a802caf..9fed0d1 100644 --- a/pipeline/tests/test_validate_driver.py +++ b/pipeline/tests/test_validate_driver.py @@ -223,3 +223,36 @@ def multi_drift(samples, **kwargs): data = json.loads(report_path.read_text()) assert result == 1 assert len(data["drift_records"]) == data["sample_size"] + + + +def test_driver_skips_listed_shard(tmp_path: Path) -> None: + """Shards in SKIP_REVALIDATION are skipped — revalidator is not called.""" + from validate.driver import SKIP_REVALIDATION + + db = _make_minimal_shard(tmp_path) + report_path = tmp_path / "report.json" + + called = False + + def must_not_be_called(samples, **kwargs): + nonlocal called + called = True + return [], [] + + skipped_shard = next(iter(SKIP_REVALIDATION)) + result = run_validation( + shard=skipped_shard, + shard_db=db, + budget=5, + report=report_path, + revalidator=must_not_be_called, + seed=42, + ) + assert result == 0 + assert not called + data = json.loads(report_path.read_text()) + assert data["exit"] == "skipped" + assert data["skip_reason"] + assert data["sample_size"] == 0 + assert data["drift_records"] == [] diff --git a/pipeline/validate/driver.py b/pipeline/validate/driver.py index 7f3a2fd..5ca6763 100644 --- a/pipeline/validate/driver.py +++ b/pipeline/validate/driver.py @@ -26,6 +26,22 @@ logger = logging.getLogger(__name__) +# Shards where the upstream API can't be compared one-to-one against catalog +# rows: catalog stores synthesized values (e.g. gcp-gce machine totals built +# from per-vCPU + per-GiB component prices) while the upstream API exposes the +# components. Listing here makes the validator skip revalidation and emit a +# pass with the reason recorded in the report. +SKIP_REVALIDATION: dict[str, str] = { + "gcp-gce": ( + "ingest synthesizes machine totals from per-vCPU and per-GiB " + "component SKUs (see pipeline/ingest/gcp_gce.py); validator " + "compares against the raw component unitPrice, producing " + "false-positive drift. Re-enable once a sidecar or component-aware " + "comparison lands." + ), +} + + # --------------------------------------------------------------------------- # Types # --------------------------------------------------------------------------- @@ -52,7 +68,8 @@ class ValidationReport: drift_records: list[dict] missing_upstream: list[str] vantage_drift: list[dict] - exit: str # "pass" | "fail" + exit: str # "pass" | "fail" | "skipped" + skip_reason: str | None = None def as_dict(self) -> dict: return dataclasses.asdict(self) @@ -123,6 +140,23 @@ def run_validation( int 0 on pass, 1 on fail. """ + if shard in SKIP_REVALIDATION: + reason = SKIP_REVALIDATION[shard] + logger.info("Skipping revalidation for %s: %s", shard, reason) + report_data = ValidationReport( + shard=shard, + generated_at=datetime.now(UTC).isoformat(), + sample_size=0, + drift_records=[], + missing_upstream=[], + vantage_drift=[], + exit="skipped", + skip_reason=reason, + ) + report.parent.mkdir(parents=True, exist_ok=True) + report.write_text(json.dumps(report_data.as_dict(), indent=2)) + return 0 + if revalidator is None: revalidator = _default_revalidator(shard)