From 1a96dddc7ff195e45ae0a59f4ac7003aa71c9279 Mon Sep 17 00:00:00 2001 From: sofq Date: Tue, 5 May 2026 15:53:31 +0700 Subject: [PATCH] fix(validate): primary-dimension filter + GCP usageUnit normalization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two real validator bugs producing chronic false-positive drift on gcp-gcs, gcp-run, gcp-functions. 1. Fan-in dimensions were sampled but couldn't be re-fetched. gcp-gcs storage rows carry fanned-in global ops prices (read-ops, write-ops) under the storage SKU id; gcp-run / gcp-functions carry memory-gb-second + requests under the cpu-second SKU id. The validator looks up the upstream SKU and reads its tieredRates[0].unitPrice — which only matches the primary dimension. Add PRIMARY_DIMENSIONS in driver.py so non-primary dimensions are filtered out before revalidation. Fan-in dims will need separate validation against their actual source SKUs (follow-up). 2. Validator did not mirror ingest's usageUnit normalization. For SKUs with usageUnit='GiBy.d' (e.g. early-delete fees), ingest converts day → month (×30.4375) but validator compared against the raw per-day price, producing 30.4375× drift. Add _USAGE_UNIT_DIVISORS in validate/gcp.py mirroring the table in ingest/gcp_common.py. Verified end-to-end against the live GCP Cloud Billing API for gcp-gcs / gcp-run / gcp-functions: 0 drift, 0 missing. Closes #57, #59, #61. --- pipeline/tests/test_validate_driver.py | 32 ++++++++++++++ pipeline/tests/test_validate_gcp.py | 58 ++++++++++++++++++++++++++ pipeline/validate/driver.py | 23 ++++++++++ pipeline/validate/gcp.py | 23 +++++++++- 4 files changed, 134 insertions(+), 2 deletions(-) diff --git a/pipeline/tests/test_validate_driver.py b/pipeline/tests/test_validate_driver.py index 9fed0d1..c0b5fbc 100644 --- a/pipeline/tests/test_validate_driver.py +++ b/pipeline/tests/test_validate_driver.py @@ -256,3 +256,35 @@ def must_not_be_called(samples, **kwargs): assert data["skip_reason"] assert data["sample_size"] == 0 assert data["drift_records"] == [] + + +def test_driver_filters_to_primary_dimensions(tmp_path: Path) -> None: + """Samples whose dimension isn't in PRIMARY_DIMENSIONS[shard] are filtered out.""" + from validate.driver import PRIMARY_DIMENSIONS + + db = _make_minimal_shard(tmp_path) + report_path = tmp_path / "report.json" + + seen_dims: list[str] = [] + + def capture(samples, **kwargs): + seen_dims.extend(s.dimension for s in samples) + return [], [] + + # Inject a temporary primary-dim filter for the test shard so the existing + # minimal fixture (whose dimension is "on-demand") survives. + PRIMARY_DIMENSIONS["aws-ec2"] = frozenset({"on-demand"}) + try: + run_validation( + shard="aws-ec2", + shard_db=db, + budget=10, + report=report_path, + revalidator=capture, + seed=42, + ) + finally: + del PRIMARY_DIMENSIONS["aws-ec2"] + + assert seen_dims, "revalidator received no samples" + assert all(d == "on-demand" for d in seen_dims) diff --git a/pipeline/tests/test_validate_gcp.py b/pipeline/tests/test_validate_gcp.py index cc01c4e..4972253 100644 --- a/pipeline/tests/test_validate_gcp.py +++ b/pipeline/tests/test_validate_gcp.py @@ -346,3 +346,61 @@ def test_gcp_gke_autopilot_matches_requested_dimension( drift, missing = revalidate([sample], service_id=service_id) assert drift == [] assert missing == [] + + +def test_gcp_validator_applies_usage_unit_divisor( + requests_mock: requests_mock_module.Mocker, +) -> None: + """Validator must mirror ingest's per-day → per-month conversion (GiBy.d). + + Ingest divides by 1/30.4375 (= multiplies by 30.4375) so a per-day price + of 0.0005333 becomes 0.01623 per-month in the catalog. Validator must + apply the same divisor to compare on the same axis instead of reporting + spurious 30.4375× drift. + """ + sku_id = "6088-27E4-7DD4" + region = "asia-east2" + catalog_per_month = 0.016233231875 # what ingest stored + + sample = Sample( + sku_id=sku_id, + region=region, + resource_name="nearline", + price_amount=catalog_per_month, + price_currency="USD", + dimension="storage", + ) + response = { + "skus": [ + { + "skuId": sku_id, + "description": "Nearline Storage Hong Kong (Early Delete)", + "serviceRegions": [region], + "pricingInfo": [ + { + "pricingExpression": { + "usageUnit": "GiBy.d", + "tieredRates": [ + { + "startUsageAmount": 0, + "unitPrice": { + "currencyCode": "USD", + "units": "0", + "nanos": 533330, # 0.0005333 per-day + }, + } + ], + } + } + ], + } + ], + "nextPageToken": "", + } + requests_mock.get(f"{_BASE_URL}/95FF-2EF5-5EA1/skus", json=response) + + with patch("validate.gcp._get_bearer_token", return_value="token"): + drift, missing = revalidate([sample], service_id="95FF-2EF5-5EA1") + + assert drift == [], f"unexpected drift after unit normalization: {drift}" + assert missing == [] diff --git a/pipeline/validate/driver.py b/pipeline/validate/driver.py index c526329..0308721 100644 --- a/pipeline/validate/driver.py +++ b/pipeline/validate/driver.py @@ -62,6 +62,19 @@ } +# Shards whose ingest fans-in additional price dimensions onto a primary SKU's +# row (e.g. gcp-gcs storage SKU also carries fanned-in global ops prices). The +# upstream API only knows about the primary dimension, so the validator can +# only meaningfully compare that one. Other dimensions are sampled out before +# revalidation. Fan-in dimensions remain unvalidated until the validator can +# look them up against their actual source SKUs (tracked separately). +PRIMARY_DIMENSIONS: dict[str, frozenset[str]] = { + "gcp-gcs": frozenset({"storage"}), # ops fanned-in from global SKUs + "gcp-run": frozenset({"cpu-second"}), # memory + requests fanned-in + "gcp-functions": frozenset({"cpu-second"}), # memory + requests fanned-in +} + + # --------------------------------------------------------------------------- # Types # --------------------------------------------------------------------------- @@ -184,6 +197,16 @@ def run_validation( samples = sample(shard_db, budget=budget, seed=seed) logger.info("Sampled %d rows from %s", len(samples), shard) + # --- Filter to primary dimensions for fan-in shards --- + if shard in PRIMARY_DIMENSIONS: + allowed = PRIMARY_DIMENSIONS[shard] + before = len(samples) + samples = [s for s in samples if s.dimension in allowed] + logger.info( + "Filtered %s samples to primary dimensions %s: %d → %d", + shard, sorted(allowed), before, len(samples), + ) + # --- Revalidate --- drift_objs, missing = revalidator(samples) diff --git a/pipeline/validate/gcp.py b/pipeline/validate/gcp.py index 3f4c68d..c53d2d6 100644 --- a/pipeline/validate/gcp.py +++ b/pipeline/validate/gcp.py @@ -22,6 +22,19 @@ _BILLING_BASE = "https://cloudbilling.googleapis.com/v1/services" _DRIFT_THRESHOLD = 0.01 # 1% +# Must mirror ingest/gcp_common.py:_USAGE_UNITS divisors. Validator divides the +# raw upstream unitPrice by this factor to match what ingest stored. +_USAGE_UNIT_DIVISORS: dict[str, float] = { + "h": 1.0, + "GiBy.h": 1.0, + "GiBy.mo": 1.0, + "GiBy.d": 1.0 / 30.4375, + "By.mo": 1.0 / (1024**3), + "count": 1.0, + "s": 1.0, + "GiBy.s": 1.0, +} + # Default service ID used when callers don't specify (matches the legacy # behaviour of single-service GCE shards). The driver should always pass an # explicit service_id for known shards. @@ -176,8 +189,14 @@ def _fetch_sku_price( nanos = int(up.get("nanos", 0)) units = up.get("units", "0") price = _nanos_to_float(units, nanos) - if price > 0: - return price + if price <= 0: + continue + # Mirror ingest's unit normalization (pipeline/ingest/gcp_common.py). + # E.g. usageUnit="GiBy.d" → ingest emits per-month price (×30.4375), + # so validator must do the same to compare on the same axis. + usage_unit = expr.get("usageUnit", "") + divisor = _USAGE_UNIT_DIVISORS.get(usage_unit, 1.0) + return price / divisor page_token = data.get("nextPageToken", "") if not page_token: