From 1a96dddc7ff195e45ae0a59f4ac7003aa71c9279 Mon Sep 17 00:00:00 2001
From: sofq <hoanghongquan93@gmail.com>
Date: Tue, 5 May 2026 15:53:31 +0700
Subject: [PATCH] fix(validate): primary-dimension filter + GCP usageUnit
 normalization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two real validator bugs producing chronic false-positive drift on
gcp-gcs, gcp-run, gcp-functions.

1. Fan-in dimensions were sampled but couldn't be re-fetched.
   gcp-gcs storage rows carry fanned-in global ops prices (read-ops,
   write-ops) under the storage SKU id; gcp-run / gcp-functions
   carry memory-gb-second + requests under the cpu-second SKU id.
   The validator looks up the upstream SKU and reads its
   tieredRates[0].unitPrice — which only matches the primary
   dimension. Add PRIMARY_DIMENSIONS in driver.py so non-primary
   dimensions are filtered out before revalidation. Fan-in dims will
   need separate validation against their actual source SKUs (follow-up).

2. Validator did not mirror ingest's usageUnit normalization.
   For SKUs with usageUnit='GiBy.d' (e.g. early-delete fees), ingest
   converts day → month (×30.4375) but validator compared against the
   raw per-day price, producing 30.4375× drift. Add
   _USAGE_UNIT_DIVISORS in validate/gcp.py mirroring the table in
   ingest/gcp_common.py.

Verified end-to-end against the live GCP Cloud Billing API for
gcp-gcs / gcp-run / gcp-functions: 0 drift, 0 missing.

Closes #57, #59, #61.
---
 pipeline/tests/test_validate_driver.py | 32 ++++++++++++++
 pipeline/tests/test_validate_gcp.py    | 58 ++++++++++++++++++++++++++
 pipeline/validate/driver.py            | 23 ++++++++++
 pipeline/validate/gcp.py               | 23 +++++++++-
 4 files changed, 134 insertions(+), 2 deletions(-)

diff --git a/pipeline/tests/test_validate_driver.py b/pipeline/tests/test_validate_driver.py
index 9fed0d1..c0b5fbc 100644
--- a/pipeline/tests/test_validate_driver.py
+++ b/pipeline/tests/test_validate_driver.py
@@ -256,3 +256,35 @@ def must_not_be_called(samples, **kwargs):
     assert data["skip_reason"]
     assert data["sample_size"] == 0
     assert data["drift_records"] == []
+
+
+def test_driver_filters_to_primary_dimensions(tmp_path: Path) -> None:
+    """Samples whose dimension isn't in PRIMARY_DIMENSIONS[shard] are filtered out."""
+    from validate.driver import PRIMARY_DIMENSIONS
+
+    db = _make_minimal_shard(tmp_path)
+    report_path = tmp_path / "report.json"
+
+    seen_dims: list[str] = []
+
+    def capture(samples, **kwargs):
+        seen_dims.extend(s.dimension for s in samples)
+        return [], []
+
+    # Inject a temporary primary-dim filter for the test shard so the existing
+    # minimal fixture (whose dimension is "on-demand") survives.
+    PRIMARY_DIMENSIONS["aws-ec2"] = frozenset({"on-demand"})
+    try:
+        run_validation(
+            shard="aws-ec2",
+            shard_db=db,
+            budget=10,
+            report=report_path,
+            revalidator=capture,
+            seed=42,
+        )
+    finally:
+        del PRIMARY_DIMENSIONS["aws-ec2"]
+
+    assert seen_dims, "revalidator received no samples"
+    assert all(d == "on-demand" for d in seen_dims)
diff --git a/pipeline/tests/test_validate_gcp.py b/pipeline/tests/test_validate_gcp.py
index cc01c4e..4972253 100644
--- a/pipeline/tests/test_validate_gcp.py
+++ b/pipeline/tests/test_validate_gcp.py
@@ -346,3 +346,61 @@ def test_gcp_gke_autopilot_matches_requested_dimension(
         drift, missing = revalidate([sample], service_id=service_id)
     assert drift == []
     assert missing == []
+
+
+def test_gcp_validator_applies_usage_unit_divisor(
+    requests_mock: requests_mock_module.Mocker,
+) -> None:
+    """Validator must mirror ingest's per-day → per-month conversion (GiBy.d).
+
+    Ingest divides by 1/30.4375 (= multiplies by 30.4375) so a per-day price
+    of 0.0005333 becomes 0.01623 per-month in the catalog. Validator must
+    apply the same divisor to compare on the same axis instead of reporting
+    spurious 30.4375× drift.
+    """
+    sku_id = "6088-27E4-7DD4"
+    region = "asia-east2"
+    catalog_per_month = 0.016233231875  # what ingest stored
+
+    sample = Sample(
+        sku_id=sku_id,
+        region=region,
+        resource_name="nearline",
+        price_amount=catalog_per_month,
+        price_currency="USD",
+        dimension="storage",
+    )
+    response = {
+        "skus": [
+            {
+                "skuId": sku_id,
+                "description": "Nearline Storage Hong Kong (Early Delete)",
+                "serviceRegions": [region],
+                "pricingInfo": [
+                    {
+                        "pricingExpression": {
+                            "usageUnit": "GiBy.d",
+                            "tieredRates": [
+                                {
+                                    "startUsageAmount": 0,
+                                    "unitPrice": {
+                                        "currencyCode": "USD",
+                                        "units": "0",
+                                        "nanos": 533330,  # 0.0005333 per-day
+                                    },
+                                }
+                            ],
+                        }
+                    }
+                ],
+            }
+        ],
+        "nextPageToken": "",
+    }
+    requests_mock.get(f"{_BASE_URL}/95FF-2EF5-5EA1/skus", json=response)
+
+    with patch("validate.gcp._get_bearer_token", return_value="token"):
+        drift, missing = revalidate([sample], service_id="95FF-2EF5-5EA1")
+
+    assert drift == [], f"unexpected drift after unit normalization: {drift}"
+    assert missing == []
diff --git a/pipeline/validate/driver.py b/pipeline/validate/driver.py
index c526329..0308721 100644
--- a/pipeline/validate/driver.py
+++ b/pipeline/validate/driver.py
@@ -62,6 +62,19 @@
 }
 
 
+# Shards whose ingest fans-in additional price dimensions onto a primary SKU's
+# row (e.g. gcp-gcs storage SKU also carries fanned-in global ops prices). The
+# upstream API only knows about the primary dimension, so the validator can
+# only meaningfully compare that one. Other dimensions are sampled out before
+# revalidation. Fan-in dimensions remain unvalidated until the validator can
+# look them up against their actual source SKUs (tracked separately).
+PRIMARY_DIMENSIONS: dict[str, frozenset[str]] = {
+    "gcp-gcs": frozenset({"storage"}),       # ops fanned-in from global SKUs
+    "gcp-run": frozenset({"cpu-second"}),    # memory + requests fanned-in
+    "gcp-functions": frozenset({"cpu-second"}),  # memory + requests fanned-in
+}
+
+
 # ---------------------------------------------------------------------------
 # Types
 # ---------------------------------------------------------------------------
@@ -184,6 +197,16 @@ def run_validation(
     samples = sample(shard_db, budget=budget, seed=seed)
     logger.info("Sampled %d rows from %s", len(samples), shard)
 
+    # --- Filter to primary dimensions for fan-in shards ---
+    if shard in PRIMARY_DIMENSIONS:
+        allowed = PRIMARY_DIMENSIONS[shard]
+        before = len(samples)
+        samples = [s for s in samples if s.dimension in allowed]
+        logger.info(
+            "Filtered %s samples to primary dimensions %s: %d → %d",
+            shard, sorted(allowed), before, len(samples),
+        )
+
     # --- Revalidate ---
     drift_objs, missing = revalidator(samples)
 
diff --git a/pipeline/validate/gcp.py b/pipeline/validate/gcp.py
index 3f4c68d..c53d2d6 100644
--- a/pipeline/validate/gcp.py
+++ b/pipeline/validate/gcp.py
@@ -22,6 +22,19 @@
 _BILLING_BASE = "https://cloudbilling.googleapis.com/v1/services"
 _DRIFT_THRESHOLD = 0.01  # 1%
 
+# Must mirror ingest/gcp_common.py:_USAGE_UNITS divisors. Validator divides the
+# raw upstream unitPrice by this factor to match what ingest stored.
+_USAGE_UNIT_DIVISORS: dict[str, float] = {
+    "h": 1.0,
+    "GiBy.h": 1.0,
+    "GiBy.mo": 1.0,
+    "GiBy.d": 1.0 / 30.4375,
+    "By.mo": 1.0 / (1024**3),
+    "count": 1.0,
+    "s": 1.0,
+    "GiBy.s": 1.0,
+}
+
 # Default service ID used when callers don't specify (matches the legacy
 # behaviour of single-service GCE shards). The driver should always pass an
 # explicit service_id for known shards.
@@ -176,8 +189,14 @@ def _fetch_sku_price(
             nanos = int(up.get("nanos", 0))
             units = up.get("units", "0")
             price = _nanos_to_float(units, nanos)
-            if price > 0:
-                return price
+            if price <= 0:
+                continue
+            # Mirror ingest's unit normalization (pipeline/ingest/gcp_common.py).
+            # E.g. usageUnit="GiBy.d" → ingest emits per-month price (×30.4375),
+            # so validator must do the same to compare on the same axis.
+            usage_unit = expr.get("usageUnit", "")
+            divisor = _USAGE_UNIT_DIVISORS.get(usage_unit, 1.0)
+            return price / divisor
 
         page_token = data.get("nextPageToken", "")
         if not page_token: