Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions pipeline/tests/test_validate_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,3 +256,35 @@ def must_not_be_called(samples, **kwargs):
assert data["skip_reason"]
assert data["sample_size"] == 0
assert data["drift_records"] == []


def test_driver_filters_to_primary_dimensions(tmp_path: Path) -> None:
"""Samples whose dimension isn't in PRIMARY_DIMENSIONS[shard] are filtered out."""
from validate.driver import PRIMARY_DIMENSIONS

db = _make_minimal_shard(tmp_path)
report_path = tmp_path / "report.json"

seen_dims: list[str] = []

def capture(samples, **kwargs):
seen_dims.extend(s.dimension for s in samples)
return [], []

# Inject a temporary primary-dim filter for the test shard so the existing
# minimal fixture (whose dimension is "on-demand") survives.
PRIMARY_DIMENSIONS["aws-ec2"] = frozenset({"on-demand"})
try:
run_validation(
shard="aws-ec2",
shard_db=db,
budget=10,
report=report_path,
revalidator=capture,
seed=42,
)
finally:
del PRIMARY_DIMENSIONS["aws-ec2"]

assert seen_dims, "revalidator received no samples"
assert all(d == "on-demand" for d in seen_dims)
58 changes: 58 additions & 0 deletions pipeline/tests/test_validate_gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,3 +346,61 @@ def test_gcp_gke_autopilot_matches_requested_dimension(
drift, missing = revalidate([sample], service_id=service_id)
assert drift == []
assert missing == []


def test_gcp_validator_applies_usage_unit_divisor(
requests_mock: requests_mock_module.Mocker,
) -> None:
"""Validator must mirror ingest's per-day → per-month conversion (GiBy.d).

Ingest divides by 1/30.4375 (= multiplies by 30.4375) so a per-day price
of 0.0005333 becomes 0.01623 per-month in the catalog. Validator must
apply the same divisor to compare on the same axis instead of reporting
spurious 30.4375× drift.
"""
sku_id = "6088-27E4-7DD4"
region = "asia-east2"
catalog_per_month = 0.016233231875 # what ingest stored

sample = Sample(
sku_id=sku_id,
region=region,
resource_name="nearline",
price_amount=catalog_per_month,
price_currency="USD",
dimension="storage",
)
response = {
"skus": [
{
"skuId": sku_id,
"description": "Nearline Storage Hong Kong (Early Delete)",
"serviceRegions": [region],
"pricingInfo": [
{
"pricingExpression": {
"usageUnit": "GiBy.d",
"tieredRates": [
{
"startUsageAmount": 0,
"unitPrice": {
"currencyCode": "USD",
"units": "0",
"nanos": 533330, # 0.0005333 per-day
},
}
],
}
}
],
}
],
"nextPageToken": "",
}
requests_mock.get(f"{_BASE_URL}/95FF-2EF5-5EA1/skus", json=response)

with patch("validate.gcp._get_bearer_token", return_value="token"):
drift, missing = revalidate([sample], service_id="95FF-2EF5-5EA1")

assert drift == [], f"unexpected drift after unit normalization: {drift}"
assert missing == []
23 changes: 23 additions & 0 deletions pipeline/validate/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,19 @@
}


# Shards whose ingest fans-in additional price dimensions onto a primary SKU's
# row (e.g. gcp-gcs storage SKU also carries fanned-in global ops prices). The
# upstream API only knows about the primary dimension, so the validator can
# only meaningfully compare that one. Other dimensions are sampled out before
# revalidation. Fan-in dimensions remain unvalidated until the validator can
# look them up against their actual source SKUs (tracked separately).
PRIMARY_DIMENSIONS: dict[str, frozenset[str]] = {
"gcp-gcs": frozenset({"storage"}), # ops fanned-in from global SKUs
"gcp-run": frozenset({"cpu-second"}), # memory + requests fanned-in
"gcp-functions": frozenset({"cpu-second"}), # memory + requests fanned-in
}


# ---------------------------------------------------------------------------
# Types
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -184,6 +197,16 @@ def run_validation(
samples = sample(shard_db, budget=budget, seed=seed)
logger.info("Sampled %d rows from %s", len(samples), shard)

# --- Filter to primary dimensions for fan-in shards ---
if shard in PRIMARY_DIMENSIONS:
allowed = PRIMARY_DIMENSIONS[shard]
before = len(samples)
samples = [s for s in samples if s.dimension in allowed]
logger.info(
"Filtered %s samples to primary dimensions %s: %d → %d",
shard, sorted(allowed), before, len(samples),
)

# --- Revalidate ---
drift_objs, missing = revalidator(samples)

Expand Down
23 changes: 21 additions & 2 deletions pipeline/validate/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,19 @@
_BILLING_BASE = "https://cloudbilling.googleapis.com/v1/services"
_DRIFT_THRESHOLD = 0.01 # 1%

# Must mirror ingest/gcp_common.py:_USAGE_UNITS divisors. Validator divides the
# raw upstream unitPrice by this factor to match what ingest stored.
_USAGE_UNIT_DIVISORS: dict[str, float] = {
"h": 1.0,
"GiBy.h": 1.0,
"GiBy.mo": 1.0,
"GiBy.d": 1.0 / 30.4375,
"By.mo": 1.0 / (1024**3),
"count": 1.0,
"s": 1.0,
"GiBy.s": 1.0,
}

# Default service ID used when callers don't specify (matches the legacy
# behaviour of single-service GCE shards). The driver should always pass an
# explicit service_id for known shards.
Expand Down Expand Up @@ -176,8 +189,14 @@ def _fetch_sku_price(
nanos = int(up.get("nanos", 0))
units = up.get("units", "0")
price = _nanos_to_float(units, nanos)
if price > 0:
return price
if price <= 0:
continue
# Mirror ingest's unit normalization (pipeline/ingest/gcp_common.py).
# E.g. usageUnit="GiBy.d" → ingest emits per-month price (×30.4375),
# so validator must do the same to compare on the same axis.
usage_unit = expr.get("usageUnit", "")
divisor = _USAGE_UNIT_DIVISORS.get(usage_unit, 1.0)
return price / divisor

page_token = data.get("nextPageToken", "")
if not page_token:
Expand Down
Loading