From 4af8cbbac1dd1488e47c9989da948983058b3aac Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 14 Apr 2026 22:37:36 +0200 Subject: [PATCH 01/25] Add traceability and scope fingerprinting --- modal_app/local_area.py | 137 +++++++--- modal_app/pipeline.py | 27 +- .../calibration/local_h5/__init__.py | 2 +- .../calibration/local_h5/fingerprinting.py | 247 ++++++++++++++++++ .../calibration/publish_local_area.py | 72 ++--- .../fixtures/test_local_h5_fingerprinting.py | 164 ++++++++++++ .../test_local_h5_fingerprinting.py | 80 ++++++ tests/unit/fixtures/test_modal_local_area.py | 16 ++ tests/unit/test_pipeline.py | 35 +++ 9 files changed, 696 insertions(+), 84 deletions(-) create mode 100644 policyengine_us_data/calibration/local_h5/fingerprinting.py create mode 100644 tests/unit/calibration/fixtures/test_local_h5_fingerprinting.py create mode 100644 tests/unit/calibration/test_local_h5_fingerprinting.py diff --git a/modal_app/local_area.py b/modal_app/local_area.py index 0beafee5c..e5045582d 100644 --- a/modal_app/local_area.py +++ b/modal_app/local_area.py @@ -29,6 +29,10 @@ from modal_app.images import cpu_image as image # noqa: E402 from modal_app.resilience import reconcile_run_dir_fingerprint # noqa: E402 +from policyengine_us_data.calibration.local_h5.fingerprinting import ( # noqa: E402 + FingerprintingService, + PublishingInputBundle, +) from policyengine_us_data.calibration.local_h5.partitioning import ( # noqa: E402 partition_weighted_work_items, ) @@ -311,6 +315,65 @@ def get_version() -> str: return pyproject["project"]["version"] +def _build_publishing_input_bundle( + *, + weights_path: Path, + dataset_path: Path, + db_path: Path | None, + geography_path: Path | None, + calibration_package_path: Path | None, + run_config_path: Path | None, + run_id: str, + version: str, + n_clones: int | None, + seed: int, + legacy_blocks_path: Path | None = None, +) -> PublishingInputBundle: + """Build the normalized coordinator input bundle for one publish scope.""" + + return PublishingInputBundle( + weights_path=weights_path, + source_dataset_path=dataset_path, + target_db_path=db_path, + exact_geography_path=geography_path, + calibration_package_path=calibration_package_path, + run_config_path=run_config_path, + run_id=run_id, + version=version, + n_clones=n_clones, + seed=seed, + legacy_blocks_path=legacy_blocks_path, + ) + + +def _resolve_scope_fingerprint( + *, + inputs: PublishingInputBundle, + scope: str, + expected_fingerprint: str = "", +) -> str: + """Compute the scope fingerprint while preserving pinned resume values.""" + + service = FingerprintingService() + traceability = service.build_traceability(inputs=inputs, scope=scope) + computed_fingerprint = service.compute_scope_fingerprint(traceability) + if expected_fingerprint: + if expected_fingerprint != computed_fingerprint: + print( + "WARNING: Pinned fingerprint differs from current " + f"{scope} scope fingerprint. " + "Preserving pinned value for backward-compatible resume.\n" + f" Pinned: {expected_fingerprint}\n" + f" Current: {computed_fingerprint}" + ) + else: + print( + f"Using pinned fingerprint from pipeline: {expected_fingerprint}" + ) + return expected_fingerprint + return computed_fingerprint + + def partition_work( work_items: List[Dict], num_workers: int, @@ -836,45 +899,26 @@ def coordinate_publish( validate = False # Fingerprint-based cache invalidation - if expected_fingerprint: - fingerprint = expected_fingerprint - print(f"Using pinned fingerprint from pipeline: {fingerprint}") - else: - geography_path_expr = ( - f'Path("{geography_path}")' if geography_path.exists() else "None" - ) - package_path_expr = ( - f'Path("{calibration_package_path}")' - if calibration_package_path.exists() - else "None" - ) - fp_result = subprocess.run( - _python_cmd( - "-c", - f""" -from pathlib import Path -from policyengine_us_data.calibration.publish_local_area import ( - compute_input_fingerprint, -) -print( - compute_input_fingerprint( - Path("{weights_path}"), - Path("{dataset_path}"), - {n_clones}, + fingerprint_inputs = _build_publishing_input_bundle( + weights_path=weights_path, + dataset_path=dataset_path, + db_path=db_path, + geography_path=geography_path, + calibration_package_path=( + calibration_package_path if calibration_package_path.exists() else None + ), + run_config_path=config_json_path if config_json_path.exists() else None, + run_id=run_id, + version=version, + n_clones=n_clones, seed=42, - geography_path={geography_path_expr}, - calibration_package_path={package_path_expr}, + legacy_blocks_path=artifacts / "stacked_blocks.npy", + ) + fingerprint = _resolve_scope_fingerprint( + inputs=fingerprint_inputs, + scope="regional", + expected_fingerprint=expected_fingerprint, ) -) -""", - ), - capture_output=True, - text=True, - env=os.environ.copy(), - ) - if fp_result.returncode != 0: - raise RuntimeError(f"Failed to compute fingerprint: {fp_result.stderr}") - fingerprint = fp_result.stdout.strip() reconcile_action = reconcile_run_dir_fingerprint(run_dir, fingerprint) if reconcile_action == "resume": print(f"Inputs unchanged ({fingerprint}), resuming...") @@ -1064,6 +1108,7 @@ def coordinate_national_publish( n_clones: int = 430, validate: bool = True, run_id: str = "", + expected_fingerprint: str = "", ) -> Dict: """Build and upload a national US.h5 from national weights.""" setup_gcp_credentials() @@ -1123,6 +1168,23 @@ def coordinate_national_publish( "geography_assignment.npz": "national_geography_assignment.npz", }, ) + fingerprint_inputs = _build_publishing_input_bundle( + weights_path=weights_path, + dataset_path=dataset_path, + db_path=db_path, + geography_path=geography_path, + calibration_package_path=None, + run_config_path=config_json_path if config_json_path.exists() else None, + run_id=run_id, + version=version, + n_clones=n_clones, + seed=42, + ) + fingerprint = _resolve_scope_fingerprint( + inputs=fingerprint_inputs, + scope="national", + expected_fingerprint=expected_fingerprint, + ) run_dir = staging_dir / run_id run_dir.mkdir(parents=True, exist_ok=True) @@ -1224,6 +1286,7 @@ def coordinate_national_publish( f"{version}. Run main_national_promote to publish." ), "run_id": run_id, + "fingerprint": fingerprint, "national_validation": national_validation_output, } diff --git a/modal_app/pipeline.py b/modal_app/pipeline.py index 94e20d9e6..c96a428d3 100644 --- a/modal_app/pipeline.py +++ b/modal_app/pipeline.py @@ -109,12 +109,26 @@ class RunMetadata: error: Optional[str] = None resume_history: list = field(default_factory=list) fingerprint: Optional[str] = None + regional_fingerprint: Optional[str] = None + national_fingerprint: Optional[str] = None + + def __post_init__(self) -> None: + if self.regional_fingerprint is None and self.fingerprint is not None: + self.regional_fingerprint = self.fingerprint + if self.fingerprint is None and self.regional_fingerprint is not None: + self.fingerprint = self.regional_fingerprint def to_dict(self) -> dict: - return asdict(self) + data = asdict(self) + if data.get("fingerprint") is None and data.get("regional_fingerprint") is not None: + data["fingerprint"] = data["regional_fingerprint"] + return data @classmethod def from_dict(cls, data: dict) -> "RunMetadata": + data = dict(data) + if data.get("regional_fingerprint") is None and data.get("fingerprint") is not None: + data["regional_fingerprint"] = data["fingerprint"] return cls(**data) @@ -1000,7 +1014,9 @@ def run_pipeline( n_clones=n_clones, validate=True, run_id=run_id, - expected_fingerprint=meta.fingerprint or "", + expected_fingerprint=( + meta.regional_fingerprint or meta.fingerprint or "" + ), ) print(f" → coordinate_publish fc: {regional_h5_handle.object_id}") @@ -1012,6 +1028,7 @@ def run_pipeline( n_clones=n_clones, validate=True, run_id=run_id, + expected_fingerprint=meta.national_fingerprint or "", ) print( f" → coordinate_national_publish fc: {national_h5_handle.object_id}" @@ -1036,6 +1053,7 @@ def run_pipeline( if isinstance(regional_h5_result, dict) and regional_h5_result.get( "fingerprint" ): + meta.regional_fingerprint = regional_h5_result["fingerprint"] meta.fingerprint = regional_h5_result["fingerprint"] write_run_meta(meta, pipeline_volume) @@ -1049,6 +1067,11 @@ def run_pipeline( else national_h5_result ) print(f" National H5: {national_msg}") + if isinstance(national_h5_result, dict) and national_h5_result.get( + "fingerprint" + ): + meta.national_fingerprint = national_h5_result["fingerprint"] + write_run_meta(meta, pipeline_volume) # ── Aggregate validation results ── _write_validation_diagnostics( diff --git a/policyengine_us_data/calibration/local_h5/__init__.py b/policyengine_us_data/calibration/local_h5/__init__.py index f69663eb0..96ec7258f 100644 --- a/policyengine_us_data/calibration/local_h5/__init__.py +++ b/policyengine_us_data/calibration/local_h5/__init__.py @@ -3,5 +3,5 @@ Modules in this package should land only when they become active runtime seams rather than speculative placeholders. The current early slices introduce ``partitioning.py``, ``requests.py``, ``area_catalog.py``, -and ``geography_loader.py``. +``fingerprinting.py``, and ``geography_loader.py``. """ diff --git a/policyengine_us_data/calibration/local_h5/fingerprinting.py b/policyengine_us_data/calibration/local_h5/fingerprinting.py new file mode 100644 index 000000000..6bff37af0 --- /dev/null +++ b/policyengine_us_data/calibration/local_h5/fingerprinting.py @@ -0,0 +1,247 @@ +"""Coordinator-owned provenance and resumability logic for local H5 publication.""" + +from __future__ import annotations + +import hashlib +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Literal, Mapping + +from .geography_loader import CalibrationGeographyLoader + +FingerprintScope = Literal["regional", "national"] + + +@dataclass(frozen=True) +class PublishingInputBundle: + """File-system and run metadata needed to publish one H5 scope.""" + + weights_path: Path + source_dataset_path: Path + target_db_path: Path | None + exact_geography_path: Path | None + calibration_package_path: Path | None + run_config_path: Path | None + run_id: str + version: str + n_clones: int | None + seed: int + legacy_blocks_path: Path | None = None + + +@dataclass(frozen=True) +class ArtifactIdentity: + """Stable identity for one input artifact used by traceability and resume.""" + + logical_name: str + path: Path | None + sha256: str | None + size_bytes: int | None = None + metadata: Mapping[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class TraceabilityBundle: + """Full provenance record for one publish scope.""" + + scope: FingerprintScope + weights: ArtifactIdentity + source_dataset: ArtifactIdentity + exact_geography: ArtifactIdentity | None = None + target_db: ArtifactIdentity | None = None + calibration_package: ArtifactIdentity | None = None + run_config: ArtifactIdentity | None = None + code_version: Mapping[str, Any] = field(default_factory=dict) + model_build: Mapping[str, Any] = field(default_factory=dict) + metadata: Mapping[str, Any] = field(default_factory=dict) + + def resumability_material(self) -> Mapping[str, Any]: + """Return the normalized subset that controls staged-output validity.""" + + geography_sha = None + if self.exact_geography is not None: + geography_sha = self.exact_geography.metadata.get("canonical_sha256") + if geography_sha is None: + geography_sha = self.exact_geography.sha256 + + return { + "scope": self.scope, + "weights_sha256": self.weights.sha256, + "source_dataset_sha256": self.source_dataset.sha256, + "exact_geography_sha256": geography_sha, + "target_db_sha256": ( + self.target_db.sha256 if self.target_db is not None else None + ), + "n_clones": self.metadata.get("n_clones"), + "seed": self.metadata.get("seed"), + "policyengine_us_locked_version": self.model_build.get("locked_version"), + "policyengine_us_git_commit": self.model_build.get("git_commit"), + } + + +class FingerprintingService: + """Build traceability bundles and derive scope fingerprints from them.""" + + def __init__( + self, + *, + geography_loader: CalibrationGeographyLoader | None = None, + ) -> None: + self._geography_loader = geography_loader or CalibrationGeographyLoader() + + def build_traceability( + self, + *, + inputs: PublishingInputBundle, + scope: FingerprintScope, + ) -> TraceabilityBundle: + """Build a traceability bundle from current publish inputs.""" + + run_config_payload = self._load_json(inputs.run_config_path) + return TraceabilityBundle( + scope=scope, + weights=self._build_artifact_identity("weights", inputs.weights_path), + source_dataset=self._build_artifact_identity( + "source_dataset", + inputs.source_dataset_path, + ), + exact_geography=self._build_geography_identity(inputs), + target_db=self._build_optional_artifact_identity( + "target_db", + inputs.target_db_path, + ), + calibration_package=self._build_optional_artifact_identity( + "calibration_package", + inputs.calibration_package_path, + ), + run_config=self._build_optional_artifact_identity( + "run_config", + inputs.run_config_path, + ), + code_version=self._extract_code_version(run_config_payload), + model_build=self._extract_model_build(run_config_payload), + metadata={ + "run_id": inputs.run_id, + "version": inputs.version, + "n_clones": inputs.n_clones, + "seed": inputs.seed, + }, + ) + + def compute_scope_fingerprint(self, traceability: TraceabilityBundle) -> str: + """Hash normalized resumability material into a short scope fingerprint.""" + + payload = json.dumps( + traceability.resumability_material(), + sort_keys=True, + separators=(",", ":"), + ).encode() + return hashlib.sha256(payload).hexdigest()[:16] + + def _build_artifact_identity( + self, + logical_name: str, + path: Path, + *, + metadata: Mapping[str, Any] | None = None, + ) -> ArtifactIdentity: + actual_path = Path(path) + if not actual_path.exists(): + raise FileNotFoundError(f"Expected {logical_name} artifact at {actual_path}") + return ArtifactIdentity( + logical_name=logical_name, + path=actual_path, + sha256=self._sha256_file(actual_path), + size_bytes=actual_path.stat().st_size, + metadata=dict(metadata or {}), + ) + + def _build_optional_artifact_identity( + self, + logical_name: str, + path: Path | None, + ) -> ArtifactIdentity | None: + if path is None: + return None + actual_path = Path(path) + if not actual_path.exists(): + return None + return self._build_artifact_identity(logical_name, actual_path) + + def _build_geography_identity( + self, + inputs: PublishingInputBundle, + ) -> ArtifactIdentity | None: + resolved = self._geography_loader.resolve_source( + weights_path=inputs.weights_path, + geography_path=inputs.exact_geography_path, + blocks_path=inputs.legacy_blocks_path, + calibration_package_path=inputs.calibration_package_path, + ) + if resolved is None: + return None + + metadata = { + "source_kind": resolved.kind, + "canonical_sha256": self._geography_loader.compute_canonical_checksum( + weights_path=inputs.weights_path, + n_records=self._infer_n_records(inputs.source_dataset_path), + n_clones=inputs.n_clones, + geography_path=inputs.exact_geography_path, + blocks_path=inputs.legacy_blocks_path, + calibration_package_path=inputs.calibration_package_path, + ), + } + return self._build_artifact_identity( + "exact_geography", + resolved.path, + metadata=metadata, + ) + + def _extract_code_version(self, run_config_payload: Mapping[str, Any]) -> dict[str, Any]: + return { + "git_commit": run_config_payload.get("git_commit"), + "git_branch": run_config_payload.get("git_branch"), + "git_dirty": run_config_payload.get("git_dirty"), + } + + def _extract_model_build(self, run_config_payload: Mapping[str, Any]) -> dict[str, Any]: + return { + "locked_version": run_config_payload.get("package_version"), + "git_commit": run_config_payload.get("git_commit"), + } + + def _load_json(self, path: Path | None) -> Mapping[str, Any]: + if path is None: + return {} + actual_path = Path(path) + if not actual_path.exists(): + return {} + with open(actual_path) as handle: + return json.load(handle) + + def _sha256_file(self, path: Path) -> str: + digest = hashlib.sha256() + with open(path, "rb") as handle: + for chunk in iter(lambda: handle.read(1 << 20), b""): + digest.update(chunk) + return f"sha256:{digest.hexdigest()}" + + def _infer_n_records(self, source_dataset_path: Path) -> int: + import h5py + + with h5py.File(source_dataset_path, "r") as handle: + if "person" not in handle: + raise ValueError( + f"Unable to infer n_records from {source_dataset_path}: " + "missing 'person' entity" + ) + person_group = handle["person"] + first_dataset_name = next(iter(person_group.keys()), None) + if first_dataset_name is None: + raise ValueError( + f"Unable to infer n_records from {source_dataset_path}: " + "'person' entity is empty" + ) + return int(len(person_group[first_dataset_name])) diff --git a/policyengine_us_data/calibration/publish_local_area.py b/policyengine_us_data/calibration/publish_local_area.py index b1946a8f3..785fbafc8 100644 --- a/policyengine_us_data/calibration/publish_local_area.py +++ b/policyengine_us_data/calibration/publish_local_area.py @@ -11,12 +11,15 @@ import json import shutil - import numpy as np from pathlib import Path from typing import List, Optional from policyengine_us import Microsimulation +from policyengine_us_data.calibration.local_h5.fingerprinting import ( + FingerprintingService, + PublishingInputBundle, +) from policyengine_us_data.calibration.local_h5.geography_loader import ( CalibrationGeographyLoader, ) @@ -48,8 +51,6 @@ META_FILE = WORK_DIR / "checkpoint_meta.json" - - def compute_input_fingerprint( weights_path: Path, dataset_path: Path, @@ -57,50 +58,33 @@ def compute_input_fingerprint( seed: int = 42, geography_path: Optional[Path] = None, blocks_path: Optional[Path] = None, + target_db_path: Optional[Path] = None, + run_config_path: Optional[Path] = None, calibration_package_path: Optional[Path] = None, + scope: str = "regional", ) -> str: - import hashlib - - def _update_hash_from_file(h: "hashlib._Hash", path: Path) -> None: - with open(path, "rb") as f: - while chunk := f.read(8192): - h.update(chunk) - - def _infer_n_records() -> int: - if n_clones is not None: - weights = np.load(weights_path, mmap_mode="r") - if len(weights) % n_clones == 0: - return len(weights) // n_clones - sim = Microsimulation(dataset=str(dataset_path)) - return len(sim.calculate("household_id", map_to="household").values) - - loader = CalibrationGeographyLoader() - h = hashlib.sha256() - for p in [weights_path, dataset_path]: - _update_hash_from_file(h, p) - - resolved = loader.resolve_source( - weights_path=weights_path, - geography_path=geography_path, - blocks_path=blocks_path, - calibration_package_path=calibration_package_path, + service = FingerprintingService() + inputs = PublishingInputBundle( + weights_path=Path(weights_path), + source_dataset_path=Path(dataset_path), + target_db_path=Path(target_db_path) if target_db_path is not None else None, + exact_geography_path=( + Path(geography_path) if geography_path is not None else None + ), + calibration_package_path=( + Path(calibration_package_path) + if calibration_package_path is not None + else None + ), + run_config_path=Path(run_config_path) if run_config_path is not None else None, + run_id="", + version="", + n_clones=n_clones, + seed=seed, + legacy_blocks_path=Path(blocks_path) if blocks_path is not None else None, ) - if resolved is not None: - n_records = _infer_n_records() - h.update(f"geography_source:{resolved.kind}".encode()) - h.update( - loader.compute_canonical_checksum( - weights_path=weights_path, - n_records=n_records, - n_clones=n_clones, - geography_path=geography_path, - blocks_path=blocks_path, - calibration_package_path=calibration_package_path, - ).encode() - ) - else: - h.update(f"legacy_regeneration:{n_clones}:{seed}".encode()) - return h.hexdigest()[:16] + traceability = service.build_traceability(inputs=inputs, scope=scope) + return service.compute_scope_fingerprint(traceability) def load_calibration_geography( diff --git a/tests/unit/calibration/fixtures/test_local_h5_fingerprinting.py b/tests/unit/calibration/fixtures/test_local_h5_fingerprinting.py new file mode 100644 index 000000000..d8bba2148 --- /dev/null +++ b/tests/unit/calibration/fixtures/test_local_h5_fingerprinting.py @@ -0,0 +1,164 @@ +"""Fixture helpers for ``test_local_h5_fingerprinting.py``.""" + +from __future__ import annotations + +import importlib.util +import json +import sys +from pathlib import Path +from types import ModuleType + +import h5py +import numpy as np + +from tests.unit.calibration.fixtures.test_local_h5_geography_loader import ( + write_saved_geography, +) + +__test__ = False + + +def _ensure_package(name: str, path: Path) -> None: + """Register a synthetic package so relative imports resolve locally.""" + + package = sys.modules.get(name) + if package is None: + package = ModuleType(name) + package.__path__ = [str(path)] + sys.modules[name] = package + return + package.__path__ = [str(path)] + + +def _load_module(name: str, path: Path): + """Load one module from disk under a specific fully-qualified name.""" + + sys.modules.pop(name, None) + spec = importlib.util.spec_from_file_location(name, path) + module = importlib.util.module_from_spec(spec) + assert spec is not None + assert spec.loader is not None + sys.modules[name] = module + spec.loader.exec_module(module) + return module + + +def load_fingerprinting_exports(): + """Load the local H5 fingerprinting module under a synthetic package name.""" + + repo_root = Path(__file__).resolve().parents[4] + local_h5_root = ( + repo_root + / "policyengine_us_data" + / "calibration" + / "local_h5" + ) + calibration_root = repo_root / "policyengine_us_data" / "calibration" + storage_root = repo_root / "policyengine_us_data" / "storage" + package_name = "local_h5_fingerprinting_fixture" + policyengine_package = "policyengine_us_data" + calibration_package = "policyengine_us_data.calibration" + + for name in list(sys.modules): + if ( + name == package_name + or name.startswith(f"{package_name}.") + or name == policyengine_package + or name.startswith(f"{policyengine_package}.") + ): + sys.modules.pop(name, None) + + _ensure_package(package_name, local_h5_root) + _ensure_package(policyengine_package, repo_root / "policyengine_us_data") + _ensure_package(calibration_package, calibration_root) + _load_module( + "policyengine_us_data.storage", + storage_root / "__init__.py", + ) + _load_module( + "policyengine_us_data.calibration.clone_and_assign", + calibration_root / "clone_and_assign.py", + ) + _load_module( + f"{package_name}.geography_loader", + local_h5_root / "geography_loader.py", + ) + module = _load_module( + f"{package_name}.fingerprinting", + local_h5_root / "fingerprinting.py", + ) + return { + "module": module, + "ArtifactIdentity": module.ArtifactIdentity, + "FingerprintingService": module.FingerprintingService, + "PublishingInputBundle": module.PublishingInputBundle, + "TraceabilityBundle": module.TraceabilityBundle, + } + + +def write_source_dataset(path: Path, *, n_records: int) -> None: + """Write a minimal HDF5 dataset with a ``person`` entity.""" + + with h5py.File(path, "w") as handle: + person = handle.create_group("person") + person.create_dataset("person_id", data=np.arange(n_records, dtype=np.int32)) + + +def write_run_config(path: Path, *, package_version: str = "1.0.0") -> None: + """Write a minimal run-config payload with provenance fields.""" + + payload = { + "git_commit": "deadbeefcafebabe", + "git_branch": "main", + "git_dirty": False, + "package_version": package_version, + } + path.write_text(json.dumps(payload)) + + +def write_artifact_file(path: Path, content: bytes) -> None: + """Write one small binary artifact for traceability tests.""" + + path.write_bytes(content) + + +def make_publishing_inputs( + bundle_cls, + *, + tmp_path: Path, + n_records: int = 2, + n_clones: int = 2, + seed: int = 42, + package_version: str = "1.0.0", +): + """Create a fully-populated publishing input bundle for tests.""" + + tmp_path.mkdir(parents=True, exist_ok=True) + weights_path = tmp_path / "calibration_weights.npy" + dataset_path = tmp_path / "source.h5" + db_path = tmp_path / "policy_data.db" + geography_path = tmp_path / "geography_assignment.npz" + run_config_path = tmp_path / "unified_run_config.json" + + np.save(weights_path, np.array([1.0, 2.0, 3.0])) + write_source_dataset(dataset_path, n_records=n_records) + write_artifact_file(db_path, b"fake-db") + write_saved_geography( + geography_path, + n_records=n_records, + n_clones=n_clones, + ) + write_run_config(run_config_path, package_version=package_version) + + return bundle_cls( + weights_path=weights_path, + source_dataset_path=dataset_path, + target_db_path=db_path, + exact_geography_path=geography_path, + calibration_package_path=None, + run_config_path=run_config_path, + run_id="run-123", + version="1.2.3", + n_clones=n_clones, + seed=seed, + ) diff --git a/tests/unit/calibration/test_local_h5_fingerprinting.py b/tests/unit/calibration/test_local_h5_fingerprinting.py new file mode 100644 index 000000000..180e242f1 --- /dev/null +++ b/tests/unit/calibration/test_local_h5_fingerprinting.py @@ -0,0 +1,80 @@ +from tests.unit.calibration.fixtures.test_local_h5_fingerprinting import ( + load_fingerprinting_exports, + make_publishing_inputs, +) + + +exports = load_fingerprinting_exports() +FingerprintingService = exports["FingerprintingService"] +PublishingInputBundle = exports["PublishingInputBundle"] + + +def test_build_traceability_captures_artifact_identity_and_metadata(tmp_path): + inputs = make_publishing_inputs(PublishingInputBundle, tmp_path=tmp_path) + + service = FingerprintingService() + traceability = service.build_traceability(inputs=inputs, scope="regional") + + assert traceability.scope == "regional" + assert traceability.weights.path == inputs.weights_path + assert traceability.weights.sha256.startswith("sha256:") + assert traceability.source_dataset.sha256.startswith("sha256:") + assert traceability.exact_geography is not None + assert traceability.exact_geography.metadata["source_kind"] == "saved_geography" + assert traceability.exact_geography.metadata["canonical_sha256"].startswith( + "sha256:" + ) + assert traceability.target_db is not None + assert traceability.model_build["locked_version"] == "1.0.0" + assert traceability.metadata["n_clones"] == 2 + assert traceability.metadata["seed"] == 42 + + +def test_scope_fingerprint_differs_between_regional_and_national(tmp_path): + inputs = make_publishing_inputs(PublishingInputBundle, tmp_path=tmp_path) + + service = FingerprintingService() + regional = service.compute_scope_fingerprint( + service.build_traceability(inputs=inputs, scope="regional") + ) + national = service.compute_scope_fingerprint( + service.build_traceability(inputs=inputs, scope="national") + ) + + assert regional != national + + +def test_scope_fingerprint_is_stable_for_identical_inputs(tmp_path): + inputs = make_publishing_inputs(PublishingInputBundle, tmp_path=tmp_path) + + service = FingerprintingService() + first = service.compute_scope_fingerprint( + service.build_traceability(inputs=inputs, scope="regional") + ) + second = service.compute_scope_fingerprint( + service.build_traceability(inputs=inputs, scope="regional") + ) + + assert first == second + + +def test_scope_fingerprint_changes_when_relevant_provenance_changes(tmp_path): + first_inputs = make_publishing_inputs( + PublishingInputBundle, + tmp_path=tmp_path / "first", + ) + second_inputs = make_publishing_inputs( + PublishingInputBundle, + tmp_path=tmp_path / "second", + ) + second_inputs.target_db_path.write_bytes(b"changed-db") + + service = FingerprintingService() + first = service.compute_scope_fingerprint( + service.build_traceability(inputs=first_inputs, scope="regional") + ) + second = service.compute_scope_fingerprint( + service.build_traceability(inputs=second_inputs, scope="regional") + ) + + assert first != second diff --git a/tests/unit/fixtures/test_modal_local_area.py b/tests/unit/fixtures/test_modal_local_area.py index 377e879ae..935da8d6e 100644 --- a/tests/unit/fixtures/test_modal_local_area.py +++ b/tests/unit/fixtures/test_modal_local_area.py @@ -41,6 +41,9 @@ def load_local_area_module(): fake_partitioning = ModuleType( "policyengine_us_data.calibration.local_h5.partitioning" ) + fake_fingerprinting = ModuleType( + "policyengine_us_data.calibration.local_h5.fingerprinting" + ) fake_policyengine.__path__ = [] fake_calibration.__path__ = [] fake_local_h5.__path__ = [] @@ -71,6 +74,16 @@ def decorator(func): fake_resilience = ModuleType("modal_app.resilience") fake_resilience.reconcile_run_dir_fingerprint = lambda *args, **kwargs: None fake_partitioning.partition_weighted_work_items = lambda *args, **kwargs: [] + fake_fingerprinting.PublishingInputBundle = object + + class _FakeFingerprintingService: + def build_traceability(self, *args, **kwargs): + return object() + + def compute_scope_fingerprint(self, *args, **kwargs): + return "fake-fingerprint" + + fake_fingerprinting.FingerprintingService = _FakeFingerprintingService with _patched_module_registry( { @@ -80,6 +93,9 @@ def decorator(func): "policyengine_us_data": fake_policyengine, "policyengine_us_data.calibration": fake_calibration, "policyengine_us_data.calibration.local_h5": fake_local_h5, + "policyengine_us_data.calibration.local_h5.fingerprinting": ( + fake_fingerprinting + ), "policyengine_us_data.calibration.local_h5.partitioning": ( fake_partitioning ), diff --git a/tests/unit/test_pipeline.py b/tests/unit/test_pipeline.py index c287f5940..72c545676 100644 --- a/tests/unit/test_pipeline.py +++ b/tests/unit/test_pipeline.py @@ -66,6 +66,23 @@ def test_from_dict(self): assert meta.status == "completed" assert meta.step_timings["build_datasets"]["status"] == "completed" + def test_from_dict_maps_legacy_fingerprint_to_regional_scope(self): + meta = RunMetadata.from_dict( + { + "run_id": "test", + "branch": "main", + "sha": "abc12345deadbeef", + "version": "1.72.3", + "start_time": "2026-03-19T12:00:00Z", + "status": "running", + "fingerprint": "legacy-fingerprint", + } + ) + + assert meta.fingerprint == "legacy-fingerprint" + assert meta.regional_fingerprint == "legacy-fingerprint" + assert meta.national_fingerprint is None + def test_roundtrip(self): meta = RunMetadata( run_id="1.72.3_abc12345_20260319_120000", @@ -82,6 +99,24 @@ def test_roundtrip(self): assert roundtripped.status == meta.status assert roundtripped.error == meta.error + def test_to_dict_keeps_legacy_fingerprint_alias_in_sync(self): + meta = RunMetadata( + run_id="test", + branch="main", + sha="abc", + version="1.0.0", + start_time="now", + status="running", + regional_fingerprint="regional-fp", + national_fingerprint="national-fp", + ) + + payload = meta.to_dict() + + assert payload["fingerprint"] == "regional-fp" + assert payload["regional_fingerprint"] == "regional-fp" + assert payload["national_fingerprint"] == "national-fp" + def test_step_timings_default_empty(self): meta = RunMetadata( run_id="test", From 85ac71b7ccf1d7e973135736f9cdc65f81c9b846 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Thu, 16 Apr 2026 01:19:44 +0200 Subject: [PATCH 02/25] Tighten local H5 fingerprinting boundary --- modal_app/local_area.py | 2 - modal_app/pipeline.py | 7 -- .../calibration/local_h5/fingerprinting.py | 43 ++++---- .../fixtures/test_local_h5_fingerprinting.py | 99 +++++-------------- .../test_local_h5_fingerprinting.py | 18 ++++ tests/unit/test_pipeline.py | 3 - 6 files changed, 69 insertions(+), 103 deletions(-) diff --git a/modal_app/local_area.py b/modal_app/local_area.py index e5045582d..97499f08b 100644 --- a/modal_app/local_area.py +++ b/modal_app/local_area.py @@ -1108,7 +1108,6 @@ def coordinate_national_publish( n_clones: int = 430, validate: bool = True, run_id: str = "", - expected_fingerprint: str = "", ) -> Dict: """Build and upload a national US.h5 from national weights.""" setup_gcp_credentials() @@ -1183,7 +1182,6 @@ def coordinate_national_publish( fingerprint = _resolve_scope_fingerprint( inputs=fingerprint_inputs, scope="national", - expected_fingerprint=expected_fingerprint, ) run_dir = staging_dir / run_id run_dir.mkdir(parents=True, exist_ok=True) diff --git a/modal_app/pipeline.py b/modal_app/pipeline.py index c96a428d3..ed34f3905 100644 --- a/modal_app/pipeline.py +++ b/modal_app/pipeline.py @@ -110,7 +110,6 @@ class RunMetadata: resume_history: list = field(default_factory=list) fingerprint: Optional[str] = None regional_fingerprint: Optional[str] = None - national_fingerprint: Optional[str] = None def __post_init__(self) -> None: if self.regional_fingerprint is None and self.fingerprint is not None: @@ -1028,7 +1027,6 @@ def run_pipeline( n_clones=n_clones, validate=True, run_id=run_id, - expected_fingerprint=meta.national_fingerprint or "", ) print( f" → coordinate_national_publish fc: {national_h5_handle.object_id}" @@ -1067,11 +1065,6 @@ def run_pipeline( else national_h5_result ) print(f" National H5: {national_msg}") - if isinstance(national_h5_result, dict) and national_h5_result.get( - "fingerprint" - ): - meta.national_fingerprint = national_h5_result["fingerprint"] - write_run_meta(meta, pipeline_volume) # ── Aggregate validation results ── _write_validation_diagnostics( diff --git a/policyengine_us_data/calibration/local_h5/fingerprinting.py b/policyengine_us_data/calibration/local_h5/fingerprinting.py index 6bff37af0..8f401e582 100644 --- a/policyengine_us_data/calibration/local_h5/fingerprinting.py +++ b/policyengine_us_data/calibration/local_h5/fingerprinting.py @@ -186,7 +186,11 @@ def _build_geography_identity( "source_kind": resolved.kind, "canonical_sha256": self._geography_loader.compute_canonical_checksum( weights_path=inputs.weights_path, - n_records=self._infer_n_records(inputs.source_dataset_path), + n_records=self._infer_n_records( + weights_path=inputs.weights_path, + source_dataset_path=inputs.source_dataset_path, + n_clones=inputs.n_clones, + ), n_clones=inputs.n_clones, geography_path=inputs.exact_geography_path, blocks_path=inputs.legacy_blocks_path, @@ -228,20 +232,23 @@ def _sha256_file(self, path: Path) -> str: digest.update(chunk) return f"sha256:{digest.hexdigest()}" - def _infer_n_records(self, source_dataset_path: Path) -> int: - import h5py - - with h5py.File(source_dataset_path, "r") as handle: - if "person" not in handle: - raise ValueError( - f"Unable to infer n_records from {source_dataset_path}: " - "missing 'person' entity" - ) - person_group = handle["person"] - first_dataset_name = next(iter(person_group.keys()), None) - if first_dataset_name is None: - raise ValueError( - f"Unable to infer n_records from {source_dataset_path}: " - "'person' entity is empty" - ) - return int(len(person_group[first_dataset_name])) + def _infer_n_records( + self, + *, + weights_path: Path, + source_dataset_path: Path, + n_clones: int | None, + ) -> int: + if n_clones is not None: + import numpy as np + + weights = np.load(weights_path, mmap_mode="r") + if len(weights) % n_clones == 0: + return int(len(weights) // n_clones) + + from policyengine_us import Microsimulation + + simulation = Microsimulation(dataset=str(source_dataset_path)) + return int( + len(simulation.calculate("household_id", map_to="household").values) + ) diff --git a/tests/unit/calibration/fixtures/test_local_h5_fingerprinting.py b/tests/unit/calibration/fixtures/test_local_h5_fingerprinting.py index d8bba2148..2ecffd000 100644 --- a/tests/unit/calibration/fixtures/test_local_h5_fingerprinting.py +++ b/tests/unit/calibration/fixtures/test_local_h5_fingerprinting.py @@ -2,11 +2,9 @@ from __future__ import annotations -import importlib.util +import importlib import json -import sys from pathlib import Path -from types import ModuleType import h5py import numpy as np @@ -17,91 +15,41 @@ __test__ = False +_FINGERPRINTING_EXPORTS = None -def _ensure_package(name: str, path: Path) -> None: - """Register a synthetic package so relative imports resolve locally.""" - package = sys.modules.get(name) - if package is None: - package = ModuleType(name) - package.__path__ = [str(path)] - sys.modules[name] = package - return - package.__path__ = [str(path)] - - -def _load_module(name: str, path: Path): - """Load one module from disk under a specific fully-qualified name.""" - - sys.modules.pop(name, None) - spec = importlib.util.spec_from_file_location(name, path) - module = importlib.util.module_from_spec(spec) - assert spec is not None - assert spec.loader is not None - sys.modules[name] = module - spec.loader.exec_module(module) - return module +def load_fingerprinting_exports(): + """Load the fingerprinting module without replacing shared package modules.""" + global _FINGERPRINTING_EXPORTS + if _FINGERPRINTING_EXPORTS is not None: + return _FINGERPRINTING_EXPORTS -def load_fingerprinting_exports(): - """Load the local H5 fingerprinting module under a synthetic package name.""" - - repo_root = Path(__file__).resolve().parents[4] - local_h5_root = ( - repo_root - / "policyengine_us_data" - / "calibration" - / "local_h5" - ) - calibration_root = repo_root / "policyengine_us_data" / "calibration" - storage_root = repo_root / "policyengine_us_data" / "storage" - package_name = "local_h5_fingerprinting_fixture" - policyengine_package = "policyengine_us_data" - calibration_package = "policyengine_us_data.calibration" - - for name in list(sys.modules): - if ( - name == package_name - or name.startswith(f"{package_name}.") - or name == policyengine_package - or name.startswith(f"{policyengine_package}.") - ): - sys.modules.pop(name, None) - - _ensure_package(package_name, local_h5_root) - _ensure_package(policyengine_package, repo_root / "policyengine_us_data") - _ensure_package(calibration_package, calibration_root) - _load_module( - "policyengine_us_data.storage", - storage_root / "__init__.py", + module = importlib.import_module( + "policyengine_us_data.calibration.local_h5.fingerprinting" ) - _load_module( - "policyengine_us_data.calibration.clone_and_assign", - calibration_root / "clone_and_assign.py", - ) - _load_module( - f"{package_name}.geography_loader", - local_h5_root / "geography_loader.py", - ) - module = _load_module( - f"{package_name}.fingerprinting", - local_h5_root / "fingerprinting.py", - ) - return { + _FINGERPRINTING_EXPORTS = { "module": module, "ArtifactIdentity": module.ArtifactIdentity, "FingerprintingService": module.FingerprintingService, "PublishingInputBundle": module.PublishingInputBundle, "TraceabilityBundle": module.TraceabilityBundle, } + return _FINGERPRINTING_EXPORTS -def write_source_dataset(path: Path, *, n_records: int) -> None: +def write_source_dataset( + path: Path, + *, + n_records: int, + person_records: int | None = None, +) -> None: """Write a minimal HDF5 dataset with a ``person`` entity.""" + person_count = person_records if person_records is not None else n_records with h5py.File(path, "w") as handle: person = handle.create_group("person") - person.create_dataset("person_id", data=np.arange(n_records, dtype=np.int32)) + person.create_dataset("person_id", data=np.arange(person_count, dtype=np.int32)) def write_run_config(path: Path, *, package_version: str = "1.0.0") -> None: @@ -127,6 +75,7 @@ def make_publishing_inputs( *, tmp_path: Path, n_records: int = 2, + person_records: int | None = None, n_clones: int = 2, seed: int = 42, package_version: str = "1.0.0", @@ -140,8 +89,12 @@ def make_publishing_inputs( geography_path = tmp_path / "geography_assignment.npz" run_config_path = tmp_path / "unified_run_config.json" - np.save(weights_path, np.array([1.0, 2.0, 3.0])) - write_source_dataset(dataset_path, n_records=n_records) + np.save(weights_path, np.arange(n_records * n_clones, dtype=float) + 1.0) + write_source_dataset( + dataset_path, + n_records=n_records, + person_records=person_records, + ) write_artifact_file(db_path, b"fake-db") write_saved_geography( geography_path, diff --git a/tests/unit/calibration/test_local_h5_fingerprinting.py b/tests/unit/calibration/test_local_h5_fingerprinting.py index 180e242f1..66f288738 100644 --- a/tests/unit/calibration/test_local_h5_fingerprinting.py +++ b/tests/unit/calibration/test_local_h5_fingerprinting.py @@ -78,3 +78,21 @@ def test_scope_fingerprint_changes_when_relevant_provenance_changes(tmp_path): ) assert first != second + + +def test_traceability_uses_weight_derived_household_count_for_geography(tmp_path): + inputs = make_publishing_inputs( + PublishingInputBundle, + tmp_path=tmp_path, + n_records=2, + person_records=5, + n_clones=2, + ) + + service = FingerprintingService() + traceability = service.build_traceability(inputs=inputs, scope="regional") + + assert traceability.exact_geography is not None + assert traceability.exact_geography.metadata["canonical_sha256"].startswith( + "sha256:" + ) diff --git a/tests/unit/test_pipeline.py b/tests/unit/test_pipeline.py index 72c545676..5e769caaf 100644 --- a/tests/unit/test_pipeline.py +++ b/tests/unit/test_pipeline.py @@ -81,7 +81,6 @@ def test_from_dict_maps_legacy_fingerprint_to_regional_scope(self): assert meta.fingerprint == "legacy-fingerprint" assert meta.regional_fingerprint == "legacy-fingerprint" - assert meta.national_fingerprint is None def test_roundtrip(self): meta = RunMetadata( @@ -108,14 +107,12 @@ def test_to_dict_keeps_legacy_fingerprint_alias_in_sync(self): start_time="now", status="running", regional_fingerprint="regional-fp", - national_fingerprint="national-fp", ) payload = meta.to_dict() assert payload["fingerprint"] == "regional-fp" assert payload["regional_fingerprint"] == "regional-fp" - assert payload["national_fingerprint"] == "national-fp" def test_step_timings_default_empty(self): meta = RunMetadata( From 68f92233bec467e06550f07134437b6addf32c9b Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Sat, 25 Apr 2026 01:02:05 +0200 Subject: [PATCH 03/25] Add local H5 traceability tests --- .github/workflows/pr.yaml | 14 ++ tests/integration/local_h5/fixtures.py | 203 ++++++++++++++++++ .../test_modal_local_area_traceability.py | 64 ++++++ .../local_h5/test_traceability_contract.py | 88 ++++++++ .../test_worker_script_tiny_fixture.py | 115 ++++++++++ .../test_local_h5_fingerprinting.py | 51 +++++ tests/unit/fixtures/test_modal_local_area.py | 88 ++++---- tests/unit/test_modal_local_area.py | 163 ++++++++++++++ tests/unit/test_pipeline.py | 36 +++- 9 files changed, 780 insertions(+), 42 deletions(-) create mode 100644 tests/integration/local_h5/fixtures.py create mode 100644 tests/integration/local_h5/test_modal_local_area_traceability.py create mode 100644 tests/integration/local_h5/test_traceability_contract.py create mode 100644 tests/integration/local_h5/test_worker_script_tiny_fixture.py diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 797ca19b0..fccc63078 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -86,6 +86,19 @@ jobs: env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + local-h5-integration-tests: + runs-on: ubuntu-latest + needs: [check-fork, lint, unit-tests] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.14" + - uses: astral-sh/setup-uv@v5 + - run: uv sync --dev + - name: Run local H5 integration tests + run: uv run pytest --noconftest tests/integration/local_h5/ -v + optimized-integration-tests: runs-on: ubuntu-latest needs: @@ -95,6 +108,7 @@ jobs: lint, check-changelog, unit-tests, + local-h5-integration-tests, smoke-test, docs-build, ] diff --git a/tests/integration/local_h5/fixtures.py b/tests/integration/local_h5/fixtures.py new file mode 100644 index 000000000..3edf0f020 --- /dev/null +++ b/tests/integration/local_h5/fixtures.py @@ -0,0 +1,203 @@ +"""Shared tiny-artifact fixtures for local H5 integration tests.""" + +from __future__ import annotations + +import json +import pickle +import shutil +import sqlite3 +from dataclasses import dataclass +from functools import lru_cache +from pathlib import Path + +import numpy as np + +from policyengine_us_data.calibration.clone_and_assign import ( + GeographyAssignment, + save_geography, +) +from policyengine_us_data.calibration.local_h5.requests import ( + AreaBuildRequest, + AreaFilter, +) + +FIXTURE_DATASET_PATH = Path(__file__).resolve().parents[1] / "test_fixture_50hh.h5" +DISTRICT_GEOID = "3701" +COUNTY_FIPS = "37183" +STATE_FIPS = 37 +N_CLONES = 1 +SEED = 42 +VERSION = "0.0.0" + + +@dataclass(frozen=True) +class LocalH5Artifacts: + dataset_path: Path + weights_path: Path + db_path: Path + run_config_path: Path + geography_path: Path + calibration_package_path: Path + geography: GeographyAssignment + n_records: int + n_clones: int + + +@lru_cache(maxsize=1) +def fixture_household_count() -> int: + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=str(FIXTURE_DATASET_PATH)) + try: + return int(len(sim.calculate("household_id", map_to="household").values)) + finally: + del sim + + +def base_geography(*, n_records: int, n_clones: int = N_CLONES) -> GeographyAssignment: + total_rows = n_records * n_clones + block_geoids = np.array( + [f"{COUNTY_FIPS}{i:06d}{i:04d}"[:15] for i in range(total_rows)], + dtype="U15", + ) + return GeographyAssignment( + block_geoid=block_geoids, + cd_geoid=np.full(total_rows, DISTRICT_GEOID, dtype="U4"), + county_fips=np.full(total_rows, COUNTY_FIPS, dtype="U5"), + state_fips=np.full(total_rows, STATE_FIPS, dtype=np.int32), + n_records=n_records, + n_clones=n_clones, + ) + + +def seed_local_h5_artifacts( + tmp_path: Path, + *, + n_clones: int = N_CLONES, +) -> LocalH5Artifacts: + artifact_dir = tmp_path / "artifacts" + if artifact_dir.exists(): + shutil.rmtree(artifact_dir) + artifact_dir.mkdir(parents=True, exist_ok=True) + + dataset_path = artifact_dir / "source.h5" + weights_path = artifact_dir / "calibration_weights.npy" + db_path = artifact_dir / "policy_data.db" + run_config_path = artifact_dir / "unified_run_config.json" + geography_path = artifact_dir / "geography_assignment.npz" + calibration_package_path = artifact_dir / "calibration_package.pkl" + + shutil.copy2(FIXTURE_DATASET_PATH, dataset_path) + n_records = fixture_household_count() + np.save(weights_path, np.ones(n_records * n_clones, dtype=np.float32)) + + geography = base_geography(n_records=n_records, n_clones=n_clones) + save_geography(geography, geography_path) + + with open(calibration_package_path, "wb") as handle: + pickle.dump( + { + "block_geoid": geography.block_geoid, + "cd_geoid": geography.cd_geoid, + "metadata": { + "git_commit": "deadbeefcafebabe", + "git_branch": "main", + "git_dirty": False, + "package_version": VERSION, + }, + }, + handle, + protocol=pickle.HIGHEST_PROTOCOL, + ) + + conn = sqlite3.connect(db_path) + try: + conn.execute( + """ + CREATE TABLE stratum_constraints ( + stratum_id INTEGER, + constraint_variable TEXT, + value TEXT + ) + """ + ) + conn.execute( + """ + INSERT INTO stratum_constraints (stratum_id, constraint_variable, value) + VALUES (?, ?, ?) + """, + (1, "congressional_district_geoid", DISTRICT_GEOID), + ) + conn.commit() + finally: + conn.close() + + run_config_path.write_text( + json.dumps( + { + "git_commit": "deadbeefcafebabe", + "git_branch": "main", + "git_dirty": False, + "package_version": VERSION, + } + ) + ) + + return LocalH5Artifacts( + dataset_path=dataset_path, + weights_path=weights_path, + db_path=db_path, + run_config_path=run_config_path, + geography_path=geography_path, + calibration_package_path=calibration_package_path, + geography=geography, + n_records=n_records, + n_clones=n_clones, + ) + + +def build_request( + area_type: str, *, geography: GeographyAssignment +) -> AreaBuildRequest: + if area_type == "district": + return AreaBuildRequest( + area_type="district", + area_id="NC-01", + display_name="NC-01", + output_relative_path="districts/NC-01.h5", + filters=( + AreaFilter( + geography_field="cd_geoid", + op="in", + value=(DISTRICT_GEOID,), + ), + ), + validation_geo_level="district", + validation_geographic_ids=(DISTRICT_GEOID,), + ) + if area_type == "state": + return AreaBuildRequest( + area_type="state", + area_id="NC", + display_name="NC", + output_relative_path="states/NC.h5", + filters=( + AreaFilter( + geography_field="cd_geoid", + op="in", + value=(DISTRICT_GEOID,), + ), + ), + validation_geo_level="state", + validation_geographic_ids=(str(STATE_FIPS),), + ) + if area_type == "national": + return AreaBuildRequest( + area_type="national", + area_id="US", + display_name="US", + output_relative_path="national/US.h5", + validation_geo_level="national", + validation_geographic_ids=("US",), + ) + raise ValueError(f"Unsupported area_type for test fixture: {area_type}") diff --git a/tests/integration/local_h5/test_modal_local_area_traceability.py b/tests/integration/local_h5/test_modal_local_area_traceability.py new file mode 100644 index 000000000..13d86ad30 --- /dev/null +++ b/tests/integration/local_h5/test_modal_local_area_traceability.py @@ -0,0 +1,64 @@ +from policyengine_us_data.calibration.local_h5.fingerprinting import ( + FingerprintingService, +) + +from tests.integration.local_h5.fixtures import SEED, VERSION, seed_local_h5_artifacts +from tests.unit.fixtures.test_modal_local_area import load_local_area_module + + +def test_local_area_helpers_match_publish_traceability_contract(tmp_path): + local_area = load_local_area_module(stub_policyengine=False) + artifacts = seed_local_h5_artifacts(tmp_path) + + inputs = local_area._build_publishing_input_bundle( + weights_path=artifacts.weights_path, + dataset_path=artifacts.dataset_path, + db_path=artifacts.db_path, + geography_path=artifacts.geography_path, + calibration_package_path=artifacts.calibration_package_path, + run_config_path=artifacts.run_config_path, + run_id="run-123", + version=VERSION, + n_clones=artifacts.n_clones, + seed=SEED, + ) + + helper_fingerprint = local_area._resolve_scope_fingerprint( + inputs=inputs, + scope="regional", + ) + service = FingerprintingService() + service_fingerprint = service.compute_scope_fingerprint( + service.build_traceability(inputs=inputs, scope="regional") + ) + + assert helper_fingerprint == service_fingerprint + + +def test_local_area_scope_helper_distinguishes_regional_and_national(tmp_path): + local_area = load_local_area_module(stub_policyengine=False) + artifacts = seed_local_h5_artifacts(tmp_path) + + inputs = local_area._build_publishing_input_bundle( + weights_path=artifacts.weights_path, + dataset_path=artifacts.dataset_path, + db_path=artifacts.db_path, + geography_path=artifacts.geography_path, + calibration_package_path=artifacts.calibration_package_path, + run_config_path=artifacts.run_config_path, + run_id="run-123", + version=VERSION, + n_clones=artifacts.n_clones, + seed=SEED, + ) + + regional = local_area._resolve_scope_fingerprint( + inputs=inputs, + scope="regional", + ) + national = local_area._resolve_scope_fingerprint( + inputs=inputs, + scope="national", + ) + + assert regional != national diff --git a/tests/integration/local_h5/test_traceability_contract.py b/tests/integration/local_h5/test_traceability_contract.py new file mode 100644 index 000000000..65fa0b678 --- /dev/null +++ b/tests/integration/local_h5/test_traceability_contract.py @@ -0,0 +1,88 @@ +from policyengine_us_data.calibration.local_h5.fingerprinting import ( + FingerprintingService, + PublishingInputBundle, +) + +from tests.integration.local_h5.fixtures import SEED, VERSION, seed_local_h5_artifacts + + +def _fingerprint_for(*, inputs, scope: str = "regional") -> str: + service = FingerprintingService() + return service.compute_scope_fingerprint( + service.build_traceability(inputs=inputs, scope=scope) + ) + + +def test_saved_geography_bundle_builds_traceability_with_stable_fingerprint(tmp_path): + artifacts = seed_local_h5_artifacts(tmp_path) + inputs = PublishingInputBundle( + weights_path=artifacts.weights_path, + source_dataset_path=artifacts.dataset_path, + target_db_path=artifacts.db_path, + exact_geography_path=artifacts.geography_path, + calibration_package_path=None, + run_config_path=artifacts.run_config_path, + run_id="run-123", + version=VERSION, + n_clones=artifacts.n_clones, + seed=SEED, + ) + + first = _fingerprint_for(inputs=inputs) + second = _fingerprint_for(inputs=inputs) + + assert first == second + + +def test_package_geography_bundle_builds_traceability_with_stable_fingerprint(tmp_path): + artifacts = seed_local_h5_artifacts(tmp_path) + inputs = PublishingInputBundle( + weights_path=artifacts.weights_path, + source_dataset_path=artifacts.dataset_path, + target_db_path=artifacts.db_path, + exact_geography_path=None, + calibration_package_path=artifacts.calibration_package_path, + run_config_path=artifacts.run_config_path, + run_id="run-123", + version=VERSION, + n_clones=artifacts.n_clones, + seed=SEED, + ) + + first = _fingerprint_for(inputs=inputs) + second = _fingerprint_for(inputs=inputs) + + assert first == second + + +def test_saved_and_package_geography_share_the_same_resumability_identity(tmp_path): + artifacts = seed_local_h5_artifacts(tmp_path) + saved_inputs = PublishingInputBundle( + weights_path=artifacts.weights_path, + source_dataset_path=artifacts.dataset_path, + target_db_path=artifacts.db_path, + exact_geography_path=artifacts.geography_path, + calibration_package_path=None, + run_config_path=artifacts.run_config_path, + run_id="run-123", + version=VERSION, + n_clones=artifacts.n_clones, + seed=SEED, + ) + package_inputs = PublishingInputBundle( + weights_path=artifacts.weights_path, + source_dataset_path=artifacts.dataset_path, + target_db_path=artifacts.db_path, + exact_geography_path=None, + calibration_package_path=artifacts.calibration_package_path, + run_config_path=artifacts.run_config_path, + run_id="run-123", + version=VERSION, + n_clones=artifacts.n_clones, + seed=SEED, + ) + + saved_fingerprint = _fingerprint_for(inputs=saved_inputs) + package_fingerprint = _fingerprint_for(inputs=package_inputs) + + assert saved_fingerprint == package_fingerprint diff --git a/tests/integration/local_h5/test_worker_script_tiny_fixture.py b/tests/integration/local_h5/test_worker_script_tiny_fixture.py new file mode 100644 index 000000000..12b6a0426 --- /dev/null +++ b/tests/integration/local_h5/test_worker_script_tiny_fixture.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +import pytest + +from tests.integration.local_h5.fixtures import ( + build_request, + seed_local_h5_artifacts, +) + +pytest.importorskip("scipy") +pytest.importorskip("spm_calculator") + + +def _run_worker( + *, + request, + artifacts, + output_dir: Path, + use_saved_geography: bool = False, + use_package_geography: bool = False, +) -> dict: + cmd = [ + sys.executable, + "-m", + "modal_app.worker_script", + "--requests-json", + json.dumps([request.to_dict()]), + "--weights-path", + str(artifacts.weights_path), + "--dataset-path", + str(artifacts.dataset_path), + "--db-path", + str(artifacts.db_path), + "--output-dir", + str(output_dir), + "--n-clones", + str(artifacts.n_clones), + "--no-validate", + ] + if use_saved_geography: + cmd.extend(["--geography-path", str(artifacts.geography_path)]) + if use_package_geography: + cmd.extend( + [ + "--calibration-package-path", + str(artifacts.calibration_package_path), + ] + ) + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + check=True, + ) + return json.loads(result.stdout) + + +def test_worker_builds_district_h5_from_saved_geography(tmp_path): + artifacts = seed_local_h5_artifacts(tmp_path / "district") + request = build_request("district", geography=artifacts.geography) + output_dir = tmp_path / "district-out" + + result = _run_worker( + request=request, + artifacts=artifacts, + output_dir=output_dir, + use_saved_geography=True, + ) + + assert result["failed"] == [] + assert result["errors"] == [] + assert result["completed"] == [f"district:{request.area_id}"] + assert (output_dir / request.output_relative_path).exists() + + +def test_worker_builds_state_h5_from_package_geography(tmp_path): + artifacts = seed_local_h5_artifacts(tmp_path / "state") + request = build_request("state", geography=artifacts.geography) + output_dir = tmp_path / "state-out" + + result = _run_worker( + request=request, + artifacts=artifacts, + output_dir=output_dir, + use_package_geography=True, + ) + + assert result["failed"] == [] + assert result["errors"] == [] + assert result["completed"] == [f"state:{request.area_id}"] + assert (output_dir / request.output_relative_path).exists() + + +def test_worker_builds_national_h5_from_package_geography(tmp_path): + artifacts = seed_local_h5_artifacts(tmp_path / "national") + request = build_request("national", geography=artifacts.geography) + output_dir = tmp_path / "national-out" + + result = _run_worker( + request=request, + artifacts=artifacts, + output_dir=output_dir, + use_package_geography=True, + ) + + assert result["failed"] == [] + assert result["errors"] == [] + assert result["completed"] == ["national:US"] + assert (output_dir / request.output_relative_path).exists() diff --git a/tests/unit/calibration/test_local_h5_fingerprinting.py b/tests/unit/calibration/test_local_h5_fingerprinting.py index 66f288738..08d2b593b 100644 --- a/tests/unit/calibration/test_local_h5_fingerprinting.py +++ b/tests/unit/calibration/test_local_h5_fingerprinting.py @@ -96,3 +96,54 @@ def test_traceability_uses_weight_derived_household_count_for_geography(tmp_path assert traceability.exact_geography.metadata["canonical_sha256"].startswith( "sha256:" ) + + +def test_resumability_material_prefers_canonical_geography_checksum(tmp_path): + inputs = make_publishing_inputs(PublishingInputBundle, tmp_path=tmp_path) + + service = FingerprintingService() + traceability = service.build_traceability(inputs=inputs, scope="regional") + resumability = traceability.resumability_material() + + assert traceability.exact_geography is not None + assert ( + resumability["exact_geography_sha256"] + == traceability.exact_geography.metadata["canonical_sha256"] + ) + + +def test_traceability_handles_missing_optional_artifacts(tmp_path): + inputs = make_publishing_inputs(PublishingInputBundle, tmp_path=tmp_path) + standalone_weights_path = tmp_path / "standalone" / "weights.npy" + standalone_weights_path.parent.mkdir(parents=True, exist_ok=True) + standalone_weights_path.write_bytes(inputs.weights_path.read_bytes()) + inputs = PublishingInputBundle( + weights_path=standalone_weights_path, + source_dataset_path=inputs.source_dataset_path, + target_db_path=None, + exact_geography_path=None, + calibration_package_path=None, + run_config_path=None, + run_id=inputs.run_id, + version=inputs.version, + n_clones=inputs.n_clones, + seed=inputs.seed, + legacy_blocks_path=None, + ) + + service = FingerprintingService() + traceability = service.build_traceability(inputs=inputs, scope="regional") + + assert traceability.target_db is None + assert traceability.exact_geography is None + assert traceability.calibration_package is None + assert traceability.run_config is None + assert traceability.code_version == { + "git_commit": None, + "git_branch": None, + "git_dirty": None, + } + assert traceability.model_build == { + "locked_version": None, + "git_commit": None, + } diff --git a/tests/unit/fixtures/test_modal_local_area.py b/tests/unit/fixtures/test_modal_local_area.py index 935da8d6e..db9d0e621 100644 --- a/tests/unit/fixtures/test_modal_local_area.py +++ b/tests/unit/fixtures/test_modal_local_area.py @@ -31,22 +31,10 @@ def _patched_module_registry(overrides: dict[str, ModuleType]): sys.modules[name] = module -def load_local_area_module(): +def load_local_area_module(*, stub_policyengine: bool = True): """Import `modal_app.local_area` with scoped fake Modal dependencies.""" fake_modal = ModuleType("modal") - fake_policyengine = ModuleType("policyengine_us_data") - fake_calibration = ModuleType("policyengine_us_data.calibration") - fake_local_h5 = ModuleType("policyengine_us_data.calibration.local_h5") - fake_partitioning = ModuleType( - "policyengine_us_data.calibration.local_h5.partitioning" - ) - fake_fingerprinting = ModuleType( - "policyengine_us_data.calibration.local_h5.fingerprinting" - ) - fake_policyengine.__path__ = [] - fake_calibration.__path__ = [] - fake_local_h5.__path__ = [] class _FakeApp: def __init__(self, *args, **kwargs): @@ -73,32 +61,50 @@ def decorator(func): fake_resilience = ModuleType("modal_app.resilience") fake_resilience.reconcile_run_dir_fingerprint = lambda *args, **kwargs: None - fake_partitioning.partition_weighted_work_items = lambda *args, **kwargs: [] - fake_fingerprinting.PublishingInputBundle = object - - class _FakeFingerprintingService: - def build_traceability(self, *args, **kwargs): - return object() - - def compute_scope_fingerprint(self, *args, **kwargs): - return "fake-fingerprint" - - fake_fingerprinting.FingerprintingService = _FakeFingerprintingService - - with _patched_module_registry( - { - "modal": fake_modal, - "modal_app.images": fake_images, - "modal_app.resilience": fake_resilience, - "policyengine_us_data": fake_policyengine, - "policyengine_us_data.calibration": fake_calibration, - "policyengine_us_data.calibration.local_h5": fake_local_h5, - "policyengine_us_data.calibration.local_h5.fingerprinting": ( - fake_fingerprinting - ), - "policyengine_us_data.calibration.local_h5.partitioning": ( - fake_partitioning - ), - } - ): + + overrides = { + "modal": fake_modal, + "modal_app.images": fake_images, + "modal_app.resilience": fake_resilience, + } + + if stub_policyengine: + fake_policyengine = ModuleType("policyengine_us_data") + fake_calibration = ModuleType("policyengine_us_data.calibration") + fake_local_h5 = ModuleType("policyengine_us_data.calibration.local_h5") + fake_partitioning = ModuleType( + "policyengine_us_data.calibration.local_h5.partitioning" + ) + fake_fingerprinting = ModuleType( + "policyengine_us_data.calibration.local_h5.fingerprinting" + ) + fake_policyengine.__path__ = [] + fake_calibration.__path__ = [] + fake_local_h5.__path__ = [] + fake_partitioning.partition_weighted_work_items = lambda *args, **kwargs: [] + fake_fingerprinting.PublishingInputBundle = object + + class _FakeFingerprintingService: + def build_traceability(self, *args, **kwargs): + return object() + + def compute_scope_fingerprint(self, *args, **kwargs): + return "fake-fingerprint" + + fake_fingerprinting.FingerprintingService = _FakeFingerprintingService + overrides.update( + { + "policyengine_us_data": fake_policyengine, + "policyengine_us_data.calibration": fake_calibration, + "policyengine_us_data.calibration.local_h5": fake_local_h5, + "policyengine_us_data.calibration.local_h5.fingerprinting": ( + fake_fingerprinting + ), + "policyengine_us_data.calibration.local_h5.partitioning": ( + fake_partitioning + ), + } + ) + + with _patched_module_registry(overrides): return importlib.import_module("modal_app.local_area") diff --git a/tests/unit/test_modal_local_area.py b/tests/unit/test_modal_local_area.py index 0e3cd9fd6..e8128db71 100644 --- a/tests/unit/test_modal_local_area.py +++ b/tests/unit/test_modal_local_area.py @@ -1,3 +1,5 @@ +from pathlib import Path + from tests.unit.fixtures.test_modal_local_area import load_local_area_module @@ -28,3 +30,164 @@ def test_build_promote_publish_script_finalizes_complete_release(): assert "should_finalize_local_area_release" in script assert "create_tag=should_finalize" in script assert "upload_manifest(" in script + + +def test_build_publishing_input_bundle_preserves_traceability_inputs(): + local_area = load_local_area_module(stub_policyengine=False) + + bundle = local_area._build_publishing_input_bundle( + weights_path=Path("/tmp/calibration_weights.npy"), + dataset_path=Path("/tmp/source.h5"), + db_path=Path("/tmp/policy_data.db"), + geography_path=Path("/tmp/geography_assignment.npz"), + calibration_package_path=Path("/tmp/calibration_package.pkl"), + run_config_path=Path("/tmp/unified_run_config.json"), + run_id="run-123", + version="1.2.3", + n_clones=4, + seed=42, + legacy_blocks_path=Path("/tmp/stacked_blocks.npy"), + ) + + assert bundle.weights_path == Path("/tmp/calibration_weights.npy") + assert bundle.source_dataset_path == Path("/tmp/source.h5") + assert bundle.target_db_path == Path("/tmp/policy_data.db") + assert bundle.exact_geography_path == Path("/tmp/geography_assignment.npz") + assert bundle.calibration_package_path == Path("/tmp/calibration_package.pkl") + assert bundle.run_config_path == Path("/tmp/unified_run_config.json") + assert bundle.run_id == "run-123" + assert bundle.version == "1.2.3" + assert bundle.n_clones == 4 + assert bundle.seed == 42 + assert bundle.legacy_blocks_path == Path("/tmp/stacked_blocks.npy") + + +def test_resolve_scope_fingerprint_computes_when_no_pin(monkeypatch): + local_area = load_local_area_module(stub_policyengine=False) + + seen = {} + + class FakeFingerprintingService: + def build_traceability(self, *, inputs, scope): + seen["inputs"] = inputs + seen["scope"] = scope + return {"scope": scope, "run_id": inputs.run_id} + + def compute_scope_fingerprint(self, traceability): + seen["traceability"] = traceability + return "computed-fingerprint" + + monkeypatch.setattr( + local_area, + "FingerprintingService", + FakeFingerprintingService, + ) + + bundle = local_area._build_publishing_input_bundle( + weights_path=Path("/tmp/calibration_weights.npy"), + dataset_path=Path("/tmp/source.h5"), + db_path=None, + geography_path=None, + calibration_package_path=None, + run_config_path=None, + run_id="run-123", + version="1.2.3", + n_clones=2, + seed=42, + ) + + fingerprint = local_area._resolve_scope_fingerprint( + inputs=bundle, + scope="regional", + ) + + assert fingerprint == "computed-fingerprint" + assert seen["inputs"] == bundle + assert seen["scope"] == "regional" + assert seen["traceability"] == {"scope": "regional", "run_id": "run-123"} + + +def test_resolve_scope_fingerprint_preserves_matching_pin(monkeypatch, capsys): + local_area = load_local_area_module(stub_policyengine=False) + + class FakeFingerprintingService: + def build_traceability(self, *, inputs, scope): + return scope + + def compute_scope_fingerprint(self, traceability): + return "pinned-fingerprint" + + monkeypatch.setattr( + local_area, + "FingerprintingService", + FakeFingerprintingService, + ) + + bundle = local_area._build_publishing_input_bundle( + weights_path=Path("/tmp/calibration_weights.npy"), + dataset_path=Path("/tmp/source.h5"), + db_path=None, + geography_path=None, + calibration_package_path=None, + run_config_path=None, + run_id="run-123", + version="1.2.3", + n_clones=2, + seed=42, + ) + + fingerprint = local_area._resolve_scope_fingerprint( + inputs=bundle, + scope="regional", + expected_fingerprint="pinned-fingerprint", + ) + + captured = capsys.readouterr() + assert fingerprint == "pinned-fingerprint" + assert "Using pinned fingerprint from pipeline" in captured.out + + +def test_resolve_scope_fingerprint_warns_and_preserves_mismatched_pin( + monkeypatch, capsys +): + local_area = load_local_area_module(stub_policyengine=False) + + class FakeFingerprintingService: + def build_traceability(self, *, inputs, scope): + return scope + + def compute_scope_fingerprint(self, traceability): + return "computed-fingerprint" + + monkeypatch.setattr( + local_area, + "FingerprintingService", + FakeFingerprintingService, + ) + + bundle = local_area._build_publishing_input_bundle( + weights_path=Path("/tmp/calibration_weights.npy"), + dataset_path=Path("/tmp/source.h5"), + db_path=None, + geography_path=None, + calibration_package_path=None, + run_config_path=None, + run_id="run-123", + version="1.2.3", + n_clones=2, + seed=42, + ) + + fingerprint = local_area._resolve_scope_fingerprint( + inputs=bundle, + scope="national", + expected_fingerprint="legacy-fingerprint", + ) + + captured = capsys.readouterr() + assert fingerprint == "legacy-fingerprint" + assert "Pinned fingerprint differs from current national scope fingerprint" in ( + captured.out + ) + assert "legacy-fingerprint" in captured.out + assert "computed-fingerprint" in captured.out diff --git a/tests/unit/test_pipeline.py b/tests/unit/test_pipeline.py index 5e769caaf..2d126e71f 100644 --- a/tests/unit/test_pipeline.py +++ b/tests/unit/test_pipeline.py @@ -10,7 +10,7 @@ modal = pytest.importorskip("modal") -from modal_app.pipeline import ( +from modal_app.pipeline import ( # noqa: E402 RunMetadata, _build_diagnostics_upload_script, _step_completed, @@ -82,6 +82,23 @@ def test_from_dict_maps_legacy_fingerprint_to_regional_scope(self): assert meta.fingerprint == "legacy-fingerprint" assert meta.regional_fingerprint == "legacy-fingerprint" + def test_from_dict_keeps_explicit_regional_fingerprint_when_both_present(self): + meta = RunMetadata.from_dict( + { + "run_id": "test", + "branch": "main", + "sha": "abc12345deadbeef", + "version": "1.72.3", + "start_time": "2026-03-19T12:00:00Z", + "status": "running", + "fingerprint": "legacy-fingerprint", + "regional_fingerprint": "regional-fingerprint", + } + ) + + assert meta.fingerprint == "legacy-fingerprint" + assert meta.regional_fingerprint == "regional-fingerprint" + def test_roundtrip(self): meta = RunMetadata( run_id="1.72.3_abc12345_20260319_120000", @@ -114,6 +131,23 @@ def test_to_dict_keeps_legacy_fingerprint_alias_in_sync(self): assert payload["fingerprint"] == "regional-fp" assert payload["regional_fingerprint"] == "regional-fp" + def test_to_dict_preserves_distinct_explicit_regional_fingerprint(self): + meta = RunMetadata( + run_id="test", + branch="main", + sha="abc", + version="1.0.0", + start_time="now", + status="running", + fingerprint="legacy-fp", + regional_fingerprint="regional-fp", + ) + + payload = meta.to_dict() + + assert payload["fingerprint"] == "legacy-fp" + assert payload["regional_fingerprint"] == "regional-fp" + def test_step_timings_default_empty(self): meta = RunMetadata( run_id="test", From 9a33855186e16aae0c36d297a428230577a4e5d3 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 28 Apr 2026 16:29:02 +0200 Subject: [PATCH 04/25] Refresh PR3b formatting and changelog --- changelog.d/760.added.md | 1 + modal_app/local_area.py | 4 +--- modal_app/pipeline.py | 10 ++++++++-- .../calibration/local_h5/fingerprinting.py | 16 ++++++++++------ .../calibration/publish_local_area.py | 2 ++ 5 files changed, 22 insertions(+), 11 deletions(-) create mode 100644 changelog.d/760.added.md diff --git a/changelog.d/760.added.md b/changelog.d/760.added.md new file mode 100644 index 000000000..bff3b7190 --- /dev/null +++ b/changelog.d/760.added.md @@ -0,0 +1 @@ +Added local H5 traceability metadata and scope fingerprinting for calibration artifacts. diff --git a/modal_app/local_area.py b/modal_app/local_area.py index 97499f08b..6ea07cdca 100644 --- a/modal_app/local_area.py +++ b/modal_app/local_area.py @@ -367,9 +367,7 @@ def _resolve_scope_fingerprint( f" Current: {computed_fingerprint}" ) else: - print( - f"Using pinned fingerprint from pipeline: {expected_fingerprint}" - ) + print(f"Using pinned fingerprint from pipeline: {expected_fingerprint}") return expected_fingerprint return computed_fingerprint diff --git a/modal_app/pipeline.py b/modal_app/pipeline.py index ed34f3905..478d0e073 100644 --- a/modal_app/pipeline.py +++ b/modal_app/pipeline.py @@ -119,14 +119,20 @@ def __post_init__(self) -> None: def to_dict(self) -> dict: data = asdict(self) - if data.get("fingerprint") is None and data.get("regional_fingerprint") is not None: + if ( + data.get("fingerprint") is None + and data.get("regional_fingerprint") is not None + ): data["fingerprint"] = data["regional_fingerprint"] return data @classmethod def from_dict(cls, data: dict) -> "RunMetadata": data = dict(data) - if data.get("regional_fingerprint") is None and data.get("fingerprint") is not None: + if ( + data.get("regional_fingerprint") is None + and data.get("fingerprint") is not None + ): data["regional_fingerprint"] = data["fingerprint"] return cls(**data) diff --git a/policyengine_us_data/calibration/local_h5/fingerprinting.py b/policyengine_us_data/calibration/local_h5/fingerprinting.py index 8f401e582..f141ac28f 100644 --- a/policyengine_us_data/calibration/local_h5/fingerprinting.py +++ b/policyengine_us_data/calibration/local_h5/fingerprinting.py @@ -148,7 +148,9 @@ def _build_artifact_identity( ) -> ArtifactIdentity: actual_path = Path(path) if not actual_path.exists(): - raise FileNotFoundError(f"Expected {logical_name} artifact at {actual_path}") + raise FileNotFoundError( + f"Expected {logical_name} artifact at {actual_path}" + ) return ArtifactIdentity( logical_name=logical_name, path=actual_path, @@ -203,14 +205,18 @@ def _build_geography_identity( metadata=metadata, ) - def _extract_code_version(self, run_config_payload: Mapping[str, Any]) -> dict[str, Any]: + def _extract_code_version( + self, run_config_payload: Mapping[str, Any] + ) -> dict[str, Any]: return { "git_commit": run_config_payload.get("git_commit"), "git_branch": run_config_payload.get("git_branch"), "git_dirty": run_config_payload.get("git_dirty"), } - def _extract_model_build(self, run_config_payload: Mapping[str, Any]) -> dict[str, Any]: + def _extract_model_build( + self, run_config_payload: Mapping[str, Any] + ) -> dict[str, Any]: return { "locked_version": run_config_payload.get("package_version"), "git_commit": run_config_payload.get("git_commit"), @@ -249,6 +255,4 @@ def _infer_n_records( from policyengine_us import Microsimulation simulation = Microsimulation(dataset=str(source_dataset_path)) - return int( - len(simulation.calculate("household_id", map_to="household").values) - ) + return int(len(simulation.calculate("household_id", map_to="household").values)) diff --git a/policyengine_us_data/calibration/publish_local_area.py b/policyengine_us_data/calibration/publish_local_area.py index 785fbafc8..e42d7a030 100644 --- a/policyengine_us_data/calibration/publish_local_area.py +++ b/policyengine_us_data/calibration/publish_local_area.py @@ -51,6 +51,8 @@ META_FILE = WORK_DIR / "checkpoint_meta.json" + + def compute_input_fingerprint( weights_path: Path, dataset_path: Path, From e50bc742d34bfea273303833740076705efb72c4 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 28 Apr 2026 16:37:59 +0200 Subject: [PATCH 05/25] Fix publish fingerprint test doubles --- tests/unit/calibration/test_publish_local_area.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/unit/calibration/test_publish_local_area.py b/tests/unit/calibration/test_publish_local_area.py index d4ebf685b..e6b5aa4d5 100644 --- a/tests/unit/calibration/test_publish_local_area.py +++ b/tests/unit/calibration/test_publish_local_area.py @@ -80,7 +80,10 @@ def test_compute_input_fingerprint_uses_loader_canonical_geography_identity( monkeypatch.setattr( "policyengine_us_data.calibration.publish_local_area.CalibrationGeographyLoader.resolve_source", - lambda self, **kwargs: SimpleNamespace(kind="saved_geography"), + lambda self, **kwargs: SimpleNamespace( + kind="saved_geography", + path=kwargs["geography_path"], + ), ) monkeypatch.setattr( "policyengine_us_data.calibration.publish_local_area.CalibrationGeographyLoader.compute_canonical_checksum", @@ -118,7 +121,10 @@ def test_compute_input_fingerprint_passes_calibration_package_path_to_loader( def fake_resolve_source(self, **kwargs): seen["resolve"] = kwargs - return SimpleNamespace(kind="calibration_package") + return SimpleNamespace( + kind="calibration_package", + path=kwargs["calibration_package_path"], + ) def fake_compute_canonical_checksum(self, **kwargs): seen["checksum"] = kwargs From 6780eddcd810b42510321dc6ae2e89f9057b8705 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 28 Apr 2026 19:50:00 +0200 Subject: [PATCH 06/25] Add test layout quality guards --- .github/copilot-instructions.md | 7 + .github/workflows/pr.yaml | 12 + AGENTS.md | 14 + CLAUDE.md | 10 +- changelog.d/test-quality-guards.changed.md | 1 + docs/engineering/skills/README.md | 13 + docs/engineering/skills/testing.md | 54 +++ .../tests/test_release_manifest.py | 318 ------------------ scripts/__init__.py | 1 + scripts/guards/__init__.py | 1 + scripts/guards/test_layout.py | 160 +++++++++ scripts/run_quality_guards.py | 38 +++ tests/integration/local_h5/__init__.py | 1 + .../test_modal_local_area_traceability.py | 2 +- tests/integration/test_xw_consistency.py | 2 +- tests/support/__init__.py | 1 + .../modal_local_area.py} | 2 +- tests/unit/test_modal_local_area.py | 2 +- .../test_refresh_local_agi_state_targets.py | 2 +- .../tests => tests/unit}/test_trace_tro.py | 0 tests/unit/version_manifest/__init__.py | 1 + tests/{ => unit/version_manifest}/conftest.py | 44 +-- tests/unit/version_manifest/support.py | 23 ++ .../test_version_manifest.py | 2 +- 24 files changed, 351 insertions(+), 360 deletions(-) create mode 100644 .github/copilot-instructions.md create mode 100644 AGENTS.md create mode 100644 changelog.d/test-quality-guards.changed.md create mode 100644 docs/engineering/skills/README.md create mode 100644 docs/engineering/skills/testing.md delete mode 100644 policyengine_us_data/tests/test_release_manifest.py create mode 100644 scripts/__init__.py create mode 100644 scripts/guards/__init__.py create mode 100644 scripts/guards/test_layout.py create mode 100644 scripts/run_quality_guards.py create mode 100644 tests/integration/local_h5/__init__.py create mode 100644 tests/support/__init__.py rename tests/{unit/fixtures/test_modal_local_area.py => support/modal_local_area.py} (98%) rename tests/{ => unit}/test_refresh_local_agi_state_targets.py (98%) rename {policyengine_us_data/tests => tests/unit}/test_trace_tro.py (100%) create mode 100644 tests/unit/version_manifest/__init__.py rename tests/{ => unit/version_manifest}/conftest.py (62%) create mode 100644 tests/unit/version_manifest/support.py rename tests/unit/{ => version_manifest}/test_version_manifest.py (99%) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 000000000..3969bdb47 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,7 @@ +# Copilot Instructions + +Follow the repository's canonical engineering skills under +`docs/engineering/skills/`. + +For tests, read `docs/engineering/skills/testing.md` before adding, moving, or +reviewing test files. Do not duplicate or override that testing guidance here. diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index fccc63078..34feb266a 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -43,6 +43,18 @@ jobs: - run: pip install ruff>=0.9.0 - run: ruff format --check . + quality-guards: + name: Quality guards + runs-on: ubuntu-latest + needs: check-fork + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.14" + - name: Run quality guards + run: python scripts/run_quality_guards.py + check-changelog: runs-on: ubuntu-latest needs: check-fork diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..87120ff4b --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,14 @@ +# Codex Instructions + +These instructions apply repository-wide. + +## Skills system + +Canonical AI-facing engineering skills live under `docs/engineering/skills/`. +Use those files as the source of truth across Codex, Claude, Copilot, and other +AI tools. + +When adding, moving, or reviewing tests, read +`docs/engineering/skills/testing.md`. Do not put pytest files under +`policyengine_us_data/tests/`, do not import from `tests.conftest`, and do not +import helpers across test lanes. diff --git a/CLAUDE.md b/CLAUDE.md index 699744940..d9bfbf642 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -7,25 +7,33 @@ ## Testing +Canonical testing guidance lives in `docs/engineering/skills/testing.md`. If +this file conflicts with that skill, follow the skill and update this adapter. + ### Running Tests - `make test-unit` - Run unit tests only (fast, no data dependencies) - `make test-integration` - Run integration tests (requires built H5 datasets) - `make test` - Run all tests - `pytest tests/unit/ -v` - Unit tests directly - `pytest tests/integration/test_cps.py -v` - Specific integration test +- `python scripts/run_quality_guards.py` - Run layout/import quality guards ### Test Organization -Tests are in the top-level `tests/` directory, split into two sub-directories: +Tests are in the top-level `tests/` directory, split into these sub-directories: - **`tests/unit/`** — Self-contained tests that use synthetic data, mocks, patches, or checked-in fixtures. Run in seconds with no external dependencies. - `unit/datasets/` — unit tests for dataset code - `unit/calibration/` — unit tests for calibration code - **`tests/integration/`** — Tests that require built H5 datasets, HuggingFace downloads, Microsimulation objects, or database ETL. Named after the dataset they test. +- **`tests/optimized/`** — Tests that exercise deployed Modal/staging seams. ### Test Placement Rules +- **NEVER** put pytest files under `policyengine_us_data/tests/`; CI does not collect that tree - **NEVER** put tests that require H5 files or Microsimulation in `unit/` - **NEVER** put tests that use only synthetic data or mocks in `integration/` +- **NEVER** import from `tests.conftest`; fixtures are discovered automatically and helper functions belong in local support modules +- **NEVER** import helpers across test lanes, such as `tests.unit` from an integration test - Integration test files are named after their dataset dependency: `test_cps.py` tests `cps_2024.h5` - Sanity checks (value ranges, population counts) belong in the per-dataset integration test file, not in a separate sanity file - When adding a new integration test, add it to the existing per-dataset file if one exists diff --git a/changelog.d/test-quality-guards.changed.md b/changelog.d/test-quality-guards.changed.md new file mode 100644 index 000000000..ae2d8afa9 --- /dev/null +++ b/changelog.d/test-quality-guards.changed.md @@ -0,0 +1 @@ +Add quality guards for test layout and document the testing skill for AI tooling. diff --git a/docs/engineering/skills/README.md b/docs/engineering/skills/README.md new file mode 100644 index 000000000..253d28031 --- /dev/null +++ b/docs/engineering/skills/README.md @@ -0,0 +1,13 @@ +# Engineering Skills + +This directory is the canonical source for AI-facing engineering rules. + +Tool-specific instruction files such as `AGENTS.md`, `CLAUDE.md`, and +`.github/copilot-instructions.md` should point here instead of duplicating +implementation-specific guidance. When a rule changes, update the skill here +first, then keep adapters thin. + +Current skills: + +- `testing.md`: test layout, fixture scope, helper placement, and quality guard + expectations. diff --git a/docs/engineering/skills/testing.md b/docs/engineering/skills/testing.md new file mode 100644 index 000000000..69a4d71e9 --- /dev/null +++ b/docs/engineering/skills/testing.md @@ -0,0 +1,54 @@ +# Testing Skill + +Use this skill whenever adding, moving, or reviewing tests. + +## Canonical Layout + +- Put unit tests under `tests/unit/`. +- Put data-dependent or runtime integration tests under `tests/integration/`. +- Put deployed Modal/staging tests under `tests/optimized/`. +- Do not add pytest files under `policyengine_us_data/tests/`; CI does not + collect that tree. + +## Fixtures And Helpers + +- Keep root `tests/conftest.py` empty or very lightweight. It must not import + cloud clients, Modal, Hugging Face, PolicyEngine runtime-heavy modules, or + package modules that transitively import those dependencies. +- Put domain-specific fixtures in the narrowest `conftest.py` that covers the + tests that use them. +- Put reusable helper functions in a local `support.py`, a local fixture module, + or `tests/support/`. +- Do not import from `tests.conftest`; pytest discovers fixtures automatically. +- Do not import across test lanes, for example from `tests.integration` into + `tests.unit` or from `tests.unit` into `tests.integration`. Move shared helpers + to `tests/support/` or colocate them with the tests. + +## Dependency Boundaries + +- Unit tests should not require real network credentials, Modal, Hugging Face, + or GCS. Mock those seams. +- Integration tests may require built data or heavier runtime setup, but should + be explicit about those requirements and skip cleanly when local artifacts are + unavailable. +- CI should run tests in an environment where project dependencies are installed + with `uv sync --dev` or an equivalent full test dependency install. A full + install is required, but it is not a substitute for fixture isolation. + +## Quality Guards + +Run this before opening or updating a PR: + +```bash +python scripts/run_quality_guards.py +``` + +The current guard enforces: + +- No package-internal pytest files under `policyengine_us_data/tests/`. +- No pytest files outside the approved top-level test lanes. +- No imports from `tests.conftest`. +- No imports across test lanes. + +When adding a new guard, register it in `scripts/run_quality_guards.py` so CI +continues to expose a single `Quality guards` job. diff --git a/policyengine_us_data/tests/test_release_manifest.py b/policyengine_us_data/tests/test_release_manifest.py deleted file mode 100644 index 0f0b8f9df..000000000 --- a/policyengine_us_data/tests/test_release_manifest.py +++ /dev/null @@ -1,318 +0,0 @@ -import hashlib -from io import BytesIO -from pathlib import Path -from unittest.mock import MagicMock, patch - -from huggingface_hub import CommitOperationAdd - -from policyengine_us_data.utils.data_upload import upload_files_to_hf -from policyengine_us_data.utils.data_upload import publish_release_manifest_to_hf -from policyengine_us_data.utils.release_manifest import ( - RELEASE_MANIFEST_SCHEMA_VERSION, - build_release_manifest, -) -from policyengine_us_data.utils.trace_tro import TRACE_TRO_FILENAME - - -def _write_file(path: Path, content: bytes) -> Path: - path.parent.mkdir(parents=True, exist_ok=True) - path.write_bytes(content) - return path - - -def _sha256(content: bytes) -> str: - return hashlib.sha256(content).hexdigest() - - -def test_build_release_manifest_tracks_uploaded_artifacts(tmp_path): - national_bytes = b"national-dataset" - state_bytes = b"state-dataset" - national_path = _write_file( - tmp_path / "enhanced_cps_2024.h5", - national_bytes, - ) - state_path = _write_file(tmp_path / "AL.h5", state_bytes) - - manifest = build_release_manifest( - files_with_repo_paths=[ - (national_path, "enhanced_cps_2024.h5"), - (state_path, "states/AL.h5"), - ], - version="1.73.0", - repo_id="policyengine/policyengine-us-data", - model_package_version="1.634.4", - model_package_git_sha="deadbeef", - model_package_data_build_fingerprint="sha256:fingerprint", - created_at="2026-04-10T12:00:00Z", - ) - - assert manifest["data_package"] == { - "name": "policyengine-us-data", - "version": "1.73.0", - } - assert manifest["schema_version"] == RELEASE_MANIFEST_SCHEMA_VERSION - assert manifest["compatible_model_packages"] == [ - { - "name": "policyengine-us", - "specifier": "==1.634.4", - } - ] - assert manifest["build"] == { - "build_id": "policyengine-us-data-1.73.0", - "built_at": "2026-04-10T12:00:00Z", - "built_with_model_package": { - "name": "policyengine-us", - "version": "1.634.4", - "git_sha": "deadbeef", - "data_build_fingerprint": "sha256:fingerprint", - }, - } - assert manifest["default_datasets"] == {"national": "enhanced_cps_2024"} - - assert manifest["artifacts"]["enhanced_cps_2024"] == { - "kind": "microdata", - "path": "enhanced_cps_2024.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.73.0", - "sha256": _sha256(national_bytes), - "size_bytes": len(national_bytes), - } - assert manifest["artifacts"]["states/AL"] == { - "kind": "microdata", - "path": "states/AL.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.73.0", - "sha256": _sha256(state_bytes), - "size_bytes": len(state_bytes), - } - - -def test_build_release_manifest_merges_existing_release_same_version(tmp_path): - district_bytes = b"district-dataset" - district_path = _write_file(tmp_path / "NC-01.h5", district_bytes) - - existing_manifest = { - "data_package": { - "name": "policyengine-us-data", - "version": "1.73.0", - }, - "compatible_model_packages": [ - { - "name": "policyengine-us", - "specifier": "==1.634.4", - } - ], - "default_datasets": {"national": "enhanced_cps_2024"}, - "created_at": "2026-04-09T12:00:00Z", - "artifacts": { - "enhanced_cps_2024": { - "kind": "microdata", - "path": "enhanced_cps_2024.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.73.0", - "sha256": "abc", - "size_bytes": 123, - } - }, - } - - manifest = build_release_manifest( - files_with_repo_paths=[(district_path, "districts/NC-01.h5")], - version="1.73.0", - repo_id="policyengine/policyengine-us-data", - model_package_version="1.634.4", - model_package_git_sha="deadbeef", - model_package_data_build_fingerprint="sha256:fingerprint", - existing_manifest=existing_manifest, - created_at="2026-04-10T12:00:00Z", - ) - - assert set(manifest["artifacts"]) == {"enhanced_cps_2024", "districts/NC-01"} - assert manifest["default_datasets"] == {"national": "enhanced_cps_2024"} - assert manifest["build"] == { - "build_id": "policyengine-us-data-1.73.0", - "built_at": "2026-04-10T12:00:00Z", - "built_with_model_package": { - "name": "policyengine-us", - "version": "1.634.4", - "git_sha": "deadbeef", - "data_build_fingerprint": "sha256:fingerprint", - }, - } - assert manifest["artifacts"]["districts/NC-01"]["sha256"] == _sha256(district_bytes) - - -def test_upload_files_to_hf_adds_release_manifest_operations(tmp_path): - dataset_path = _write_file( - tmp_path / "enhanced_cps_2024.h5", - b"national-dataset", - ) - - mock_api = MagicMock() - mock_api.create_commit.return_value = MagicMock(oid="commit-sha") - - with ( - patch("policyengine_us_data.utils.data_upload.HfApi", return_value=mock_api), - patch( - "policyengine_us_data.utils.data_upload.load_release_manifest_from_hf", - return_value=None, - ), - patch( - "policyengine_us_data.utils.data_upload._get_model_package_build_metadata", - return_value={ - "version": "1.634.4", - "git_sha": "deadbeef", - "data_build_fingerprint": "sha256:fingerprint", - }, - ), - patch.dict( - "policyengine_us_data.utils.data_upload.os.environ", - {"HUGGING_FACE_TOKEN": "token"}, - clear=False, - ), - ): - upload_files_to_hf( - files=[dataset_path], - version="1.73.0", - ) - - operations = mock_api.create_commit.call_args.kwargs["operations"] - operation_paths = [operation.path_in_repo for operation in operations] - - assert "enhanced_cps_2024.h5" in operation_paths - assert "release_manifest.json" in operation_paths - assert "releases/1.73.0/release_manifest.json" in operation_paths - assert TRACE_TRO_FILENAME in operation_paths - assert f"releases/1.73.0/{TRACE_TRO_FILENAME}" in operation_paths - - release_ops = [ - operation - for operation in operations - if operation.path_in_repo.endswith("release_manifest.json") - ] - assert len(release_ops) == 2 - for operation in release_ops: - assert isinstance(operation, CommitOperationAdd) - assert isinstance(operation.path_or_fileobj, BytesIO) - - trace_ops = [ - operation - for operation in operations - if operation.path_in_repo.endswith(".jsonld") - ] - assert len(trace_ops) == 2 - for operation in trace_ops: - assert isinstance(operation, CommitOperationAdd) - assert isinstance(operation.path_or_fileobj, BytesIO) - - -def test_upload_files_to_hf_does_not_tag_until_finalize(tmp_path): - dataset_path = _write_file( - tmp_path / "enhanced_cps_2024.h5", - b"national-dataset", - ) - - mock_api = MagicMock() - mock_api.create_commit.return_value = MagicMock(oid="commit-sha") - - with ( - patch("policyengine_us_data.utils.data_upload.HfApi", return_value=mock_api), - patch( - "policyengine_us_data.utils.data_upload.load_release_manifest_from_hf", - return_value=None, - ), - patch( - "policyengine_us_data.utils.data_upload._get_model_package_build_metadata", - return_value={ - "version": "1.634.4", - "git_sha": "deadbeef", - "data_build_fingerprint": "sha256:fingerprint", - }, - ), - patch.dict( - "policyengine_us_data.utils.data_upload.os.environ", - {"HUGGING_FACE_TOKEN": "token"}, - clear=False, - ), - ): - upload_files_to_hf( - files=[dataset_path], - version="1.73.0", - create_tag=False, - ) - - mock_api.create_tag.assert_not_called() - - -def test_publish_release_manifest_to_hf_can_finalize_and_tag(tmp_path): - state_path = _write_file( - tmp_path / "AL.h5", - b"state-dataset", - ) - - mock_api = MagicMock() - mock_api.create_commit.return_value = MagicMock(oid="final-commit-sha") - existing_manifest = { - "schema_version": RELEASE_MANIFEST_SCHEMA_VERSION, - "data_package": { - "name": "policyengine-us-data", - "version": "1.73.0", - }, - "compatible_model_packages": [], - "default_datasets": {"national": "enhanced_cps_2024"}, - "created_at": "2026-04-10T12:00:00Z", - "build": { - "build_id": "policyengine-us-data-1.73.0", - "built_at": "2026-04-10T12:00:00Z", - }, - "artifacts": { - "enhanced_cps_2024": { - "kind": "microdata", - "path": "enhanced_cps_2024.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.73.0", - "sha256": "abc", - "size_bytes": 123, - } - }, - } - - with ( - patch("policyengine_us_data.utils.data_upload.HfApi", return_value=mock_api), - patch( - "policyengine_us_data.utils.data_upload.load_release_manifest_from_hf", - side_effect=lambda *args, **kwargs: ( - None if kwargs.get("revision") == "1.73.0" else existing_manifest - ), - ), - patch( - "policyengine_us_data.utils.data_upload._get_model_package_build_metadata", - return_value={ - "version": "1.634.4", - "git_sha": "deadbeef", - "data_build_fingerprint": "sha256:fingerprint", - }, - ), - patch.dict( - "policyengine_us_data.utils.data_upload.os.environ", - {"HUGGING_FACE_TOKEN": "token"}, - clear=False, - ), - ): - manifest = publish_release_manifest_to_hf( - [(state_path, "states/AL.h5")], - version="1.73.0", - create_tag=True, - ) - - mock_api.create_tag.assert_called_once() - assert manifest["build"] == { - "build_id": "policyengine-us-data-1.73.0", - "built_at": "2026-04-10T12:00:00Z", - "built_with_model_package": { - "name": "policyengine-us", - "version": "1.634.4", - "git_sha": "deadbeef", - "data_build_fingerprint": "sha256:fingerprint", - }, - } diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 000000000..e3d9df4d4 --- /dev/null +++ b/scripts/__init__.py @@ -0,0 +1 @@ +"""Repository maintenance scripts.""" diff --git a/scripts/guards/__init__.py b/scripts/guards/__init__.py new file mode 100644 index 000000000..2b896a044 --- /dev/null +++ b/scripts/guards/__init__.py @@ -0,0 +1 @@ +"""Quality guard implementations.""" diff --git a/scripts/guards/test_layout.py b/scripts/guards/test_layout.py new file mode 100644 index 000000000..aef7a425f --- /dev/null +++ b/scripts/guards/test_layout.py @@ -0,0 +1,160 @@ +"""Guardrails for pytest layout and test helper imports.""" + +from __future__ import annotations + +import ast +import subprocess +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] +TEST_LANES = { + "tests/unit": Path("tests/unit"), + "tests/integration": Path("tests/integration"), + "tests/optimized": Path("tests/optimized"), +} +ALLOWED_TEST_ROOTS = tuple(TEST_LANES.values()) +PYTEST_FILE_PREFIX = "test_" +PYTEST_FILE_SUFFIX = "_test.py" + + +def _git_files() -> list[Path]: + try: + result = subprocess.run( + [ + "git", + "ls-files", + "--cached", + "--others", + "--exclude-standard", + ], + cwd=REPO_ROOT, + check=True, + capture_output=True, + text=True, + ) + except (FileNotFoundError, subprocess.CalledProcessError): + return [ + path.relative_to(REPO_ROOT) + for path in REPO_ROOT.rglob("*") + if path.is_file() + ] + + return [ + Path(line) + for line in result.stdout.splitlines() + if line and (REPO_ROOT / line).is_file() + ] + + +def _is_pytest_file(path: Path) -> bool: + return path.suffix == ".py" and ( + path.name.startswith(PYTEST_FILE_PREFIX) + or path.name.endswith(PYTEST_FILE_SUFFIX) + ) + + +def _is_under(path: Path, parent: Path) -> bool: + return path == parent or parent in path.parents + + +def _test_lane(path: Path) -> str | None: + for name, root in TEST_LANES.items(): + if _is_under(path, root): + return name + return None + + +def _module_root(module: str) -> str | None: + for name in TEST_LANES: + if module == name.replace("/", ".") or module.startswith( + f"{name.replace('/', '.')}." + ): + return name + return None + + +def _check_test_placement(files: list[Path]) -> list[str]: + violations = [] + for path in files: + if not _is_pytest_file(path): + continue + + if _is_under(path, Path("policyengine_us_data/tests")): + violations.append( + f"{path}: package-internal tests are not collected by CI; " + "move tests under tests/unit, tests/integration, or tests/optimized." + ) + continue + + if path.parts and path.parts[0] == "tests": + if not any(_is_under(path, root) for root in ALLOWED_TEST_ROOTS): + violations.append( + f"{path}: pytest files under tests/ must live under " + "tests/unit, tests/integration, or tests/optimized." + ) + + return violations + + +def _check_test_imports(files: list[Path]) -> list[str]: + violations = [] + for path in files: + if path.suffix != ".py" or not _is_under(path, Path("tests")): + continue + + source = (REPO_ROOT / path).read_text(encoding="utf-8") + try: + tree = ast.parse(source, filename=str(path)) + except SyntaxError as exc: + violations.append(f"{path}: could not parse Python source: {exc}") + continue + + current_lane = _test_lane(path) + for node in ast.walk(tree): + module_names: list[str] = [] + if isinstance(node, ast.ImportFrom) and node.module: + module_names.append(node.module) + elif isinstance(node, ast.Import): + module_names.extend(alias.name for alias in node.names) + + for module in module_names: + if module == "tests.conftest" or module.startswith("tests.conftest."): + violations.append( + f"{path}: import from {module!r} couples tests to global " + "pytest setup; move helpers into a local support module." + ) + continue + + imported_lane = _module_root(module) + if imported_lane and imported_lane != current_lane: + violations.append( + f"{path}: imports {module!r} across test lanes; move shared " + "helpers to tests/support or colocate them with the tests." + ) + + return violations + + +def check() -> list[str]: + files = _git_files() + return [ + *_check_test_placement(files), + *_check_test_imports(files), + ] + + +def main() -> int: + violations = check() + if not violations: + print("test-layout guard passed") + return 0 + + print("test-layout guard failed:") + for violation in violations: + print(f" - {violation}") + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_quality_guards.py b/scripts/run_quality_guards.py new file mode 100644 index 000000000..135b8772b --- /dev/null +++ b/scripts/run_quality_guards.py @@ -0,0 +1,38 @@ +"""Run repository quality guards.""" + +from __future__ import annotations + +import sys +from collections.abc import Callable +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from scripts.guards import test_layout # noqa: E402 + + +Guard = tuple[str, Callable[[], list[str]]] + +GUARDS: tuple[Guard, ...] = (("test-layout", test_layout.check),) + + +def main() -> int: + failed = False + for name, check in GUARDS: + violations = check() + if not violations: + print(f"{name}: passed") + continue + + failed = True + print(f"{name}: failed") + for violation in violations: + print(f" - {violation}") + + return 1 if failed else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/integration/local_h5/__init__.py b/tests/integration/local_h5/__init__.py new file mode 100644 index 000000000..e911efbea --- /dev/null +++ b/tests/integration/local_h5/__init__.py @@ -0,0 +1 @@ +"""Local H5 integration tests.""" diff --git a/tests/integration/local_h5/test_modal_local_area_traceability.py b/tests/integration/local_h5/test_modal_local_area_traceability.py index 13d86ad30..d95dde2bc 100644 --- a/tests/integration/local_h5/test_modal_local_area_traceability.py +++ b/tests/integration/local_h5/test_modal_local_area_traceability.py @@ -3,7 +3,7 @@ ) from tests.integration.local_h5.fixtures import SEED, VERSION, seed_local_h5_artifacts -from tests.unit.fixtures.test_modal_local_area import load_local_area_module +from tests.support.modal_local_area import load_local_area_module def test_local_area_helpers_match_publish_traceability_contract(tmp_path): diff --git a/tests/integration/test_xw_consistency.py b/tests/integration/test_xw_consistency.py index 49c60dd88..e6ee9e101 100644 --- a/tests/integration/test_xw_consistency.py +++ b/tests/integration/test_xw_consistency.py @@ -6,7 +6,7 @@ from any optimizer behavior. Usage: - pytest policyengine_us_data/tests/test_calibration/test_xw_consistency.py -v + pytest tests/integration/test_xw_consistency.py -v """ import tempfile diff --git a/tests/support/__init__.py b/tests/support/__init__.py new file mode 100644 index 000000000..38361eaf5 --- /dev/null +++ b/tests/support/__init__.py @@ -0,0 +1 @@ +"""Shared test support helpers.""" diff --git a/tests/unit/fixtures/test_modal_local_area.py b/tests/support/modal_local_area.py similarity index 98% rename from tests/unit/fixtures/test_modal_local_area.py rename to tests/support/modal_local_area.py index db9d0e621..683528e42 100644 --- a/tests/unit/fixtures/test_modal_local_area.py +++ b/tests/support/modal_local_area.py @@ -1,4 +1,4 @@ -"""Fixture helpers for `test_modal_local_area.py`.""" +"""Helpers for importing `modal_app.local_area` in tests.""" import importlib import sys diff --git a/tests/unit/test_modal_local_area.py b/tests/unit/test_modal_local_area.py index e8128db71..2e745846a 100644 --- a/tests/unit/test_modal_local_area.py +++ b/tests/unit/test_modal_local_area.py @@ -1,6 +1,6 @@ from pathlib import Path -from tests.unit.fixtures.test_modal_local_area import load_local_area_module +from tests.support.modal_local_area import load_local_area_module def test_build_promote_national_publish_script_imports_version_manifest_helpers(): diff --git a/tests/test_refresh_local_agi_state_targets.py b/tests/unit/test_refresh_local_agi_state_targets.py similarity index 98% rename from tests/test_refresh_local_agi_state_targets.py rename to tests/unit/test_refresh_local_agi_state_targets.py index 9b59c056d..c036e1caf 100644 --- a/tests/test_refresh_local_agi_state_targets.py +++ b/tests/unit/test_refresh_local_agi_state_targets.py @@ -5,7 +5,7 @@ import pandas as pd -REPO_ROOT = Path(__file__).resolve().parent.parent +REPO_ROOT = Path(__file__).resolve().parents[2] PACKAGE_ROOT = REPO_ROOT / "policyengine_us_data" MODULE_PATH = ( PACKAGE_ROOT diff --git a/policyengine_us_data/tests/test_trace_tro.py b/tests/unit/test_trace_tro.py similarity index 100% rename from policyengine_us_data/tests/test_trace_tro.py rename to tests/unit/test_trace_tro.py diff --git a/tests/unit/version_manifest/__init__.py b/tests/unit/version_manifest/__init__.py new file mode 100644 index 000000000..8baf56aff --- /dev/null +++ b/tests/unit/version_manifest/__init__.py @@ -0,0 +1 @@ +"""Version manifest unit tests.""" diff --git a/tests/conftest.py b/tests/unit/version_manifest/conftest.py similarity index 62% rename from tests/conftest.py rename to tests/unit/version_manifest/conftest.py index fc97e2882..452224741 100644 --- a/tests/conftest.py +++ b/tests/unit/version_manifest/conftest.py @@ -1,12 +1,17 @@ -"""Shared fixtures and helpers for version manifest tests.""" +"""Shared fixtures for version manifest tests.""" -from __future__ import annotations - -import json from unittest.mock import MagicMock import pytest +from policyengine_us_data.utils.version_manifest import ( + HFVersionInfo, + GCSVersionInfo, + VersionManifest, + VersionRegistry, +) +from policyengine_us_data.utils.policyengine import PolicyEngineUSBuildInfo + # -- Fixtures ------------------------------------------------------ @@ -21,8 +26,6 @@ def sample_generations() -> dict[str, int]: @pytest.fixture def sample_hf_info() -> HFVersionInfo: - from policyengine_us_data.utils.version_manifest import HFVersionInfo - return HFVersionInfo( repo="policyengine/policyengine-us-data", commit="abc123def456", @@ -31,8 +34,6 @@ def sample_hf_info() -> HFVersionInfo: @pytest.fixture def sample_policyengine_us_info() -> PolicyEngineUSBuildInfo: - from policyengine_us_data.utils.policyengine import PolicyEngineUSBuildInfo - return PolicyEngineUSBuildInfo( version="1.587.0", locked_version="1.587.0", @@ -47,11 +48,6 @@ def sample_manifest( sample_hf_info: HFVersionInfo, sample_policyengine_us_info: PolicyEngineUSBuildInfo, ) -> VersionManifest: - from policyengine_us_data.utils.version_manifest import ( - GCSVersionInfo, - VersionManifest, - ) - return VersionManifest( version="1.72.3", created_at="2026-03-10T14:30:00Z", @@ -69,8 +65,6 @@ def sample_registry( sample_manifest: VersionManifest, ) -> VersionRegistry: """A registry with one version entry.""" - from policyengine_us_data.utils.version_manifest import VersionRegistry - return VersionRegistry( current="1.72.3", versions=[sample_manifest], @@ -82,23 +76,3 @@ def mock_bucket() -> MagicMock: bucket = MagicMock() bucket.name = "policyengine-us-data" return bucket - - -# -- Helpers ------------------------------------------------------- - - -def make_mock_blob(generation: int) -> MagicMock: - blob = MagicMock() - blob.generation = generation - return blob - - -def setup_bucket_with_registry( - bucket: MagicMock, - registry: VersionRegistry, -) -> None: - """Configure a mock bucket to serve a registry.""" - registry_json = json.dumps(registry.to_dict()) - blob = MagicMock() - blob.download_as_text.return_value = registry_json - bucket.blob.return_value = blob diff --git a/tests/unit/version_manifest/support.py b/tests/unit/version_manifest/support.py new file mode 100644 index 000000000..591a6484b --- /dev/null +++ b/tests/unit/version_manifest/support.py @@ -0,0 +1,23 @@ +"""Support helpers for version manifest unit tests.""" + +import json +from unittest.mock import MagicMock + +from policyengine_us_data.utils.version_manifest import VersionRegistry + + +def make_mock_blob(generation: int) -> MagicMock: + blob = MagicMock() + blob.generation = generation + return blob + + +def setup_bucket_with_registry( + bucket: MagicMock, + registry: VersionRegistry, +) -> None: + """Configure a mock bucket to serve a registry.""" + registry_json = json.dumps(registry.to_dict()) + blob = MagicMock() + blob.download_as_text.return_value = registry_json + bucket.blob.return_value = blob diff --git a/tests/unit/test_version_manifest.py b/tests/unit/version_manifest/test_version_manifest.py similarity index 99% rename from tests/unit/test_version_manifest.py rename to tests/unit/version_manifest/test_version_manifest.py index 7e46f16c6..1c2eede0f 100644 --- a/tests/unit/test_version_manifest.py +++ b/tests/unit/version_manifest/test_version_manifest.py @@ -20,7 +20,7 @@ get_data_manifest, get_data_version, ) -from tests.conftest import ( +from .support import ( make_mock_blob, setup_bucket_with_registry, ) From 36daa155dad963469689d2a9035d6e3ea2a9585e Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 28 Apr 2026 23:57:12 +0200 Subject: [PATCH 07/25] Fix trace TRO schema test path --- tests/unit/test_trace_tro.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_trace_tro.py b/tests/unit/test_trace_tro.py index fafd714c5..327e64412 100644 --- a/tests/unit/test_trace_tro.py +++ b/tests/unit/test_trace_tro.py @@ -1,4 +1,5 @@ import hashlib +import importlib.resources import json from pathlib import Path @@ -219,8 +220,9 @@ def test_tro_validates_against_shipped_schema(tmp_path): tro = build_trace_tro_from_release_manifest(manifest) - schema_path = ( - Path(__file__).resolve().parents[1] / "schemas" / "trace_tro.schema.json" + schema = json.loads( + importlib.resources.files("policyengine_us_data") + .joinpath("schemas/trace_tro.schema.json") + .read_text() ) - schema = json.loads(schema_path.read_text()) jsonschema.validate(instance=tro, schema=schema) From d57ec175e227655b9b6ef3cb54f1c42d3db2a792 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 00:08:21 +0200 Subject: [PATCH 08/25] Rename PR linear integration check --- .github/workflows/pr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 34feb266a..d6f505f67 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -205,6 +205,7 @@ jobs: fi integration-tests: + name: Linear integration tests runs-on: ubuntu-latest needs: [check-fork, lint, decide-test-scope] if: needs.decide-test-scope.outputs.run_integration == 'true' From a77023a94480f18d2c4c974eb14c0846d3f354c2 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 00:20:21 +0200 Subject: [PATCH 09/25] Rename PR build datasets check --- .github/workflows/pr.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index d6f505f67..5a5c4a9d4 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -205,7 +205,7 @@ jobs: fi integration-tests: - name: Linear integration tests + name: Build datasets runs-on: ubuntu-latest needs: [check-fork, lint, decide-test-scope] if: needs.decide-test-scope.outputs.run_integration == 'true' @@ -222,7 +222,7 @@ jobs: with: python-version: "3.14" - run: pip install modal - - name: Build datasets and run integration tests on Modal + - name: Build datasets run: | STAGE_ARGS="" if git diff --name-only origin/main...HEAD | grep -qx 'pyproject.toml'; then From a47e1b81f6e4d02c9b6d853e5741b682fe09836f Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 00:58:50 +0200 Subject: [PATCH 10/25] Add tiny pipeline workspace scaffold --- tests/support/pipeline_workspace.py | 174 +++++++++++++++++++++ tests/unit/test_tiny_pipeline_workspace.py | 64 ++++++++ 2 files changed, 238 insertions(+) create mode 100644 tests/support/pipeline_workspace.py create mode 100644 tests/unit/test_tiny_pipeline_workspace.py diff --git a/tests/support/pipeline_workspace.py b/tests/support/pipeline_workspace.py new file mode 100644 index 000000000..d8eb2e600 --- /dev/null +++ b/tests/support/pipeline_workspace.py @@ -0,0 +1,174 @@ +"""Shared fixture-scale pipeline workspace helpers. + +This module is intentionally test-only. Production code must not import it. +It defines the canonical tiny pipeline directory and artifact names that local +integration tests and Modal test harnesses can share as we move coverage +upstream from H5-only seams toward dataset build stages 1-5. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import ClassVar + +__test__ = False + + +STAGE_ARTIFACTS: dict[str, tuple[str, ...]] = { + "stage_1": ( + "uprating_factors.csv", + "acs_2022.h5", + "irs_puf_2015.h5", + ), + "stage_2": ( + "cps_2024.h5", + "puf_2024.h5", + ), + "stage_3": ("extended_cps_2024.h5",), + "stage_4": ( + "enhanced_cps_2024.h5", + "stratified_extended_cps_2024.h5", + ), + "stage_5": ( + "source_imputed_stratified_extended_cps_2024.h5", + "source_imputed_stratified_extended_cps.h5", + "small_enhanced_cps_2024.h5", + "sparse_enhanced_cps_2024.h5", + ), + "calibration": ( + "policy_data.db", + "calibration_package.pkl", + "calibration_weights.npy", + "geography_assignment.npz", + "unified_run_config.json", + ), + "h5_outputs": ( + "states/AL.h5", + "districts/NC-01.h5", + "national/US.h5", + ), +} + + +@dataclass(frozen=True) +class TinyPipelineWorkspace: + """Canonical on-disk layout for fixture-scale pipeline tests.""" + + root: Path + + STAGE_NAMES: ClassVar[tuple[str, ...]] = tuple(STAGE_ARTIFACTS) + TOP_LEVEL_DIRS: ClassVar[tuple[str, ...]] = ( + "inputs", + "stage_1", + "stage_2", + "stage_3", + "stage_4", + "stage_5", + "calibration", + "h5", + ) + H5_DIRS: ClassVar[tuple[str, ...]] = ( + "outputs", + "staging", + "diagnostics", + "manifests", + ) + + @classmethod + def create(cls, root: Path) -> "TinyPipelineWorkspace": + """Create an empty canonical workspace under ``root``.""" + + workspace = cls(root=root) + workspace.materialize() + return workspace + + @property + def inputs(self) -> Path: + return self.root / "inputs" + + @property + def stage_1(self) -> Path: + return self.root / "stage_1" + + @property + def stage_2(self) -> Path: + return self.root / "stage_2" + + @property + def stage_3(self) -> Path: + return self.root / "stage_3" + + @property + def stage_4(self) -> Path: + return self.root / "stage_4" + + @property + def stage_5(self) -> Path: + return self.root / "stage_5" + + @property + def calibration(self) -> Path: + return self.root / "calibration" + + @property + def h5(self) -> Path: + return self.root / "h5" + + @property + def h5_outputs(self) -> Path: + return self.h5 / "outputs" + + @property + def h5_staging(self) -> Path: + return self.h5 / "staging" + + @property + def h5_diagnostics(self) -> Path: + return self.h5 / "diagnostics" + + @property + def h5_manifests(self) -> Path: + return self.h5 / "manifests" + + def materialize(self) -> None: + """Create all canonical directories without writing artifacts.""" + + for dirname in self.TOP_LEVEL_DIRS: + (self.root / dirname).mkdir(parents=True, exist_ok=True) + for dirname in self.H5_DIRS: + (self.h5 / dirname).mkdir(parents=True, exist_ok=True) + + def stage_dir(self, stage: str) -> Path: + """Return the directory for a known stage name.""" + + if stage == "h5_outputs": + return self.h5_outputs + if stage not in STAGE_ARTIFACTS: + raise KeyError(f"Unknown tiny pipeline stage: {stage}") + return self.root / stage + + def artifact_path(self, stage: str, relative_path: str) -> Path: + """Return an artifact path and ensure nested parent dirs exist.""" + + path = self.stage_dir(stage) / relative_path + path.parent.mkdir(parents=True, exist_ok=True) + return path + + def expected_artifacts(self, stage: str) -> tuple[Path, ...]: + """Return expected artifact paths for a known stage.""" + + if stage not in STAGE_ARTIFACTS: + raise KeyError(f"Unknown tiny pipeline stage: {stage}") + return tuple( + self.artifact_path(stage, relative_path) + for relative_path in STAGE_ARTIFACTS[stage] + ) + + def all_expected_artifacts(self) -> dict[str, tuple[Path, ...]]: + """Return every currently defined expected artifact path by stage.""" + + return { + stage: self.expected_artifacts(stage) + for stage in STAGE_ARTIFACTS + } diff --git a/tests/unit/test_tiny_pipeline_workspace.py b/tests/unit/test_tiny_pipeline_workspace.py new file mode 100644 index 000000000..da2aec7da --- /dev/null +++ b/tests/unit/test_tiny_pipeline_workspace.py @@ -0,0 +1,64 @@ +from pathlib import Path + +import pytest + +from tests.support.pipeline_workspace import ( + STAGE_ARTIFACTS, + TinyPipelineWorkspace, +) + + +def test_tiny_pipeline_workspace_creates_canonical_directories(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + + expected_dirs = [ + workspace.inputs, + workspace.stage_1, + workspace.stage_2, + workspace.stage_3, + workspace.stage_4, + workspace.stage_5, + workspace.calibration, + workspace.h5_outputs, + workspace.h5_staging, + workspace.h5_diagnostics, + workspace.h5_manifests, + ] + + assert all(path.is_dir() for path in expected_dirs) + + +def test_tiny_pipeline_workspace_resolves_expected_artifacts(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + + assert workspace.expected_artifacts("stage_1") == ( + workspace.stage_1 / "uprating_factors.csv", + workspace.stage_1 / "acs_2022.h5", + workspace.stage_1 / "irs_puf_2015.h5", + ) + assert workspace.expected_artifacts("h5_outputs") == ( + workspace.h5_outputs / "states" / "AL.h5", + workspace.h5_outputs / "districts" / "NC-01.h5", + workspace.h5_outputs / "national" / "US.h5", + ) + + # Nested expected artifact paths should be immediately writable by later + # fixture builders. + for path in workspace.expected_artifacts("h5_outputs"): + assert path.parent.is_dir() + + +def test_tiny_pipeline_workspace_rejects_unknown_stage(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + + with pytest.raises(KeyError, match="Unknown tiny pipeline stage"): + workspace.stage_dir("not-a-stage") + + +def test_tiny_pipeline_workspace_exposes_all_declared_artifacts(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + + artifacts = workspace.all_expected_artifacts() + + assert set(artifacts) == set(STAGE_ARTIFACTS) + assert all(isinstance(path, Path) for paths in artifacts.values() for path in paths) From a14dacfefa69cf94219f7aab43f3e86beb0fce82 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 01:24:48 +0200 Subject: [PATCH 11/25] Move local H5 tests into optimized suite --- .github/workflows/pr.yaml | 32 ++++++------------- tests/integration/local_h5/__init__.py | 1 - tests/optimized/local_h5/__init__.py | 1 + .../local_h5/fixtures.py | 6 ++-- .../test_modal_local_area_traceability.py | 2 +- .../local_h5/test_traceability_contract.py | 2 +- .../test_worker_script_tiny_fixture.py | 2 +- 7 files changed, 18 insertions(+), 28 deletions(-) delete mode 100644 tests/integration/local_h5/__init__.py create mode 100644 tests/optimized/local_h5/__init__.py rename tests/{integration => optimized}/local_h5/fixtures.py (96%) rename tests/{integration => optimized}/local_h5/test_modal_local_area_traceability.py (95%) rename tests/{integration => optimized}/local_h5/test_traceability_contract.py (97%) rename tests/{integration => optimized}/local_h5/test_worker_script_tiny_fixture.py (98%) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 5a5c4a9d4..0ade07c7a 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -98,19 +98,6 @@ jobs: env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} - local-h5-integration-tests: - runs-on: ubuntu-latest - needs: [check-fork, lint, unit-tests] - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: "3.14" - - uses: astral-sh/setup-uv@v5 - - run: uv sync --dev - - name: Run local H5 integration tests - run: uv run pytest --noconftest tests/integration/local_h5/ -v - optimized-integration-tests: runs-on: ubuntu-latest needs: @@ -120,7 +107,6 @@ jobs: lint, check-changelog, unit-tests, - local-h5-integration-tests, smoke-test, docs-build, ] @@ -139,23 +125,25 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.14" + - uses: astral-sh/setup-uv@v5 + - run: uv sync --dev - name: Install optimized test deps - run: pip install modal pytest numpy pandas + run: uv pip install modal - name: Ensure PR Modal environment exists - run: python .github/scripts/ensure_modal_environment.py + run: uv run python .github/scripts/ensure_modal_environment.py - name: Sync Modal secrets to PR environment - run: python .github/scripts/sync_modal_secrets.py + run: uv run python .github/scripts/sync_modal_secrets.py - name: Deploy Modal pipeline app to PR staging - run: modal deploy --env="${MODAL_ENVIRONMENT}" modal_app/pipeline.py + run: uv run modal deploy --env="${MODAL_ENVIRONMENT}" modal_app/pipeline.py - name: Deploy Modal local-area app to PR staging - run: modal deploy --env="${MODAL_ENVIRONMENT}" modal_app/local_area.py + run: uv run modal deploy --env="${MODAL_ENVIRONMENT}" modal_app/local_area.py - name: Deploy Modal H5 test harness to PR staging - run: modal deploy --env="${MODAL_ENVIRONMENT}" modal_app/h5_test_harness.py + run: uv run modal deploy --env="${MODAL_ENVIRONMENT}" modal_app/h5_test_harness.py - name: Run optimized integration tests against PR staging - run: python -m pytest tests/optimized/ -v + run: uv run pytest tests/optimized/ -v - name: Cleanup PR Modal environment if: always() - run: python .github/scripts/delete_modal_environment.py + run: uv run python .github/scripts/delete_modal_environment.py smoke-test: runs-on: ubuntu-latest diff --git a/tests/integration/local_h5/__init__.py b/tests/integration/local_h5/__init__.py deleted file mode 100644 index e911efbea..000000000 --- a/tests/integration/local_h5/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Local H5 integration tests.""" diff --git a/tests/optimized/local_h5/__init__.py b/tests/optimized/local_h5/__init__.py new file mode 100644 index 000000000..99c72228c --- /dev/null +++ b/tests/optimized/local_h5/__init__.py @@ -0,0 +1 @@ +"""Optimized local H5 integration tests.""" diff --git a/tests/integration/local_h5/fixtures.py b/tests/optimized/local_h5/fixtures.py similarity index 96% rename from tests/integration/local_h5/fixtures.py rename to tests/optimized/local_h5/fixtures.py index 3edf0f020..7918f9cd0 100644 --- a/tests/integration/local_h5/fixtures.py +++ b/tests/optimized/local_h5/fixtures.py @@ -1,4 +1,4 @@ -"""Shared tiny-artifact fixtures for local H5 integration tests.""" +"""Shared tiny-artifact fixtures for optimized local H5 integration tests.""" from __future__ import annotations @@ -21,7 +21,9 @@ AreaFilter, ) -FIXTURE_DATASET_PATH = Path(__file__).resolve().parents[1] / "test_fixture_50hh.h5" +FIXTURE_DATASET_PATH = ( + Path(__file__).resolve().parents[2] / "integration" / "test_fixture_50hh.h5" +) DISTRICT_GEOID = "3701" COUNTY_FIPS = "37183" STATE_FIPS = 37 diff --git a/tests/integration/local_h5/test_modal_local_area_traceability.py b/tests/optimized/local_h5/test_modal_local_area_traceability.py similarity index 95% rename from tests/integration/local_h5/test_modal_local_area_traceability.py rename to tests/optimized/local_h5/test_modal_local_area_traceability.py index d95dde2bc..a943b8dcf 100644 --- a/tests/integration/local_h5/test_modal_local_area_traceability.py +++ b/tests/optimized/local_h5/test_modal_local_area_traceability.py @@ -2,7 +2,7 @@ FingerprintingService, ) -from tests.integration.local_h5.fixtures import SEED, VERSION, seed_local_h5_artifacts +from tests.optimized.local_h5.fixtures import SEED, VERSION, seed_local_h5_artifacts from tests.support.modal_local_area import load_local_area_module diff --git a/tests/integration/local_h5/test_traceability_contract.py b/tests/optimized/local_h5/test_traceability_contract.py similarity index 97% rename from tests/integration/local_h5/test_traceability_contract.py rename to tests/optimized/local_h5/test_traceability_contract.py index 65fa0b678..b6b599082 100644 --- a/tests/integration/local_h5/test_traceability_contract.py +++ b/tests/optimized/local_h5/test_traceability_contract.py @@ -3,7 +3,7 @@ PublishingInputBundle, ) -from tests.integration.local_h5.fixtures import SEED, VERSION, seed_local_h5_artifacts +from tests.optimized.local_h5.fixtures import SEED, VERSION, seed_local_h5_artifacts def _fingerprint_for(*, inputs, scope: str = "regional") -> str: diff --git a/tests/integration/local_h5/test_worker_script_tiny_fixture.py b/tests/optimized/local_h5/test_worker_script_tiny_fixture.py similarity index 98% rename from tests/integration/local_h5/test_worker_script_tiny_fixture.py rename to tests/optimized/local_h5/test_worker_script_tiny_fixture.py index 12b6a0426..6ad30fb4e 100644 --- a/tests/integration/local_h5/test_worker_script_tiny_fixture.py +++ b/tests/optimized/local_h5/test_worker_script_tiny_fixture.py @@ -7,7 +7,7 @@ import pytest -from tests.integration.local_h5.fixtures import ( +from tests.optimized.local_h5.fixtures import ( build_request, seed_local_h5_artifacts, ) From 980fb7de6ae69125557293bca9374521d08cb98b Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 01:30:10 +0200 Subject: [PATCH 12/25] Restore optimized test dependency install --- .github/workflows/pr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 0ade07c7a..cf7d8173e 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -128,7 +128,7 @@ jobs: - uses: astral-sh/setup-uv@v5 - run: uv sync --dev - name: Install optimized test deps - run: uv pip install modal + run: uv pip install modal pytest numpy pandas - name: Ensure PR Modal environment exists run: uv run python .github/scripts/ensure_modal_environment.py - name: Sync Modal secrets to PR environment From 831fbceeb65528fbcd5a5093d96252cf1b4f5320 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 01:45:00 +0200 Subject: [PATCH 13/25] Clarify PR and push dataset workflow labels --- .github/workflows/pr.yaml | 4 ++-- .github/workflows/push.yaml | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index cf7d8173e..9f712acc4 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -193,7 +193,7 @@ jobs: fi integration-tests: - name: Build datasets + name: Build datasets and run integration tests on Modal runs-on: ubuntu-latest needs: [check-fork, lint, decide-test-scope] if: needs.decide-test-scope.outputs.run_integration == 'true' @@ -210,7 +210,7 @@ jobs: with: python-version: "3.14" - run: pip install modal - - name: Build datasets + - name: Build datasets and run integration tests on Modal run: | STAGE_ARGS="" if git diff --name-only origin/main...HEAD | grep -qx 'pyproject.toml'; then diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml index 4a0cec793..005a6eaad 100644 --- a/.github/workflows/push.yaml +++ b/.github/workflows/push.yaml @@ -12,9 +12,9 @@ jobs: - run: pip install ruff>=0.9.0 - run: ruff format --check . - # ── Build and linear integration tests ────────────────────── - build-and-linear-integration-tests: - name: Build and linear integration tests + # ── Dataset build ─────────────────────────────────────────── + build-datasets: + name: Build datasets runs-on: ubuntu-latest needs: lint if: github.event.head_commit.message != 'Update package version' @@ -29,7 +29,7 @@ jobs: with: python-version: "3.14" - run: pip install modal - - name: Run linear integration tests on Modal + - name: Build datasets on Modal run: | modal run --env="${MODAL_ENVIRONMENT}" modal_app/data_build.py \ --upload \ From 6d01f7e466f24a0576102b828aa794af0d7c262d Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 18:41:11 +0200 Subject: [PATCH 14/25] Add fixture-backed Stage 1 artifacts --- tests/support/tiny_stage_1.py | 274 ++++++++++++++++++++++ tests/unit/test_tiny_stage_1_artifacts.py | 69 ++++++ 2 files changed, 343 insertions(+) create mode 100644 tests/support/tiny_stage_1.py create mode 100644 tests/unit/test_tiny_stage_1_artifacts.py diff --git a/tests/support/tiny_stage_1.py b/tests/support/tiny_stage_1.py new file mode 100644 index 000000000..e7ce068d1 --- /dev/null +++ b/tests/support/tiny_stage_1.py @@ -0,0 +1,274 @@ +"""Fixture-backed Stage 1 artifacts for tiny pipeline integration tests.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +import h5py +import numpy as np +import pandas as pd + +from tests.support.pipeline_workspace import TinyPipelineWorkspace + +__test__ = False + + +UPRATING_YEARS = tuple(range(2020, 2035)) +UPRATING_VARIABLES = ( + "employment_income", + "self_employment_income", + "social_security", + "household_weight", + "population", +) + +ACS_PERSON_ARRAYS = ( + "person_id", + "person_household_id", + "person_spm_unit_id", + "person_tax_unit_id", + "person_family_id", + "person_marital_unit_id", + "age", + "is_male", + "employment_income", + "self_employment_income", + "social_security", + "taxable_private_pension_income", + "is_household_head", + "rent", + "real_estate_taxes", +) + +ACS_HOUSEHOLD_ARRAYS = ( + "household_id", + "spm_unit_id", + "tax_unit_id", + "family_id", + "marital_unit_id", + "household_weight", + "tenure_type", + "household_vehicles_owned", + "state_fips", + "household_state_fips", +) + +PUF_CORE_COLUMNS = ( + "RECID", + "S006", + "MARS", + "DSI", + "EIC", + "XTOT", + "E00200", +) + +PUF_DEMOGRAPHIC_COLUMNS = ( + "RECID", + "AGEDP1", + "AGEDP2", + "AGEDP3", + "AGERANGE", + "EARNSPLIT", + "GENDER", +) + +_PUF_ZERO_COLUMNS = ( + "E00300", + "E00400", + "E00600", + "E00650", + "E00700", + "E00800", + "E00900", + "E01100", + "E01200", + "E01400", + "E01500", + "E01700", + "E02100", + "E02300", + "E02400", + "E03150", + "E03210", + "E03220", + "E03230", + "E03240", + "E03270", + "E03290", + "E03300", + "E03400", + "E03500", + "E07240", + "E07260", + "E07300", + "E07400", + "E07600", + "E09700", + "E09800", + "E09900", + "E11200", + "E17500", + "E18400", + "E18500", + "E19200", + "E19800", + "E20100", + "E20400", + "E20500", + "E24515", + "E24518", + "E25850", + "E25860", + "E25920", + "E25940", + "E25960", + "E25980", + "E26180", + "E26190", + "E26390", + "E26400", + "E27200", + "E30400", + "E30500", + "E32800", + "E58990", + "E62900", + "E87521", + "P08000", + "P22250", + "P23250", + "T27800", +) + + +@dataclass(frozen=True) +class Stage1Artifacts: + """Paths written by the fixture-backed Stage 1 builder.""" + + uprating_factors_path: Path + acs_path: Path + irs_puf_path: Path + + def as_tuple(self) -> tuple[Path, Path, Path]: + return ( + self.uprating_factors_path, + self.acs_path, + self.irs_puf_path, + ) + + +def create_stage_1_artifacts(workspace: TinyPipelineWorkspace) -> Stage1Artifacts: + """Write deterministic Stage 1 artifacts into ``workspace``.""" + + artifacts = Stage1Artifacts( + uprating_factors_path=workspace.artifact_path( + "stage_1", "uprating_factors.csv" + ), + acs_path=workspace.artifact_path("stage_1", "acs_2022.h5"), + irs_puf_path=workspace.artifact_path("stage_1", "irs_puf_2015.h5"), + ) + + write_tiny_uprating_factors(artifacts.uprating_factors_path) + write_tiny_acs(artifacts.acs_path) + write_tiny_irs_puf(artifacts.irs_puf_path) + + return artifacts + + +def write_tiny_uprating_factors(path: Path) -> None: + """Write a production-shaped uprating factor table with tiny values.""" + + year_offsets = np.array(UPRATING_YEARS) - UPRATING_YEARS[0] + growth_rates = { + "employment_income": 0.030, + "self_employment_income": 0.025, + "social_security": 0.020, + "household_weight": 0.010, + "population": 0.005, + } + table = pd.DataFrame( + { + year: { + variable: round(1 + growth_rates[variable] * offset, 3) + for variable in UPRATING_VARIABLES + } + for year, offset in zip(UPRATING_YEARS, year_offsets) + } + ) + table.index.name = "Variable" + table.to_csv(path) + + +def write_tiny_acs(path: Path) -> None: + """Write a minimal ACS array H5 compatible with Stage 1 contracts.""" + + person_household_id = np.array([1, 1, 2], dtype=np.int64) + household_id = np.array([1, 2], dtype=np.int64) + + arrays = { + "person_id": np.array([1, 2, 3], dtype=np.int64), + "household_id": household_id, + "spm_unit_id": household_id, + "tax_unit_id": household_id, + "family_id": household_id, + "marital_unit_id": household_id, + "person_household_id": person_household_id, + "person_spm_unit_id": person_household_id, + "person_tax_unit_id": person_household_id, + "person_family_id": person_household_id, + "person_marital_unit_id": person_household_id, + "household_weight": np.array([120.0, 80.0], dtype=np.float32), + "age": np.array([40, 38, 10], dtype=np.int16), + "is_male": np.array([True, False, True], dtype=np.bool_), + "employment_income": np.array([55_000, 35_000, 0], dtype=np.float32), + "self_employment_income": np.array([0, 5_000, 0], dtype=np.float32), + "social_security": np.array([0, 0, 0], dtype=np.float32), + "taxable_private_pension_income": np.array([0, 0, 0], dtype=np.float32), + "is_household_head": np.array([True, False, True], dtype=np.bool_), + "rent": np.array([0, 0, 14_400], dtype=np.float32), + "real_estate_taxes": np.array([2_400, 0, 0], dtype=np.float32), + "tenure_type": np.array([b"OWNED_WITH_MORTGAGE", b"RENTED"]), + "household_vehicles_owned": np.array([2, 1], dtype=np.int16), + "state_fips": np.array([37, 37], dtype=np.int16), + "household_state_fips": np.array([37, 37], dtype=np.int16), + } + + with h5py.File(path, mode="w") as h5: + h5.attrs["fixture_scale"] = True + h5.attrs["source"] = "tests.support.tiny_stage_1" + for name, values in arrays.items(): + h5.create_dataset(name, data=values) + + +def write_tiny_irs_puf(path: Path) -> None: + """Write minimal raw IRS PUF tables with production table names.""" + + puf = pd.DataFrame( + { + "RECID": [1001, 1002, 1003], + "S006": [12_000, 8_000, 5_000], + "MARS": [2, 1, 4], + "DSI": [0, 0, 0], + "EIC": [0, 1, 0], + "XTOT": [2, 1, 3], + "E00200": [90_000, 45_000, 30_000], + **{column: [0, 0, 0] for column in _PUF_ZERO_COLUMNS}, + } + ) + demographics = pd.DataFrame( + { + "RECID": [1001, 1002, 1003], + "AGEDP1": [0, 0, 1], + "AGEDP2": [0, 0, 0], + "AGEDP3": [0, 0, 0], + "AGERANGE": [4, 3, 5], + "EARNSPLIT": [2, 0, 0], + "GENDER": [1, 2, 2], + } + ) + + with pd.HDFStore(path, mode="w") as store: + store.put("puf", puf, format="table") + store.put("puf_demographics", demographics, format="table") diff --git a/tests/unit/test_tiny_stage_1_artifacts.py b/tests/unit/test_tiny_stage_1_artifacts.py new file mode 100644 index 000000000..34236c04c --- /dev/null +++ b/tests/unit/test_tiny_stage_1_artifacts.py @@ -0,0 +1,69 @@ +import h5py +import pandas as pd + +from tests.support.pipeline_workspace import TinyPipelineWorkspace +from tests.support.tiny_stage_1 import ( + ACS_HOUSEHOLD_ARRAYS, + ACS_PERSON_ARRAYS, + PUF_CORE_COLUMNS, + PUF_DEMOGRAPHIC_COLUMNS, + UPRATING_VARIABLES, + UPRATING_YEARS, + create_stage_1_artifacts, +) + + +def test_create_stage_1_artifacts_writes_declared_workspace_outputs(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + + artifacts = create_stage_1_artifacts(workspace) + + assert artifacts.as_tuple() == workspace.expected_artifacts("stage_1") + assert all(path.exists() for path in artifacts.as_tuple()) + + +def test_tiny_uprating_factors_have_production_table_shape(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + artifacts = create_stage_1_artifacts(workspace) + + factors = pd.read_csv(artifacts.uprating_factors_path, index_col="Variable") + + assert tuple(factors.index) == UPRATING_VARIABLES + assert tuple(map(int, factors.columns)) == UPRATING_YEARS + assert factors[["2020", "2024", "2034"]].notna().all().all() + assert (factors["2020"] == 1.0).all() + assert ( + factors.loc["employment_income", "2034"] + > factors.loc["employment_income", "2020"] + ) + + +def test_tiny_acs_artifact_has_minimal_array_contract(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + artifacts = create_stage_1_artifacts(workspace) + + with h5py.File(artifacts.acs_path, mode="r") as acs: + assert set(ACS_PERSON_ARRAYS).issubset(acs.keys()) + assert set(ACS_HOUSEHOLD_ARRAYS).issubset(acs.keys()) + assert bool(acs.attrs["fixture_scale"]) is True + assert len(acs["person_id"]) == 3 + assert len(acs["person_household_id"]) == 3 + assert len(acs["household_id"]) == 2 + assert len(acs["household_weight"]) == 2 + assert acs["tenure_type"].dtype.kind == "S" + + +def test_tiny_irs_puf_artifact_has_raw_table_contract(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + artifacts = create_stage_1_artifacts(workspace) + + with pd.HDFStore(artifacts.irs_puf_path, mode="r") as store: + assert set(store.keys()) == {"/puf", "/puf_demographics"} + puf = store["puf"] + demographics = store["puf_demographics"] + + assert set(PUF_CORE_COLUMNS).issubset(puf.columns) + assert set(PUF_DEMOGRAPHIC_COLUMNS).issubset(demographics.columns) + assert len(puf) == 3 + assert len(demographics) == 3 + assert set(puf["RECID"]) == set(demographics["RECID"]) From aec326285f60e53d28636d1e9fe20ca0b4e3d8b4 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 18:48:04 +0200 Subject: [PATCH 15/25] Add fixture-backed Stage 2 artifacts --- tests/support/tiny_stage_2.py | 334 ++++++++++++++++++++++ tests/unit/test_tiny_stage_2_artifacts.py | 112 ++++++++ 2 files changed, 446 insertions(+) create mode 100644 tests/support/tiny_stage_2.py create mode 100644 tests/unit/test_tiny_stage_2_artifacts.py diff --git a/tests/support/tiny_stage_2.py b/tests/support/tiny_stage_2.py new file mode 100644 index 000000000..681fb8c54 --- /dev/null +++ b/tests/support/tiny_stage_2.py @@ -0,0 +1,334 @@ +"""Fixture-backed Stage 2 artifacts for tiny pipeline integration tests.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + +import h5py +import numpy as np +import pandas as pd + +from tests.support.pipeline_workspace import TinyPipelineWorkspace + +__test__ = False + + +STAGE_2_PERIOD = 2024 +PERIOD_KEY = str(STAGE_2_PERIOD) + +ID_VARIABLES = ( + "person_id", + "tax_unit_id", + "marital_unit_id", + "spm_unit_id", + "family_id", + "household_id", + "person_tax_unit_id", + "person_marital_unit_id", + "person_spm_unit_id", + "person_family_id", + "person_household_id", +) + +PERSON_LEVEL_VARIABLES = ( + "person_id", + "person_tax_unit_id", + "person_marital_unit_id", + "person_spm_unit_id", + "person_family_id", + "person_household_id", + "age", + "is_male", + "employment_income", + "self_employment_income", + "social_security", + "taxable_private_pension_income", + "taxable_interest_income", + "tax_exempt_interest_income", + "qualified_dividend_income", + "non_qualified_dividend_income", + "rent", + "real_estate_taxes", + "deductible_mortgage_interest", + "is_tax_unit_head", + "is_tax_unit_spouse", + "is_tax_unit_dependent", +) + +GROUP_LEVEL_VARIABLES = ( + "tax_unit_id", + "marital_unit_id", + "spm_unit_id", + "family_id", + "household_id", + "household_weight", + "filing_status", + "state_fips", + "household_state_fips", + "tenure_type", + "household_vehicles_owned", +) + +CPS_REQUIRED_VARIABLES = tuple( + dict.fromkeys((*ID_VARIABLES, *PERSON_LEVEL_VARIABLES, *GROUP_LEVEL_VARIABLES)) +) +PUF_REQUIRED_VARIABLES = CPS_REQUIRED_VARIABLES + + +@dataclass(frozen=True) +class Stage2Artifacts: + """Paths written by the fixture-backed Stage 2 builder.""" + + cps_path: Path + puf_path: Path + + def as_tuple(self) -> tuple[Path, Path]: + return (self.cps_path, self.puf_path) + + +def create_stage_2_artifacts(workspace: TinyPipelineWorkspace) -> Stage2Artifacts: + """Write deterministic CPS and PUF artifacts from Stage 1 inputs.""" + + stage_1 = _stage_1_paths(workspace) + _require_paths(stage_1.values()) + + artifacts = Stage2Artifacts( + cps_path=workspace.artifact_path("stage_2", "cps_2024.h5"), + puf_path=workspace.artifact_path("stage_2", "puf_2024.h5"), + ) + + uprating = pd.read_csv(stage_1["uprating"], index_col="Variable") + write_tiny_cps( + artifacts.cps_path, + acs_path=stage_1["acs"], + uprating=uprating, + ) + write_tiny_puf( + artifacts.puf_path, + irs_puf_path=stage_1["irs_puf"], + uprating=uprating, + ) + + return artifacts + + +def write_tiny_cps( + path: Path, + *, + acs_path: Path, + uprating: pd.DataFrame, +) -> None: + """Create a tiny CPS-like array dataset from the tiny ACS artifact.""" + + with h5py.File(acs_path, mode="r") as acs: + employment_growth = _uprating_factor(uprating, "employment_income") + self_employment_growth = _uprating_factor(uprating, "self_employment_income") + social_security_growth = _uprating_factor(uprating, "social_security") + weight_growth = _uprating_factor(uprating, "household_weight") + + person_household_id = acs["person_household_id"][:].astype(np.int64) + household_id = acs["household_id"][:].astype(np.int64) + person_count = len(person_household_id) + household_count = len(household_id) + + arrays = { + "person_id": acs["person_id"][:].astype(np.int64), + "tax_unit_id": household_id, + "marital_unit_id": household_id, + "spm_unit_id": household_id, + "family_id": household_id, + "household_id": household_id, + "person_tax_unit_id": person_household_id, + "person_marital_unit_id": person_household_id, + "person_spm_unit_id": person_household_id, + "person_family_id": person_household_id, + "person_household_id": person_household_id, + "household_weight": acs["household_weight"][:] * weight_growth, + "age": acs["age"][:], + "is_male": acs["is_male"][:], + "employment_income": acs["employment_income"][:] * employment_growth, + "self_employment_income": ( + acs["self_employment_income"][:] * self_employment_growth + ), + "social_security": acs["social_security"][:] * social_security_growth, + "taxable_private_pension_income": acs["taxable_private_pension_income"][:], + "taxable_interest_income": np.array([100, 50, 0], dtype=np.float32), + "tax_exempt_interest_income": np.array([25, 0, 0], dtype=np.float32), + "qualified_dividend_income": np.array([40, 10, 0], dtype=np.float32), + "non_qualified_dividend_income": np.array([10, 5, 0], dtype=np.float32), + "rent": acs["rent"][:], + "real_estate_taxes": acs["real_estate_taxes"][:], + "deductible_mortgage_interest": np.array([1_800, 0, 0], dtype=np.float32), + "is_tax_unit_head": np.array([True, False, True], dtype=np.bool_), + "is_tax_unit_spouse": np.array([False, True, False], dtype=np.bool_), + "is_tax_unit_dependent": np.array([False, False, True], dtype=np.bool_), + "filing_status": np.array([b"JOINT", b"HEAD_OF_HOUSEHOLD"]), + "state_fips": acs["state_fips"][:].astype(np.int32), + "household_state_fips": acs["household_state_fips"][:].astype(np.int32), + "tenure_type": acs["tenure_type"][:], + "household_vehicles_owned": acs["household_vehicles_owned"][:], + } + + _assert_lengths( + arrays, + person_count=person_count, + household_count=household_count, + ) + _write_period_h5( + path, + arrays, + attrs={ + "fixture_scale": True, + "source": "tests.support.tiny_stage_2", + "source_stage_1_acs": acs_path.name, + "time_period": STAGE_2_PERIOD, + }, + ) + + +def write_tiny_puf( + path: Path, + *, + irs_puf_path: Path, + uprating: pd.DataFrame, +) -> None: + """Create a tiny PUF-like array dataset from the tiny raw IRS PUF.""" + + with pd.HDFStore(irs_puf_path, mode="r") as store: + puf = store["puf"] + demographics = store["puf_demographics"] + + raw = puf.merge(demographics, on="RECID", validate="one_to_one") + record_id = raw["RECID"].to_numpy(dtype=np.int64) + person_id = record_id * 100 + 1 + employment_growth = _uprating_factor(uprating, "employment_income") + weight_growth = _uprating_factor(uprating, "household_weight") + person_count = len(raw) + household_count = len(raw) + + arrays = { + "person_id": person_id, + "tax_unit_id": record_id, + "marital_unit_id": person_id, + "spm_unit_id": record_id, + "family_id": record_id, + "household_id": record_id, + "person_tax_unit_id": record_id, + "person_marital_unit_id": person_id, + "person_spm_unit_id": record_id, + "person_family_id": record_id, + "person_household_id": record_id, + "household_weight": raw["S006"].to_numpy(dtype=np.float32) + / 100 + * weight_growth, + "age": _decode_age_range(raw["AGERANGE"].to_numpy(dtype=np.int16)), + "is_male": raw["GENDER"].to_numpy(dtype=np.int16) == 1, + "employment_income": raw["E00200"].to_numpy(dtype=np.float32) + * employment_growth, + "self_employment_income": raw["E00900"].to_numpy(dtype=np.float32), + "social_security": raw["E02400"].to_numpy(dtype=np.float32), + "taxable_private_pension_income": raw["E01700"].to_numpy(dtype=np.float32), + "taxable_interest_income": raw["E00300"].to_numpy(dtype=np.float32), + "tax_exempt_interest_income": raw["E00400"].to_numpy(dtype=np.float32), + "qualified_dividend_income": raw["E00650"].to_numpy(dtype=np.float32), + "non_qualified_dividend_income": ( + raw["E00600"].to_numpy(dtype=np.float32) + - raw["E00650"].to_numpy(dtype=np.float32) + ), + "rent": np.zeros(person_count, dtype=np.float32), + "real_estate_taxes": raw["E18500"].to_numpy(dtype=np.float32), + "deductible_mortgage_interest": raw["E19200"].to_numpy(dtype=np.float32), + "is_tax_unit_head": np.ones(person_count, dtype=np.bool_), + "is_tax_unit_spouse": np.zeros(person_count, dtype=np.bool_), + "is_tax_unit_dependent": np.zeros(person_count, dtype=np.bool_), + "filing_status": _filing_status(raw["MARS"].to_numpy(dtype=np.int16)), + "state_fips": np.array([37, 6, 48], dtype=np.int32), + "household_state_fips": np.array([37, 6, 48], dtype=np.int32), + "tenure_type": np.array([b"OWNED_WITH_MORTGAGE", b"RENTED", b"NONE"]), + "household_vehicles_owned": np.array([2, 1, 0], dtype=np.int16), + } + + _assert_lengths( + arrays, + person_count=person_count, + household_count=household_count, + ) + _write_period_h5( + path, + arrays, + attrs={ + "fixture_scale": True, + "source": "tests.support.tiny_stage_2", + "source_stage_1_irs_puf": irs_puf_path.name, + "time_period": STAGE_2_PERIOD, + }, + ) + + +def _stage_1_paths(workspace: TinyPipelineWorkspace) -> dict[str, Path]: + return { + "uprating": workspace.stage_1 / "uprating_factors.csv", + "acs": workspace.stage_1 / "acs_2022.h5", + "irs_puf": workspace.stage_1 / "irs_puf_2015.h5", + } + + +def _require_paths(paths: Iterable[Path]) -> None: + missing = [path for path in paths if not path.exists()] + if missing: + missing_list = ", ".join(str(path) for path in missing) + raise FileNotFoundError(f"Missing Stage 1 artifact(s): {missing_list}") + + +def _uprating_factor(uprating: pd.DataFrame, variable: str) -> float: + return float(uprating.loc[variable, PERIOD_KEY]) + + +def _write_period_h5( + path: Path, + arrays: dict[str, np.ndarray], + *, + attrs: dict[str, object], +) -> None: + with h5py.File(path, mode="w") as h5: + for key, value in attrs.items(): + h5.attrs[key] = value + for variable, values in arrays.items(): + h5.create_group(variable).create_dataset(PERIOD_KEY, data=values) + + +def _assert_lengths( + arrays: dict[str, np.ndarray], + *, + person_count: int, + household_count: int, +) -> None: + for variable in PERSON_LEVEL_VARIABLES: + assert len(arrays[variable]) == person_count, variable + for variable in GROUP_LEVEL_VARIABLES: + assert len(arrays[variable]) == household_count, variable + + +def _decode_age_range(age_range: np.ndarray) -> np.ndarray: + age_by_range = { + 1: 18, + 2: 26, + 3: 35, + 4: 45, + 5: 55, + 6: 65, + 7: 80, + } + return np.array([age_by_range.get(int(value), 40) for value in age_range]) + + +def _filing_status(mars: np.ndarray) -> np.ndarray: + status_by_mars = { + 1: b"SINGLE", + 2: b"JOINT", + 3: b"SEPARATE", + 4: b"HEAD_OF_HOUSEHOLD", + } + return np.array([status_by_mars[int(value)] for value in mars]) diff --git a/tests/unit/test_tiny_stage_2_artifacts.py b/tests/unit/test_tiny_stage_2_artifacts.py new file mode 100644 index 000000000..9a7ccdb11 --- /dev/null +++ b/tests/unit/test_tiny_stage_2_artifacts.py @@ -0,0 +1,112 @@ +import h5py +import numpy as np +import pandas as pd +import pytest + +from tests.support.pipeline_workspace import TinyPipelineWorkspace +from tests.support.tiny_stage_1 import create_stage_1_artifacts +from tests.support.tiny_stage_2 import ( + CPS_REQUIRED_VARIABLES, + PERIOD_KEY, + PUF_REQUIRED_VARIABLES, + STAGE_2_PERIOD, + create_stage_2_artifacts, +) + + +def _load_period_arrays(path): + with h5py.File(path, mode="r") as h5: + return {name: h5[name][PERIOD_KEY][:] for name in h5.keys()} + + +def _assert_period_grouped_contract(path, required_variables): + with h5py.File(path, mode="r") as h5: + assert bool(h5.attrs["fixture_scale"]) is True + assert h5.attrs["time_period"] == STAGE_2_PERIOD + assert set(required_variables).issubset(h5.keys()) + for variable in required_variables: + assert PERIOD_KEY in h5[variable] + + +def _assert_identity_contract(arrays): + assert len(np.unique(arrays["person_id"])) == len(arrays["person_id"]) + assert set(arrays["person_household_id"]).issubset(set(arrays["household_id"])) + assert set(arrays["person_tax_unit_id"]).issubset(set(arrays["tax_unit_id"])) + assert set(arrays["person_spm_unit_id"]).issubset(set(arrays["spm_unit_id"])) + assert (arrays["household_weight"] > 0).all() + + +def test_create_stage_2_artifacts_requires_stage_1_outputs(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + + with pytest.raises(FileNotFoundError, match="Missing Stage 1 artifact"): + create_stage_2_artifacts(workspace) + + +def test_create_stage_2_artifacts_writes_declared_workspace_outputs(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + create_stage_1_artifacts(workspace) + + artifacts = create_stage_2_artifacts(workspace) + + assert artifacts.as_tuple() == workspace.expected_artifacts("stage_2") + assert all(path.exists() for path in artifacts.as_tuple()) + + +def test_tiny_cps_artifact_has_period_grouped_array_contract(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + create_stage_1_artifacts(workspace) + artifacts = create_stage_2_artifacts(workspace) + + _assert_period_grouped_contract(artifacts.cps_path, CPS_REQUIRED_VARIABLES) + arrays = _load_period_arrays(artifacts.cps_path) + + assert len(arrays["person_id"]) == 3 + assert len(arrays["household_id"]) == 2 + assert len(arrays["household_weight"]) == 2 + assert arrays["filing_status"].dtype.kind == "S" + _assert_identity_contract(arrays) + + +def test_tiny_puf_artifact_has_period_grouped_array_contract(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + create_stage_1_artifacts(workspace) + artifacts = create_stage_2_artifacts(workspace) + + _assert_period_grouped_contract(artifacts.puf_path, PUF_REQUIRED_VARIABLES) + arrays = _load_period_arrays(artifacts.puf_path) + + assert len(arrays["person_id"]) == 3 + assert len(arrays["household_id"]) == 3 + assert len(arrays["household_weight"]) == 3 + assert arrays["filing_status"].dtype.kind == "S" + _assert_identity_contract(arrays) + + +def test_stage_2_cps_uses_stage_1_acs_and_uprating_inputs(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + stage_1 = create_stage_1_artifacts(workspace) + artifacts = create_stage_2_artifacts(workspace) + + factors = pd.read_csv(stage_1.uprating_factors_path, index_col="Variable") + expected_growth = factors.loc["employment_income", PERIOD_KEY] + with h5py.File(stage_1.acs_path, mode="r") as acs: + expected = acs["employment_income"][:] * expected_growth + + arrays = _load_period_arrays(artifacts.cps_path) + np.testing.assert_allclose(arrays["employment_income"], expected) + + +def test_stage_2_puf_uses_stage_1_raw_puf_and_uprating_inputs(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + stage_1 = create_stage_1_artifacts(workspace) + artifacts = create_stage_2_artifacts(workspace) + + factors = pd.read_csv(stage_1.uprating_factors_path, index_col="Variable") + expected_growth = factors.loc["employment_income", PERIOD_KEY] + with pd.HDFStore(stage_1.irs_puf_path, mode="r") as store: + raw_puf = store["puf"] + expected = raw_puf["E00200"].to_numpy(dtype=np.float32) * expected_growth + + arrays = _load_period_arrays(artifacts.puf_path) + np.testing.assert_allclose(arrays["employment_income"], expected) From 301be103ba5899765441b329a2a202adcf63d84e Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 18:56:17 +0200 Subject: [PATCH 16/25] Add fixture-backed Stage 3 artifact --- tests/support/tiny_stage_3.py | 330 ++++++++++++++++++++++ tests/unit/test_tiny_stage_3_artifacts.py | 110 ++++++++ 2 files changed, 440 insertions(+) create mode 100644 tests/support/tiny_stage_3.py create mode 100644 tests/unit/test_tiny_stage_3_artifacts.py diff --git a/tests/support/tiny_stage_3.py b/tests/support/tiny_stage_3.py new file mode 100644 index 000000000..22f4a040a --- /dev/null +++ b/tests/support/tiny_stage_3.py @@ -0,0 +1,330 @@ +"""Fixture-backed Stage 3 artifacts for tiny pipeline integration tests.""" + +from __future__ import annotations + +import hashlib +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + +import h5py +import numpy as np + +from tests.support.pipeline_workspace import TinyPipelineWorkspace +from tests.support.tiny_stage_2 import ( + GROUP_LEVEL_VARIABLES, + PERIOD_KEY, + PERSON_LEVEL_VARIABLES, + STAGE_2_PERIOD, +) + +__test__ = False + + +STAGE_3_PERIOD = STAGE_2_PERIOD + +STAGE_3_PERSON_VARIABLES = tuple( + dict.fromkeys( + ( + *PERSON_LEVEL_VARIABLES, + "employment_income_before_lsr", + "pre_tax_contributions", + "weekly_hours_worked", + "hours_worked_last_week", + "is_hispanic", + "cps_race", + "detailed_occupation_recode", + "treasury_tipped_occupation_code", + "is_puf_clone", + ) + ) +) + +STAGE_3_GROUP_VARIABLES = tuple( + dict.fromkeys( + ( + *GROUP_LEVEL_VARIABLES, + "tax_unit_count_dependents", + "tax_unit_is_joint", + "spm_unit_total_income_reported", + "spm_unit_net_income_reported", + "spm_unit_spm_threshold", + "spm_unit_capped_housing_subsidy_reported", + "snap_reported", + "tanf_reported", + "ssi_reported", + "household_is_puf_clone", + ) + ) +) + +EXTENDED_CPS_REQUIRED_VARIABLES = tuple( + dict.fromkeys((*STAGE_3_PERSON_VARIABLES, *STAGE_3_GROUP_VARIABLES)) +) + +STAGE_4_INPUT_VARIABLES = ( + "person_id", + "household_id", + "person_household_id", + "household_weight", + "employment_income", + "employment_income_before_lsr", + "self_employment_income", + "social_security", + "taxable_private_pension_income", + "state_fips", + "tax_unit_count_dependents", + "tax_unit_is_joint", + "spm_unit_total_income_reported", + "spm_unit_net_income_reported", + "spm_unit_spm_threshold", + "is_puf_clone", +) + + +@dataclass(frozen=True) +class Stage3Artifacts: + """Paths written by the fixture-backed Stage 3 builder.""" + + extended_cps_path: Path + + def as_tuple(self) -> tuple[Path]: + return (self.extended_cps_path,) + + +def create_stage_3_artifacts(workspace: TinyPipelineWorkspace) -> Stage3Artifacts: + """Write deterministic extended CPS artifact from Stage 2 outputs.""" + + stage_2 = _stage_2_paths(workspace) + _require_paths(stage_2.values()) + + artifacts = Stage3Artifacts( + extended_cps_path=workspace.artifact_path("stage_3", "extended_cps_2024.h5") + ) + write_tiny_extended_cps( + artifacts.extended_cps_path, + cps_path=stage_2["cps"], + puf_path=stage_2["puf"], + ) + + return artifacts + + +def write_tiny_extended_cps( + path: Path, + *, + cps_path: Path, + puf_path: Path, +) -> None: + """Create a tiny extended CPS by appending PUF clone rows to CPS rows.""" + + cps = _load_period_arrays(cps_path) + puf = _load_period_arrays(puf_path) + + arrays = _concatenate_common_arrays(cps, puf) + arrays.update( + _extended_person_arrays(arrays, cps_person_count=len(cps["person_id"])) + ) + arrays.update( + _extended_group_arrays( + arrays, + cps_household_count=len(cps["household_id"]), + ) + ) + + _assert_lengths(arrays) + _write_period_h5( + path, + arrays, + attrs={ + "fixture_scale": True, + "source": "tests.support.tiny_stage_3", + "source_stage_2_cps": cps_path.name, + "source_stage_2_puf": puf_path.name, + "time_period": STAGE_3_PERIOD, + }, + ) + + +def stage_3_artifact_digest(path: Path) -> str: + """Return a deterministic content digest for a Stage 3 H5 artifact.""" + + digest = hashlib.sha256() + with h5py.File(path, mode="r") as h5: + for key in sorted(h5.attrs): + digest.update(str(key).encode("utf-8")) + digest.update(str(h5.attrs[key]).encode("utf-8")) + for variable in sorted(h5.keys()): + values = h5[variable][PERIOD_KEY][:] + digest.update(variable.encode("utf-8")) + digest.update(str(values.dtype).encode("utf-8")) + digest.update(str(values.shape).encode("utf-8")) + digest.update(np.ascontiguousarray(values).tobytes()) + return digest.hexdigest() + + +def _stage_2_paths(workspace: TinyPipelineWorkspace) -> dict[str, Path]: + return { + "cps": workspace.stage_2 / "cps_2024.h5", + "puf": workspace.stage_2 / "puf_2024.h5", + } + + +def _require_paths(paths: Iterable[Path]) -> None: + missing = [path for path in paths if not path.exists()] + if missing: + missing_list = ", ".join(str(path) for path in missing) + raise FileNotFoundError(f"Missing Stage 2 artifact(s): {missing_list}") + + +def _load_period_arrays(path: Path) -> dict[str, np.ndarray]: + with h5py.File(path, mode="r") as h5: + return {variable: h5[variable][PERIOD_KEY][:] for variable in h5.keys()} + + +def _concatenate_common_arrays( + cps: dict[str, np.ndarray], + puf: dict[str, np.ndarray], +) -> dict[str, np.ndarray]: + return { + variable: np.concatenate([cps[variable], puf[variable]]) + for variable in sorted(set(cps) & set(puf)) + } + + +def _extended_person_arrays( + arrays: dict[str, np.ndarray], + *, + cps_person_count: int, +) -> dict[str, np.ndarray]: + person_count = len(arrays["person_id"]) + puf_person_count = person_count - cps_person_count + employment_income = arrays["employment_income"].astype(np.float32) + + return { + "employment_income_before_lsr": employment_income.copy(), + "pre_tax_contributions": np.round(employment_income * 0.04, 2).astype( + np.float32 + ), + "weekly_hours_worked": np.where(employment_income > 0, 40, 0).astype(np.int16), + "hours_worked_last_week": np.where(employment_income > 0, 40, 0).astype( + np.int16 + ), + "is_hispanic": _resize_pattern( + [False, True, False, False, False, True], + person_count, + dtype=np.bool_, + ), + "cps_race": _resize_pattern([1, 2, 1, 1, 1, 2], person_count, dtype=np.int16), + "detailed_occupation_recode": _resize_pattern( + [10, 20, 0, 30, 20, 10], + person_count, + dtype=np.int16, + ), + "treasury_tipped_occupation_code": _resize_pattern( + [0, 1, 0, 0, 0, 1], + person_count, + dtype=np.int16, + ), + "is_puf_clone": np.concatenate( + [ + np.zeros(cps_person_count, dtype=np.bool_), + np.ones(puf_person_count, dtype=np.bool_), + ] + ), + } + + +def _extended_group_arrays( + arrays: dict[str, np.ndarray], + *, + cps_household_count: int, +) -> dict[str, np.ndarray]: + household_count = len(arrays["household_id"]) + puf_household_count = household_count - cps_household_count + tax_unit_count_dependents = _count_dependents_by_tax_unit(arrays) + total_income = _sum_person_values_by_group( + group_ids=arrays["spm_unit_id"], + person_group_ids=arrays["person_spm_unit_id"], + person_values=( + arrays["employment_income"].astype(np.float32) + + arrays["self_employment_income"].astype(np.float32) + + arrays["social_security"].astype(np.float32) + ), + ) + + return { + "tax_unit_count_dependents": tax_unit_count_dependents, + "tax_unit_is_joint": arrays["filing_status"] == b"JOINT", + "spm_unit_total_income_reported": total_income.astype(np.float32), + "spm_unit_net_income_reported": np.round(total_income * 0.85, 2).astype( + np.float32 + ), + "spm_unit_spm_threshold": ( + 25_000 + tax_unit_count_dependents.astype(np.float32) * 5_000 + ).astype(np.float32), + "spm_unit_capped_housing_subsidy_reported": np.where( + arrays["tenure_type"] == b"RENTED", + 1_200, + 0, + ).astype(np.float32), + "snap_reported": np.where(total_income < 50_000, 1_000, 0).astype(np.float32), + "tanf_reported": np.zeros(household_count, dtype=np.float32), + "ssi_reported": np.zeros(household_count, dtype=np.float32), + "household_is_puf_clone": np.concatenate( + [ + np.zeros(cps_household_count, dtype=np.bool_), + np.ones(puf_household_count, dtype=np.bool_), + ] + ), + } + + +def _count_dependents_by_tax_unit(arrays: dict[str, np.ndarray]) -> np.ndarray: + dependents = arrays["is_tax_unit_dependent"].astype(bool) + return np.array( + [ + dependents[arrays["person_tax_unit_id"] == tax_unit_id].sum() + for tax_unit_id in arrays["tax_unit_id"] + ], + dtype=np.int16, + ) + + +def _sum_person_values_by_group( + *, + group_ids: np.ndarray, + person_group_ids: np.ndarray, + person_values: np.ndarray, +) -> np.ndarray: + return np.array( + [person_values[person_group_ids == group_id].sum() for group_id in group_ids], + dtype=np.float32, + ) + + +def _resize_pattern(values: list[object], length: int, *, dtype) -> np.ndarray: + repeats = int(np.ceil(length / len(values))) + return np.resize(np.array(values * repeats, dtype=dtype), length) + + +def _write_period_h5( + path: Path, + arrays: dict[str, np.ndarray], + *, + attrs: dict[str, object], +) -> None: + with h5py.File(path, mode="w") as h5: + for key, value in attrs.items(): + h5.attrs[key] = value + for variable in sorted(arrays): + h5.create_group(variable).create_dataset(PERIOD_KEY, data=arrays[variable]) + + +def _assert_lengths(arrays: dict[str, np.ndarray]) -> None: + person_count = len(arrays["person_id"]) + household_count = len(arrays["household_id"]) + for variable in STAGE_3_PERSON_VARIABLES: + assert len(arrays[variable]) == person_count, variable + for variable in STAGE_3_GROUP_VARIABLES: + assert len(arrays[variable]) == household_count, variable diff --git a/tests/unit/test_tiny_stage_3_artifacts.py b/tests/unit/test_tiny_stage_3_artifacts.py new file mode 100644 index 000000000..104218bb2 --- /dev/null +++ b/tests/unit/test_tiny_stage_3_artifacts.py @@ -0,0 +1,110 @@ +import h5py +import numpy as np +import pytest + +from tests.support.pipeline_workspace import TinyPipelineWorkspace +from tests.support.tiny_stage_1 import create_stage_1_artifacts +from tests.support.tiny_stage_2 import PERIOD_KEY, create_stage_2_artifacts +from tests.support.tiny_stage_3 import ( + EXTENDED_CPS_REQUIRED_VARIABLES, + STAGE_3_PERIOD, + STAGE_4_INPUT_VARIABLES, + create_stage_3_artifacts, + stage_3_artifact_digest, +) + + +def _load_period_arrays(path): + with h5py.File(path, mode="r") as h5: + return {name: h5[name][PERIOD_KEY][:] for name in h5.keys()} + + +def _create_stage_3_workspace(root): + workspace = TinyPipelineWorkspace.create(root / "tiny-pipeline") + create_stage_1_artifacts(workspace) + create_stage_2_artifacts(workspace) + return workspace + + +def test_create_stage_3_artifacts_requires_stage_2_outputs(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + + with pytest.raises(FileNotFoundError, match="Missing Stage 2 artifact"): + create_stage_3_artifacts(workspace) + + +def test_create_stage_3_artifacts_writes_declared_workspace_output(tmp_path): + workspace = _create_stage_3_workspace(tmp_path) + + artifacts = create_stage_3_artifacts(workspace) + + assert artifacts.as_tuple() == workspace.expected_artifacts("stage_3") + assert artifacts.extended_cps_path.exists() + + +def test_tiny_extended_cps_has_required_period_grouped_variables(tmp_path): + workspace = _create_stage_3_workspace(tmp_path) + artifacts = create_stage_3_artifacts(workspace) + + with h5py.File(artifacts.extended_cps_path, mode="r") as extended: + assert bool(extended.attrs["fixture_scale"]) is True + assert extended.attrs["time_period"] == STAGE_3_PERIOD + assert set(EXTENDED_CPS_REQUIRED_VARIABLES).issubset(extended.keys()) + for variable in EXTENDED_CPS_REQUIRED_VARIABLES: + assert PERIOD_KEY in extended[variable] + + +def test_tiny_extended_cps_combines_cps_and_puf_rows(tmp_path): + workspace = _create_stage_3_workspace(tmp_path) + artifacts = create_stage_3_artifacts(workspace) + + arrays = _load_period_arrays(artifacts.extended_cps_path) + + assert len(arrays["person_id"]) == 6 + assert len(arrays["household_id"]) == 5 + assert len(np.unique(arrays["person_id"])) == 6 + assert len(np.unique(arrays["household_id"])) == 5 + assert arrays["is_puf_clone"].tolist() == [ + False, + False, + False, + True, + True, + True, + ] + assert arrays["household_is_puf_clone"].tolist() == [ + False, + False, + True, + True, + True, + ] + + +def test_tiny_extended_cps_derives_stage_4_contract_variables(tmp_path): + workspace = _create_stage_3_workspace(tmp_path) + artifacts = create_stage_3_artifacts(workspace) + + arrays = _load_period_arrays(artifacts.extended_cps_path) + + assert set(STAGE_4_INPUT_VARIABLES).issubset(arrays) + np.testing.assert_allclose( + arrays["employment_income_before_lsr"], + arrays["employment_income"], + ) + assert (arrays["pre_tax_contributions"] >= 0).all() + assert (arrays["spm_unit_total_income_reported"] >= 0).all() + assert (arrays["spm_unit_net_income_reported"] >= 0).all() + assert (arrays["spm_unit_spm_threshold"] > 0).all() + + +def test_tiny_extended_cps_digest_is_stable_for_same_inputs(tmp_path): + workspace_a = _create_stage_3_workspace(tmp_path / "a") + workspace_b = _create_stage_3_workspace(tmp_path / "b") + + artifact_a = create_stage_3_artifacts(workspace_a) + artifact_b = create_stage_3_artifacts(workspace_b) + + assert stage_3_artifact_digest( + artifact_a.extended_cps_path + ) == stage_3_artifact_digest(artifact_b.extended_cps_path) From db2b771a5cdf92505e7cb2c7f22e574a4f27b4ea Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 19:02:07 +0200 Subject: [PATCH 17/25] Format tiny pipeline workspace helper --- tests/support/pipeline_workspace.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/support/pipeline_workspace.py b/tests/support/pipeline_workspace.py index d8eb2e600..916e90eba 100644 --- a/tests/support/pipeline_workspace.py +++ b/tests/support/pipeline_workspace.py @@ -168,7 +168,4 @@ def expected_artifacts(self, stage: str) -> tuple[Path, ...]: def all_expected_artifacts(self) -> dict[str, tuple[Path, ...]]: """Return every currently defined expected artifact path by stage.""" - return { - stage: self.expected_artifacts(stage) - for stage in STAGE_ARTIFACTS - } + return {stage: self.expected_artifacts(stage) for stage in STAGE_ARTIFACTS} From 204066a1891d0b2747d9e2636e0105bf8db1cb5a Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 19:11:52 +0200 Subject: [PATCH 18/25] Move tiny pipeline tests to integration --- tests/integration/support/__init__.py | 1 + tests/{ => integration}/support/pipeline_workspace.py | 0 tests/{ => integration}/support/tiny_stage_1.py | 4 ++-- tests/{ => integration}/support/tiny_stage_2.py | 6 +++--- tests/{ => integration}/support/tiny_stage_3.py | 6 +++--- .../{unit => integration}/test_tiny_pipeline_workspace.py | 2 +- .../{unit => integration}/test_tiny_stage_1_artifacts.py | 4 ++-- .../{unit => integration}/test_tiny_stage_2_artifacts.py | 6 +++--- .../{unit => integration}/test_tiny_stage_3_artifacts.py | 8 ++++---- 9 files changed, 19 insertions(+), 18 deletions(-) create mode 100644 tests/integration/support/__init__.py rename tests/{ => integration}/support/pipeline_workspace.py (100%) rename tests/{ => integration}/support/tiny_stage_1.py (97%) rename tests/{ => integration}/support/tiny_stage_2.py (98%) rename tests/{ => integration}/support/tiny_stage_3.py (98%) rename tests/{unit => integration}/test_tiny_pipeline_workspace.py (97%) rename tests/{unit => integration}/test_tiny_stage_1_artifacts.py (94%) rename tests/{unit => integration}/test_tiny_stage_2_artifacts.py (95%) rename tests/{unit => integration}/test_tiny_stage_3_artifacts.py (92%) diff --git a/tests/integration/support/__init__.py b/tests/integration/support/__init__.py new file mode 100644 index 000000000..5182418f1 --- /dev/null +++ b/tests/integration/support/__init__.py @@ -0,0 +1 @@ +"""Shared support helpers for fixture-scale integration tests.""" diff --git a/tests/support/pipeline_workspace.py b/tests/integration/support/pipeline_workspace.py similarity index 100% rename from tests/support/pipeline_workspace.py rename to tests/integration/support/pipeline_workspace.py diff --git a/tests/support/tiny_stage_1.py b/tests/integration/support/tiny_stage_1.py similarity index 97% rename from tests/support/tiny_stage_1.py rename to tests/integration/support/tiny_stage_1.py index e7ce068d1..117ba078d 100644 --- a/tests/support/tiny_stage_1.py +++ b/tests/integration/support/tiny_stage_1.py @@ -9,7 +9,7 @@ import numpy as np import pandas as pd -from tests.support.pipeline_workspace import TinyPipelineWorkspace +from tests.integration.support.pipeline_workspace import TinyPipelineWorkspace __test__ = False @@ -237,7 +237,7 @@ def write_tiny_acs(path: Path) -> None: with h5py.File(path, mode="w") as h5: h5.attrs["fixture_scale"] = True - h5.attrs["source"] = "tests.support.tiny_stage_1" + h5.attrs["source"] = "tests.integration.support.tiny_stage_1" for name, values in arrays.items(): h5.create_dataset(name, data=values) diff --git a/tests/support/tiny_stage_2.py b/tests/integration/support/tiny_stage_2.py similarity index 98% rename from tests/support/tiny_stage_2.py rename to tests/integration/support/tiny_stage_2.py index 681fb8c54..6daf0c422 100644 --- a/tests/support/tiny_stage_2.py +++ b/tests/integration/support/tiny_stage_2.py @@ -10,7 +10,7 @@ import numpy as np import pandas as pd -from tests.support.pipeline_workspace import TinyPipelineWorkspace +from tests.integration.support.pipeline_workspace import TinyPipelineWorkspace __test__ = False @@ -181,7 +181,7 @@ def write_tiny_cps( arrays, attrs={ "fixture_scale": True, - "source": "tests.support.tiny_stage_2", + "source": "tests.integration.support.tiny_stage_2", "source_stage_1_acs": acs_path.name, "time_period": STAGE_2_PERIOD, }, @@ -260,7 +260,7 @@ def write_tiny_puf( arrays, attrs={ "fixture_scale": True, - "source": "tests.support.tiny_stage_2", + "source": "tests.integration.support.tiny_stage_2", "source_stage_1_irs_puf": irs_puf_path.name, "time_period": STAGE_2_PERIOD, }, diff --git a/tests/support/tiny_stage_3.py b/tests/integration/support/tiny_stage_3.py similarity index 98% rename from tests/support/tiny_stage_3.py rename to tests/integration/support/tiny_stage_3.py index 22f4a040a..755f24853 100644 --- a/tests/support/tiny_stage_3.py +++ b/tests/integration/support/tiny_stage_3.py @@ -10,8 +10,8 @@ import h5py import numpy as np -from tests.support.pipeline_workspace import TinyPipelineWorkspace -from tests.support.tiny_stage_2 import ( +from tests.integration.support.pipeline_workspace import TinyPipelineWorkspace +from tests.integration.support.tiny_stage_2 import ( GROUP_LEVEL_VARIABLES, PERIOD_KEY, PERSON_LEVEL_VARIABLES, @@ -138,7 +138,7 @@ def write_tiny_extended_cps( arrays, attrs={ "fixture_scale": True, - "source": "tests.support.tiny_stage_3", + "source": "tests.integration.support.tiny_stage_3", "source_stage_2_cps": cps_path.name, "source_stage_2_puf": puf_path.name, "time_period": STAGE_3_PERIOD, diff --git a/tests/unit/test_tiny_pipeline_workspace.py b/tests/integration/test_tiny_pipeline_workspace.py similarity index 97% rename from tests/unit/test_tiny_pipeline_workspace.py rename to tests/integration/test_tiny_pipeline_workspace.py index da2aec7da..49b53ed56 100644 --- a/tests/unit/test_tiny_pipeline_workspace.py +++ b/tests/integration/test_tiny_pipeline_workspace.py @@ -2,7 +2,7 @@ import pytest -from tests.support.pipeline_workspace import ( +from tests.integration.support.pipeline_workspace import ( STAGE_ARTIFACTS, TinyPipelineWorkspace, ) diff --git a/tests/unit/test_tiny_stage_1_artifacts.py b/tests/integration/test_tiny_stage_1_artifacts.py similarity index 94% rename from tests/unit/test_tiny_stage_1_artifacts.py rename to tests/integration/test_tiny_stage_1_artifacts.py index 34236c04c..6a9bb8de2 100644 --- a/tests/unit/test_tiny_stage_1_artifacts.py +++ b/tests/integration/test_tiny_stage_1_artifacts.py @@ -1,8 +1,8 @@ import h5py import pandas as pd -from tests.support.pipeline_workspace import TinyPipelineWorkspace -from tests.support.tiny_stage_1 import ( +from tests.integration.support.pipeline_workspace import TinyPipelineWorkspace +from tests.integration.support.tiny_stage_1 import ( ACS_HOUSEHOLD_ARRAYS, ACS_PERSON_ARRAYS, PUF_CORE_COLUMNS, diff --git a/tests/unit/test_tiny_stage_2_artifacts.py b/tests/integration/test_tiny_stage_2_artifacts.py similarity index 95% rename from tests/unit/test_tiny_stage_2_artifacts.py rename to tests/integration/test_tiny_stage_2_artifacts.py index 9a7ccdb11..5179bb268 100644 --- a/tests/unit/test_tiny_stage_2_artifacts.py +++ b/tests/integration/test_tiny_stage_2_artifacts.py @@ -3,9 +3,9 @@ import pandas as pd import pytest -from tests.support.pipeline_workspace import TinyPipelineWorkspace -from tests.support.tiny_stage_1 import create_stage_1_artifacts -from tests.support.tiny_stage_2 import ( +from tests.integration.support.pipeline_workspace import TinyPipelineWorkspace +from tests.integration.support.tiny_stage_1 import create_stage_1_artifacts +from tests.integration.support.tiny_stage_2 import ( CPS_REQUIRED_VARIABLES, PERIOD_KEY, PUF_REQUIRED_VARIABLES, diff --git a/tests/unit/test_tiny_stage_3_artifacts.py b/tests/integration/test_tiny_stage_3_artifacts.py similarity index 92% rename from tests/unit/test_tiny_stage_3_artifacts.py rename to tests/integration/test_tiny_stage_3_artifacts.py index 104218bb2..f19ee029f 100644 --- a/tests/unit/test_tiny_stage_3_artifacts.py +++ b/tests/integration/test_tiny_stage_3_artifacts.py @@ -2,10 +2,10 @@ import numpy as np import pytest -from tests.support.pipeline_workspace import TinyPipelineWorkspace -from tests.support.tiny_stage_1 import create_stage_1_artifacts -from tests.support.tiny_stage_2 import PERIOD_KEY, create_stage_2_artifacts -from tests.support.tiny_stage_3 import ( +from tests.integration.support.pipeline_workspace import TinyPipelineWorkspace +from tests.integration.support.tiny_stage_1 import create_stage_1_artifacts +from tests.integration.support.tiny_stage_2 import PERIOD_KEY, create_stage_2_artifacts +from tests.integration.support.tiny_stage_3 import ( EXTENDED_CPS_REQUIRED_VARIABLES, STAGE_3_PERIOD, STAGE_4_INPUT_VARIABLES, From 010d1aba0eb24302f5b1c5e0e2b5b7eb0b0ddfa1 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 19:32:57 +0200 Subject: [PATCH 19/25] Move target integration tests under integration --- .github/workflows/pr.yaml | 19 ++++++++++++++----- docs/engineering/skills/testing.md | 6 +++--- scripts/guards/test_layout.py | 5 ++--- tests/integration/local_h5/__init__.py | 1 + .../local_h5/fixtures.py | 2 +- .../test_modal_local_area_traceability.py | 5 ++++- .../local_h5/test_traceability_contract.py | 5 ++++- .../test_worker_script_tiny_fixture.py | 4 +++- .../test_modal_h5_pipeline_e2e.py | 2 +- .../test_modal_pipeline_seams.py | 2 +- tests/optimized/local_h5/__init__.py | 1 - 11 files changed, 34 insertions(+), 18 deletions(-) create mode 100644 tests/integration/local_h5/__init__.py rename tests/{optimized => integration}/local_h5/fixtures.py (98%) rename tests/{optimized => integration}/local_h5/test_modal_local_area_traceability.py (93%) rename tests/{optimized => integration}/local_h5/test_traceability_contract.py (95%) rename tests/{optimized => integration}/local_h5/test_worker_script_tiny_fixture.py (97%) rename tests/{optimized => integration}/test_modal_h5_pipeline_e2e.py (98%) rename tests/{optimized => integration}/test_modal_pipeline_seams.py (96%) delete mode 100644 tests/optimized/local_h5/__init__.py diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 9f712acc4..04cfcc64e 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -98,7 +98,7 @@ jobs: env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} - optimized-integration-tests: + target-integration-tests: runs-on: ubuntu-latest needs: [ @@ -119,7 +119,7 @@ jobs: MODAL_APP_NAME: policyengine-us-data-pipeline MODAL_LOCAL_AREA_APP_NAME: policyengine-us-data-local-area MODAL_H5_TEST_HARNESS_APP_NAME: policyengine-us-data-h5-test-harness - name: Optimized integration tests (PR staging) + name: Target integration tests steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 @@ -127,7 +127,7 @@ jobs: python-version: "3.14" - uses: astral-sh/setup-uv@v5 - run: uv sync --dev - - name: Install optimized test deps + - name: Install integration test deps run: uv pip install modal pytest numpy pandas - name: Ensure PR Modal environment exists run: uv run python .github/scripts/ensure_modal_environment.py @@ -139,8 +139,17 @@ jobs: run: uv run modal deploy --env="${MODAL_ENVIRONMENT}" modal_app/local_area.py - name: Deploy Modal H5 test harness to PR staging run: uv run modal deploy --env="${MODAL_ENVIRONMENT}" modal_app/h5_test_harness.py - - name: Run optimized integration tests against PR staging - run: uv run pytest tests/optimized/ -v + - name: Run target integration tests + run: > + uv run pytest + tests/integration/test_tiny_pipeline_workspace.py + tests/integration/test_tiny_stage_1_artifacts.py + tests/integration/test_tiny_stage_2_artifacts.py + tests/integration/test_tiny_stage_3_artifacts.py + tests/integration/local_h5/ + tests/integration/test_modal_pipeline_seams.py + tests/integration/test_modal_h5_pipeline_e2e.py + -v - name: Cleanup PR Modal environment if: always() run: uv run python .github/scripts/delete_modal_environment.py diff --git a/docs/engineering/skills/testing.md b/docs/engineering/skills/testing.md index 69a4d71e9..fc562a377 100644 --- a/docs/engineering/skills/testing.md +++ b/docs/engineering/skills/testing.md @@ -5,8 +5,8 @@ Use this skill whenever adding, moving, or reviewing tests. ## Canonical Layout - Put unit tests under `tests/unit/`. -- Put data-dependent or runtime integration tests under `tests/integration/`. -- Put deployed Modal/staging tests under `tests/optimized/`. +- Put data-dependent, runtime, deployed Modal, and staging integration tests + under `tests/integration/`. - Do not add pytest files under `policyengine_us_data/tests/`; CI does not collect that tree. @@ -46,7 +46,7 @@ python scripts/run_quality_guards.py The current guard enforces: - No package-internal pytest files under `policyengine_us_data/tests/`. -- No pytest files outside the approved top-level test lanes. +- No pytest files outside `tests/unit/` and `tests/integration/`. - No imports from `tests.conftest`. - No imports across test lanes. diff --git a/scripts/guards/test_layout.py b/scripts/guards/test_layout.py index aef7a425f..f2d985f65 100644 --- a/scripts/guards/test_layout.py +++ b/scripts/guards/test_layout.py @@ -11,7 +11,6 @@ TEST_LANES = { "tests/unit": Path("tests/unit"), "tests/integration": Path("tests/integration"), - "tests/optimized": Path("tests/optimized"), } ALLOWED_TEST_ROOTS = tuple(TEST_LANES.values()) PYTEST_FILE_PREFIX = "test_" @@ -83,7 +82,7 @@ def _check_test_placement(files: list[Path]) -> list[str]: if _is_under(path, Path("policyengine_us_data/tests")): violations.append( f"{path}: package-internal tests are not collected by CI; " - "move tests under tests/unit, tests/integration, or tests/optimized." + "move tests under tests/unit or tests/integration." ) continue @@ -91,7 +90,7 @@ def _check_test_placement(files: list[Path]) -> list[str]: if not any(_is_under(path, root) for root in ALLOWED_TEST_ROOTS): violations.append( f"{path}: pytest files under tests/ must live under " - "tests/unit, tests/integration, or tests/optimized." + "tests/unit or tests/integration." ) return violations diff --git a/tests/integration/local_h5/__init__.py b/tests/integration/local_h5/__init__.py new file mode 100644 index 000000000..e911efbea --- /dev/null +++ b/tests/integration/local_h5/__init__.py @@ -0,0 +1 @@ +"""Local H5 integration tests.""" diff --git a/tests/optimized/local_h5/fixtures.py b/tests/integration/local_h5/fixtures.py similarity index 98% rename from tests/optimized/local_h5/fixtures.py rename to tests/integration/local_h5/fixtures.py index 7918f9cd0..8e71dc0cd 100644 --- a/tests/optimized/local_h5/fixtures.py +++ b/tests/integration/local_h5/fixtures.py @@ -1,4 +1,4 @@ -"""Shared tiny-artifact fixtures for optimized local H5 integration tests.""" +"""Shared tiny-artifact fixtures for local H5 integration tests.""" from __future__ import annotations diff --git a/tests/optimized/local_h5/test_modal_local_area_traceability.py b/tests/integration/local_h5/test_modal_local_area_traceability.py similarity index 93% rename from tests/optimized/local_h5/test_modal_local_area_traceability.py rename to tests/integration/local_h5/test_modal_local_area_traceability.py index a943b8dcf..6c6f22dc1 100644 --- a/tests/optimized/local_h5/test_modal_local_area_traceability.py +++ b/tests/integration/local_h5/test_modal_local_area_traceability.py @@ -1,10 +1,13 @@ from policyengine_us_data.calibration.local_h5.fingerprinting import ( FingerprintingService, ) +import pytest -from tests.optimized.local_h5.fixtures import SEED, VERSION, seed_local_h5_artifacts +from tests.integration.local_h5.fixtures import SEED, VERSION, seed_local_h5_artifacts from tests.support.modal_local_area import load_local_area_module +pytestmark = pytest.mark.integration + def test_local_area_helpers_match_publish_traceability_contract(tmp_path): local_area = load_local_area_module(stub_policyengine=False) diff --git a/tests/optimized/local_h5/test_traceability_contract.py b/tests/integration/local_h5/test_traceability_contract.py similarity index 95% rename from tests/optimized/local_h5/test_traceability_contract.py rename to tests/integration/local_h5/test_traceability_contract.py index b6b599082..e1187cc3d 100644 --- a/tests/optimized/local_h5/test_traceability_contract.py +++ b/tests/integration/local_h5/test_traceability_contract.py @@ -2,8 +2,11 @@ FingerprintingService, PublishingInputBundle, ) +import pytest -from tests.optimized.local_h5.fixtures import SEED, VERSION, seed_local_h5_artifacts +from tests.integration.local_h5.fixtures import SEED, VERSION, seed_local_h5_artifacts + +pytestmark = pytest.mark.integration def _fingerprint_for(*, inputs, scope: str = "regional") -> str: diff --git a/tests/optimized/local_h5/test_worker_script_tiny_fixture.py b/tests/integration/local_h5/test_worker_script_tiny_fixture.py similarity index 97% rename from tests/optimized/local_h5/test_worker_script_tiny_fixture.py rename to tests/integration/local_h5/test_worker_script_tiny_fixture.py index 6ad30fb4e..aeab4b22c 100644 --- a/tests/optimized/local_h5/test_worker_script_tiny_fixture.py +++ b/tests/integration/local_h5/test_worker_script_tiny_fixture.py @@ -7,11 +7,13 @@ import pytest -from tests.optimized.local_h5.fixtures import ( +from tests.integration.local_h5.fixtures import ( build_request, seed_local_h5_artifacts, ) +pytestmark = pytest.mark.integration + pytest.importorskip("scipy") pytest.importorskip("spm_calculator") diff --git a/tests/optimized/test_modal_h5_pipeline_e2e.py b/tests/integration/test_modal_h5_pipeline_e2e.py similarity index 98% rename from tests/optimized/test_modal_h5_pipeline_e2e.py rename to tests/integration/test_modal_h5_pipeline_e2e.py index d5b931e8a..c7109aafe 100644 --- a/tests/optimized/test_modal_h5_pipeline_e2e.py +++ b/tests/integration/test_modal_h5_pipeline_e2e.py @@ -26,7 +26,7 @@ def _require_modal_tokens() -> None: if not (os.environ.get("MODAL_TOKEN_ID") and os.environ.get("MODAL_TOKEN_SECRET")): - pytest.skip("Modal credentials are required for optimized H5 tests") + pytest.skip("Modal credentials are required for deployed H5 integration tests") def _function(app_name: str, function_name: str): diff --git a/tests/optimized/test_modal_pipeline_seams.py b/tests/integration/test_modal_pipeline_seams.py similarity index 96% rename from tests/optimized/test_modal_pipeline_seams.py rename to tests/integration/test_modal_pipeline_seams.py index 6c5b49b77..ac13629cc 100644 --- a/tests/optimized/test_modal_pipeline_seams.py +++ b/tests/integration/test_modal_pipeline_seams.py @@ -1,4 +1,4 @@ -"""Optimized integration tests for the deployed Modal pipeline app. +"""Integration tests for the deployed Modal pipeline app. These tests focus on image/runtime seams rather than the full data build. They verify that the deployed pipeline image can boot, import critical diff --git a/tests/optimized/local_h5/__init__.py b/tests/optimized/local_h5/__init__.py deleted file mode 100644 index 99c72228c..000000000 --- a/tests/optimized/local_h5/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Optimized local H5 integration tests.""" From fa296df3ddc718f0e896b1a54a572bb643164827 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 20:05:41 +0200 Subject: [PATCH 20/25] Add fixture-backed Stage 4 artifacts --- .github/workflows/pr.yaml | 1 + tests/integration/support/tiny_stage_4.py | 280 ++++++++++++++++++ .../test_tiny_stage_4_artifacts.py | 129 ++++++++ 3 files changed, 410 insertions(+) create mode 100644 tests/integration/support/tiny_stage_4.py create mode 100644 tests/integration/test_tiny_stage_4_artifacts.py diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 04cfcc64e..746296043 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -146,6 +146,7 @@ jobs: tests/integration/test_tiny_stage_1_artifacts.py tests/integration/test_tiny_stage_2_artifacts.py tests/integration/test_tiny_stage_3_artifacts.py + tests/integration/test_tiny_stage_4_artifacts.py tests/integration/local_h5/ tests/integration/test_modal_pipeline_seams.py tests/integration/test_modal_h5_pipeline_e2e.py diff --git a/tests/integration/support/tiny_stage_4.py b/tests/integration/support/tiny_stage_4.py new file mode 100644 index 000000000..e036a525a --- /dev/null +++ b/tests/integration/support/tiny_stage_4.py @@ -0,0 +1,280 @@ +"""Fixture-backed Stage 4 artifacts for tiny pipeline integration tests.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + +import h5py +import numpy as np + +from tests.integration.support.pipeline_workspace import TinyPipelineWorkspace +from tests.integration.support.tiny_stage_2 import PERIOD_KEY +from tests.integration.support.tiny_stage_3 import ( + EXTENDED_CPS_REQUIRED_VARIABLES, + STAGE_3_GROUP_VARIABLES, + STAGE_3_PERSON_VARIABLES, + STAGE_3_PERIOD, +) + +__test__ = False + + +STAGE_4_PERIOD = STAGE_3_PERIOD + +ENHANCED_PERSON_VARIABLES = tuple( + dict.fromkeys( + ( + *STAGE_3_PERSON_VARIABLES, + "tip_income", + "ssn_card_type", + "immigration_status_str", + "taxpayer_id_type", + "has_tin", + "has_itin", + "has_valid_ssn", + ) + ) +) + +ENHANCED_GROUP_VARIABLES = tuple( + dict.fromkeys((*STAGE_3_GROUP_VARIABLES, "takes_up_aca_if_eligible")) +) + +ENHANCED_CPS_REQUIRED_VARIABLES = tuple( + dict.fromkeys((*ENHANCED_PERSON_VARIABLES, *ENHANCED_GROUP_VARIABLES)) +) +STRATIFIED_CPS_REQUIRED_VARIABLES = EXTENDED_CPS_REQUIRED_VARIABLES + + +@dataclass(frozen=True) +class Stage4Artifacts: + """Paths written by the fixture-backed Stage 4 builder.""" + + enhanced_cps_path: Path + stratified_extended_cps_path: Path + + def as_tuple(self) -> tuple[Path, Path]: + return (self.enhanced_cps_path, self.stratified_extended_cps_path) + + +def create_stage_4_artifacts(workspace: TinyPipelineWorkspace) -> Stage4Artifacts: + """Write deterministic enhanced and stratified CPS artifacts.""" + + extended_cps_path = workspace.stage_3 / "extended_cps_2024.h5" + _require_paths((extended_cps_path,)) + + artifacts = Stage4Artifacts( + enhanced_cps_path=workspace.artifact_path("stage_4", "enhanced_cps_2024.h5"), + stratified_extended_cps_path=workspace.artifact_path( + "stage_4", + "stratified_extended_cps_2024.h5", + ), + ) + + write_tiny_enhanced_cps( + artifacts.enhanced_cps_path, + extended_cps_path=extended_cps_path, + ) + write_tiny_stratified_extended_cps( + artifacts.stratified_extended_cps_path, + extended_cps_path=extended_cps_path, + ) + + return artifacts + + +def write_tiny_enhanced_cps(path: Path, *, extended_cps_path: Path) -> None: + """Create a tiny enhanced CPS artifact without running calibration.""" + + arrays = _load_period_arrays(extended_cps_path) + arrays["household_weight"] = _calibrated_household_weights(arrays) + arrays.update(_enhanced_person_arrays(arrays)) + arrays.update(_enhanced_group_arrays(arrays)) + + _assert_enhanced_lengths(arrays) + _write_period_h5( + path, + arrays, + attrs={ + "fixture_scale": True, + "source": "tests.integration.support.tiny_stage_4", + "source_stage_3_extended_cps": extended_cps_path.name, + "time_period": STAGE_4_PERIOD, + "stage_4_artifact": "enhanced_cps", + }, + ) + + +def write_tiny_stratified_extended_cps( + path: Path, + *, + extended_cps_path: Path, +) -> None: + """Create a tiny stratified extended CPS subset from Stage 3 output.""" + + arrays = _load_period_arrays(extended_cps_path) + selected_household_ids = _select_representative_household_ids(arrays) + stratified = _subset_by_household_ids(arrays, selected_household_ids) + + _assert_stratified_lengths(stratified) + _write_period_h5( + path, + stratified, + attrs={ + "fixture_scale": True, + "source": "tests.integration.support.tiny_stage_4", + "source_stage_3_extended_cps": extended_cps_path.name, + "time_period": STAGE_4_PERIOD, + "stage_4_artifact": "stratified_extended_cps", + "selected_household_ids": selected_household_ids, + }, + ) + + +def _require_paths(paths: Iterable[Path]) -> None: + missing = [path for path in paths if not path.exists()] + if missing: + missing_list = ", ".join(str(path) for path in missing) + raise FileNotFoundError(f"Missing Stage 3 artifact(s): {missing_list}") + + +def _load_period_arrays(path: Path) -> dict[str, np.ndarray]: + with h5py.File(path, mode="r") as h5: + return {variable: h5[variable][PERIOD_KEY][:] for variable in h5.keys()} + + +def _calibrated_household_weights(arrays: dict[str, np.ndarray]) -> np.ndarray: + weights = arrays["household_weight"].astype(np.float32) + income = arrays["spm_unit_total_income_reported"].astype(np.float32) + income_rank = np.argsort(np.argsort(income)).astype(np.float32) + center = income_rank.mean() + scale = 1.0 + (income_rank - center) * 0.04 + return np.round(weights * scale, 2).astype(np.float32) + + +def _enhanced_person_arrays(arrays: dict[str, np.ndarray]) -> dict[str, np.ndarray]: + person_count = len(arrays["person_id"]) + ssn_card_type = _resize_pattern( + [ + b"CITIZEN", + b"CITIZEN", + b"NONE", + b"NON_CITIZEN_VALID_EAD", + b"OTHER_NON_CITIZEN", + b"CITIZEN", + ], + person_count, + dtype="S32", + ) + has_valid_ssn = ssn_card_type == b"CITIZEN" + has_tin = has_valid_ssn | (ssn_card_type == b"OTHER_NON_CITIZEN") + + return { + "tip_income": np.where( + arrays["treasury_tipped_occupation_code"].astype(np.int16) > 0, + arrays["employment_income"].astype(np.float32) * 0.08, + 0, + ).astype(np.float32), + "ssn_card_type": ssn_card_type, + "immigration_status_str": np.where( + ssn_card_type == b"NONE", + b"UNDOCUMENTED", + b"CITIZEN", + ).astype("S32"), + "taxpayer_id_type": np.where( + has_valid_ssn, + b"VALID_SSN", + np.where(has_tin, b"OTHER_TIN", b"NONE"), + ).astype("S16"), + "has_tin": has_tin.astype(np.bool_), + "has_itin": has_tin.astype(np.bool_), + "has_valid_ssn": has_valid_ssn.astype(np.bool_), + } + + +def _enhanced_group_arrays(arrays: dict[str, np.ndarray]) -> dict[str, np.ndarray]: + group_count = len(arrays["household_id"]) + return { + "takes_up_aca_if_eligible": _resize_pattern( + [True, False, True, True, False], + group_count, + dtype=np.bool_, + ) + } + + +def _select_representative_household_ids( + arrays: dict[str, np.ndarray], +) -> np.ndarray: + household_ids = arrays["household_id"].astype(np.int64) + income = arrays["spm_unit_total_income_reported"].astype(np.float32) + ordered = household_ids[np.argsort(income)] + candidates = [ordered[0], ordered[len(ordered) // 2], ordered[-1]] + + is_puf = arrays["household_is_puf_clone"].astype(bool) + if not np.isin(household_ids[is_puf], candidates).any(): + candidates.append(household_ids[is_puf][0]) + if not np.isin(household_ids[~is_puf], candidates).any(): + candidates.append(household_ids[~is_puf][0]) + + selected = np.array(list(dict.fromkeys(int(value) for value in candidates))) + return selected.astype(np.int64) + + +def _subset_by_household_ids( + arrays: dict[str, np.ndarray], + household_ids: np.ndarray, +) -> dict[str, np.ndarray]: + source_household_ids = arrays["household_id"] + household_mask = np.isin(source_household_ids, household_ids) + person_mask = np.isin(arrays["person_household_id"], household_ids) + person_count = len(arrays["person_id"]) + household_count = len(source_household_ids) + + subset = {} + for variable, values in arrays.items(): + if len(values) == person_count: + subset[variable] = values[person_mask] + elif len(values) == household_count: + subset[variable] = values[household_mask] + else: + raise ValueError(f"Cannot infer entity level for {variable}") + return subset + + +def _resize_pattern(values: list[object], length: int, *, dtype) -> np.ndarray: + repeats = int(np.ceil(length / len(values))) + return np.resize(np.array(values * repeats, dtype=dtype), length) + + +def _write_period_h5( + path: Path, + arrays: dict[str, np.ndarray], + *, + attrs: dict[str, object], +) -> None: + with h5py.File(path, mode="w") as h5: + for key, value in attrs.items(): + h5.attrs[key] = value + for variable in sorted(arrays): + h5.create_group(variable).create_dataset(PERIOD_KEY, data=arrays[variable]) + + +def _assert_enhanced_lengths(arrays: dict[str, np.ndarray]) -> None: + person_count = len(arrays["person_id"]) + household_count = len(arrays["household_id"]) + for variable in ENHANCED_PERSON_VARIABLES: + assert len(arrays[variable]) == person_count, variable + for variable in ENHANCED_GROUP_VARIABLES: + assert len(arrays[variable]) == household_count, variable + + +def _assert_stratified_lengths(arrays: dict[str, np.ndarray]) -> None: + person_count = len(arrays["person_id"]) + household_count = len(arrays["household_id"]) + for variable in STAGE_3_PERSON_VARIABLES: + assert len(arrays[variable]) == person_count, variable + for variable in STAGE_3_GROUP_VARIABLES: + assert len(arrays[variable]) == household_count, variable diff --git a/tests/integration/test_tiny_stage_4_artifacts.py b/tests/integration/test_tiny_stage_4_artifacts.py new file mode 100644 index 000000000..c290c8722 --- /dev/null +++ b/tests/integration/test_tiny_stage_4_artifacts.py @@ -0,0 +1,129 @@ +import h5py +import numpy as np +import pytest + +from tests.integration.support.pipeline_workspace import TinyPipelineWorkspace +from tests.integration.support.tiny_stage_1 import create_stage_1_artifacts +from tests.integration.support.tiny_stage_2 import PERIOD_KEY, create_stage_2_artifacts +from tests.integration.support.tiny_stage_3 import create_stage_3_artifacts +from tests.integration.support.tiny_stage_4 import ( + ENHANCED_CPS_REQUIRED_VARIABLES, + STAGE_4_PERIOD, + STRATIFIED_CPS_REQUIRED_VARIABLES, + create_stage_4_artifacts, +) + + +def _load_period_arrays(path): + with h5py.File(path, mode="r") as h5: + return {name: h5[name][PERIOD_KEY][:] for name in h5.keys()} + + +def _create_stage_4_workspace(root): + workspace = TinyPipelineWorkspace.create(root / "tiny-pipeline") + create_stage_1_artifacts(workspace) + create_stage_2_artifacts(workspace) + create_stage_3_artifacts(workspace) + return workspace + + +def _assert_period_grouped_contract(path, required_variables, artifact_name): + with h5py.File(path, mode="r") as h5: + assert bool(h5.attrs["fixture_scale"]) is True + assert h5.attrs["time_period"] == STAGE_4_PERIOD + assert h5.attrs["stage_4_artifact"] == artifact_name + assert set(required_variables).issubset(h5.keys()) + for variable in required_variables: + assert PERIOD_KEY in h5[variable] + + +def test_create_stage_4_artifacts_requires_stage_3_output(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + + with pytest.raises(FileNotFoundError, match="Missing Stage 3 artifact"): + create_stage_4_artifacts(workspace) + + +def test_create_stage_4_artifacts_writes_declared_workspace_outputs(tmp_path): + workspace = _create_stage_4_workspace(tmp_path) + + artifacts = create_stage_4_artifacts(workspace) + + assert artifacts.as_tuple() == workspace.expected_artifacts("stage_4") + assert all(path.exists() for path in artifacts.as_tuple()) + + +def test_tiny_enhanced_cps_has_required_schema_and_weights(tmp_path): + workspace = _create_stage_4_workspace(tmp_path) + artifacts = create_stage_4_artifacts(workspace) + + _assert_period_grouped_contract( + artifacts.enhanced_cps_path, + ENHANCED_CPS_REQUIRED_VARIABLES, + "enhanced_cps", + ) + enhanced = _load_period_arrays(artifacts.enhanced_cps_path) + extended = _load_period_arrays(workspace.stage_3 / "extended_cps_2024.h5") + + assert len(enhanced["household_weight"]) == len(extended["household_weight"]) + assert (enhanced["household_weight"] > 0).all() + assert not np.array_equal( + enhanced["household_weight"], + extended["household_weight"], + ) + + +def test_tiny_enhanced_cps_carries_identification_and_tip_contract(tmp_path): + workspace = _create_stage_4_workspace(tmp_path) + artifacts = create_stage_4_artifacts(workspace) + + arrays = _load_period_arrays(artifacts.enhanced_cps_path) + ssn_card_type = arrays["ssn_card_type"].astype(str) + taxpayer_id_type = arrays["taxpayer_id_type"].astype(str) + + assert arrays["tip_income"].shape == arrays["person_id"].shape + assert arrays["tip_income"].sum() > 0 + np.testing.assert_array_equal(arrays["has_itin"], arrays["has_tin"]) + np.testing.assert_array_equal( + arrays["has_valid_ssn"], + taxpayer_id_type == "VALID_SSN", + ) + np.testing.assert_array_equal( + arrays["has_tin"], + taxpayer_id_type != "NONE", + ) + np.testing.assert_array_equal( + arrays["has_valid_ssn"][ssn_card_type == "NONE"], False + ) + + +def test_tiny_stratified_cps_has_required_schema_and_representative_rows(tmp_path): + workspace = _create_stage_4_workspace(tmp_path) + artifacts = create_stage_4_artifacts(workspace) + + _assert_period_grouped_contract( + artifacts.stratified_extended_cps_path, + STRATIFIED_CPS_REQUIRED_VARIABLES, + "stratified_extended_cps", + ) + arrays = _load_period_arrays(artifacts.stratified_extended_cps_path) + + assert len(arrays["household_id"]) == 3 + assert len(arrays["person_id"]) == 4 + assert set(arrays["person_household_id"]).issubset(set(arrays["household_id"])) + assert arrays["household_is_puf_clone"].any() + assert (~arrays["household_is_puf_clone"]).any() + assert arrays["is_puf_clone"].any() + assert (~arrays["is_puf_clone"]).any() + + +def test_tiny_stratified_cps_preserves_low_middle_and_high_income_rows(tmp_path): + workspace = _create_stage_4_workspace(tmp_path) + artifacts = create_stage_4_artifacts(workspace) + + arrays = _load_period_arrays(artifacts.stratified_extended_cps_path) + income = arrays["spm_unit_total_income_reported"] + + assert income.min() == 0 + assert income.max() >= 100_000 + assert len(np.unique(income)) == len(income) From eccdaf22e52dfc282725a4564a7334ba774ecb53 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 20:15:18 +0200 Subject: [PATCH 21/25] Add fixture-backed Stage 5 artifacts --- .github/workflows/pr.yaml | 1 + tests/integration/support/tiny_stage_5.py | 328 ++++++++++++++++++ .../test_tiny_stage_5_artifacts.py | 129 +++++++ 3 files changed, 458 insertions(+) create mode 100644 tests/integration/support/tiny_stage_5.py create mode 100644 tests/integration/test_tiny_stage_5_artifacts.py diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 746296043..5a8698d98 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -147,6 +147,7 @@ jobs: tests/integration/test_tiny_stage_2_artifacts.py tests/integration/test_tiny_stage_3_artifacts.py tests/integration/test_tiny_stage_4_artifacts.py + tests/integration/test_tiny_stage_5_artifacts.py tests/integration/local_h5/ tests/integration/test_modal_pipeline_seams.py tests/integration/test_modal_h5_pipeline_e2e.py diff --git a/tests/integration/support/tiny_stage_5.py b/tests/integration/support/tiny_stage_5.py new file mode 100644 index 000000000..824a16c96 --- /dev/null +++ b/tests/integration/support/tiny_stage_5.py @@ -0,0 +1,328 @@ +"""Fixture-backed Stage 5 artifacts for tiny pipeline integration tests.""" + +from __future__ import annotations + +import shutil +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + +import h5py +import numpy as np + +from tests.integration.support.pipeline_workspace import TinyPipelineWorkspace +from tests.integration.support.tiny_stage_2 import PERIOD_KEY +from tests.integration.support.tiny_stage_4 import ( + ENHANCED_CPS_REQUIRED_VARIABLES, + STAGE_4_PERIOD, + STRATIFIED_CPS_REQUIRED_VARIABLES, +) + +__test__ = False + + +STAGE_5_PERIOD = STAGE_4_PERIOD + +SOURCE_IMPUTED_PERSON_VARIABLES = ( + "tip_income", + "hourly_wage", + "is_paid_hourly", + "is_union_member_or_covered", +) +SOURCE_IMPUTED_HOUSEHOLD_VARIABLES = ( + "pre_subsidy_rent", + "bank_account_assets", + "stock_assets", + "bond_assets", + "household_vehicles_value", + "net_worth", + "auto_loan_balance", + "auto_loan_interest", +) +SOURCE_IMPUTED_REQUIRED_VARIABLES = tuple( + dict.fromkeys( + ( + *STRATIFIED_CPS_REQUIRED_VARIABLES, + *SOURCE_IMPUTED_PERSON_VARIABLES, + *SOURCE_IMPUTED_HOUSEHOLD_VARIABLES, + ) + ) +) +SMALL_ENHANCED_REQUIRED_VARIABLES = ENHANCED_CPS_REQUIRED_VARIABLES +SPARSE_ENHANCED_REQUIRED_VARIABLES = ENHANCED_CPS_REQUIRED_VARIABLES + + +@dataclass(frozen=True) +class Stage5Artifacts: + """Paths written by the fixture-backed Stage 5 builder.""" + + source_imputed_path: Path + source_imputed_alias_path: Path + small_enhanced_cps_path: Path + sparse_enhanced_cps_path: Path + + def as_tuple(self) -> tuple[Path, Path, Path, Path]: + return ( + self.source_imputed_path, + self.source_imputed_alias_path, + self.small_enhanced_cps_path, + self.sparse_enhanced_cps_path, + ) + + +def create_stage_5_artifacts(workspace: TinyPipelineWorkspace) -> Stage5Artifacts: + """Write deterministic Stage 5 artifacts from Stage 4 outputs.""" + + stage_4_paths = _stage_4_paths(workspace) + _require_paths(stage_4_paths.values()) + + artifacts = Stage5Artifacts( + source_imputed_path=workspace.artifact_path( + "stage_5", + "source_imputed_stratified_extended_cps_2024.h5", + ), + source_imputed_alias_path=workspace.artifact_path( + "stage_5", + "source_imputed_stratified_extended_cps.h5", + ), + small_enhanced_cps_path=workspace.artifact_path( + "stage_5", + "small_enhanced_cps_2024.h5", + ), + sparse_enhanced_cps_path=workspace.artifact_path( + "stage_5", + "sparse_enhanced_cps_2024.h5", + ), + ) + + write_tiny_source_imputed_stratified_cps( + artifacts.source_imputed_path, + stratified_extended_cps_path=stage_4_paths["stratified"], + ) + shutil.copy2(artifacts.source_imputed_path, artifacts.source_imputed_alias_path) + write_tiny_small_enhanced_cps( + artifacts.small_enhanced_cps_path, + enhanced_cps_path=stage_4_paths["enhanced"], + ) + write_tiny_sparse_enhanced_cps( + artifacts.sparse_enhanced_cps_path, + enhanced_cps_path=stage_4_paths["enhanced"], + ) + + return artifacts + + +def write_tiny_source_imputed_stratified_cps( + path: Path, + *, + stratified_extended_cps_path: Path, +) -> None: + """Create a tiny source-imputed stratified CPS artifact.""" + + arrays = _load_period_arrays(stratified_extended_cps_path) + arrays.update(_source_imputed_person_arrays(arrays)) + arrays.update(_source_imputed_household_arrays(arrays)) + + _assert_source_imputed_lengths(arrays) + _write_period_h5( + path, + arrays, + attrs={ + "fixture_scale": True, + "source": "tests.integration.support.tiny_stage_5", + "source_stage_4_stratified": stratified_extended_cps_path.name, + "time_period": STAGE_5_PERIOD, + "stage_5_artifact": "source_imputed_stratified_extended_cps", + }, + ) + + +def write_tiny_small_enhanced_cps(path: Path, *, enhanced_cps_path: Path) -> None: + """Create a deterministic tiny subsample of the enhanced CPS artifact.""" + + arrays = _load_period_arrays(enhanced_cps_path) + selected_household_ids = arrays["household_id"].astype(np.int64)[:2] + subset = _subset_by_household_ids(arrays, selected_household_ids) + + _assert_enhanced_lengths( + subset, required_variables=SMALL_ENHANCED_REQUIRED_VARIABLES + ) + _write_period_h5( + path, + subset, + attrs={ + "fixture_scale": True, + "source": "tests.integration.support.tiny_stage_5", + "source_stage_4_enhanced": enhanced_cps_path.name, + "time_period": STAGE_5_PERIOD, + "stage_5_artifact": "small_enhanced_cps", + "selected_household_ids": selected_household_ids, + }, + ) + + +def write_tiny_sparse_enhanced_cps(path: Path, *, enhanced_cps_path: Path) -> None: + """Create a sparse enhanced CPS artifact by retaining non-zero-weight rows.""" + + arrays = _load_period_arrays(enhanced_cps_path) + household_ids = arrays["household_id"].astype(np.int64) + household_weight = arrays["household_weight"].astype(np.float32) + selected_household_ids = household_ids[ + household_weight >= np.median(household_weight) + ] + if len(selected_household_ids) == len(household_ids): + selected_household_ids = household_ids[np.argsort(household_weight)[-3:]] + + subset = _subset_by_household_ids(arrays, selected_household_ids) + positive_mask = subset["household_weight"] > 0 + subset = _subset_by_household_ids(subset, subset["household_id"][positive_mask]) + + _assert_enhanced_lengths( + subset, + required_variables=SPARSE_ENHANCED_REQUIRED_VARIABLES, + ) + _write_period_h5( + path, + subset, + attrs={ + "fixture_scale": True, + "source": "tests.integration.support.tiny_stage_5", + "source_stage_4_enhanced": enhanced_cps_path.name, + "time_period": STAGE_5_PERIOD, + "stage_5_artifact": "sparse_enhanced_cps", + "selected_household_ids": selected_household_ids, + }, + ) + + +def _stage_4_paths(workspace: TinyPipelineWorkspace) -> dict[str, Path]: + return { + "enhanced": workspace.stage_4 / "enhanced_cps_2024.h5", + "stratified": workspace.stage_4 / "stratified_extended_cps_2024.h5", + } + + +def _require_paths(paths: Iterable[Path]) -> None: + missing = [path for path in paths if not path.exists()] + if missing: + missing_list = ", ".join(str(path) for path in missing) + raise FileNotFoundError(f"Missing Stage 4 artifact(s): {missing_list}") + + +def _load_period_arrays(path: Path) -> dict[str, np.ndarray]: + with h5py.File(path, mode="r") as h5: + return {variable: h5[variable][PERIOD_KEY][:] for variable in h5.keys()} + + +def _source_imputed_person_arrays( + arrays: dict[str, np.ndarray], +) -> dict[str, np.ndarray]: + employment_income = arrays["employment_income"].astype(np.float32) + hours = arrays["weekly_hours_worked"].astype(np.float32) + annual_hours = np.maximum(hours * 52, 1) + hourly_wage = np.where(hours > 0, employment_income / annual_hours, 0).astype( + np.float32 + ) + + return { + "tip_income": np.where( + arrays["treasury_tipped_occupation_code"].astype(np.int16) > 0, + np.round(employment_income * 0.08, 2), + 0, + ).astype(np.float32), + "hourly_wage": np.round(hourly_wage, 2).astype(np.float32), + "is_paid_hourly": (hours > 0) & (hourly_wage < 60), + "is_union_member_or_covered": _resize_pattern( + [False, True, False, False], + len(arrays["person_id"]), + dtype=np.bool_, + ), + } + + +def _source_imputed_household_arrays( + arrays: dict[str, np.ndarray], +) -> dict[str, np.ndarray]: + income = arrays["spm_unit_total_income_reported"].astype(np.float32) + vehicles_owned = arrays["household_vehicles_owned"].astype(np.float32) + bank_account_assets = np.round(np.maximum(income * 0.06, 250), 2).astype(np.float32) + stock_assets = np.round(np.where(income > 80_000, income * 0.35, income * 0.05), 2) + bond_assets = np.round(np.where(income > 50_000, income * 0.03, 0), 2) + vehicle_value = np.round(vehicles_owned * 8_000, 2) + auto_loan_balance = np.round(vehicles_owned * 2_500, 2) + + return { + "pre_subsidy_rent": arrays["rent"].astype(np.float32).copy(), + "bank_account_assets": bank_account_assets.astype(np.float32), + "stock_assets": stock_assets.astype(np.float32), + "bond_assets": bond_assets.astype(np.float32), + "household_vehicles_value": vehicle_value.astype(np.float32), + "net_worth": ( + bank_account_assets + + stock_assets + + bond_assets + + vehicle_value + - auto_loan_balance + ).astype(np.float32), + "auto_loan_balance": auto_loan_balance.astype(np.float32), + "auto_loan_interest": np.round(auto_loan_balance * 0.07, 2).astype(np.float32), + } + + +def _subset_by_household_ids( + arrays: dict[str, np.ndarray], + household_ids: np.ndarray, +) -> dict[str, np.ndarray]: + source_household_ids = arrays["household_id"] + household_mask = np.isin(source_household_ids, household_ids) + person_mask = np.isin(arrays["person_household_id"], household_ids) + person_count = len(arrays["person_id"]) + household_count = len(source_household_ids) + + subset = {} + for variable, values in arrays.items(): + if len(values) == person_count: + subset[variable] = values[person_mask] + elif len(values) == household_count: + subset[variable] = values[household_mask] + else: + raise ValueError(f"Cannot infer entity level for {variable}") + return subset + + +def _resize_pattern(values: list[object], length: int, *, dtype) -> np.ndarray: + repeats = int(np.ceil(length / len(values))) + return np.resize(np.array(values * repeats, dtype=dtype), length) + + +def _write_period_h5( + path: Path, + arrays: dict[str, np.ndarray], + *, + attrs: dict[str, object], +) -> None: + with h5py.File(path, mode="w") as h5: + for key, value in attrs.items(): + h5.attrs[key] = value + for variable in sorted(arrays): + h5.create_group(variable).create_dataset(PERIOD_KEY, data=arrays[variable]) + + +def _assert_source_imputed_lengths(arrays: dict[str, np.ndarray]) -> None: + person_count = len(arrays["person_id"]) + household_count = len(arrays["household_id"]) + for variable in SOURCE_IMPUTED_REQUIRED_VARIABLES: + length = len(arrays[variable]) + assert length in {person_count, household_count}, variable + + +def _assert_enhanced_lengths( + arrays: dict[str, np.ndarray], + *, + required_variables: tuple[str, ...], +) -> None: + person_count = len(arrays["person_id"]) + household_count = len(arrays["household_id"]) + for variable in required_variables: + length = len(arrays[variable]) + assert length in {person_count, household_count}, variable diff --git a/tests/integration/test_tiny_stage_5_artifacts.py b/tests/integration/test_tiny_stage_5_artifacts.py new file mode 100644 index 000000000..e7ad96f7d --- /dev/null +++ b/tests/integration/test_tiny_stage_5_artifacts.py @@ -0,0 +1,129 @@ +import h5py +import numpy as np +import pytest + +from tests.integration.support.pipeline_workspace import TinyPipelineWorkspace +from tests.integration.support.tiny_stage_1 import create_stage_1_artifacts +from tests.integration.support.tiny_stage_2 import PERIOD_KEY, create_stage_2_artifacts +from tests.integration.support.tiny_stage_3 import create_stage_3_artifacts +from tests.integration.support.tiny_stage_4 import create_stage_4_artifacts +from tests.integration.support.tiny_stage_5 import ( + SMALL_ENHANCED_REQUIRED_VARIABLES, + SOURCE_IMPUTED_REQUIRED_VARIABLES, + SPARSE_ENHANCED_REQUIRED_VARIABLES, + STAGE_5_PERIOD, + create_stage_5_artifacts, +) + + +def _load_period_arrays(path): + with h5py.File(path, mode="r") as h5: + return {name: h5[name][PERIOD_KEY][:] for name in h5.keys()} + + +def _create_stage_5_workspace(root): + workspace = TinyPipelineWorkspace.create(root / "tiny-pipeline") + create_stage_1_artifacts(workspace) + create_stage_2_artifacts(workspace) + create_stage_3_artifacts(workspace) + create_stage_4_artifacts(workspace) + return workspace + + +def _assert_period_grouped_contract(path, required_variables, artifact_name): + with h5py.File(path, mode="r") as h5: + assert bool(h5.attrs["fixture_scale"]) is True + assert h5.attrs["time_period"] == STAGE_5_PERIOD + assert h5.attrs["stage_5_artifact"] == artifact_name + assert set(required_variables).issubset(h5.keys()) + for variable in required_variables: + assert PERIOD_KEY in h5[variable] + + +def test_create_stage_5_artifacts_requires_stage_4_outputs(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + + with pytest.raises(FileNotFoundError, match="Missing Stage 4 artifact"): + create_stage_5_artifacts(workspace) + + +def test_create_stage_5_artifacts_writes_declared_workspace_outputs(tmp_path): + workspace = _create_stage_5_workspace(tmp_path) + + artifacts = create_stage_5_artifacts(workspace) + + assert artifacts.as_tuple() == workspace.expected_artifacts("stage_5") + assert all(path.exists() for path in artifacts.as_tuple()) + + +def test_source_imputed_stratified_cps_has_alias_and_source_contract(tmp_path): + workspace = _create_stage_5_workspace(tmp_path) + artifacts = create_stage_5_artifacts(workspace) + + _assert_period_grouped_contract( + artifacts.source_imputed_path, + SOURCE_IMPUTED_REQUIRED_VARIABLES, + "source_imputed_stratified_extended_cps", + ) + assert ( + artifacts.source_imputed_alias_path.read_bytes() + == artifacts.source_imputed_path.read_bytes() + ) + + +def test_source_imputed_stratified_cps_adds_expected_imputations(tmp_path): + workspace = _create_stage_5_workspace(tmp_path) + artifacts = create_stage_5_artifacts(workspace) + + arrays = _load_period_arrays(artifacts.source_imputed_path) + + assert arrays["tip_income"].shape == arrays["person_id"].shape + assert arrays["hourly_wage"].shape == arrays["person_id"].shape + assert arrays["is_paid_hourly"].dtype == np.bool_ + assert arrays["is_union_member_or_covered"].dtype == np.bool_ + assert arrays["tip_income"].sum() > 0 + assert arrays["bank_account_assets"].shape == arrays["household_id"].shape + assert arrays["net_worth"].shape == arrays["household_id"].shape + assert (arrays["bank_account_assets"] >= 0).all() + assert (arrays["net_worth"] >= 0).all() + np.testing.assert_allclose(arrays["pre_subsidy_rent"], arrays["rent"]) + + +def test_small_enhanced_cps_is_subset_with_enhanced_contract(tmp_path): + workspace = _create_stage_5_workspace(tmp_path) + artifacts = create_stage_5_artifacts(workspace) + + _assert_period_grouped_contract( + artifacts.small_enhanced_cps_path, + SMALL_ENHANCED_REQUIRED_VARIABLES, + "small_enhanced_cps", + ) + small = _load_period_arrays(artifacts.small_enhanced_cps_path) + enhanced = _load_period_arrays(workspace.stage_4 / "enhanced_cps_2024.h5") + + assert len(small["household_id"]) == 2 + assert len(small["person_id"]) < len(enhanced["person_id"]) + assert set(small["person_household_id"]).issubset(set(small["household_id"])) + assert (small["household_weight"] > 0).all() + assert set(small["taxpayer_id_type"].astype(str)).issubset( + {"VALID_SSN", "OTHER_TIN", "NONE"} + ) + + +def test_sparse_enhanced_cps_keeps_only_positive_weight_subset(tmp_path): + workspace = _create_stage_5_workspace(tmp_path) + artifacts = create_stage_5_artifacts(workspace) + + _assert_period_grouped_contract( + artifacts.sparse_enhanced_cps_path, + SPARSE_ENHANCED_REQUIRED_VARIABLES, + "sparse_enhanced_cps", + ) + sparse = _load_period_arrays(artifacts.sparse_enhanced_cps_path) + enhanced = _load_period_arrays(workspace.stage_4 / "enhanced_cps_2024.h5") + + assert 0 < len(sparse["household_id"]) < len(enhanced["household_id"]) + assert len(sparse["person_id"]) < len(enhanced["person_id"]) + assert (sparse["household_weight"] > 0).all() + assert set(sparse["person_household_id"]).issubset(set(sparse["household_id"])) + assert set(sparse["household_id"]).issubset(set(enhanced["household_id"])) From 2d917ade13e19c0b8243418ecf548f8c21fb6417 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 20:21:49 +0200 Subject: [PATCH 22/25] Add local fixture-scale pipeline E2E --- .github/workflows/pr.yaml | 1 + tests/integration/support/tiny_pipeline.py | 129 ++++++++++++++++ tests/integration/test_tiny_pipeline_e2e.py | 159 ++++++++++++++++++++ 3 files changed, 289 insertions(+) create mode 100644 tests/integration/support/tiny_pipeline.py create mode 100644 tests/integration/test_tiny_pipeline_e2e.py diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 5a8698d98..72b915803 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -148,6 +148,7 @@ jobs: tests/integration/test_tiny_stage_3_artifacts.py tests/integration/test_tiny_stage_4_artifacts.py tests/integration/test_tiny_stage_5_artifacts.py + tests/integration/test_tiny_pipeline_e2e.py tests/integration/local_h5/ tests/integration/test_modal_pipeline_seams.py tests/integration/test_modal_h5_pipeline_e2e.py diff --git a/tests/integration/support/tiny_pipeline.py b/tests/integration/support/tiny_pipeline.py new file mode 100644 index 000000000..a9c480efe --- /dev/null +++ b/tests/integration/support/tiny_pipeline.py @@ -0,0 +1,129 @@ +"""Fixture-scale pipeline composition helpers for integration tests.""" + +from __future__ import annotations + +import hashlib +from dataclasses import dataclass +from pathlib import Path + +import h5py +import numpy as np + +from tests.integration.support.pipeline_workspace import TinyPipelineWorkspace +from tests.integration.support.tiny_stage_1 import ( + Stage1Artifacts, + create_stage_1_artifacts, +) +from tests.integration.support.tiny_stage_2 import ( + Stage2Artifacts, + create_stage_2_artifacts, +) +from tests.integration.support.tiny_stage_3 import ( + Stage3Artifacts, + create_stage_3_artifacts, +) +from tests.integration.support.tiny_stage_4 import ( + Stage4Artifacts, + create_stage_4_artifacts, +) +from tests.integration.support.tiny_stage_5 import ( + Stage5Artifacts, + create_stage_5_artifacts, +) + +__test__ = False + + +@dataclass(frozen=True) +class TinyPipelineArtifacts: + """Artifacts emitted by one fixture-scale Stage 1-5 pipeline run.""" + + stage_1: Stage1Artifacts + stage_2: Stage2Artifacts + stage_3: Stage3Artifacts + stage_4: Stage4Artifacts + stage_5: Stage5Artifacts + + def by_stage(self) -> dict[str, tuple[Path, ...]]: + return { + "stage_1": self.stage_1.as_tuple(), + "stage_2": self.stage_2.as_tuple(), + "stage_3": self.stage_3.as_tuple(), + "stage_4": self.stage_4.as_tuple(), + "stage_5": self.stage_5.as_tuple(), + } + + +def create_tiny_pipeline_artifacts( + workspace: TinyPipelineWorkspace, +) -> TinyPipelineArtifacts: + """Run the fixture-backed Stage 1-5 pipeline into one workspace.""" + + stage_1 = create_stage_1_artifacts(workspace) + stage_2 = create_stage_2_artifacts(workspace) + stage_3 = create_stage_3_artifacts(workspace) + stage_4 = create_stage_4_artifacts(workspace) + stage_5 = create_stage_5_artifacts(workspace) + return TinyPipelineArtifacts( + stage_1=stage_1, + stage_2=stage_2, + stage_3=stage_3, + stage_4=stage_4, + stage_5=stage_5, + ) + + +def artifact_content_digest(path: Path) -> str: + """Return a stable digest for a tiny pipeline artifact.""" + + digest = hashlib.sha256() + if h5py.is_hdf5(path): + with h5py.File(path, mode="r") as h5: + _digest_h5_object(digest, "/", h5) + return digest.hexdigest() + + digest.update(path.read_bytes()) + return digest.hexdigest() + + +def stage_content_digests( + artifacts: TinyPipelineArtifacts, + *, + stages: tuple[str, ...] = ("stage_3", "stage_4", "stage_5"), +) -> dict[str, dict[str, str]]: + """Return stable content digests by stage and artifact filename.""" + + by_stage = artifacts.by_stage() + return { + stage: {path.name: artifact_content_digest(path) for path in by_stage[stage]} + for stage in stages + } + + +def _digest_h5_object( + digest: hashlib._Hash, + name: str, + obj: h5py.Dataset | h5py.Group | h5py.File, +) -> None: + digest.update(name.encode("utf-8")) + digest.update(type(obj).__name__.encode("utf-8")) + for attr_name in sorted(obj.attrs): + digest.update(attr_name.encode("utf-8")) + digest.update(_normalise_h5_value(obj.attrs[attr_name])) + + if isinstance(obj, h5py.Dataset): + values = obj[()] + digest.update(str(values.dtype).encode("utf-8")) + digest.update(str(values.shape).encode("utf-8")) + digest.update(np.ascontiguousarray(values).tobytes()) + return + + for child_name in sorted(obj.keys()): + _digest_h5_object(digest, f"{name}/{child_name}", obj[child_name]) + + +def _normalise_h5_value(value: object) -> bytes: + array = np.asarray(value) + if array.shape == (): + return repr(array.item()).encode("utf-8") + return repr(array.tolist()).encode("utf-8") diff --git a/tests/integration/test_tiny_pipeline_e2e.py b/tests/integration/test_tiny_pipeline_e2e.py new file mode 100644 index 000000000..187698d64 --- /dev/null +++ b/tests/integration/test_tiny_pipeline_e2e.py @@ -0,0 +1,159 @@ +import socket +from pathlib import Path + +import h5py +import pytest + +from tests.integration.support.pipeline_workspace import TinyPipelineWorkspace +from tests.integration.support.tiny_pipeline import ( + create_tiny_pipeline_artifacts, + stage_content_digests, +) +from tests.integration.support.tiny_stage_2 import ( + CPS_REQUIRED_VARIABLES, + PERIOD_KEY, + PUF_REQUIRED_VARIABLES, +) +from tests.integration.support.tiny_stage_3 import EXTENDED_CPS_REQUIRED_VARIABLES +from tests.integration.support.tiny_stage_4 import ( + ENHANCED_CPS_REQUIRED_VARIABLES, + STRATIFIED_CPS_REQUIRED_VARIABLES, +) +from tests.integration.support.tiny_stage_5 import ( + SMALL_ENHANCED_REQUIRED_VARIABLES, + SOURCE_IMPUTED_REQUIRED_VARIABLES, + SPARSE_ENHANCED_REQUIRED_VARIABLES, +) + + +def _block_network(monkeypatch: pytest.MonkeyPatch) -> None: + def fail_network(*_args, **_kwargs): + raise AssertionError("fixture-scale pipeline must not open network sockets") + + monkeypatch.setattr(socket, "socket", fail_network) + + +def _assert_declared_artifacts_exist( + workspace: TinyPipelineWorkspace, + artifacts_by_stage: dict[str, tuple[Path, ...]], +) -> None: + for stage, paths in artifacts_by_stage.items(): + assert paths == workspace.expected_artifacts(stage) + assert all(path.exists() for path in paths) + + +def _assert_period_grouped_h5(path: Path, required_variables: tuple[str, ...]) -> None: + with h5py.File(path, mode="r") as h5: + assert bool(h5.attrs["fixture_scale"]) is True + assert set(required_variables).issubset(h5.keys()) + for variable in required_variables: + assert PERIOD_KEY in h5[variable] + + +def _assert_consumed_source( + path: Path, + expected_attrs: dict[str, str], +) -> None: + with h5py.File(path, mode="r") as h5: + for attr, value in expected_attrs.items(): + assert h5.attrs[attr] == value + + +def _build_pipeline(root: Path): + workspace = TinyPipelineWorkspace.create(root / "tiny-pipeline") + artifacts = create_tiny_pipeline_artifacts(workspace) + return workspace, artifacts + + +def test_fixture_scale_pipeline_builds_stage_1_through_5_without_network( + tmp_path, + monkeypatch, +): + _block_network(monkeypatch) + + workspace, artifacts = _build_pipeline(tmp_path) + + _assert_declared_artifacts_exist(workspace, artifacts.by_stage()) + + +def test_fixture_scale_pipeline_outputs_required_handoff_schemas(tmp_path): + _workspace, artifacts = _build_pipeline(tmp_path) + + _assert_period_grouped_h5(artifacts.stage_2.cps_path, CPS_REQUIRED_VARIABLES) + _assert_period_grouped_h5(artifacts.stage_2.puf_path, PUF_REQUIRED_VARIABLES) + _assert_period_grouped_h5( + artifacts.stage_3.extended_cps_path, + EXTENDED_CPS_REQUIRED_VARIABLES, + ) + _assert_period_grouped_h5( + artifacts.stage_4.enhanced_cps_path, + ENHANCED_CPS_REQUIRED_VARIABLES, + ) + _assert_period_grouped_h5( + artifacts.stage_4.stratified_extended_cps_path, + STRATIFIED_CPS_REQUIRED_VARIABLES, + ) + _assert_period_grouped_h5( + artifacts.stage_5.source_imputed_path, + SOURCE_IMPUTED_REQUIRED_VARIABLES, + ) + _assert_period_grouped_h5( + artifacts.stage_5.small_enhanced_cps_path, + SMALL_ENHANCED_REQUIRED_VARIABLES, + ) + _assert_period_grouped_h5( + artifacts.stage_5.sparse_enhanced_cps_path, + SPARSE_ENHANCED_REQUIRED_VARIABLES, + ) + + +def test_fixture_scale_pipeline_records_stage_handoffs(tmp_path): + _workspace, artifacts = _build_pipeline(tmp_path) + + _assert_consumed_source( + artifacts.stage_2.cps_path, + {"source_stage_1_acs": "acs_2022.h5"}, + ) + _assert_consumed_source( + artifacts.stage_2.puf_path, + {"source_stage_1_irs_puf": "irs_puf_2015.h5"}, + ) + _assert_consumed_source( + artifacts.stage_3.extended_cps_path, + { + "source_stage_2_cps": "cps_2024.h5", + "source_stage_2_puf": "puf_2024.h5", + }, + ) + for path in artifacts.stage_4.as_tuple(): + _assert_consumed_source( + path, {"source_stage_3_extended_cps": "extended_cps_2024.h5"} + ) + _assert_consumed_source( + artifacts.stage_5.source_imputed_path, + {"source_stage_4_stratified": "stratified_extended_cps_2024.h5"}, + ) + _assert_consumed_source( + artifacts.stage_5.small_enhanced_cps_path, + {"source_stage_4_enhanced": "enhanced_cps_2024.h5"}, + ) + _assert_consumed_source( + artifacts.stage_5.sparse_enhanced_cps_path, + {"source_stage_4_enhanced": "enhanced_cps_2024.h5"}, + ) + + +def test_fixture_scale_pipeline_stage_digests_are_stable(tmp_path): + _workspace_a, artifacts_a = _build_pipeline(tmp_path / "a") + _workspace_b, artifacts_b = _build_pipeline(tmp_path / "b") + + assert stage_content_digests(artifacts_a) == stage_content_digests(artifacts_b) + + +def test_fixture_scale_pipeline_source_imputed_alias_matches_versioned_output(tmp_path): + _workspace, artifacts = _build_pipeline(tmp_path) + + assert ( + artifacts.stage_5.source_imputed_alias_path.read_bytes() + == artifacts.stage_5.source_imputed_path.read_bytes() + ) From 4448407202206159b22e893c07788b9382469537 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 20:25:11 +0200 Subject: [PATCH 23/25] Run target integration tests alongside unit checks --- .github/workflows/pr.yaml | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 72b915803..5af546ccc 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -100,16 +100,7 @@ jobs: target-integration-tests: runs-on: ubuntu-latest - needs: - [ - check-fork, - check-lock-freshness, - lint, - check-changelog, - unit-tests, - smoke-test, - docs-build, - ] + needs: [check-fork, lint] env: MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} From b4b6963e96f9e748bef385b589f2a3399db271b8 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 22:37:09 +0200 Subject: [PATCH 24/25] Add local H5 continuation integration test --- .github/workflows/pr.yaml | 1 + .../integration/support/pipeline_workspace.py | 2 +- tests/integration/support/tiny_h5.py | 290 ++++++++++++++++++ tests/integration/support/tiny_stage_3.py | 8 +- tests/integration/support/tiny_stage_5.py | 79 +++-- .../integration/test_tiny_pipeline_h5_e2e.py | 114 +++++++ .../test_tiny_pipeline_workspace.py | 2 +- .../test_tiny_stage_5_artifacts.py | 8 +- 8 files changed, 471 insertions(+), 33 deletions(-) create mode 100644 tests/integration/support/tiny_h5.py create mode 100644 tests/integration/test_tiny_pipeline_h5_e2e.py diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 5af546ccc..1a8c299f0 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -140,6 +140,7 @@ jobs: tests/integration/test_tiny_stage_4_artifacts.py tests/integration/test_tiny_stage_5_artifacts.py tests/integration/test_tiny_pipeline_e2e.py + tests/integration/test_tiny_pipeline_h5_e2e.py tests/integration/local_h5/ tests/integration/test_modal_pipeline_seams.py tests/integration/test_modal_h5_pipeline_e2e.py diff --git a/tests/integration/support/pipeline_workspace.py b/tests/integration/support/pipeline_workspace.py index 916e90eba..f8d56e31c 100644 --- a/tests/integration/support/pipeline_workspace.py +++ b/tests/integration/support/pipeline_workspace.py @@ -44,7 +44,7 @@ "unified_run_config.json", ), "h5_outputs": ( - "states/AL.h5", + "states/NC.h5", "districts/NC-01.h5", "national/US.h5", ), diff --git a/tests/integration/support/tiny_h5.py b/tests/integration/support/tiny_h5.py new file mode 100644 index 000000000..b48f1656d --- /dev/null +++ b/tests/integration/support/tiny_h5.py @@ -0,0 +1,290 @@ +"""Fixture-scale H5 continuation helpers for integration tests.""" + +from __future__ import annotations + +import json +import pickle +import sqlite3 +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path + +import numpy as np + +from policyengine_us_data.calibration.clone_and_assign import ( + GeographyAssignment, + save_geography, +) +from policyengine_us_data.calibration.local_h5.fingerprinting import ( + PublishingInputBundle, +) +from policyengine_us_data.calibration.local_h5.requests import ( + AreaBuildRequest, + AreaFilter, +) +from tests.integration.support.pipeline_workspace import TinyPipelineWorkspace +from tests.integration.support.tiny_pipeline import TinyPipelineArtifacts + +__test__ = False + + +DISTRICT_GEOID = "3701" +COUNTY_FIPS = "37183" +STATE_CODE = "NC" +STATE_FIPS = 37 +N_CLONES = 1 +SEED = 42 +VERSION = "0.0.0" + + +@dataclass(frozen=True) +class TinyH5Artifacts: + """Artifacts needed to continue a tiny Stage 5 build into H5 generation.""" + + dataset_path: Path + weights_path: Path + db_path: Path + run_config_path: Path + geography_path: Path + calibration_package_path: Path + geography: GeographyAssignment + n_records: int + n_clones: int + + +def create_tiny_h5_artifacts( + workspace: TinyPipelineWorkspace, + pipeline_artifacts: TinyPipelineArtifacts, + *, + n_clones: int = N_CLONES, +) -> TinyH5Artifacts: + """Seed calibration inputs using the shared tiny Stage 5 dataset.""" + + dataset_path = pipeline_artifacts.stage_5.source_imputed_alias_path + n_records = _household_count(dataset_path) + + weights_path = workspace.artifact_path("calibration", "calibration_weights.npy") + db_path = workspace.artifact_path("calibration", "policy_data.db") + run_config_path = workspace.artifact_path("calibration", "unified_run_config.json") + geography_path = workspace.artifact_path("calibration", "geography_assignment.npz") + calibration_package_path = workspace.artifact_path( + "calibration", + "calibration_package.pkl", + ) + + np.save(weights_path, np.ones(n_records * n_clones, dtype=np.float32)) + + geography = base_geography(n_records=n_records, n_clones=n_clones) + save_geography(geography, geography_path) + _write_calibration_package(calibration_package_path, geography=geography) + _write_policy_data_db(db_path) + run_config_path.write_text(json.dumps(_run_metadata())) + + return TinyH5Artifacts( + dataset_path=dataset_path, + weights_path=weights_path, + db_path=db_path, + run_config_path=run_config_path, + geography_path=geography_path, + calibration_package_path=calibration_package_path, + geography=geography, + n_records=n_records, + n_clones=n_clones, + ) + + +def base_geography(*, n_records: int, n_clones: int = N_CLONES) -> GeographyAssignment: + """Create one deterministic NC-01 geography assignment.""" + + total_rows = n_records * n_clones + block_geoids = np.array( + [f"{COUNTY_FIPS}{i:06d}{i:04d}"[:15] for i in range(total_rows)], + dtype="U15", + ) + return GeographyAssignment( + block_geoid=block_geoids, + cd_geoid=np.full(total_rows, DISTRICT_GEOID, dtype="U4"), + county_fips=np.full(total_rows, COUNTY_FIPS, dtype="U5"), + state_fips=np.full(total_rows, STATE_FIPS, dtype=np.int32), + n_records=n_records, + n_clones=n_clones, + ) + + +def build_h5_request(area_type: str) -> AreaBuildRequest: + """Return a typed worker request for the tiny H5 fixture geography.""" + + if area_type == "district": + return AreaBuildRequest( + area_type="district", + area_id=f"{STATE_CODE}-01", + display_name=f"{STATE_CODE}-01", + output_relative_path=f"districts/{STATE_CODE}-01.h5", + filters=( + AreaFilter( + geography_field="cd_geoid", + op="in", + value=(DISTRICT_GEOID,), + ), + ), + validation_geo_level="district", + validation_geographic_ids=(DISTRICT_GEOID,), + ) + if area_type == "state": + return AreaBuildRequest( + area_type="state", + area_id=STATE_CODE, + display_name=STATE_CODE, + output_relative_path=f"states/{STATE_CODE}.h5", + filters=( + AreaFilter( + geography_field="cd_geoid", + op="in", + value=(DISTRICT_GEOID,), + ), + ), + validation_geo_level="state", + validation_geographic_ids=(str(STATE_FIPS),), + ) + if area_type == "national": + return AreaBuildRequest( + area_type="national", + area_id="US", + display_name="US", + output_relative_path="national/US.h5", + validation_geo_level="national", + validation_geographic_ids=("US",), + ) + raise ValueError(f"Unsupported tiny H5 request type: {area_type}") + + +def build_publishing_input_bundle( + artifacts: TinyH5Artifacts, + *, + run_id: str, + scope: str, +) -> PublishingInputBundle: + """Build the same traceability input shape used by local H5 publication.""" + + return PublishingInputBundle( + weights_path=artifacts.weights_path, + source_dataset_path=artifacts.dataset_path, + target_db_path=artifacts.db_path, + exact_geography_path=artifacts.geography_path, + calibration_package_path=( + artifacts.calibration_package_path if scope == "regional" else None + ), + run_config_path=artifacts.run_config_path, + run_id=run_id, + version=VERSION, + n_clones=artifacts.n_clones, + seed=SEED, + ) + + +def run_local_h5_worker( + *, + requests: tuple[AreaBuildRequest, ...], + artifacts: TinyH5Artifacts, + output_dir: Path, + use_saved_geography: bool, + use_package_geography: bool, +) -> dict: + """Run the real local H5 worker subprocess for tiny fixture requests.""" + + cmd = [ + sys.executable, + "-m", + "modal_app.worker_script", + "--requests-json", + json.dumps([request.to_dict() for request in requests]), + "--weights-path", + str(artifacts.weights_path), + "--dataset-path", + str(artifacts.dataset_path), + "--db-path", + str(artifacts.db_path), + "--output-dir", + str(output_dir), + "--n-clones", + str(artifacts.n_clones), + "--no-validate", + ] + if use_saved_geography: + cmd.extend(["--geography-path", str(artifacts.geography_path)]) + if use_package_geography: + cmd.extend( + [ + "--calibration-package-path", + str(artifacts.calibration_package_path), + ] + ) + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + check=True, + ) + return json.loads(result.stdout) + + +def _household_count(dataset_path: Path) -> int: + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=str(dataset_path)) + try: + return int(len(sim.calculate("household_id", map_to="household").values)) + finally: + del sim + + +def _run_metadata() -> dict[str, object]: + return { + "git_commit": "deadbeefcafebabe", + "git_branch": "main", + "git_dirty": False, + "package_version": VERSION, + } + + +def _write_calibration_package( + path: Path, + *, + geography: GeographyAssignment, +) -> None: + with open(path, "wb") as handle: + pickle.dump( + { + "block_geoid": geography.block_geoid, + "cd_geoid": geography.cd_geoid, + "metadata": _run_metadata(), + }, + handle, + protocol=pickle.HIGHEST_PROTOCOL, + ) + + +def _write_policy_data_db(path: Path) -> None: + conn = sqlite3.connect(path) + try: + conn.execute( + """ + CREATE TABLE stratum_constraints ( + stratum_id INTEGER, + constraint_variable TEXT, + value TEXT + ) + """ + ) + conn.execute( + """ + INSERT INTO stratum_constraints (stratum_id, constraint_variable, value) + VALUES (?, ?, ?) + """, + (1, "congressional_district_geoid", DISTRICT_GEOID), + ) + conn.commit() + finally: + conn.close() diff --git a/tests/integration/support/tiny_stage_3.py b/tests/integration/support/tiny_stage_3.py index 755f24853..e4947fb80 100644 --- a/tests/integration/support/tiny_stage_3.py +++ b/tests/integration/support/tiny_stage_3.py @@ -35,6 +35,8 @@ "cps_race", "detailed_occupation_recode", "treasury_tipped_occupation_code", + "tanf_reported", + "ssi_reported", "is_puf_clone", ) ) @@ -51,8 +53,6 @@ "spm_unit_spm_threshold", "spm_unit_capped_housing_subsidy_reported", "snap_reported", - "tanf_reported", - "ssi_reported", "household_is_puf_clone", ) ) @@ -226,6 +226,8 @@ def _extended_person_arrays( person_count, dtype=np.int16, ), + "tanf_reported": np.zeros(person_count, dtype=np.float32), + "ssi_reported": np.zeros(person_count, dtype=np.float32), "is_puf_clone": np.concatenate( [ np.zeros(cps_person_count, dtype=np.bool_), @@ -269,8 +271,6 @@ def _extended_group_arrays( 0, ).astype(np.float32), "snap_reported": np.where(total_income < 50_000, 1_000, 0).astype(np.float32), - "tanf_reported": np.zeros(household_count, dtype=np.float32), - "ssi_reported": np.zeros(household_count, dtype=np.float32), "household_is_puf_clone": np.concatenate( [ np.zeros(cps_household_count, dtype=np.bool_), diff --git a/tests/integration/support/tiny_stage_5.py b/tests/integration/support/tiny_stage_5.py index 824a16c96..59ba55d54 100644 --- a/tests/integration/support/tiny_stage_5.py +++ b/tests/integration/support/tiny_stage_5.py @@ -25,15 +25,13 @@ SOURCE_IMPUTED_PERSON_VARIABLES = ( "tip_income", - "hourly_wage", - "is_paid_hourly", - "is_union_member_or_covered", -) -SOURCE_IMPUTED_HOUSEHOLD_VARIABLES = ( "pre_subsidy_rent", "bank_account_assets", "stock_assets", "bond_assets", + "is_paid_hourly", +) +SOURCE_IMPUTED_HOUSEHOLD_VARIABLES = ( "household_vehicles_value", "net_worth", "auto_loan_balance", @@ -219,10 +217,7 @@ def _source_imputed_person_arrays( ) -> dict[str, np.ndarray]: employment_income = arrays["employment_income"].astype(np.float32) hours = arrays["weekly_hours_worked"].astype(np.float32) - annual_hours = np.maximum(hours * 52, 1) - hourly_wage = np.where(hours > 0, employment_income / annual_hours, 0).astype( - np.float32 - ) + household_assets = _source_imputed_household_asset_inputs(arrays) return { "tip_income": np.where( @@ -230,32 +225,38 @@ def _source_imputed_person_arrays( np.round(employment_income * 0.08, 2), 0, ).astype(np.float32), - "hourly_wage": np.round(hourly_wage, 2).astype(np.float32), - "is_paid_hourly": (hours > 0) & (hourly_wage < 60), - "is_union_member_or_covered": _resize_pattern( - [False, True, False, False], - len(arrays["person_id"]), - dtype=np.bool_, + "pre_subsidy_rent": _household_values_to_person( + arrays, + arrays["rent"].astype(np.float32), ), + "bank_account_assets": _household_values_to_person( + arrays, + household_assets["bank_account_assets"], + ), + "stock_assets": _household_values_to_person( + arrays, + household_assets["stock_assets"], + ), + "bond_assets": _household_values_to_person( + arrays, + household_assets["bond_assets"], + ), + "is_paid_hourly": hours > 0, } def _source_imputed_household_arrays( arrays: dict[str, np.ndarray], ) -> dict[str, np.ndarray]: - income = arrays["spm_unit_total_income_reported"].astype(np.float32) + household_assets = _source_imputed_household_asset_inputs(arrays) + bank_account_assets = household_assets["bank_account_assets"] + stock_assets = household_assets["stock_assets"] + bond_assets = household_assets["bond_assets"] vehicles_owned = arrays["household_vehicles_owned"].astype(np.float32) - bank_account_assets = np.round(np.maximum(income * 0.06, 250), 2).astype(np.float32) - stock_assets = np.round(np.where(income > 80_000, income * 0.35, income * 0.05), 2) - bond_assets = np.round(np.where(income > 50_000, income * 0.03, 0), 2) vehicle_value = np.round(vehicles_owned * 8_000, 2) auto_loan_balance = np.round(vehicles_owned * 2_500, 2) return { - "pre_subsidy_rent": arrays["rent"].astype(np.float32).copy(), - "bank_account_assets": bank_account_assets.astype(np.float32), - "stock_assets": stock_assets.astype(np.float32), - "bond_assets": bond_assets.astype(np.float32), "household_vehicles_value": vehicle_value.astype(np.float32), "net_worth": ( bank_account_assets @@ -269,6 +270,38 @@ def _source_imputed_household_arrays( } +def _source_imputed_household_asset_inputs( + arrays: dict[str, np.ndarray], +) -> dict[str, np.ndarray]: + income = arrays["spm_unit_total_income_reported"].astype(np.float32) + return { + "bank_account_assets": np.round(np.maximum(income * 0.06, 250), 2).astype( + np.float32 + ), + "stock_assets": np.round( + np.where(income > 80_000, income * 0.35, income * 0.05), + 2, + ).astype(np.float32), + "bond_assets": np.round(np.where(income > 50_000, income * 0.03, 0), 2).astype( + np.float32 + ), + } + + +def _household_values_to_person( + arrays: dict[str, np.ndarray], + household_values: np.ndarray, +) -> np.ndarray: + household_id_to_value = dict(zip(arrays["household_id"], household_values)) + return np.array( + [ + household_id_to_value[household_id] + for household_id in arrays["person_household_id"] + ], + dtype=np.asarray(household_values).dtype, + ) + + def _subset_by_household_ids( arrays: dict[str, np.ndarray], household_ids: np.ndarray, diff --git a/tests/integration/test_tiny_pipeline_h5_e2e.py b/tests/integration/test_tiny_pipeline_h5_e2e.py new file mode 100644 index 000000000..f23752a92 --- /dev/null +++ b/tests/integration/test_tiny_pipeline_h5_e2e.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +from pathlib import Path + +import h5py +import pytest + +from policyengine_us_data.calibration.local_h5.fingerprinting import ( + FingerprintingService, +) +from policyengine_us_data.utils.manifest import generate_manifest, verify_manifest +from tests.integration.support.pipeline_workspace import TinyPipelineWorkspace +from tests.integration.support.tiny_h5 import ( + build_h5_request, + build_publishing_input_bundle, + create_tiny_h5_artifacts, + run_local_h5_worker, +) +from tests.integration.support.tiny_pipeline import create_tiny_pipeline_artifacts + +pytestmark = pytest.mark.integration + +pytest.importorskip("scipy") +pytest.importorskip("spm_calculator") + + +def test_tiny_pipeline_stage_5_outputs_continue_into_local_h5s(tmp_path): + workspace = TinyPipelineWorkspace.create(tmp_path / "tiny-pipeline") + pipeline_artifacts = create_tiny_pipeline_artifacts(workspace) + h5_artifacts = create_tiny_h5_artifacts(workspace, pipeline_artifacts) + run_id = "tiny-run-001" + run_dir = workspace.h5_staging / run_id + + requests = ( + build_h5_request("district"), + build_h5_request("state"), + build_h5_request("national"), + ) + + result = run_local_h5_worker( + requests=requests, + artifacts=h5_artifacts, + output_dir=run_dir, + use_saved_geography=True, + use_package_geography=False, + ) + + assert result["failed"] == [] + assert result["errors"] == [] + assert result["completed"] == ["district:NC-01", "state:NC", "national:US"] + + for request in requests: + h5_path = run_dir / request.output_relative_path + assert h5_path.exists() + _assert_h5_contract(h5_path) + + manifest = generate_manifest( + workspace.h5_staging, + run_id, + version="0.0.0", + categories=["states", "districts", "national"], + ) + verification = verify_manifest(workspace.h5_staging, manifest, subdir=run_id) + + assert sorted(manifest["files"]) == [ + "districts/NC-01.h5", + "national/US.h5", + "states/NC.h5", + ] + assert manifest["totals"]["states"] == 1 + assert manifest["totals"]["districts"] == 1 + assert manifest["totals"]["national"] == 1 + assert verification == { + "valid": True, + "missing": [], + "checksum_mismatch": [], + "verified": 3, + } + + fingerprints = _scope_fingerprints(h5_artifacts, run_id=run_id) + assert set(fingerprints) == {"regional", "national"} + assert fingerprints["regional"] != fingerprints["national"] + assert all(len(value) == 16 for value in fingerprints.values()) + + +def _assert_h5_contract(path: Path) -> None: + with h5py.File(path, mode="r") as h5: + for variable in ( + "household_id", + "person_id", + "household_weight", + "state_fips", + "congressional_district_geoid", + ): + assert variable in h5 + assert "2024" in h5[variable] + assert len(h5[variable]["2024"]) > 0 + + +def _scope_fingerprints(h5_artifacts, *, run_id: str) -> dict[str, str]: + service = FingerprintingService() + fingerprints = {} + for scope in ("regional", "national"): + inputs = build_publishing_input_bundle( + h5_artifacts, + run_id=run_id, + scope=scope, + ) + traceability = service.build_traceability(inputs=inputs, scope=scope) + assert traceability.metadata["run_id"] == run_id + assert traceability.weights.path == h5_artifacts.weights_path + assert traceability.source_dataset.path == h5_artifacts.dataset_path + fingerprints[scope] = service.compute_scope_fingerprint(traceability) + return fingerprints diff --git a/tests/integration/test_tiny_pipeline_workspace.py b/tests/integration/test_tiny_pipeline_workspace.py index 49b53ed56..5ccd77cf7 100644 --- a/tests/integration/test_tiny_pipeline_workspace.py +++ b/tests/integration/test_tiny_pipeline_workspace.py @@ -37,7 +37,7 @@ def test_tiny_pipeline_workspace_resolves_expected_artifacts(tmp_path): workspace.stage_1 / "irs_puf_2015.h5", ) assert workspace.expected_artifacts("h5_outputs") == ( - workspace.h5_outputs / "states" / "AL.h5", + workspace.h5_outputs / "states" / "NC.h5", workspace.h5_outputs / "districts" / "NC-01.h5", workspace.h5_outputs / "national" / "US.h5", ) diff --git a/tests/integration/test_tiny_stage_5_artifacts.py b/tests/integration/test_tiny_stage_5_artifacts.py index e7ad96f7d..0ccf4cab1 100644 --- a/tests/integration/test_tiny_stage_5_artifacts.py +++ b/tests/integration/test_tiny_stage_5_artifacts.py @@ -78,15 +78,15 @@ def test_source_imputed_stratified_cps_adds_expected_imputations(tmp_path): arrays = _load_period_arrays(artifacts.source_imputed_path) assert arrays["tip_income"].shape == arrays["person_id"].shape - assert arrays["hourly_wage"].shape == arrays["person_id"].shape + assert arrays["pre_subsidy_rent"].shape == arrays["person_id"].shape + assert arrays["bank_account_assets"].shape == arrays["person_id"].shape + assert arrays["stock_assets"].shape == arrays["person_id"].shape + assert arrays["bond_assets"].shape == arrays["person_id"].shape assert arrays["is_paid_hourly"].dtype == np.bool_ - assert arrays["is_union_member_or_covered"].dtype == np.bool_ assert arrays["tip_income"].sum() > 0 - assert arrays["bank_account_assets"].shape == arrays["household_id"].shape assert arrays["net_worth"].shape == arrays["household_id"].shape assert (arrays["bank_account_assets"] >= 0).all() assert (arrays["net_worth"] >= 0).all() - np.testing.assert_allclose(arrays["pre_subsidy_rent"], arrays["rent"]) def test_small_enhanced_cps_is_subset_with_enhanced_contract(tmp_path): From e8a37fb09042fea4737ce391375bce1b4bd63ec7 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 29 Apr 2026 22:53:01 +0200 Subject: [PATCH 25/25] Strengthen Modal runtime seam checks --- modal_app/pipeline.py | 65 ++++++++++++++++--- .../integration/test_modal_pipeline_seams.py | 26 ++++++++ 2 files changed, 82 insertions(+), 9 deletions(-) diff --git a/modal_app/pipeline.py b/modal_app/pipeline.py index 478d0e073..e23ed24a8 100644 --- a/modal_app/pipeline.py +++ b/modal_app/pipeline.py @@ -482,6 +482,17 @@ def verify_runtime_seams() -> dict: import importlib repo_root = "/root/policyengine-us-data" + expected_files = ( + "pyproject.toml", + "uv.lock", + "modal_app/worker_script.py", + "modal_app/local_area.py", + "modal_app/h5_test_harness.py", + "modal_app/fixtures/h5_cases.py", + "tests/integration/test_fixture_50hh.h5", + "policyengine_us_data/calibration/target_config.yaml", + "policyengine_us_data/calibration/target_config_full.yaml", + ) result = { "interpreter": { "parent": sys.executable, @@ -489,38 +500,74 @@ def verify_runtime_seams() -> dict: "imports": {}, "subprocess": {}, "paths": { + "cwd": os.getcwd(), "repo_root_exists": os.path.isdir(repo_root), + "working_directory_is_repo_root": os.getcwd() == repo_root, "target_config_exists": os.path.exists( f"{repo_root}/policyengine_us_data/calibration/target_config.yaml" ), + "expected_files": { + rel_path: os.path.exists(f"{repo_root}/{rel_path}") + for rel_path in expected_files + }, }, } + result["paths"]["all_expected_files_exist"] = all( + result["paths"]["expected_files"].values() + ) for module_name in ( - "pandas", + "google.cloud.storage", "h5py", - "policyengine_us_data", + "huggingface_hub", + "modal_app.fixtures.h5_cases", + "modal_app.h5_test_harness", + "modal_app.local_area", + "modal_app.remote_calibration_runner", "modal_app.worker_script", + "numpy", + "pandas", + "policyengine_us", + "policyengine_us_data", + "spm_calculator", + "sqlalchemy", ): - imported = importlib.import_module(module_name) - result["imports"][module_name] = { - "ok": True, - "version": getattr(imported, "__version__", None), - } + try: + imported = importlib.import_module(module_name) + result["imports"][module_name] = { + "ok": True, + "version": getattr(imported, "__version__", None), + } + except Exception as exc: + result["imports"][module_name] = { + "ok": False, + "error": repr(exc), + } child_python = subprocess.run( - _python_cmd("-c", "import sys; print(sys.executable)"), + _python_cmd( + "-c", + ( + "import json, os, sys; " + "print(json.dumps({'executable': sys.executable, 'cwd': os.getcwd()}))" + ), + ), capture_output=True, text=True, check=True, cwd=repo_root, ) - child_exec = child_python.stdout.strip() + child_runtime = json.loads(child_python.stdout) + child_exec = child_runtime["executable"] result["interpreter"]["child"] = child_exec + result["interpreter"]["child_cwd"] = child_runtime["cwd"] result["interpreter"]["child_matches_parent"] = child_exec == sys.executable + result["interpreter"]["child_cwd_is_repo_root"] = child_runtime["cwd"] == repo_root for name, cmd in { + "worker_import": _python_cmd("-c", "import modal_app.worker_script"), "worker_help": _python_cmd("-m", "modal_app.worker_script", "--help"), + "local_area_import": _python_cmd("-c", "import modal_app.local_area"), "calibration_help": _python_cmd( "-m", "policyengine_us_data.calibration.unified_calibration", diff --git a/tests/integration/test_modal_pipeline_seams.py b/tests/integration/test_modal_pipeline_seams.py index ac13629cc..eb7842f18 100644 --- a/tests/integration/test_modal_pipeline_seams.py +++ b/tests/integration/test_modal_pipeline_seams.py @@ -35,17 +35,43 @@ def test_pipeline_image_runtime_seams(): assert result["paths"]["repo_root_exists"] is True assert result["paths"]["target_config_exists"] is True + assert result["paths"]["working_directory_is_repo_root"] is True + assert result["paths"]["all_expected_files_exist"] is True + assert result["paths"]["expected_files"] == { + "pyproject.toml": True, + "uv.lock": True, + "modal_app/worker_script.py": True, + "modal_app/local_area.py": True, + "modal_app/h5_test_harness.py": True, + "modal_app/fixtures/h5_cases.py": True, + "tests/integration/test_fixture_50hh.h5": True, + "policyengine_us_data/calibration/target_config.yaml": True, + "policyengine_us_data/calibration/target_config_full.yaml": True, + } for module_name in ( + "google.cloud.storage", "pandas", "h5py", + "huggingface_hub", + "modal_app.fixtures.h5_cases", + "modal_app.h5_test_harness", + "modal_app.local_area", + "modal_app.remote_calibration_runner", + "numpy", + "policyengine_us", "policyengine_us_data", "modal_app.worker_script", + "spm_calculator", + "sqlalchemy", ): assert result["imports"][module_name]["ok"] is True assert result["interpreter"]["child_matches_parent"] is True + assert result["interpreter"]["child_cwd_is_repo_root"] is True + assert result["subprocess"]["worker_import"]["returncode"] == 0 assert result["subprocess"]["worker_help"]["returncode"] == 0 + assert result["subprocess"]["local_area_import"]["returncode"] == 0 assert result["subprocess"]["calibration_help"]["returncode"] == 0 checkpoint_policy = result["calibration_optimizer_checkpoint_policy"] assert checkpoint_policy["runner_exposes_checkpoint_name"] is False