PolicyEngine · MaxGhenis · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026
diff --git a/.gitignore b/.gitignore
@@ -22,6 +22,7 @@ node_modules
 !population_by_state.csv
 !aca_spending_and_enrollment_2024.csv
 !aca_spending_and_enrollment_2025.csv
+!policyengine_us_data/storage/calibration_targets/acs_housing_costs_2024.csv
 !real_estate_taxes_by_state_acs.csv
 !snap_state.csv
 !age_state.csv

diff --git a/changelog.d/831.changed.md b/changelog.d/831.changed.md
@@ -0,0 +1 @@
+Refined national ECPS calibration targets to remove circular survey/SPM constructs while keeping defensible rent, property tax, childcare, private-transfer balance constraints, structured EITC-by-AGI-and-child-count SOI targets, and taxable-filer AGI/count targets by AGI band and filing status. Added a national target parity manifest utility to classify legacy `build_loss_matrix()` labels against structured `policy_data.db` target rows.
diff --git a/policyengine_us_data/calibration/calibration_utils.py b/policyengine_us_data/calibration/calibration_utils.py
@@ -268,6 +268,11 @@ def apply_op(values: np.ndarray, op: str, val: str) -> np.ndarray:
     if values.dtype.kind == "S" and isinstance(parsed, str):
         parsed = parsed.encode()
 
+    if op == "in":
+        allowed = [part.strip() for part in val.split("|")]
+        if values.dtype.kind == "S":
+            allowed = [part.encode() for part in allowed]
+        return np.isin(values, allowed)
     if op in ("==", "="):
         return values == parsed
     if op == ">":

diff --git a/policyengine_us_data/calibration/target_config.yaml b/policyengine_us_data/calibration/target_config.yaml
@@ -92,6 +92,18 @@ include:
   - variable: adjusted_gross_income
     geo_level: national
     domain_variable: adjusted_gross_income
+  - variable: tax_unit_count
+    geo_level: national
+    domain_variable: adjusted_gross_income,income_tax_before_credits
+  - variable: adjusted_gross_income
+    geo_level: national
+    domain_variable: adjusted_gross_income,income_tax_before_credits
+  - variable: tax_unit_count
+    geo_level: national
+    domain_variable: adjusted_gross_income,filing_status,income_tax_before_credits
+  - variable: adjusted_gross_income
+    geo_level: national
+    domain_variable: adjusted_gross_income,filing_status,income_tax_before_credits
 
   # === NATIONAL — wealth target (Federal Reserve SCF, no filer filter) ===
   - variable: net_worth
@@ -108,6 +120,8 @@ include:
     geo_level: national
   - variable: child_support_received
     geo_level: national
+  - variable: childcare_expenses
+    geo_level: national
   - variable: eitc
     geo_level: national
   - variable: health_insurance_premiums_without_medicare_part_b
@@ -171,6 +185,9 @@ include:
   - variable: eitc
     geo_level: national
     domain_variable: eitc_child_count
+  - variable: eitc
+    geo_level: national
+    domain_variable: adjusted_gross_income,eitc,eitc_child_count
   - variable: net_capital_gains
     geo_level: national
     domain_variable: net_capital_gains
@@ -206,6 +223,9 @@ include:
   - variable: tax_unit_count
     geo_level: national
     domain_variable: eitc_child_count
+  - variable: tax_unit_count
+    geo_level: national
+    domain_variable: adjusted_gross_income,eitc,eitc_child_count
   # Restore old loss.py's ACA enrollment count target.
   - variable: person_count
     geo_level: national

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -1,7 +1,9 @@
 from policyengine_core.data import Dataset
 import pandas as pd
 from policyengine_us_data.utils import (
+    ABSOLUTE_ERROR_SCALE_TARGETS,
     build_loss_matrix,
+    get_target_error_normalisation,
     HardConcrete,
     print_reweighting_diagnostics,
     set_seeds,
@@ -113,6 +115,10 @@ def reweight(
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
+    numerator_shift_np, error_denominator_np = get_target_error_normalisation(
+        target_names,
+        targets_array,
+    )
     loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
     nation_normalisation_factor = is_national * (1 / is_national.sum())
     state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
@@ -121,6 +127,8 @@ def reweight(
     )
     normalisation_factor = torch.tensor(normalisation_factor, dtype=torch.float32)
     targets_array = torch.tensor(targets_array, dtype=torch.float32)
+    numerator_shift = torch.tensor(numerator_shift_np, dtype=torch.float32)
+    error_denominator = torch.tensor(error_denominator_np, dtype=torch.float32)
 
     inv_mean_normalisation = 1 / np.mean(normalisation_factor.numpy())
 
@@ -132,7 +140,9 @@ def loss(weights):
         estimate = weights @ loss_matrix
         if torch.isnan(estimate).any():
             raise ValueError("Estimate contains NaNs")
-        rel_error = (((estimate - targets_array) + 1) / (targets_array + 1)) ** 2
+        rel_error = (
+            (estimate - targets_array + numerator_shift) / error_denominator
+        ) ** 2
         rel_error_normalized = inv_mean_normalisation * rel_error * normalisation_factor
         if torch.isnan(rel_error_normalized).any():
             raise ValueError("Relative error contains NaNs")
@@ -176,7 +186,10 @@ def loss(weights):
             )
             df["epoch"] = i
             df["error"] = df.estimate - df.target
-            df["rel_error"] = df.error / df.target
+            df["error_denominator"] = error_denominator.detach().numpy()
+            df["rel_error"] = (
+                df.error + numerator_shift.detach().numpy()
+            ) / df.error_denominator
             df["abs_error"] = df.error.abs()
             df["rel_abs_error"] = df.rel_error.abs()
             df["loss"] = df.rel_abs_error**2
@@ -203,6 +216,7 @@ def loss(weights):
         loss_matrix,
         targets_array,
         "L0 Sparse Solution",
+        target_names=target_names,
     )
 
     return final_weights_sparse
@@ -248,7 +262,12 @@ def generate(self):
         # Run the optimization procedure to get (close to) minimum loss weights
         for year in range(self.start_year, self.end_year + 1):
             loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year)
-            zero_mask = np.isclose(targets_array, 0.0, atol=0.1)
+            scaled_zero_target_mask = loss_matrix.columns.isin(
+                ABSOLUTE_ERROR_SCALE_TARGETS.keys()
+            )
+            zero_mask = np.isclose(targets_array, 0.0, atol=0.1) & (
+                ~scaled_zero_target_mask
+            )
             bad_mask = loss_matrix.columns.isin(bad_targets)
             keep_mask_bool = ~(zero_mask | bad_mask)
             keep_idx = np.where(keep_mask_bool)[0]

diff --git a/policyengine_us_data/db/DATABASE_GUIDE.md b/policyengine_us_data/db/DATABASE_GUIDE.md
@@ -52,6 +52,24 @@ rm -f policyengine_us_data/storage/calibration/policy_data.db
 make database
 ```
 
+### Legacy National Target Parity
+
+The legacy national Enhanced CPS pipeline still builds labels through
+`policyengine_us_data.utils.loss.build_loss_matrix()`. To audit whether those
+labels correspond to structured rows in `policy_data.db`, build a parity
+manifest:
+
+```bash
+python -m policyengine_us_data.utils.national_target_parity \
+  --dataset-path policyengine_us_data/storage/enhanced_cps_2024.h5 \
+  --target-db policyengine_us_data/storage/calibration/policy_data.db \
+  --period 2024 \
+  --output national_target_parity.json
+```
+
+Each national loss label is classified as `matched` with a target ID or as
+`legacy_only` with an explicit reason.
+
 ## Database Schema
 
 ### Core Tables

diff --git a/policyengine_us_data/db/create_field_valid_values.py b/policyengine_us_data/db/create_field_valid_values.py
@@ -45,6 +45,7 @@ def populate_field_valid_values(session: Session) -> None:
     operation_values = [
         ("operation", "==", "Equals"),
         ("operation", "!=", "Not equals"),
+        ("operation", "in", "In pipe-delimited set"),
         ("operation", ">", "Greater than"),
         ("operation", ">=", "Greater than or equal"),
         ("operation", "<", "Less than"),
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Refined national ECPS calibration targets to remove circular survey/SPM constructs while keeping defensible rent, property tax, childcare, private-transfer balance constraints, structured EITC-by-AGI-and-child-count SOI targets, and taxable-filer AGI/count targets by AGI band and filing status. Added a national target parity manifest utility to classify legacy `build_loss_matrix()` labels against structured `policy_data.db` target rows.