Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ node_modules
!population_by_state.csv
!aca_spending_and_enrollment_2024.csv
!aca_spending_and_enrollment_2025.csv
!policyengine_us_data/storage/calibration_targets/acs_housing_costs_2024.csv
!real_estate_taxes_by_state_acs.csv
!snap_state.csv
!age_state.csv
Expand Down
1 change: 1 addition & 0 deletions changelog.d/831.changed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Refined national ECPS calibration targets to remove circular survey/SPM constructs while keeping defensible rent, property tax, childcare, private-transfer balance constraints, structured EITC-by-AGI-and-child-count SOI targets, and taxable-filer AGI/count targets by AGI band and filing status. Added a national target parity manifest utility to classify legacy `build_loss_matrix()` labels against structured `policy_data.db` target rows.
5 changes: 5 additions & 0 deletions policyengine_us_data/calibration/calibration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,11 @@ def apply_op(values: np.ndarray, op: str, val: str) -> np.ndarray:
if values.dtype.kind == "S" and isinstance(parsed, str):
parsed = parsed.encode()

if op == "in":
allowed = [part.strip() for part in val.split("|")]
if values.dtype.kind == "S":
allowed = [part.encode() for part in allowed]
return np.isin(values, allowed)
if op in ("==", "="):
return values == parsed
if op == ">":
Expand Down
20 changes: 20 additions & 0 deletions policyengine_us_data/calibration/target_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,18 @@ include:
- variable: adjusted_gross_income
geo_level: national
domain_variable: adjusted_gross_income
- variable: tax_unit_count
geo_level: national
domain_variable: adjusted_gross_income,income_tax_before_credits
- variable: adjusted_gross_income
geo_level: national
domain_variable: adjusted_gross_income,income_tax_before_credits
- variable: tax_unit_count
geo_level: national
domain_variable: adjusted_gross_income,filing_status,income_tax_before_credits
- variable: adjusted_gross_income
geo_level: national
domain_variable: adjusted_gross_income,filing_status,income_tax_before_credits

# === NATIONAL — wealth target (Federal Reserve SCF, no filer filter) ===
- variable: net_worth
Expand All @@ -108,6 +120,8 @@ include:
geo_level: national
- variable: child_support_received
geo_level: national
- variable: childcare_expenses
geo_level: national
- variable: eitc
geo_level: national
- variable: health_insurance_premiums_without_medicare_part_b
Expand Down Expand Up @@ -171,6 +185,9 @@ include:
- variable: eitc
geo_level: national
domain_variable: eitc_child_count
- variable: eitc
geo_level: national
domain_variable: adjusted_gross_income,eitc,eitc_child_count
- variable: net_capital_gains
geo_level: national
domain_variable: net_capital_gains
Expand Down Expand Up @@ -206,6 +223,9 @@ include:
- variable: tax_unit_count
geo_level: national
domain_variable: eitc_child_count
- variable: tax_unit_count
geo_level: national
domain_variable: adjusted_gross_income,eitc,eitc_child_count
# Restore old loss.py's ACA enrollment count target.
- variable: person_count
geo_level: national
Expand Down
25 changes: 22 additions & 3 deletions policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from policyengine_core.data import Dataset
import pandas as pd
from policyengine_us_data.utils import (
ABSOLUTE_ERROR_SCALE_TARGETS,
build_loss_matrix,
get_target_error_normalisation,
HardConcrete,
print_reweighting_diagnostics,
set_seeds,
Expand Down Expand Up @@ -113,6 +115,10 @@ def reweight(
):
target_names = np.array(loss_matrix.columns)
is_national = loss_matrix.columns.str.startswith("nation/")
numerator_shift_np, error_denominator_np = get_target_error_normalisation(
target_names,
targets_array,
)
loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
nation_normalisation_factor = is_national * (1 / is_national.sum())
state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
Expand All @@ -121,6 +127,8 @@ def reweight(
)
normalisation_factor = torch.tensor(normalisation_factor, dtype=torch.float32)
targets_array = torch.tensor(targets_array, dtype=torch.float32)
numerator_shift = torch.tensor(numerator_shift_np, dtype=torch.float32)
error_denominator = torch.tensor(error_denominator_np, dtype=torch.float32)

inv_mean_normalisation = 1 / np.mean(normalisation_factor.numpy())

Expand All @@ -132,7 +140,9 @@ def loss(weights):
estimate = weights @ loss_matrix
if torch.isnan(estimate).any():
raise ValueError("Estimate contains NaNs")
rel_error = (((estimate - targets_array) + 1) / (targets_array + 1)) ** 2
rel_error = (
(estimate - targets_array + numerator_shift) / error_denominator
) ** 2
rel_error_normalized = inv_mean_normalisation * rel_error * normalisation_factor
if torch.isnan(rel_error_normalized).any():
raise ValueError("Relative error contains NaNs")
Expand Down Expand Up @@ -176,7 +186,10 @@ def loss(weights):
)
df["epoch"] = i
df["error"] = df.estimate - df.target
df["rel_error"] = df.error / df.target
df["error_denominator"] = error_denominator.detach().numpy()
df["rel_error"] = (
df.error + numerator_shift.detach().numpy()
) / df.error_denominator
df["abs_error"] = df.error.abs()
df["rel_abs_error"] = df.rel_error.abs()
df["loss"] = df.rel_abs_error**2
Expand All @@ -203,6 +216,7 @@ def loss(weights):
loss_matrix,
targets_array,
"L0 Sparse Solution",
target_names=target_names,
)

return final_weights_sparse
Expand Down Expand Up @@ -248,7 +262,12 @@ def generate(self):
# Run the optimization procedure to get (close to) minimum loss weights
for year in range(self.start_year, self.end_year + 1):
loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year)
zero_mask = np.isclose(targets_array, 0.0, atol=0.1)
scaled_zero_target_mask = loss_matrix.columns.isin(
ABSOLUTE_ERROR_SCALE_TARGETS.keys()
)
zero_mask = np.isclose(targets_array, 0.0, atol=0.1) & (
~scaled_zero_target_mask
)
bad_mask = loss_matrix.columns.isin(bad_targets)
keep_mask_bool = ~(zero_mask | bad_mask)
keep_idx = np.where(keep_mask_bool)[0]
Expand Down
18 changes: 18 additions & 0 deletions policyengine_us_data/db/DATABASE_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,24 @@ rm -f policyengine_us_data/storage/calibration/policy_data.db
make database
```

### Legacy National Target Parity

The legacy national Enhanced CPS pipeline still builds labels through
`policyengine_us_data.utils.loss.build_loss_matrix()`. To audit whether those
labels correspond to structured rows in `policy_data.db`, build a parity
manifest:

```bash
python -m policyengine_us_data.utils.national_target_parity \
--dataset-path policyengine_us_data/storage/enhanced_cps_2024.h5 \
--target-db policyengine_us_data/storage/calibration/policy_data.db \
--period 2024 \
--output national_target_parity.json
```

Each national loss label is classified as `matched` with a target ID or as
`legacy_only` with an explicit reason.

## Database Schema

### Core Tables
Expand Down
1 change: 1 addition & 0 deletions policyengine_us_data/db/create_field_valid_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def populate_field_valid_values(session: Session) -> None:
operation_values = [
("operation", "==", "Equals"),
("operation", "!=", "Not equals"),
("operation", "in", "In pipe-delimited set"),
("operation", ">", "Greater than"),
("operation", ">=", "Greater than or equal"),
("operation", "<", "Less than"),
Expand Down
Loading
Loading