diff --git a/.gitignore b/.gitignore index 0a174f2fa..9607a08ef 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ node_modules !population_by_state.csv !aca_spending_and_enrollment_2024.csv !aca_spending_and_enrollment_2025.csv +!policyengine_us_data/storage/calibration_targets/acs_housing_costs_2024.csv !real_estate_taxes_by_state_acs.csv !snap_state.csv !age_state.csv diff --git a/changelog.d/831.changed.md b/changelog.d/831.changed.md new file mode 100644 index 000000000..0c031900d --- /dev/null +++ b/changelog.d/831.changed.md @@ -0,0 +1 @@ +Refined national ECPS calibration targets to remove circular survey/SPM constructs while keeping defensible rent, property tax, childcare, private-transfer balance constraints, structured EITC-by-AGI-and-child-count SOI targets, and taxable-filer AGI/count targets by AGI band and filing status. Added a national target parity manifest utility to classify legacy `build_loss_matrix()` labels against structured `policy_data.db` target rows. diff --git a/policyengine_us_data/calibration/calibration_utils.py b/policyengine_us_data/calibration/calibration_utils.py index 4ffd4c3a9..9840f0354 100644 --- a/policyengine_us_data/calibration/calibration_utils.py +++ b/policyengine_us_data/calibration/calibration_utils.py @@ -268,6 +268,11 @@ def apply_op(values: np.ndarray, op: str, val: str) -> np.ndarray: if values.dtype.kind == "S" and isinstance(parsed, str): parsed = parsed.encode() + if op == "in": + allowed = [part.strip() for part in val.split("|")] + if values.dtype.kind == "S": + allowed = [part.encode() for part in allowed] + return np.isin(values, allowed) if op in ("==", "="): return values == parsed if op == ">": diff --git a/policyengine_us_data/calibration/target_config.yaml b/policyengine_us_data/calibration/target_config.yaml index 154dcf878..48c90dbe2 100644 --- a/policyengine_us_data/calibration/target_config.yaml +++ b/policyengine_us_data/calibration/target_config.yaml @@ -92,6 +92,18 @@ include: - variable: adjusted_gross_income geo_level: national domain_variable: adjusted_gross_income + - variable: tax_unit_count + geo_level: national + domain_variable: adjusted_gross_income,income_tax_before_credits + - variable: adjusted_gross_income + geo_level: national + domain_variable: adjusted_gross_income,income_tax_before_credits + - variable: tax_unit_count + geo_level: national + domain_variable: adjusted_gross_income,filing_status,income_tax_before_credits + - variable: adjusted_gross_income + geo_level: national + domain_variable: adjusted_gross_income,filing_status,income_tax_before_credits # === NATIONAL — wealth target (Federal Reserve SCF, no filer filter) === - variable: net_worth @@ -108,6 +120,8 @@ include: geo_level: national - variable: child_support_received geo_level: national + - variable: childcare_expenses + geo_level: national - variable: eitc geo_level: national - variable: health_insurance_premiums_without_medicare_part_b @@ -171,6 +185,9 @@ include: - variable: eitc geo_level: national domain_variable: eitc_child_count + - variable: eitc + geo_level: national + domain_variable: adjusted_gross_income,eitc,eitc_child_count - variable: net_capital_gains geo_level: national domain_variable: net_capital_gains @@ -206,6 +223,9 @@ include: - variable: tax_unit_count geo_level: national domain_variable: eitc_child_count + - variable: tax_unit_count + geo_level: national + domain_variable: adjusted_gross_income,eitc,eitc_child_count # Restore old loss.py's ACA enrollment count target. - variable: person_count geo_level: national diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 57455617a..9d053d4c3 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -1,7 +1,9 @@ from policyengine_core.data import Dataset import pandas as pd from policyengine_us_data.utils import ( + ABSOLUTE_ERROR_SCALE_TARGETS, build_loss_matrix, + get_target_error_normalisation, HardConcrete, print_reweighting_diagnostics, set_seeds, @@ -113,6 +115,10 @@ def reweight( ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") + numerator_shift_np, error_denominator_np = get_target_error_normalisation( + target_names, + targets_array, + ) loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32) nation_normalisation_factor = is_national * (1 / is_national.sum()) state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) @@ -121,6 +127,8 @@ def reweight( ) normalisation_factor = torch.tensor(normalisation_factor, dtype=torch.float32) targets_array = torch.tensor(targets_array, dtype=torch.float32) + numerator_shift = torch.tensor(numerator_shift_np, dtype=torch.float32) + error_denominator = torch.tensor(error_denominator_np, dtype=torch.float32) inv_mean_normalisation = 1 / np.mean(normalisation_factor.numpy()) @@ -132,7 +140,9 @@ def loss(weights): estimate = weights @ loss_matrix if torch.isnan(estimate).any(): raise ValueError("Estimate contains NaNs") - rel_error = (((estimate - targets_array) + 1) / (targets_array + 1)) ** 2 + rel_error = ( + (estimate - targets_array + numerator_shift) / error_denominator + ) ** 2 rel_error_normalized = inv_mean_normalisation * rel_error * normalisation_factor if torch.isnan(rel_error_normalized).any(): raise ValueError("Relative error contains NaNs") @@ -176,7 +186,10 @@ def loss(weights): ) df["epoch"] = i df["error"] = df.estimate - df.target - df["rel_error"] = df.error / df.target + df["error_denominator"] = error_denominator.detach().numpy() + df["rel_error"] = ( + df.error + numerator_shift.detach().numpy() + ) / df.error_denominator df["abs_error"] = df.error.abs() df["rel_abs_error"] = df.rel_error.abs() df["loss"] = df.rel_abs_error**2 @@ -203,6 +216,7 @@ def loss(weights): loss_matrix, targets_array, "L0 Sparse Solution", + target_names=target_names, ) return final_weights_sparse @@ -248,7 +262,12 @@ def generate(self): # Run the optimization procedure to get (close to) minimum loss weights for year in range(self.start_year, self.end_year + 1): loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year) - zero_mask = np.isclose(targets_array, 0.0, atol=0.1) + scaled_zero_target_mask = loss_matrix.columns.isin( + ABSOLUTE_ERROR_SCALE_TARGETS.keys() + ) + zero_mask = np.isclose(targets_array, 0.0, atol=0.1) & ( + ~scaled_zero_target_mask + ) bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~(zero_mask | bad_mask) keep_idx = np.where(keep_mask_bool)[0] diff --git a/policyengine_us_data/db/DATABASE_GUIDE.md b/policyengine_us_data/db/DATABASE_GUIDE.md index a7830f591..7f9f951a7 100644 --- a/policyengine_us_data/db/DATABASE_GUIDE.md +++ b/policyengine_us_data/db/DATABASE_GUIDE.md @@ -52,6 +52,24 @@ rm -f policyengine_us_data/storage/calibration/policy_data.db make database ``` +### Legacy National Target Parity + +The legacy national Enhanced CPS pipeline still builds labels through +`policyengine_us_data.utils.loss.build_loss_matrix()`. To audit whether those +labels correspond to structured rows in `policy_data.db`, build a parity +manifest: + +```bash +python -m policyengine_us_data.utils.national_target_parity \ + --dataset-path policyengine_us_data/storage/enhanced_cps_2024.h5 \ + --target-db policyengine_us_data/storage/calibration/policy_data.db \ + --period 2024 \ + --output national_target_parity.json +``` + +Each national loss label is classified as `matched` with a target ID or as +`legacy_only` with an explicit reason. + ## Database Schema ### Core Tables diff --git a/policyengine_us_data/db/create_field_valid_values.py b/policyengine_us_data/db/create_field_valid_values.py index 1d2b8e704..6795132bc 100644 --- a/policyengine_us_data/db/create_field_valid_values.py +++ b/policyengine_us_data/db/create_field_valid_values.py @@ -45,6 +45,7 @@ def populate_field_valid_values(session: Session) -> None: operation_values = [ ("operation", "==", "Equals"), ("operation", "!=", "Not equals"), + ("operation", "in", "In pipe-delimited set"), ("operation", ">", "Greater than"), ("operation", ">=", "Greater than or equal"), ("operation", "<", "Less than"), diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index 2c386420b..ef43951af 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -6,7 +6,7 @@ from sqlmodel import Session, create_engine, select -from policyengine_us_data.storage import STORAGE_FOLDER +from policyengine_us_data.storage import CALIBRATION_FOLDER, STORAGE_FOLDER from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -30,7 +30,11 @@ cache_path, save_bytes, ) -from policyengine_us_data.utils.soi import get_tracked_soi_row +from policyengine_us_data.utils.soi import ( + get_tracked_soi_row, + load_tracked_soi_targets, + select_best_tracked_soi_rows, +) from policyengine_us_data.storage.calibration_targets.pull_soi_targets import ( STATE_ABBR_TO_FIPS, ) @@ -163,6 +167,20 @@ def _skip_coarse_state_agi_person_count_target(geo_type: str, agi_stub: int) -> } CTC_GEOGRAPHY_TARGET_VARIABLES = ("refundable_ctc", "non_refundable_ctc") +EITC_AGI_CHILD_TARGET_SOURCE_YEAR = 2022 +SOI_TAXABLE_AGI_TARGET_VARIABLES = { + "adjusted_gross_income": "adjusted_gross_income", + "count": "tax_unit_count", +} +SOI_FILING_STATUS_CONSTRAINTS = { + "Single": ("==", "SINGLE"), + "Head of Household": ("==", "HEAD_OF_HOUSEHOLD"), + "Married Filing Separately": ("==", "SEPARATE"), + "Married Filing Jointly/Surviving Spouse": ( + "in", + "JOINT|SURVIVING_SPOUSE", + ), +} def create_records(df, breakdown_variable, target_variable): @@ -450,6 +468,49 @@ def get_national_geography_soi_target( return _get_national_geography_soi_target_from_year(variable, geography_year) +def _get_state_geography_soi_targets_from_year( + variable: str, + geography_year: int, +) -> list[dict]: + spec = _get_geography_file_aggregate_target_spec(variable) + code = spec["code"] + + raw_df = extract_soi_data(geography_year) + state_rows = raw_df[(raw_df["STATE"] != "US") & (raw_df["agi_stub"] == 0)] + if "CONG_DISTRICT" in state_rows.columns: + state_rows = state_rows[state_rows["CONG_DISTRICT"] == 0] + if state_rows.empty: + raise ValueError( + f"IRS geography SOI file for {geography_year} is missing state rows " + f"for {variable}" + ) + + targets = [] + for row in state_rows.itertuples(index=False): + targets.append( + { + "variable": variable, + "source_year": geography_year, + "state_code": row.STATE, + "count": float(getattr(row, f"N{code}")), + "amount": float(getattr(row, f"A{code}")) * 1_000, + } + ) + + return sorted(targets, key=lambda target: target["state_code"]) + + +def get_state_geography_soi_targets( + variable: str, + dataset_year: int, + *, + lag: int = IRS_SOI_LAG_YEARS, +) -> list[dict]: + """Return state count and amount targets from the IRS geography file.""" + geography_year = get_geography_soi_year(dataset_year, lag=lag) + return _get_state_geography_soi_targets_from_year(variable, geography_year) + + def get_national_geography_soi_agi_targets( variable: str, dataset_year: int, @@ -588,6 +649,138 @@ def _get_or_create_national_agi_domain_stratum( return stratum +def _get_or_create_national_eitc_agi_child_stratum( + session: Session, + national_filer_stratum_id: int, + *, + count_children: int, + agi_lower_bound: float, + agi_upper_bound: float, +) -> Stratum: + if count_children < 3: + child_operation = "==" + child_value = str(count_children) + child_note = f"EITC child count = {count_children}" + else: + child_operation = ">" + child_value = "2" + child_note = "EITC child count > 2" + + note = ( + "National EITC filers, " + f"{child_note}, AGI >= {agi_lower_bound}, AGI < {agi_upper_bound}" + ) + stratum = session.exec( + select(Stratum).where( + Stratum.parent_stratum_id == national_filer_stratum_id, + Stratum.notes == note, + ) + ).first() + if stratum: + return stratum + + stratum = Stratum( + parent_stratum_id=national_filer_stratum_id, + notes=note, + ) + stratum.constraints_rel.extend( + [ + StratumConstraint( + constraint_variable="tax_unit_is_filer", + operation="==", + value="1", + ), + StratumConstraint( + constraint_variable="eitc", + operation=">", + value="0", + ), + StratumConstraint( + constraint_variable="eitc_child_count", + operation=child_operation, + value=child_value, + ), + StratumConstraint( + constraint_variable="adjusted_gross_income", + operation=">=", + value=str(agi_lower_bound), + ), + StratumConstraint( + constraint_variable="adjusted_gross_income", + operation="<", + value=str(agi_upper_bound), + ), + ] + ) + session.add(stratum) + session.flush() + return stratum + + +def _get_or_create_national_taxable_agi_filing_status_stratum( + session: Session, + national_filer_stratum_id: int, + *, + agi_lower_bound: float, + agi_upper_bound: float, + filing_status: str, +) -> Stratum: + note = f"National taxable filers, AGI >= {agi_lower_bound}, AGI < {agi_upper_bound}" + filing_constraint = SOI_FILING_STATUS_CONSTRAINTS.get(filing_status) + if filing_constraint is not None: + note += f", filing status = {filing_status}" + + stratum = session.exec( + select(Stratum).where( + Stratum.parent_stratum_id == national_filer_stratum_id, + Stratum.notes == note, + ) + ).first() + if stratum: + return stratum + + constraints = [ + StratumConstraint( + constraint_variable="tax_unit_is_filer", + operation="==", + value="1", + ), + StratumConstraint( + constraint_variable="income_tax_before_credits", + operation=">", + value="0", + ), + StratumConstraint( + constraint_variable="adjusted_gross_income", + operation=">=", + value=str(agi_lower_bound), + ), + StratumConstraint( + constraint_variable="adjusted_gross_income", + operation="<", + value=str(agi_upper_bound), + ), + ] + if filing_constraint is not None: + operation, value = filing_constraint + constraints.append( + StratumConstraint( + constraint_variable="filing_status", + operation=operation, + value=value, + ) + ) + + stratum = Stratum( + parent_stratum_id=national_filer_stratum_id, + notes=note, + ) + stratum.constraints_rel.extend(constraints) + session.add(stratum) + session.flush() + return stratum + + def load_national_geography_ctc_targets( session: Session, national_filer_stratum_id: int, geography_year: int ) -> None: @@ -664,6 +857,113 @@ def load_national_geography_ctc_agi_targets( ) +def load_national_eitc_agi_child_targets( + session: Session, + national_filer_stratum_id: int, + *, + source_year: int = EITC_AGI_CHILD_TARGET_SOURCE_YEAR, +) -> None: + """Create national EITC targets by AGI bucket and qualifying children. + + The source CSV is IRS SOI Publication 1304 Table 2.5. It is also used by + the PE-native Enhanced CPS loss matrix, so keeping it in ``policy_data.db`` + lets downstream diagnostics map legacy PE-native labels to structured + target rows. + """ + path = CALIBRATION_FOLDER / "eitc_by_agi_and_children.csv" + if not path.exists(): + return + + df = pd.read_csv(path, comment="#") + df["agi_lower"] = df["agi_lower"].astype(float) + df["agi_upper"] = df["agi_upper"].astype(float) + + for row in df.itertuples(index=False): + count_children = int(row.count_children) + agi_lower = float(row.agi_lower) + agi_upper = float(row.agi_upper) + returns = float(row.returns) + amount = float(row.amount) + + if returns == 0 and amount == 0: + continue + + stratum = _get_or_create_national_eitc_agi_child_stratum( + session, + national_filer_stratum_id, + count_children=count_children, + agi_lower_bound=agi_lower, + agi_upper_bound=agi_upper, + ) + notes = ( + "IRS SOI Publication 1304 Table 2.5 EITC target " + f"(source year {source_year}, count_children={count_children}, " + f"agi_lower={agi_lower}, agi_upper={agi_upper})" + ) + if returns != 0: + _upsert_target( + session, + stratum_id=stratum.stratum_id, + variable="tax_unit_count", + period=source_year, + value=returns, + source="IRS SOI", + notes=notes, + ) + if amount != 0: + _upsert_target( + session, + stratum_id=stratum.stratum_id, + variable="eitc", + period=source_year, + value=amount, + source="IRS SOI", + notes=notes, + ) + + +def load_national_taxable_agi_filing_status_targets( + session: Session, + national_filer_stratum_id: int, + target_year: int, +) -> None: + """Create taxable-filer AGI/count targets by AGI band and filing status. + + These rows mirror the broad IRS SOI labels used by the legacy national + loss matrix, including the combined married-joint/surviving-spouse cell. + """ + soi = select_best_tracked_soi_rows(load_tracked_soi_targets(), target_year) + rows = soi[ + soi["Variable"].isin(SOI_TAXABLE_AGI_TARGET_VARIABLES) + & (soi["Taxable only"]) + & (soi["AGI upper bound"] > 10_000) + ].copy() + + for _, row in rows.iterrows(): + variable = row["Variable"] + target_variable = SOI_TAXABLE_AGI_TARGET_VARIABLES[variable] + stratum = _get_or_create_national_taxable_agi_filing_status_stratum( + session, + national_filer_stratum_id, + agi_lower_bound=float(row["AGI lower bound"]), + agi_upper_bound=float(row["AGI upper bound"]), + filing_status=row["Filing status"], + ) + notes = ( + f"Publication 1304 {row['SOI table']} taxable AGI/filing-status " + f"target (source year {int(row['Year'])}, row {int(row['XLSX row'])})" + ) + _upsert_target( + session, + stratum_id=stratum.stratum_id, + variable=target_variable, + period=int(row["Year"]), + value=float(row["Value"]), + source="IRS SOI", + notes=notes, + ) + + def load_national_workbook_soi_targets( session: Session, national_filer_stratum_id: int, target_year: int ) -> None: @@ -1085,6 +1385,7 @@ def load_soi_data(long_dfs, year, national_year: Optional[int] = None): load_national_geography_ctc_targets(session, filer_strata["national"], year) load_national_geography_ctc_agi_targets(session, filer_strata["national"], year) + load_national_eitc_agi_child_targets(session, filer_strata["national"]) if national_year is not None: load_national_workbook_soi_targets( @@ -1092,6 +1393,11 @@ def load_soi_data(long_dfs, year, national_year: Optional[int] = None): filer_strata["national"], national_year, ) + load_national_taxable_agi_filing_status_targets( + session, + filer_strata["national"], + national_year, + ) load_national_fine_agi_targets(session, filer_strata["national"], national_year) load_state_fine_agi_targets(session, filer_strata, year) diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index a671daedb..a86f8fd4a 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -112,20 +112,6 @@ def extract_national_targets(year: int = DEFAULT_YEAR): tax_expenditure_targets = [{**target} for target in raw_tax_expenditure_targets] direct_sum_targets = [ - { - "variable": "alimony_income", - "value": 13e9, - "source": "Survey-reported (post-TCJA grandfathered)", - "notes": "Alimony received - survey reported, not tax-filer restricted", - "year": 2024, - }, - { - "variable": "alimony_expense", - "value": 13e9, - "source": "Survey-reported (post-TCJA grandfathered)", - "notes": "Alimony paid - survey reported, not tax-filer restricted", - "year": 2024, - }, { "variable": "medicaid", "value": 871.7e9, @@ -140,20 +126,6 @@ def extract_national_targets(year: int = DEFAULT_YEAR): "notes": "Total household net worth", "year": 2024, }, - { - "variable": "health_insurance_premiums_without_medicare_part_b", - "value": 385e9, - "source": "MEPS/NHEA", - "notes": "Health insurance premiums excluding Medicare Part B", - "year": 2024, - }, - { - "variable": "other_medical_expenses", - "value": 278e9, - "source": "MEPS/NHEA", - "notes": "Out-of-pocket medical expenses", - "year": 2024, - }, { "variable": "medicare_part_b_premiums", "value": get_beneficiary_paid_medicare_part_b_premiums_target(2024), @@ -162,52 +134,24 @@ def extract_national_targets(year: int = DEFAULT_YEAR): "year": 2024, }, { - "variable": "over_the_counter_health_expenses", - "value": 72e9, - "source": "Consumer Expenditure Survey", - "notes": "OTC health products and supplies", - "year": 2024, - }, - { - "variable": "child_support_expense", - "value": 33e9, - "source": "Census Bureau", - "notes": "Child support payments", - "year": 2024, - }, - { - "variable": "child_support_received", - "value": 33e9, - "source": "Census Bureau", - "notes": "Child support received", - "year": 2024, - }, - { - "variable": "spm_unit_capped_work_childcare_expenses", - "value": 348e9, - "source": "Census Bureau SPM", - "notes": "Work and childcare expenses for SPM", - "year": 2024, - }, - { - "variable": "spm_unit_capped_housing_subsidy", - "value": 35e9, - "source": "HUD/Census", - "notes": "Housing subsidies", + "variable": "rent", + "value": 764_925_694_800, + "source": "Census ACS 2024 1-year table B25060", + "notes": "Sum of state aggregate contract rent, annualized from monthly ACS aggregate contract rent", "year": 2024, }, { "variable": "real_estate_taxes", - "value": 500e9, - "source": "Census Bureau", - "notes": "Property taxes paid", + "value": 370_014_207_400, + "source": "Census ACS 2024 1-year table B25090", + "notes": "Sum of state aggregate real estate taxes paid by owner-occupied housing units", "year": 2024, }, { - "variable": "rent", - "value": 735e9, - "source": "Census Bureau/BLS", - "notes": "Rental payments", + "variable": "childcare_expenses", + "value": 63_092e6, + "source": "BLS Consumer Expenditure Surveys CE LABSTAT", + "notes": "Series CXU670320LB0101M aggregate expenditure: babysitting, childcare, daycare, preschool", "year": 2024, }, { diff --git a/policyengine_us_data/storage/calibration_targets/acs_housing_costs_2024.csv b/policyengine_us_data/storage/calibration_targets/acs_housing_costs_2024.csv new file mode 100644 index 000000000..4954e0f14 --- /dev/null +++ b/policyengine_us_data/storage/calibration_targets/acs_housing_costs_2024.csv @@ -0,0 +1,52 @@ +state_code,state_fips,annual_contract_rent,real_estate_taxes +AK,02,1350681600,664772900 +AL,01,5761773600,1537253700 +AR,05,3760575600,1167041400 +AZ,04,16849603200,4320807000 +CA,06,143291068800,52872735400 +CO,08,17072544000,5750527500 +CT,09,8116260000,7275184600 +DC,11,4602276000,778233300 +DE,10,1652836800,656213100 +FL,12,57303682800,24312484700 +GA,13,21304225200,8707748600 +HI,15,4073208000,981165300 +IA,19,4069554000,3234507400 +ID,16,3091480800,1222009800 +IL,17,24729199200,21262263300 +IN,18,9115561200,4242347000 +KS,20,4246785600,2863525400 +KY,21,5821017600,2434868700 +LA,22,5928199200,1822794700 +MA,25,21342618000,12097297000 +MD,24,14212159200,7520628800 +ME,23,2153030400,1668939000 +MI,26,13242972000,10402220500 +MN,27,9724164000,6501643100 +MO,29,8718777600,4428280300 +MS,28,3018102000,1026895200 +MT,30,1873186800,1018759800 +NC,37,20318032800,7550042500 +ND,38,1474936800,608757100 +NE,31,3199722000,2283083400 +NH,33,2585438400,2900421200 +NJ,34,25845276000,22119447000 +NM,35,2917616400,1218092800 +NV,32,8914724400,2031449700 +NY,36,71916831600,32203085100 +OH,39,17617650000,12129649100 +OK,40,5521292400,2206132700 +OR,41,10933761600,4917685900 +PA,42,22028415600,14303332700 +RI,44,2401389600,1519517700 +SC,45,7908846000,2768317200 +SD,46,1274104800,825527300 +TN,47,12780411600,3724735100 +TX,48,67268908800,34936256600 +UT,49,6183264000,2346772700 +VA,51,20114900400,8760836100 +VT,50,1119537600,1171089500 +WA,53,23878054800,10671295800 +WI,55,10165308000,6958356700 +WV,54,1337834400,584045200 +WY,56,793893600,505130800 diff --git a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py index 16e92ea01..2c310b89e 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py @@ -3,26 +3,13 @@ from policyengine_us_data.storage import CALIBRATION_FOLDER """ -Hardcoded targets for the year 2024 from CPS-derived statistics and other sources. Include medical expenses, sum of SPM thresholds, and child support expenses. +Hardcoded targets for the year 2024 from administrative and +authoritative aggregate sources. """ HARD_CODED_TOTALS = { - "health_insurance_premiums_without_medicare_part_b": 385e9, - "other_medical_expenses": 278e9, "medicare_part_b_premiums": 112e9, - "over_the_counter_health_expenses": 72e9, - "spm_unit_spm_threshold": 3_945e9, - "child_support_expense": 33e9, - "child_support_received": 33e9, - "spm_unit_capped_work_childcare_expenses": 348e9, - "spm_unit_capped_housing_subsidy": 35e9, "tanf": 7_788_317_474.55, - # Alimony could be targeted via SOI - "alimony_income": 13e9, - "alimony_expense": 13e9, - # Rough estimate, not CPS derived - "real_estate_taxes": 500e9, # Rough estimate between 350bn and 600bn total property tax collections - "rent": 735e9, # ACS total uprated by CPI # Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics # shows $38,316,190,000 in Box 7: Social security tips (2018) # Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC diff --git a/policyengine_us_data/storage/calibration_targets/refresh_acs_housing_cost_targets.py b/policyengine_us_data/storage/calibration_targets/refresh_acs_housing_cost_targets.py new file mode 100644 index 000000000..7e85422a7 --- /dev/null +++ b/policyengine_us_data/storage/calibration_targets/refresh_acs_housing_cost_targets.py @@ -0,0 +1,75 @@ +import csv +import json +from urllib.request import urlopen + +from policyengine_us_data.storage import CALIBRATION_FOLDER +from policyengine_us_data.storage.calibration_targets.pull_soi_targets import ( + STATE_ABBR_TO_FIPS, +) + + +YEAR = 2024 +ACS_DATASET = "acs/acs1" +STATE_FIPS_TO_ABBR = { + fips: state_code for state_code, fips in STATE_ABBR_TO_FIPS.items() +} + + +def fetch_acs_housing_cost_targets(year: int = YEAR) -> list[dict]: + """Fetch ACS state rent and property-tax aggregates. + + B25060 is aggregate monthly contract rent for renter-occupied units + paying cash rent. We annualize it to match the yearly `rent` variable. + B25090 is aggregate real estate taxes paid by owner-occupied units. + """ + variables = "NAME,B25060_001E,B25090_001E" + url = ( + f"https://api.census.gov/data/{year}/{ACS_DATASET}?get={variables}&for=state:*" + ) + with urlopen(url) as response: + rows = json.load(response) + + header = rows[0] + column_index = {column: index for index, column in enumerate(header)} + + targets = [] + for row in rows[1:]: + state_fips = row[column_index["state"]] + state_code = STATE_FIPS_TO_ABBR.get(state_fips) + if state_code is None: + continue + + monthly_contract_rent = float(row[column_index["B25060_001E"]]) + real_estate_taxes = float(row[column_index["B25090_001E"]]) + targets.append( + { + "state_code": state_code, + "state_fips": state_fips, + "annual_contract_rent": int(monthly_contract_rent * 12), + "real_estate_taxes": int(real_estate_taxes), + } + ) + + return sorted(targets, key=lambda target: target["state_code"]) + + +def main() -> None: + targets = fetch_acs_housing_cost_targets() + output_path = CALIBRATION_FOLDER / f"acs_housing_costs_{YEAR}.csv" + with output_path.open("w", newline="") as output: + writer = csv.DictWriter( + output, + fieldnames=[ + "state_code", + "state_fips", + "annual_contract_rent", + "real_estate_taxes", + ], + lineterminator="\n", + ) + writer.writeheader() + writer.writerows(targets) + + +if __name__ == "__main__": + main() diff --git a/policyengine_us_data/utils/__init__.py b/policyengine_us_data/utils/__init__.py index 70db28d75..b2a964d6b 100644 --- a/policyengine_us_data/utils/__init__.py +++ b/policyengine_us_data/utils/__init__.py @@ -9,8 +9,10 @@ ) __all__ = [ + "ABSOLUTE_ERROR_SCALE_TARGETS", "HardConcrete", "build_loss_matrix", + "get_target_error_normalisation", "print_reweighting_diagnostics", "set_seeds", ] diff --git a/policyengine_us_data/utils/constraint_validation.py b/policyengine_us_data/utils/constraint_validation.py index f533739cb..275954b07 100644 --- a/policyengine_us_data/utils/constraint_validation.py +++ b/policyengine_us_data/utils/constraint_validation.py @@ -7,7 +7,7 @@ Validation Rules: 1. Operation Compatibility (per constraint_variable): - - `==` and `!=` must be alone (cannot combine with other operations) + - `==`, `!=`, and `in` must be alone (cannot combine with other operations) - `>` and `>=` cannot coexist (conflicting lower bounds) - `<` and `<=` cannot coexist (conflicting upper bounds) - `>` or `>=` can combine with `<` or `<=` to form valid ranges @@ -37,7 +37,7 @@ class ConstraintValidationError(Exception): # Operation compatibility groups -EQUALITY_OPS = {"==", "!="} +EQUALITY_OPS = {"==", "!=", "in"} LOWER_BOUND_OPS = {">", ">="} UPPER_BOUND_OPS = {"<", "<="} RANGE_OPS = LOWER_BOUND_OPS | UPPER_BOUND_OPS @@ -88,13 +88,12 @@ def _validate_variable_constraints( def _check_operation_compatibility(var_name: str, operations: set) -> None: """Check that operations on a variable are compatible.""" has_equality = bool(operations & EQUALITY_OPS) - has_range = bool(operations & RANGE_OPS) - # Equality ops must be alone + # Equality/set-membership ops must be alone if has_equality: if len(operations) > 1: raise ConstraintValidationError( - f"{var_name}: '==' or '!=' cannot combine with other " + f"{var_name}: '==', '!=', or 'in' cannot combine with other " f"operations, found: {operations}" ) diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index ce71696cc..41a64ac2d 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -15,7 +15,10 @@ from policyengine_us_data.utils.cms_medicare import ( get_beneficiary_paid_medicare_part_b_premiums_target, ) -from policyengine_us_data.db.etl_irs_soi import get_national_geography_soi_target +from policyengine_us_data.db.etl_irs_soi import ( + get_national_geography_soi_target, + get_state_geography_soi_targets, +) from policyengine_core.reforms import Reform from policyengine_us_data.utils.soi import pe_to_soi, get_soi @@ -27,24 +30,10 @@ # database so this dict can be deleted. See PR #488. HARD_CODED_TOTALS = { - "health_insurance_premiums_without_medicare_part_b": 385e9, - "other_medical_expenses": 278e9, "medicare_part_b_premiums": get_beneficiary_paid_medicare_part_b_premiums_target( 2024 ), - "over_the_counter_health_expenses": 72e9, - "spm_unit_spm_threshold": 3_945e9, - "child_support_expense": 33e9, - "child_support_received": 33e9, - "spm_unit_capped_work_childcare_expenses": 348e9, - "spm_unit_capped_housing_subsidy": 35e9, "tanf": 7_788_317_474.55, - # Alimony could be targeted via SOI - "alimony_income": 13e9, - "alimony_expense": 13e9, - # Rough estimate, not CPS derived - "real_estate_taxes": 500e9, # Rough estimate between 350bn and 600bn total property tax collections - "rent": 735e9, # ACS total uprated by CPI # Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics # shows $38,316,190,000 in Box 7: Social security tips (2018) # Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC @@ -110,6 +99,35 @@ ], } +AGE_BUCKETED_HEALTH_TARGETS = ("medicare_part_b_premiums",) + +BLS_CE_TOTALS = { + # BLS Consumer Expenditure Surveys, CE LABSTAT series + # CXU670320LB0101M, aggregate expenditure (AG) in 2024. + # Item: "Babysitting, childcare, daycare, preschool"; + # AG is reported in millions of dollars. + "childcare_expenses": 63_092e6, +} + +TRANSFER_BALANCE_TARGETS = { + "nation/accounting/alimony_paid_minus_received": ( + "alimony_expense", + "alimony_income", + ), + "nation/accounting/child_support_paid_minus_received": ( + "child_support_expense", + "child_support_received", + ), +} + +ABSOLUTE_ERROR_SCALE_TARGETS = { + # These are accounting identities, not gross flow targets. Use a + # target-specific scale so zero-dollar targets do not get dropped + # by sparse ECPS or dominate the dense reweighting objective. + target: 1e9 + for target in TRANSFER_BALANCE_TARGETS +} + ACA_SPENDING_TARGETS = { 2024: 98e9, } @@ -511,6 +529,166 @@ def _add_ctc_targets(loss_matrix, targets_list, sim, time_period): return targets_list, loss_matrix +def _add_real_estate_tax_targets(loss_matrix, targets_list, sim, time_period): + """Add IRS SOI real-estate-tax amount and count targets. + + These targets correspond to itemizing filers with positive Schedule A + real-estate-tax amounts from the IRS geography file, not total + owner-occupied property-tax payments. + """ + target = get_national_geography_soi_target("real_estate_taxes", time_period) + + real_estate_taxes_person = sim.calculate( + "real_estate_taxes", + period=time_period, + ).values.astype(np.float32) + real_estate_taxes_tax_unit = sim.map_result( + real_estate_taxes_person, + "person", + "tax_unit", + ).astype(np.float32) + is_filer = sim.calculate("tax_unit_is_filer", period=time_period).values > 0 + itemizes = sim.calculate("tax_unit_itemizes", period=time_period).values > 0 + domain_mask = is_filer & itemizes & (real_estate_taxes_tax_unit > 0) + + household_amount = sim.map_result( + real_estate_taxes_tax_unit * domain_mask.astype(np.float32), + "tax_unit", + "household", + ).astype(np.float32) + household_count = sim.map_result( + domain_mask.astype(np.float32), + "tax_unit", + "household", + ).astype(np.float32) + + label = "nation/irs/real_estate_taxes" + loss_matrix[label] = household_amount + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(target["amount"]) + + label = "nation/irs/real_estate_taxes_count" + loss_matrix[label] = household_count + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(target["count"]) + + state_code = sim.calculate( + "state_code", + map_to="household", + period=time_period, + ).values + for state_target in get_state_geography_soi_targets( + "real_estate_taxes", + time_period, + ): + in_state = (state_code == state_target["state_code"]).astype(np.float32) + + label = f"state/irs/real_estate_taxes/{state_target['state_code']}" + loss_matrix[label] = household_amount * in_state + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(state_target["amount"]) + + label = f"state/irs/real_estate_taxes_count/{state_target['state_code']}" + loss_matrix[label] = household_count * in_state + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(state_target["count"]) + + return targets_list, loss_matrix + + +def _add_acs_housing_cost_targets(loss_matrix, targets_list, sim, time_period): + """Add ACS component targets for rent and all-owner property taxes.""" + targets, _ = _load_yeared_target_csv("acs_housing_costs", time_period) + state_code = sim.calculate( + "state_code", + map_to="household", + period=time_period, + ).values + + target_columns = { + "rent": "annual_contract_rent", + "real_estate_taxes": "real_estate_taxes", + } + for variable, target_column in target_columns.items(): + values = sim.calculate( + variable, + map_to="household", + period=time_period, + ).values + + label = f"nation/census/acs/{variable}" + loss_matrix[label] = values + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(float(targets[target_column].sum())) + + for row in targets.itertuples(index=False): + in_state = (state_code == row.state_code).astype(np.float32) + label = f"state/census/acs/{variable}/{row.state_code}" + loss_matrix[label] = values * in_state + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(float(getattr(row, target_column))) + + return targets_list, loss_matrix + + +def _add_bls_ce_targets(loss_matrix, targets_list, sim, time_period): + """Add BLS Consumer Expenditure component-spending targets.""" + for variable, target in BLS_CE_TOTALS.items(): + label = f"nation/bls/ce/{variable}" + loss_matrix[label] = sim.calculate( + variable, + map_to="household", + period=time_period, + ).values + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(target) + + return targets_list, loss_matrix + + +def _add_transfer_balance_targets(loss_matrix, targets_list, sim, time_period): + """Add paid-minus-received accounting targets for private transfers.""" + for label, (paid_variable, received_variable) in TRANSFER_BALANCE_TARGETS.items(): + paid = sim.calculate( + paid_variable, + map_to="household", + period=time_period, + ).values + received = sim.calculate( + received_variable, + map_to="household", + period=time_period, + ).values + loss_matrix[label] = paid - received + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(0.0) + + return targets_list, loss_matrix + + +def get_target_error_normalisation(target_names, targets_array): + """Return numerator shifts and denominators for target loss scaling.""" + target_names = np.asarray(target_names) + targets_array = np.asarray(targets_array, dtype=np.float64) + numerator_shift = np.ones_like(targets_array, dtype=np.float64) + denominator = targets_array + 1 + + for label, scale in ABSOLUTE_ERROR_SCALE_TARGETS.items(): + mask = target_names == label + numerator_shift[mask] = 0.0 + denominator[mask] = scale + + return numerator_shift, denominator + + def build_loss_matrix(dataset: type, time_period): loss_matrix = pd.DataFrame() df = pe_to_soi(dataset, time_period) @@ -778,6 +956,13 @@ def build_loss_matrix(dataset: type, time_period): time_period, ) + targets_array, loss_matrix = _add_real_estate_tax_targets( + loss_matrix, + targets_array, + sim, + time_period, + ) + # Tax filer counts by AGI band (SOI Table 1.1). Calibrates total # filers (not just taxable returns), with granular bands sourced # from the latest SOI year <= calibration year to avoid hardcoding @@ -820,6 +1005,27 @@ def build_loss_matrix(dataset: type, time_period): raise ValueError(f"Missing values for {label}") targets_array.append(target) + targets_array, loss_matrix = _add_acs_housing_cost_targets( + loss_matrix, + targets_array, + sim, + time_period, + ) + + targets_array, loss_matrix = _add_bls_ce_targets( + loss_matrix, + targets_array, + sim, + time_period, + ) + + targets_array, loss_matrix = _add_transfer_balance_targets( + loss_matrix, + targets_array, + sim, + time_period, + ) + # Negative household market income total rough estimate from the IRS SOI PUF market_income = sim.calculate("household_market_income").values @@ -838,6 +1044,8 @@ def build_loss_matrix(dataset: type, time_period): # The top row is treated as unbounded (age >= lower_bound) so the # 90+ population is constrained by an age-specific target rather than # only by the national total. See issue #768. + # Keep only Medicare Part B: the other household medical-expense + # aggregates are survey-based and should not drive national calibration. healthcare = pd.read_csv(CALIBRATION_FOLDER / "healthcare_spending.csv") top_age_lower_bound = int(healthcare["age_10_year_lower_bound"].max()) @@ -851,12 +1059,7 @@ def build_loss_matrix(dataset: type, time_period): else: in_age_range = (age >= age_lower_bound) * (age < age_lower_bound + 10) label_suffix = f"age_{age_lower_bound}_to_{age_lower_bound + 9}" - for expense_type in [ - "health_insurance_premiums_without_medicare_part_b", - "over_the_counter_health_expenses", - "other_medical_expenses", - "medicare_part_b_premiums", - ]: + for expense_type in AGE_BUCKETED_HEALTH_TARGETS: label = f"nation/census/{expense_type}/{label_suffix}" value = sim.calculate(expense_type).values loss_matrix[label] = sim.map_result( @@ -864,26 +1067,6 @@ def build_loss_matrix(dataset: type, time_period): ) targets_array.append(row[expense_type]) - # AGI by SPM threshold totals - - spm_threshold_agi = pd.read_csv(CALIBRATION_FOLDER / "spm_threshold_agi.csv") - - for _, row in spm_threshold_agi.iterrows(): - spm_unit_agi = sim.calculate("adjusted_gross_income", map_to="spm_unit").values - spm_threshold = sim.calculate("spm_unit_spm_threshold").values - in_threshold_range = (spm_threshold >= row["lower_spm_threshold"]) * ( - spm_threshold < row["upper_spm_threshold"] - ) - label = f"nation/census/agi_in_spm_threshold_decile_{int(row['decile'])}" - loss_matrix[label] = sim.map_result( - in_threshold_range * spm_unit_agi, "spm_unit", "household" - ) - targets_array.append(row["adjusted_gross_income"]) - - label = f"nation/census/count_in_spm_threshold_decile_{int(row['decile'])}" - loss_matrix[label] = sim.map_result(in_threshold_range, "spm_unit", "household") - targets_array.append(row["count"]) - # Population by state and population under 5 by state state_population = pd.read_csv(CALIBRATION_FOLDER / "population_by_state.csv") @@ -1080,10 +1263,6 @@ def build_loss_matrix(dataset: type, time_period): targets_array.extend(agi_state_targets) loss_matrix = _add_agi_metric_columns(loss_matrix, sim) - targets_array, loss_matrix = _add_state_real_estate_taxes( - loss_matrix, targets_array, sim - ) - snap_state_target_names, snap_state_targets = _add_snap_state_targets(sim) targets_array.extend(snap_state_targets) loss_matrix = _add_snap_metric_columns(loss_matrix, sim) @@ -1219,41 +1398,6 @@ def _add_agi_metric_columns( return loss_matrix -def _add_state_real_estate_taxes(loss_matrix, targets_list, sim): - """ - Add state real estate taxes to the loss matrix and targets list. - """ - # Read the real estate taxes data - real_estate_taxes_targets = pd.read_csv( - CALIBRATION_FOLDER / "real_estate_taxes_by_state_acs.csv" - ) - national_total = HARD_CODED_TOTALS["real_estate_taxes"] - state_sum = real_estate_taxes_targets["real_estate_taxes_bn"].sum() * 1e9 - national_to_state_diff = national_total / state_sum - real_estate_taxes_targets["real_estate_taxes_bn"] *= national_to_state_diff - real_estate_taxes_targets["real_estate_taxes_bn"] = ( - real_estate_taxes_targets["real_estate_taxes_bn"] * 1e9 - ) - - assert np.isclose( - real_estate_taxes_targets["real_estate_taxes_bn"].sum(), - national_total, - rtol=1e-8, - ), "Real estate tax totals do not sum to national target" - - targets_list.extend(real_estate_taxes_targets["real_estate_taxes_bn"].tolist()) - - real_estate_taxes = sim.calculate("real_estate_taxes", map_to="household").values - state = sim.calculate("state_code", map_to="household").values - - for _, r in real_estate_taxes_targets.iterrows(): - in_state = (state == r["state_code"]).astype(float) - label = f"state/real_estate_taxes/{r['state_code']}" - loss_matrix[label] = real_estate_taxes * in_state - - return targets_list, loss_matrix - - def _add_snap_state_targets(sim): """ Add snap targets at the state level, adjusted in aggregate to the sim @@ -1317,7 +1461,9 @@ def _add_snap_metric_columns( return loss_matrix -def print_reweighting_diagnostics(optimised_weights, loss_matrix, targets_array, label): +def print_reweighting_diagnostics( + optimised_weights, loss_matrix, targets_array, label, target_names=None +): # Convert all inputs to NumPy arrays right at the start optimised_weights_np = ( optimised_weights.numpy() @@ -1334,6 +1480,10 @@ def print_reweighting_diagnostics(optimised_weights, loss_matrix, targets_array, if hasattr(targets_array, "numpy") else np.asarray(targets_array) ) + if target_names is None and hasattr(loss_matrix, "columns"): + target_names = np.asarray(loss_matrix.columns) + elif target_names is not None: + target_names = np.asarray(target_names) logging.info(f"\n\n---{label}: reweighting quick diagnostics----\n") logging.info( @@ -1344,10 +1494,20 @@ def print_reweighting_diagnostics(optimised_weights, loss_matrix, targets_array, # All subsequent calculations use the guaranteed NumPy versions estimate = optimised_weights_np @ loss_matrix_np - rel_error = (((estimate - targets_array_np) + 1) / (targets_array_np + 1)) ** 2 - within_10_percent_mask = np.abs(estimate - targets_array_np) <= ( - 0.10 * np.abs(targets_array_np) - ) + if target_names is None: + numerator_shift = np.ones_like(targets_array_np, dtype=np.float64) + denominator = targets_array_np + 1 + else: + numerator_shift, denominator = get_target_error_normalisation( + target_names, targets_array_np + ) + rel_error = ((estimate - targets_array_np + numerator_shift) / denominator) ** 2 + tolerance = 0.10 * np.abs(targets_array_np) + if target_names is not None: + for target_name, scale in ABSOLUTE_ERROR_SCALE_TARGETS.items(): + mask = target_names == target_name + tolerance[mask] = 0.10 * scale + within_10_percent_mask = np.abs(estimate - targets_array_np) <= tolerance percent_within_10 = np.mean(within_10_percent_mask) * 100 logging.info( f"rel_error: min: {np.min(rel_error):.2f}\n" diff --git a/policyengine_us_data/utils/national_target_parity.py b/policyengine_us_data/utils/national_target_parity.py new file mode 100644 index 000000000..063cb8ba1 --- /dev/null +++ b/policyengine_us_data/utils/national_target_parity.py @@ -0,0 +1,829 @@ +"""Map legacy national ``build_loss_matrix`` labels to target DB rows.""" + +from __future__ import annotations + +import argparse +import json +import math +import re +import sqlite3 +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterable, Mapping, Sequence + +from policyengine_us_data.storage import STORAGE_FOLDER + +SCHEMA_VERSION = 1 + +EITC_AGI_CHILD_DOMAIN = "adjusted_gross_income,eitc,eitc_child_count" + +_EITC_AGI_CHILD_LABEL = re.compile( + r"^nation/irs/eitc/(?Preturns|amount)/" + r"c(?P\d+)_(?P[^_]+)_(?P[^/]+)$" +) +_CTC_LABEL = re.compile( + r"^nation/irs/(?Prefundable_ctc|non_refundable_ctc)(?P_count)?$" +) +_REAL_ESTATE_TAX_LABEL = re.compile(r"^nation/irs/real_estate_taxes(?P_count)?$") +_SOI_TAXABLE_DETAIL_LABEL = re.compile( + r"^nation/irs/.+/(?:count|total)/AGI in .+/taxable/.+$" +) +_AGI_RANGE_LABEL = re.compile( + r"^(?P-inf|inf|[0-9.]+(?:bn|m|k)?)-" + r"(?P-inf|inf|[0-9.]+(?:bn|m|k)?)$" +) +_EITC_STATE_LABEL = re.compile(r"^nation/irs/eitc/(?:returns|amount)/state_[0-9]+$") +_NATIONAL_AGE_LABEL = re.compile(r"^nation/census/population_by_age/[0-9]+$") +_MEDICARE_PART_B_AGE_LABEL = re.compile( + r"^nation/census/medicare_part_b_premiums/age_.+$" +) +_SPM_THRESHOLD_LABEL = re.compile( + r"^nation/census/(?:agi|count)_in_spm_threshold_decile_[0-9]+$" +) +_SOI_FILER_AGI_LABEL = re.compile(r"^nation/soi/filer_count/agi_.+$") +_DEPRECATED_SPM_SURVEY_LABEL = re.compile( + r"^nation/census/(?:spm_unit_|(?:agi|count)_in_spm_threshold_decile_).+$" +) + +_DIRECT_NATIONAL_CENSUS_TARGET_VARIABLES = { + "medicaid", + "medicare_part_b_premiums", + "rent", + "real_estate_taxes", + "tip_income", + "social_security_retirement", + "social_security_disability", + "social_security_survivors", + "social_security_dependents", + "traditional_ira_contributions", + "traditional_401k_contributions", + "roth_401k_contributions", + "self_employed_pension_contribution_ald", + "roth_ira_contributions", +} + +_SOI_TAXABLE_DETAIL_TARGET_VARIABLES = { + ("adjusted gross income", "total"): "adjusted_gross_income", + ("count", "count"): "tax_unit_count", +} +_SOI_FILING_STATUS_CONSTRAINTS = { + "Single": ("==", "SINGLE"), + "Head of Household": ("==", "HEAD_OF_HOUSEHOLD"), + "Married Filing Separately": ("==", "SEPARATE"), + "Married Filing Jointly/Surviving Spouse": ( + "in", + "JOINT|SURVIVING_SPOUSE", + ), +} + +_DB_CONSTRAINTS_QUERY = """ + SELECT + v.target_id, + v.stratum_id, + v.variable, + v.reform_id, + v.value, + v.period, + v.active, + v.geo_level, + v.geographic_id, + v.domain_variable, + t.source, + t.notes, + sc.constraint_variable, + sc.operation, + sc.value AS constraint_value + FROM target_overview v + JOIN targets t + ON t.target_id = v.target_id + LEFT JOIN stratum_constraints sc + ON sc.stratum_id = v.stratum_id + WHERE + v.active = 1 + AND v.geo_level = 'national' + ORDER BY v.target_id, sc.constraint_variable, sc.operation, sc.value +""" + + +@dataclass(frozen=True) +class Constraint: + variable: str + operation: str + value: str + + def normalized(self) -> tuple[str, str, str]: + return (self.variable, self.operation, _normalize_constraint_value(self.value)) + + +@dataclass(frozen=True) +class TargetRecord: + target_id: int + stratum_id: int + variable: str + reform_id: int + value: float | None + period: int + source: str | None + notes: str | None + geo_level: str + geographic_id: str + domain_variable: str | None + constraints: tuple[Constraint, ...] + + @property + def constraints_set(self) -> frozenset[tuple[str, str, str]]: + return frozenset(constraint.normalized() for constraint in self.constraints) + + def to_manifest_match(self) -> dict[str, Any]: + return { + "target_id": self.target_id, + "stratum_id": self.stratum_id, + "variable": self.variable, + "reform_id": self.reform_id, + "period": self.period, + "value": self.value, + "source": self.source, + "domain_variable": self.domain_variable, + "constraints": [ + { + "variable": constraint.variable, + "operation": constraint.operation, + "value": constraint.value, + } + for constraint in self.constraints + ], + } + + +def _normalize_constraint_value(value: str) -> str: + try: + number = float(value) + except (TypeError, ValueError): + return str(value) + if number.is_integer(): + return str(int(number)) + return repr(number) + + +def _parse_numeric_token(token: str) -> float: + if token == "-inf": + return float("-inf") + if token == "inf": + return float("inf") + multipliers = { + "bn": 1_000_000_000.0, + "m": 1_000_000.0, + "k": 1_000.0, + } + for suffix, multiplier in multipliers.items(): + if token.endswith(suffix): + return float(token[: -len(suffix)]) * multiplier + return float(token) + + +def _constraint(variable: str, operation: str, value: Any) -> Constraint: + return Constraint(variable=variable, operation=operation, value=str(value)) + + +def load_national_target_records(db_path: str | Path) -> list[TargetRecord]: + """Load active national target DB rows with their constraints.""" + + path = Path(db_path).expanduser() + if not path.exists(): + raise FileNotFoundError(path) + + grouped: dict[int, dict[str, Any]] = {} + with sqlite3.connect(path) as conn: + conn.row_factory = sqlite3.Row + for row in conn.execute(_DB_CONSTRAINTS_QUERY): + target_id = int(row["target_id"]) + target = grouped.setdefault( + target_id, + { + "target_id": target_id, + "stratum_id": int(row["stratum_id"]), + "variable": row["variable"], + "reform_id": int(row["reform_id"]), + "value": row["value"], + "period": int(row["period"]), + "source": row["source"], + "notes": row["notes"], + "geo_level": row["geo_level"], + "geographic_id": row["geographic_id"], + "domain_variable": row["domain_variable"], + "constraints": [], + }, + ) + if row["constraint_variable"] is not None: + target["constraints"].append( + Constraint( + variable=row["constraint_variable"], + operation=row["operation"], + value=row["constraint_value"], + ) + ) + + return [ + TargetRecord( + target_id=row["target_id"], + stratum_id=row["stratum_id"], + variable=row["variable"], + reform_id=row["reform_id"], + value=row["value"], + period=row["period"], + source=row["source"], + notes=row["notes"], + geo_level=row["geo_level"], + geographic_id=row["geographic_id"], + domain_variable=row["domain_variable"], + constraints=tuple(row["constraints"]), + ) + for row in grouped.values() + ] + + +class NationalTargetIndex: + def __init__(self, records: Sequence[TargetRecord]): + self.records = list(records) + + def match( + self, + *, + variable: str, + period: int, + domain_variable: str | None = None, + reform_id: int = 0, + constraints: Sequence[Constraint] = (), + ) -> list[TargetRecord]: + required_constraints = {constraint.normalized() for constraint in constraints} + candidates = [ + record + for record in self.records + if record.variable == variable + and record.reform_id == reform_id + and record.domain_variable == domain_variable + and record.period <= period + and required_constraints <= record.constraints_set + ] + if not candidates: + return [] + latest_period = max(record.period for record in candidates) + return [record for record in candidates if record.period == latest_period] + + +def _match_result( + target_name: str, + matches: Sequence[TargetRecord], + *, + reason: str, +) -> dict[str, Any]: + if len(matches) == 1: + match = matches[0] + return { + "target_name": target_name, + "scope": "national", + "status": "matched", + "reason": reason, + "target_id": match.target_id, + "target": match.to_manifest_match(), + } + if len(matches) > 1: + return { + "target_name": target_name, + "scope": "national", + "status": "ambiguous", + "reason": f"{reason}_ambiguous", + "matches": [match.to_manifest_match() for match in matches], + } + return { + "target_name": target_name, + "scope": "national", + "status": "db_missing", + "reason": f"{reason}_missing_from_target_db", + } + + +def classify_national_target( + target_name: str, + index: NationalTargetIndex, + *, + period: int, + target_value: float | None = None, +) -> dict[str, Any]: + """Classify a legacy national loss label against the structured target DB.""" + + if not target_name.startswith("nation/"): + return { + "target_name": target_name, + "scope": "non_national", + "status": "out_of_scope", + "reason": "non_national_loss_target", + } + + match = _EITC_AGI_CHILD_LABEL.match(target_name) + if match: + count_children = int(match.group("count_children")) + child_constraint = ( + _constraint("eitc_child_count", "==", count_children) + if count_children < 3 + else _constraint("eitc_child_count", ">", 2) + ) + variable = "tax_unit_count" if match.group("metric") == "returns" else "eitc" + matches = index.match( + variable=variable, + domain_variable=EITC_AGI_CHILD_DOMAIN, + period=period, + constraints=[ + _constraint("tax_unit_is_filer", "==", 1), + _constraint("eitc", ">", 0), + child_constraint, + _constraint( + "adjusted_gross_income", + ">=", + _parse_numeric_token(match.group("agi_lower")), + ), + _constraint( + "adjusted_gross_income", + "<", + _parse_numeric_token(match.group("agi_upper")), + ), + ], + ) + if not matches and target_value is not None and math.isclose(target_value, 0.0): + return { + "target_name": target_name, + "scope": "national", + "status": "legacy_only", + "reason": "zero_eitc_agi_child_target_omitted_from_target_db", + } + return _match_result( + target_name, + matches, + reason="structured_eitc_agi_child_target", + ) + + match = _CTC_LABEL.match(target_name) + if match: + variable = match.group("variable") + matches = index.match( + variable="tax_unit_count" if match.group("count") else variable, + domain_variable=variable, + period=period, + constraints=[ + _constraint("tax_unit_is_filer", "==", 1), + _constraint(variable, ">", 0), + ], + ) + return _match_result(target_name, matches, reason="structured_ctc_target") + + match = _REAL_ESTATE_TAX_LABEL.match(target_name) + if match: + matches = index.match( + variable="tax_unit_count" if match.group("count") else "real_estate_taxes", + domain_variable="real_estate_taxes,tax_unit_itemizes", + period=period, + constraints=[ + _constraint("tax_unit_is_filer", "==", 1), + _constraint("tax_unit_itemizes", "==", 1), + _constraint("real_estate_taxes", ">", 0), + ], + ) + return _match_result( + target_name, + matches, + reason="structured_real_estate_tax_itemizer_target", + ) + + if target_name.startswith("nation/cbo/"): + variable = target_name.removeprefix("nation/cbo/") + matches = index.match(variable=variable, period=period) + return _match_result(target_name, matches, reason="structured_cbo_target") + + direct_census_variable = _direct_census_variable(target_name) + if direct_census_variable is not None: + matches = index.match(variable=direct_census_variable, period=period) + return _match_result( + target_name, + matches, + reason="structured_direct_national_target", + ) + + if target_name == "nation/hhs/medicaid_spending": + return _match_result( + target_name, + index.match(variable="medicaid", period=period), + reason="structured_medicaid_spending_target", + ) + if target_name == "nation/hhs/medicaid_enrollment": + return _match_result( + target_name, + index.match( + variable="person_count", + domain_variable="medicaid", + period=period, + constraints=[_constraint("medicaid", ">", 0)], + ), + reason="structured_medicaid_enrollment_target", + ) + if target_name == "nation/gov/aca_enrollment": + return _match_result( + target_name, + index.match( + variable="person_count", + domain_variable="aca_ptc", + period=period, + constraints=[_constraint("aca_ptc", ">", 0)], + ), + reason="structured_aca_enrollment_target", + ) + if target_name == "nation/census/tanf": + return _match_result( + target_name, + index.match( + variable="tanf", + domain_variable="tanf", + period=period, + constraints=[_constraint("tanf", ">", 0)], + ), + reason="structured_tanf_cash_assistance_target", + ) + if target_name.startswith("nation/census/acs/"): + variable = target_name.removeprefix("nation/census/acs/") + return _match_result( + target_name, + index.match(variable=variable, period=period), + reason="structured_acs_national_target", + ) + if target_name == "nation/bls/ce/childcare_expenses": + return _match_result( + target_name, + index.match(variable="childcare_expenses", period=period), + reason="structured_bls_ce_target", + ) + if target_name == "nation/net_worth/total": + return _match_result( + target_name, + index.match(variable="net_worth", period=period), + reason="structured_net_worth_target", + ) + if target_name == "nation/ssa/ssn_card_type_none_count": + return _match_result( + target_name, + index.match( + variable="person_count", + domain_variable="ssn_card_type", + period=period, + constraints=[_constraint("ssn_card_type", "==", "NONE")], + ), + reason="structured_ssn_card_type_target", + ) + if target_name == "nation/db/liheap/household_count": + return _match_result( + target_name, + index.match( + variable="household_count", + domain_variable="spm_unit_energy_subsidy_reported", + period=period, + constraints=[_constraint("spm_unit_energy_subsidy_reported", ">", 0)], + ), + reason="structured_liheap_target", + ) + + tax_expenditure = _tax_expenditure_variable(target_name) + if tax_expenditure is not None: + variable, reform_id = tax_expenditure + return _match_result( + target_name, + index.match(variable=variable, reform_id=reform_id, period=period), + reason="structured_tax_expenditure_target", + ) + + soi_taxable_detail = _parse_soi_taxable_detail_target(target_name) + if soi_taxable_detail is not None: + variable, domain_variable, constraints = soi_taxable_detail + matches = index.match( + variable=variable, + domain_variable=domain_variable, + period=period, + constraints=constraints, + ) + if not matches and _soi_taxable_detail_label_has_lossy_agi_range(target_name): + return { + "target_name": target_name, + "scope": "national", + "status": "legacy_only", + "reason": "legacy_soi_taxable_agi_label_has_lossy_bucket_encoding", + } + return _match_result( + target_name, + matches, + reason="structured_soi_taxable_agi_filing_status_target", + ) + + return { + "target_name": target_name, + "scope": "national", + "status": "legacy_only", + "reason": _legacy_reason(target_name), + } + + +def _direct_census_variable(target_name: str) -> str | None: + prefix = "nation/census/" + if not target_name.startswith(prefix): + return None + variable = target_name.removeprefix(prefix) + if "/" in variable: + return None + if variable not in _DIRECT_NATIONAL_CENSUS_TARGET_VARIABLES: + return None + return variable + + +def _tax_expenditure_variable(target_name: str) -> tuple[str, int] | None: + mapping = { + "nation/jct/salt_deduction_expenditure": ("salt_deduction", 1), + "nation/jct/medical_expense_deduction_expenditure": ( + "medical_expense_deduction", + 2, + ), + "nation/jct/charitable_deduction_expenditure": ( + "charitable_deduction", + 3, + ), + "nation/jct/interest_deduction_expenditure": ( + "deductible_mortgage_interest", + 4, + ), + "nation/jct/qualified_business_income_deduction_expenditure": ( + "qualified_business_income_deduction", + 5, + ), + } + return mapping.get(target_name) + + +def _parse_soi_taxable_detail_target( + target_name: str, +) -> tuple[str, str, list[Constraint]] | None: + if not _SOI_TAXABLE_DETAIL_LABEL.match(target_name): + return None + + body = target_name.removeprefix("nation/irs/") + try: + variable_and_metric, rest = body.split("/AGI in ", 1) + variable_label, metric = variable_and_metric.rsplit("/", 1) + agi_range, taxable_label, filing_status = rest.split("/", 2) + except ValueError: + return None + if taxable_label != "taxable": + return None + + variable = _SOI_TAXABLE_DETAIL_TARGET_VARIABLES.get((variable_label, metric)) + range_match = _AGI_RANGE_LABEL.match(agi_range) + if variable is None or range_match is None: + return None + + constraints = [ + _constraint("tax_unit_is_filer", "==", 1), + _constraint("income_tax_before_credits", ">", 0), + _constraint( + "adjusted_gross_income", + ">=", + _parse_numeric_token(range_match.group("lower")), + ), + _constraint( + "adjusted_gross_income", + "<", + _parse_numeric_token(range_match.group("upper")), + ), + ] + domain_variables = ["adjusted_gross_income", "income_tax_before_credits"] + filing_constraint = _SOI_FILING_STATUS_CONSTRAINTS.get(filing_status) + if filing_constraint is not None: + operation, value = filing_constraint + constraints.append(_constraint("filing_status", operation, value)) + domain_variables.append("filing_status") + elif filing_status != "All": + return None + + return variable, ",".join(sorted(domain_variables)), constraints + + +def _soi_taxable_detail_label_has_lossy_agi_range(target_name: str) -> bool: + try: + agi_range = target_name.split("/AGI in ", 1)[1].split("/", 1)[0] + except IndexError: + return False + + return agi_range in { + "1m-2m", + "2m-2m", + "676k-3m", + "3m-16m", + "16m-79m", + "79m-inf", + } + + +def _legacy_reason(target_name: str) -> str: + if _SOI_TAXABLE_DETAIL_LABEL.match(target_name): + return "legacy_soi_taxable_agi_filing_status_detail_not_in_target_db" + if _EITC_STATE_LABEL.match(target_name): + return "legacy_eitc_state_targets_not_structured_in_target_db" + if _NATIONAL_AGE_LABEL.match(target_name): + return "legacy_single_year_age_targets_replaced_by_db_age_bins" + if _MEDICARE_PART_B_AGE_LABEL.match(target_name): + return "legacy_age_bucketed_medicare_part_b_premiums_not_in_target_db" + if _SPM_THRESHOLD_LABEL.match(target_name): + return "deprecated_survey_spm_threshold_target_removed_from_national_pipeline" + if _DEPRECATED_SPM_SURVEY_LABEL.match(target_name): + return "deprecated_survey_spm_target_removed_from_national_pipeline" + if target_name in { + "nation/census/alimony_expense", + "nation/census/alimony_income", + "nation/census/child_support_expense", + "nation/census/child_support_received", + }: + return "deprecated_survey_transfer_flow_target_removed_from_national_pipeline" + if target_name in { + "nation/census/health_insurance_premiums_without_medicare_part_b", + "nation/census/other_medical_expenses", + "nation/census/over_the_counter_health_expenses", + }: + return "deprecated_survey_health_expense_target_removed_from_national_pipeline" + if _SOI_FILER_AGI_LABEL.match(target_name): + return "legacy_soi_filer_agi_label_has_lossy_bucket_encoding" + if target_name == "nation/gov/aca_spending": + return "legacy_cms_aca_spending_target_not_in_target_db" + if target_name.startswith("nation/accounting/"): + return "legacy_accounting_balance_target_not_in_target_db" + if target_name.startswith("nation/irs/negative_household_market_income_"): + return "legacy_negative_market_income_target_not_in_target_db" + if target_name == "nation/census/infants": + return "legacy_single_year_infant_target_not_in_target_db" + return "legacy_national_target_not_structured_in_target_db" + + +def build_national_target_parity_manifest( + target_names: Iterable[str | Mapping[str, Any]], + *, + db_path: str | Path, + period: int, +) -> dict[str, Any]: + records = load_national_target_records(db_path) + index = NationalTargetIndex(records) + national_targets = [] + for target in target_names: + normalized = _normalize_target_input(target) + if normalized["target_name"].startswith("nation/"): + national_targets.append(normalized) + entries = [] + for target in national_targets: + entry = classify_national_target( + target["target_name"], + index, + period=period, + target_value=target.get("target_value"), + ) + if target.get("target_value") is not None: + entry["target_value"] = target["target_value"] + entries.append(entry) + summary: dict[str, Any] = { + "total": len(entries), + "statuses": _count_by(entries, "status"), + "reasons": _count_by(entries, "reason"), + } + matched = summary["statuses"].get("matched", 0) + summary["match_rate"] = matched / len(entries) if entries else None + return { + "schema_version": SCHEMA_VERSION, + "period": period, + "target_db_path": str(Path(db_path).expanduser()), + "summary": summary, + "targets": entries, + } + + +def _count_by(rows: Sequence[dict[str, Any]], key: str) -> dict[str, int]: + counts: dict[str, int] = {} + for row in rows: + value = str(row.get(key)) + counts[value] = counts.get(value, 0) + 1 + return dict(sorted(counts.items())) + + +def _normalize_target_input(target: str | Mapping[str, Any]) -> dict[str, Any]: + if isinstance(target, Mapping): + target_name = str(target["target_name"]) + target_value = target.get("target_value") + return { + "target_name": target_name, + "target_value": (float(target_value) if target_value is not None else None), + } + return {"target_name": str(target), "target_value": None} + + +def extract_target_names_from_json(path: str | Path) -> list[str]: + return [target["target_name"] for target in extract_targets_from_json(path)] + + +def extract_targets_from_json(path: str | Path) -> list[dict[str, Any]]: + payload = json.loads(Path(path).read_text()) + if isinstance(payload, list): + return [_normalize_target_input(value) for value in payload] + if isinstance(payload, dict): + if isinstance(payload.get("target_names"), list): + return [ + {"target_name": str(value), "target_value": None} + for value in payload["target_names"] + ] + if isinstance(payload.get("targets"), list): + targets = [] + for row in payload["targets"]: + if isinstance(row, dict) and "target_name" in row: + targets.append(_normalize_target_input(row)) + else: + targets.append({"target_name": str(row), "target_value": None}) + return targets + raise ValueError( + "Expected JSON list, {'target_names': [...]}, or {'targets': [{'target_name': ...}]}" + ) + + +def extract_target_names_from_dataset( + dataset_path: str | Path, + *, + period: int, +) -> list[str]: + return [ + target["target_name"] + for target in extract_targets_from_dataset(dataset_path, period=period) + ] + + +def extract_targets_from_dataset( + dataset_path: str | Path, + *, + period: int, +) -> list[dict[str, Any]]: + from policyengine_core.data import Dataset + from policyengine_us_data.utils.loss import build_loss_matrix + + class LocalDataset(Dataset): + name = "national_target_parity_dataset" + label = name + file_path = str(dataset_path) + data_format = Dataset.TIME_PERIOD_ARRAYS + time_period = period + + loss_matrix, targets_array = build_loss_matrix(LocalDataset, period) + return [ + {"target_name": str(column), "target_value": float(target_value)} + for column, target_value in zip(loss_matrix.columns, targets_array) + ] + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description=( + "Build a parity manifest from legacy national build_loss_matrix " + "labels to structured policy_data.db targets." + ) + ) + parser.add_argument("--period", type=int, default=2024) + parser.add_argument( + "--target-db", + default=str(STORAGE_FOLDER / "calibration" / "policy_data.db"), + ) + source = parser.add_mutually_exclusive_group(required=True) + source.add_argument("--target-names-json") + source.add_argument("--dataset-path") + parser.add_argument("--output") + args = parser.parse_args(argv) + + if args.target_names_json: + target_names = extract_targets_from_json(args.target_names_json) + else: + target_names = extract_targets_from_dataset( + args.dataset_path, + period=args.period, + ) + + manifest = build_national_target_parity_manifest( + target_names, + db_path=args.target_db, + period=args.period, + ) + text = json.dumps(manifest, indent=2, sort_keys=True) + "\n" + if args.output: + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(text) + else: + print(text, end="") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/integration/test_database_build.py b/tests/integration/test_database_build.py index ff270a337..5604a2064 100644 --- a/tests/integration/test_database_build.py +++ b/tests/integration/test_database_build.py @@ -121,6 +121,43 @@ def test_national_targets_loaded(built_db): ) +def test_national_taxable_agi_filing_status_targets_loaded(built_db): + """SOI taxable-return AGI/status targets should survive the full DB build.""" + conn = sqlite3.connect(str(built_db)) + rows_by_variable = dict( + conn.execute( + """ + SELECT variable, COUNT(*) + FROM target_overview + WHERE geo_level = 'national' + AND domain_variable + = 'adjusted_gross_income,filing_status,income_tax_before_credits' + AND variable IN ('adjusted_gross_income', 'tax_unit_count') + GROUP BY variable + """ + ).fetchall() + ) + joint_status_rows = conn.execute( + """ + SELECT COUNT(DISTINCT t.target_id) + FROM targets t + JOIN target_overview tv ON tv.target_id = t.target_id + JOIN stratum_constraints sc ON sc.stratum_id = t.stratum_id + WHERE tv.geo_level = 'national' + AND tv.domain_variable + = 'adjusted_gross_income,filing_status,income_tax_before_credits' + AND sc.constraint_variable = 'filing_status' + AND sc.operation = 'in' + AND sc.value = 'JOINT|SURVIVING_SPOUSE' + """ + ).fetchone()[0] + conn.close() + + assert rows_by_variable.get("adjusted_gross_income", 0) > 0 + assert rows_by_variable.get("tax_unit_count", 0) > 0 + assert joint_status_rows > 0 + + def test_jct_mortgage_tax_expenditure_uses_mortgage_specific_variable(built_db): """The mortgage JCT target should point at a mortgage-specific variable.""" conn = sqlite3.connect(str(built_db)) diff --git a/tests/integration/test_sparse_enhanced_cps.py b/tests/integration/test_sparse_enhanced_cps.py index 91728ed57..e0b851cd3 100644 --- a/tests/integration/test_sparse_enhanced_cps.py +++ b/tests/integration/test_sparse_enhanced_cps.py @@ -11,6 +11,7 @@ from policyengine_core.reforms import Reform from policyengine_us import Microsimulation from policyengine_us_data.utils import ( + ABSOLUTE_ERROR_SCALE_TARGETS, build_loss_matrix, print_reweighting_diagnostics, ) @@ -106,7 +107,10 @@ def test_sparse_ecps(sim): ] loss_matrix, targets_array = build_loss_matrix(sim.dataset, 2024) - zero_mask = np.isclose(targets_array, 0.0, atol=0.1) + scaled_zero_target_mask = loss_matrix.columns.isin( + ABSOLUTE_ERROR_SCALE_TARGETS.keys() + ) + zero_mask = np.isclose(targets_array, 0.0, atol=0.1) & (~scaled_zero_target_mask) bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~(zero_mask | bad_mask) keep_idx = np.where(keep_mask_bool)[0] diff --git a/tests/unit/calibration/test_loss_targets.py b/tests/unit/calibration/test_loss_targets.py index a6d030ca8..ff870ee23 100644 --- a/tests/unit/calibration/test_loss_targets.py +++ b/tests/unit/calibration/test_loss_targets.py @@ -1,14 +1,26 @@ +import inspect + import numpy as np import pandas as pd import pytest from policyengine_us_data.utils.loss import ( + ABSOLUTE_ERROR_SCALE_TARGETS, + AGE_BUCKETED_HEALTH_TARGETS, + BLS_CE_TOTALS, + TRANSFER_BALANCE_TARGETS, _get_aca_national_targets, + _add_acs_housing_cost_targets, + _add_bls_ce_targets, _add_ctc_targets, + _add_real_estate_tax_targets, + _add_transfer_balance_targets, + get_target_error_normalisation, _get_medicaid_national_targets, _load_aca_spending_and_enrollment_targets, _load_medicaid_enrollment_targets, HARD_CODED_TOTALS, + build_loss_matrix, ) @@ -61,7 +73,7 @@ def test_medicaid_national_targets_use_2025_values(): class _FakeArrayResult: def __init__(self, values): - self.values = np.asarray(values, dtype=np.float32) + self.values = np.asarray(values) class _FakeSimulation: @@ -126,5 +138,230 @@ def test_add_ctc_targets(monkeypatch): ) +class _FakeRealEstateTaxSimulation: + def calculate(self, variable, map_to=None, period=None): + values = { + ("real_estate_taxes", None): [100.0, 0.0, 50.0, 0.0], + ("tax_unit_is_filer", None): [1.0, 1.0], + ("tax_unit_itemizes", None): [1.0, 0.0], + ("state_code", "household"): ["CA", "NY"], + } + key = (variable, map_to) + if key not in values: + raise AssertionError(f"Unexpected calculate call {key!r}") + return _FakeArrayResult(values[key]) + + def map_result(self, values, source_entity, target_entity, how=None): + arr = np.asarray(values, dtype=np.float32) + if (source_entity, target_entity) == ("person", "tax_unit"): + return np.array([arr[:2].sum(), arr[2:].sum()], dtype=np.float32) + if (source_entity, target_entity) == ("tax_unit", "household"): + return arr.astype(np.float32) + raise AssertionError( + f"Unexpected map_result call {(source_entity, target_entity, how)!r}" + ) + + +def test_add_real_estate_tax_targets(monkeypatch): + monkeypatch.setattr( + "policyengine_us_data.utils.loss.get_national_geography_soi_target", + lambda variable, year: {"amount": 123_000.0, "count": 17.0}, + ) + monkeypatch.setattr( + "policyengine_us_data.utils.loss.get_state_geography_soi_targets", + lambda variable, year: [ + {"state_code": "CA", "amount": 100_000.0, "count": 10.0}, + {"state_code": "NY", "amount": 50_000.0, "count": 5.0}, + ], + ) + + targets, loss_matrix = _add_real_estate_tax_targets( + pd.DataFrame(), + [], + _FakeRealEstateTaxSimulation(), + 2024, + ) + + assert targets == [123_000.0, 17.0, 100_000.0, 10.0, 50_000.0, 5.0] + np.testing.assert_array_equal( + loss_matrix["nation/irs/real_estate_taxes"], + np.array([100.0, 0.0], dtype=np.float32), + ) + np.testing.assert_array_equal( + loss_matrix["nation/irs/real_estate_taxes_count"], + np.array([1.0, 0.0], dtype=np.float32), + ) + np.testing.assert_array_equal( + loss_matrix["state/irs/real_estate_taxes/CA"], + np.array([100.0, 0.0], dtype=np.float32), + ) + np.testing.assert_array_equal( + loss_matrix["state/irs/real_estate_taxes/NY"], + np.array([0.0, 0.0], dtype=np.float32), + ) + np.testing.assert_array_equal( + loss_matrix["state/irs/real_estate_taxes_count/CA"], + np.array([1.0, 0.0], dtype=np.float32), + ) + np.testing.assert_array_equal( + loss_matrix["state/irs/real_estate_taxes_count/NY"], + np.array([0.0, 0.0], dtype=np.float32), + ) + + +class _FakeAcsHousingCostSimulation: + def calculate(self, variable, map_to=None, period=None): + values = { + ("state_code", "household"): ["CA", "NY", "CA"], + ("rent", "household"): [10.0, 20.0, 30.0], + ("real_estate_taxes", "household"): [1.0, 2.0, 3.0], + ("childcare_expenses", "household"): [4.0, 0.0, 6.0], + } + key = (variable, map_to) + if key not in values: + raise AssertionError(f"Unexpected calculate call {key!r}") + return _FakeArrayResult(values[key]) + + +def test_add_acs_housing_cost_targets(monkeypatch): + monkeypatch.setattr( + "policyengine_us_data.utils.loss._load_yeared_target_csv", + lambda prefix, year: ( + pd.DataFrame( + { + "state_code": ["CA", "NY"], + "annual_contract_rent": [100.0, 200.0], + "real_estate_taxes": [30.0, 40.0], + } + ), + 2024, + ), + ) + + targets, loss_matrix = _add_acs_housing_cost_targets( + pd.DataFrame(), + [], + _FakeAcsHousingCostSimulation(), + 2024, + ) + + assert targets == [300.0, 100.0, 200.0, 70.0, 30.0, 40.0] + np.testing.assert_array_equal( + loss_matrix["nation/census/acs/rent"], + np.array([10.0, 20.0, 30.0]), + ) + np.testing.assert_array_equal( + loss_matrix["state/census/acs/rent/CA"], + np.array([10.0, 0.0, 30.0]), + ) + np.testing.assert_array_equal( + loss_matrix["state/census/acs/real_estate_taxes/NY"], + np.array([0.0, 2.0, 0.0]), + ) + + +def test_bls_ce_childcare_target(): + assert BLS_CE_TOTALS["childcare_expenses"] == pytest.approx(63_092e6) + + targets, loss_matrix = _add_bls_ce_targets( + pd.DataFrame(), + [], + _FakeAcsHousingCostSimulation(), + 2024, + ) + + assert targets == [63_092e6] + np.testing.assert_array_equal( + loss_matrix["nation/bls/ce/childcare_expenses"], + np.array([4.0, 0.0, 6.0]), + ) + + +class _FakeTransferBalanceSimulation: + def calculate(self, variable, map_to=None, period=None): + values = { + "alimony_expense": [100.0, 0.0, 20.0], + "alimony_income": [30.0, 40.0, 0.0], + "child_support_expense": [0.0, 50.0, 10.0], + "child_support_received": [20.0, 10.0, 40.0], + } + if variable not in values: + raise AssertionError(f"Unexpected variable {variable!r}") + assert map_to == "household" + assert period == 2024 + return _FakeArrayResult(values[variable]) + + +def test_transfer_balance_targets_are_net_zero_accounting_constraints(): + targets, loss_matrix = _add_transfer_balance_targets( + pd.DataFrame(), + [], + _FakeTransferBalanceSimulation(), + 2024, + ) + + assert targets == [0.0, 0.0] + assert set(TRANSFER_BALANCE_TARGETS) == { + "nation/accounting/alimony_paid_minus_received", + "nation/accounting/child_support_paid_minus_received", + } + np.testing.assert_array_equal( + loss_matrix["nation/accounting/alimony_paid_minus_received"], + np.array([70.0, -40.0, 20.0]), + ) + np.testing.assert_array_equal( + loss_matrix["nation/accounting/child_support_paid_minus_received"], + np.array([-20.0, 40.0, -30.0]), + ) + + +def test_transfer_balance_targets_use_absolute_error_scale(): + target_names = np.array( + [ + "nation/accounting/alimony_paid_minus_received", + "nation/census/snap", + ] + ) + numerator_shift, denominator = get_target_error_normalisation( + target_names, + np.array([0.0, 10.0]), + ) + + assert ABSOLUTE_ERROR_SCALE_TARGETS[ + "nation/accounting/alimony_paid_minus_received" + ] == pytest.approx(1e9) + np.testing.assert_array_equal(numerator_shift, np.array([0.0, 1.0])) + np.testing.assert_array_equal(denominator, np.array([1e9, 11.0])) + + def test_tanf_hardcoded_target_uses_fy2024_basic_assistance_total(): assert HARD_CODED_TOTALS["tanf"] == pytest.approx(7_788_317_474.55) + + +def test_hardcoded_totals_drop_survey_spm_targets(): + removed_targets = { + "alimony_income", + "alimony_expense", + "child_support_expense", + "child_support_received", + "health_insurance_premiums_without_medicare_part_b", + "other_medical_expenses", + "over_the_counter_health_expenses", + "spm_unit_spm_threshold", + "spm_unit_capped_housing_subsidy", + "spm_unit_capped_work_childcare_expenses", + } + + assert removed_targets.isdisjoint(HARD_CODED_TOTALS) + + +def test_age_bucketed_health_targets_keep_only_medicare_part_b(): + assert AGE_BUCKETED_HEALTH_TARGETS == ("medicare_part_b_premiums",) + + +def test_national_loss_excludes_survey_spm_threshold_decile_targets(): + source = inspect.getsource(build_loss_matrix) + + assert "spm_threshold_agi.csv" not in source + assert "agi_in_spm_threshold_decile" not in source + assert "count_in_spm_threshold_decile" not in source diff --git a/tests/unit/calibration/test_target_config.py b/tests/unit/calibration/test_target_config.py index c698e83db..8d88d23df 100644 --- a/tests/unit/calibration/test_target_config.py +++ b/tests/unit/calibration/test_target_config.py @@ -245,6 +245,21 @@ def test_training_config_includes_tanf_state_and_national_count_targets(self): "domain_variable": "tanf", } in include_rules + def test_training_config_includes_national_childcare_expenses_target(self): + config = load_target_config( + str( + Path(__file__).resolve().parents[3] + / "policyengine_us_data" + / "calibration" + / "target_config.yaml" + ) + ) + + assert { + "variable": "childcare_expenses", + "geo_level": "national", + } in config["include"] + class TestCalibrationPackageRoundTrip: def test_round_trip(self, sample_targets, tmp_path): diff --git a/tests/unit/calibration/test_unified_matrix_builder_merge.py b/tests/unit/calibration/test_unified_matrix_builder_merge.py index 61ee9d321..3c404f5e7 100644 --- a/tests/unit/calibration/test_unified_matrix_builder_merge.py +++ b/tests/unit/calibration/test_unified_matrix_builder_merge.py @@ -65,6 +65,18 @@ def test_apply_op_matches_fixed_width_byte_string_constraints(): ) +def test_apply_op_matches_pipe_delimited_string_membership_constraints(): + values = np.array( + [b"SINGLE", b"JOINT", b"SURVIVING_SPOUSE"], + dtype="S24", + ) + + np.testing.assert_array_equal( + apply_op(values, "in", "JOINT|SURVIVING_SPOUSE"), + np.array([False, True, True]), + ) + + def test_builder_assemble_clone_values_preserves_string_constraints(): builder = UnifiedMatrixBuilder.__new__(UnifiedMatrixBuilder) diff --git a/tests/unit/test_constraint_validation.py b/tests/unit/test_constraint_validation.py index e494f5c92..48b26a032 100644 --- a/tests/unit/test_constraint_validation.py +++ b/tests/unit/test_constraint_validation.py @@ -110,6 +110,17 @@ def test_not_equal_alone_is_valid(self): ] ensure_consistent_constraint_set(constraints) # No exception + def test_in_alone_is_valid(self): + """filing_status in a pipe-delimited set should pass.""" + constraints = [ + Constraint( + variable="filing_status", + operation="in", + value="JOINT|SURVIVING_SPOUSE", + ), + ] + ensure_consistent_constraint_set(constraints) # No exception + def test_equality_with_range_fails(self): """state_fips == '06' AND state_fips > '05' should fail.""" constraints = [ @@ -119,6 +130,19 @@ def test_equality_with_range_fails(self): with pytest.raises(ConstraintValidationError, match="cannot combine"): ensure_consistent_constraint_set(constraints) + def test_in_with_not_equal_fails(self): + """filing_status in 'A|B' AND filing_status != 'C' should fail.""" + constraints = [ + Constraint( + variable="filing_status", + operation="in", + value="JOINT|SURVIVING_SPOUSE", + ), + Constraint(variable="filing_status", operation="!=", value="SEPARATE"), + ] + with pytest.raises(ConstraintValidationError, match="cannot combine"): + ensure_consistent_constraint_set(constraints) + def test_not_equal_with_range_fails(self): """state_fips != '06' AND state_fips < '10' should fail.""" constraints = [ diff --git a/tests/unit/test_etl_irs_soi_overlay.py b/tests/unit/test_etl_irs_soi_overlay.py index 230438aa1..d8066485f 100644 --- a/tests/unit/test_etl_irs_soi_overlay.py +++ b/tests/unit/test_etl_irs_soi_overlay.py @@ -20,8 +20,10 @@ _skip_coarse_state_agi_person_count_target, _get_or_create_national_domain_stratum, _upsert_target, + load_national_eitc_agi_child_targets, load_national_geography_ctc_agi_targets, load_national_geography_ctc_targets, + load_national_taxable_agi_filing_status_targets, load_national_workbook_soi_targets, ) @@ -433,3 +435,171 @@ def test_load_national_geography_ctc_agi_targets_creates_agi_domain_strata( assert overview_rows assert all(row.geographic_id == "US" for row in overview_rows) + + +def test_load_national_eitc_agi_child_targets_creates_structured_db_rows( + monkeypatch, tmp_path +): + db_uri, engine = _create_test_engine(tmp_path) + calibration_dir = tmp_path / "calibration_targets" + calibration_dir.mkdir() + (calibration_dir / "eitc_by_agi_and_children.csv").write_text( + "# IRS SOI Table 2.5 sample\n" + "count_children,agi_lower,agi_upper,returns,amount\n" + "0,1,1000,10,1000\n" + "3,1,1000,20,2000\n" + "3,1000,2000,0,0\n" + ) + monkeypatch.setattr( + "policyengine_us_data.db.etl_irs_soi.CALIBRATION_FOLDER", + calibration_dir, + ) + + with Session(engine) as session: + national_filer_stratum = _create_national_filer_stratum(session) + load_national_eitc_agi_child_targets( + session, + national_filer_stratum.stratum_id, + source_year=2022, + ) + session.commit() + + builder = UnifiedMatrixBuilder(db_uri=db_uri, time_period=2024) + rows = builder._query_targets( + { + "variables": ["tax_unit_count", "eitc"], + "domain_variables": ["adjusted_gross_income,eitc,eitc_child_count"], + } + ) + + assert len(rows) == 4 + assert set(rows["period"].astype(int)) == {2022} + assert set(rows["variable"]) == {"tax_unit_count", "eitc"} + assert set(rows["value"].astype(float)) == {10.0, 20.0, 1000.0, 2000.0} + + with engine.connect() as conn: + constraints = conn.execute( + text( + """ + SELECT tv.value, sc.constraint_variable, sc.operation, sc.value + FROM target_overview tv + JOIN stratum_constraints sc ON tv.stratum_id = sc.stratum_id + WHERE tv.variable = 'eitc' + AND tv.domain_variable = 'adjusted_gross_income,eitc,eitc_child_count' + ORDER BY tv.value, sc.constraint_variable, sc.operation + """ + ) + ).fetchall() + + constraints_by_target = {} + for target_value, variable, operation, constraint_value in constraints: + constraints_by_target.setdefault(float(target_value), set()).add( + (variable, operation, constraint_value) + ) + + assert ("eitc_child_count", "==", "0") in constraints_by_target[1000.0] + assert ("eitc_child_count", ">", "2") in constraints_by_target[2000.0] + assert ("adjusted_gross_income", ">=", "1.0") in constraints_by_target[2000.0] + assert ("adjusted_gross_income", "<", "1000.0") in constraints_by_target[2000.0] + + +def test_load_national_taxable_agi_filing_status_targets_creates_structured_rows( + monkeypatch, tmp_path +): + db_uri, engine = _create_test_engine(tmp_path) + soi_rows = pd.DataFrame( + [ + { + "Year": 2023, + "SOI table": "Table 1.1", + "XLSX column": "I", + "XLSX row": 20, + "Variable": "adjusted_gross_income", + "Filing status": "All", + "AGI lower bound": 50_000.0, + "AGI upper bound": 75_000.0, + "Count": False, + "Taxable only": True, + "Full population": False, + "Value": 1_000_000.0, + }, + { + "Year": 2023, + "SOI table": "Table 1.2", + "XLSX column": "AP", + "XLSX row": 14, + "Variable": "count", + "Filing status": "Married Filing Jointly/Surviving Spouse", + "AGI lower bound": 20_000.0, + "AGI upper bound": 25_000.0, + "Count": True, + "Taxable only": True, + "Full population": False, + "Value": 2_000.0, + }, + { + "Year": 2023, + "SOI table": "Table 1.2", + "XLSX column": "AP", + "XLSX row": 12, + "Variable": "count", + "Filing status": "Single", + "AGI lower bound": 1.0, + "AGI upper bound": 10_000.0, + "Count": True, + "Taxable only": True, + "Full population": False, + "Value": 999.0, + }, + ] + ) + monkeypatch.setattr( + "policyengine_us_data.db.etl_irs_soi.load_tracked_soi_targets", + lambda: soi_rows, + ) + + with Session(engine) as session: + national_filer_stratum = _create_national_filer_stratum(session) + load_national_taxable_agi_filing_status_targets( + session, + national_filer_stratum.stratum_id, + target_year=2024, + ) + session.commit() + + builder = UnifiedMatrixBuilder(db_uri=db_uri, time_period=2024) + rows = builder._query_targets( + { + "variables": ["adjusted_gross_income", "tax_unit_count"], + "domain_variables": [ + "adjusted_gross_income,income_tax_before_credits", + "adjusted_gross_income,filing_status,income_tax_before_credits", + ], + } + ) + + assert set(rows["variable"]) == {"adjusted_gross_income", "tax_unit_count"} + assert set(rows["value"].astype(float)) == {1_000_000.0, 2_000.0} + assert 999.0 not in set(rows["value"].astype(float)) + + with engine.connect() as conn: + constraints = conn.execute( + text( + """ + SELECT tv.value, sc.constraint_variable, sc.operation, sc.value + FROM target_overview tv + JOIN stratum_constraints sc ON tv.stratum_id = sc.stratum_id + WHERE tv.variable = 'tax_unit_count' + ORDER BY sc.constraint_variable + """ + ) + ).fetchall() + + count_constraints = { + (variable, operation, constraint_value) + for _, variable, operation, constraint_value in constraints + } + assert ("filing_status", "in", "JOINT|SURVIVING_SPOUSE") in count_constraints + assert ("income_tax_before_credits", ">", "0") in count_constraints + assert ("adjusted_gross_income", ">=", "20000.0") in count_constraints + assert ("adjusted_gross_income", "<", "25000.0") in count_constraints diff --git a/tests/unit/test_etl_national_targets.py b/tests/unit/test_etl_national_targets.py index 84d8c748b..534a6dbbf 100644 --- a/tests/unit/test_etl_national_targets.py +++ b/tests/unit/test_etl_national_targets.py @@ -8,6 +8,7 @@ create_database, ) from policyengine_us_data.db.etl_national_targets import ( + extract_national_targets, load_national_targets, ) @@ -199,3 +200,31 @@ def test_load_national_targets_supports_liheap_household_counts(tmp_path, monkey ).first() assert liheap_target is not None assert liheap_target.value == 5_876_646 + + +def test_extract_national_targets_drops_survey_spm_targets(): + targets = extract_national_targets(year=2024) + direct_sum_variables = { + target["variable"] for target in targets["direct_sum_targets"] + } + removed_targets = { + "alimony_income", + "alimony_expense", + "child_support_expense", + "child_support_received", + "health_insurance_premiums_without_medicare_part_b", + "other_medical_expenses", + "over_the_counter_health_expenses", + "spm_unit_capped_housing_subsidy", + "spm_unit_capped_work_childcare_expenses", + } + + assert removed_targets.isdisjoint(direct_sum_variables) + assert {"rent", "real_estate_taxes", "childcare_expenses"} <= direct_sum_variables + + direct_sum_targets = { + target["variable"]: target for target in targets["direct_sum_targets"] + } + assert direct_sum_targets["rent"]["value"] == 764_925_694_800 + assert direct_sum_targets["real_estate_taxes"]["value"] == 370_014_207_400 + assert direct_sum_targets["childcare_expenses"]["value"] == 63_092e6 diff --git a/tests/unit/test_national_target_parity.py b/tests/unit/test_national_target_parity.py new file mode 100644 index 000000000..d8262f704 --- /dev/null +++ b/tests/unit/test_national_target_parity.py @@ -0,0 +1,278 @@ +import json +import sqlite3 + +from policyengine_us_data.utils.national_target_parity import ( + Constraint, + NationalTargetIndex, + TargetRecord, + build_national_target_parity_manifest, + classify_national_target, + extract_target_names_from_json, + load_national_target_records, +) + + +def _record( + target_id, + *, + variable, + period=2024, + domain_variable=None, + reform_id=0, + constraints=(), +): + return TargetRecord( + target_id=target_id, + stratum_id=target_id + 100, + variable=variable, + reform_id=reform_id, + value=target_id * 100.0, + period=period, + source="test", + notes=None, + geo_level="national", + geographic_id="US", + domain_variable=domain_variable, + constraints=tuple(constraints), + ) + + +def test_classify_eitc_agi_child_target_matches_structured_db_row(): + index = NationalTargetIndex( + [ + _record( + 9763, + variable="eitc", + period=2022, + domain_variable="adjusted_gross_income,eitc,eitc_child_count", + constraints=[ + Constraint("tax_unit_is_filer", "==", "1"), + Constraint("eitc", ">", "0"), + Constraint("eitc_child_count", ">", "2"), + Constraint("adjusted_gross_income", ">=", "1.0"), + Constraint("adjusted_gross_income", "<", "1000.0"), + ], + ) + ] + ) + + row = classify_national_target( + "nation/irs/eitc/amount/c3_1_1k", + index, + period=2024, + ) + + assert row["status"] == "matched" + assert row["target_id"] == 9763 + assert row["reason"] == "structured_eitc_agi_child_target" + + +def test_classify_known_legacy_target_gets_named_reason(): + row = classify_national_target( + ( + "nation/irs/business net profits/total/AGI in " + "20k-25k/taxable/Married Filing Jointly/Surviving Spouse" + ), + NationalTargetIndex([]), + period=2024, + ) + + assert row == { + "target_name": ( + "nation/irs/business net profits/total/AGI in " + "20k-25k/taxable/Married Filing Jointly/Surviving Spouse" + ), + "scope": "national", + "status": "legacy_only", + "reason": "legacy_soi_taxable_agi_filing_status_detail_not_in_target_db", + } + + +def test_classify_soi_taxable_agi_filing_status_target_matches_structured_row(): + index = NationalTargetIndex( + [ + _record( + 1201, + variable="adjusted_gross_income", + period=2023, + domain_variable=( + "adjusted_gross_income,filing_status,income_tax_before_credits" + ), + constraints=[ + Constraint("tax_unit_is_filer", "==", "1"), + Constraint("income_tax_before_credits", ">", "0"), + Constraint("adjusted_gross_income", ">=", "20000.0"), + Constraint("adjusted_gross_income", "<", "25000.0"), + Constraint("filing_status", "in", "JOINT|SURVIVING_SPOUSE"), + ], + ) + ] + ) + + row = classify_national_target( + ( + "nation/irs/adjusted gross income/total/AGI in " + "20k-25k/taxable/Married Filing Jointly/Surviving Spouse" + ), + index, + period=2024, + ) + + assert row["status"] == "matched" + assert row["target_id"] == 1201 + assert row["reason"] == "structured_soi_taxable_agi_filing_status_target" + + +def test_classify_soi_taxable_count_target_matches_all_filers_row(): + index = NationalTargetIndex( + [ + _record( + 1202, + variable="tax_unit_count", + period=2023, + domain_variable="adjusted_gross_income,income_tax_before_credits", + constraints=[ + Constraint("tax_unit_is_filer", "==", "1"), + Constraint("income_tax_before_credits", ">", "0"), + Constraint("adjusted_gross_income", ">=", "50000.0"), + Constraint("adjusted_gross_income", "<", "75000.0"), + ], + ) + ] + ) + + row = classify_national_target( + "nation/irs/count/count/AGI in 50k-75k/taxable/All", + index, + period=2024, + ) + + assert row["status"] == "matched" + assert row["target_id"] == 1202 + assert row["reason"] == "structured_soi_taxable_agi_filing_status_target" + + +def test_lossy_soi_taxable_agi_label_gets_explicit_legacy_reason(): + row = classify_national_target( + "nation/irs/count/count/AGI in 2m-2m/taxable/All", + NationalTargetIndex([]), + period=2024, + ) + + assert row == { + "target_name": "nation/irs/count/count/AGI in 2m-2m/taxable/All", + "scope": "national", + "status": "legacy_only", + "reason": "legacy_soi_taxable_agi_label_has_lossy_bucket_encoding", + } + + +def test_zero_eitc_agi_child_target_is_classified_as_intentionally_omitted(): + row = classify_national_target( + "nation/irs/eitc/returns/c0_50k_inf", + NationalTargetIndex([]), + period=2024, + target_value=0.0, + ) + + assert row == { + "target_name": "nation/irs/eitc/returns/c0_50k_inf", + "scope": "national", + "status": "legacy_only", + "reason": "zero_eitc_agi_child_target_omitted_from_target_db", + } + + +def test_manifest_summarizes_matches_and_explicit_legacy_reasons(tmp_path): + db_path = tmp_path / "policy_data.db" + _create_minimal_target_db(db_path) + + manifest = build_national_target_parity_manifest( + [ + "nation/cbo/snap", + "nation/jct/interest_deduction_expenditure", + "nation/census/population_by_age/80", + "state/census/age/CA/0-4", + ], + db_path=db_path, + period=2024, + ) + + assert manifest["summary"]["total"] == 3 + assert manifest["summary"]["statuses"] == { + "legacy_only": 1, + "matched": 2, + } + assert ( + manifest["summary"]["reasons"][ + "legacy_single_year_age_targets_replaced_by_db_age_bins" + ] + == 1 + ) + assert manifest["targets"][0]["target_id"] == 1 + assert manifest["targets"][1]["target_id"] == 2 + + +def test_extract_target_names_from_diagnostic_json(tmp_path): + path = tmp_path / "diagnostic.json" + path.write_text( + json.dumps( + { + "targets": [ + {"target_name": "nation/cbo/snap"}, + {"target_name": "state/census/age/CA/0-4"}, + ] + } + ) + ) + + assert extract_target_names_from_json(path) == [ + "nation/cbo/snap", + "state/census/age/CA/0-4", + ] + + +def _create_minimal_target_db(path): + with sqlite3.connect(path) as conn: + conn.executescript( + """ + CREATE TABLE targets ( + target_id INTEGER PRIMARY KEY, + stratum_id INTEGER, + variable TEXT, + reform_id INTEGER, + value REAL, + period INTEGER, + active INTEGER, + source TEXT, + notes TEXT + ); + CREATE TABLE target_overview ( + target_id INTEGER, + stratum_id INTEGER, + variable TEXT, + reform_id INTEGER, + value REAL, + period INTEGER, + active INTEGER, + geo_level TEXT, + geographic_id TEXT, + domain_variable TEXT + ); + CREATE TABLE stratum_constraints ( + stratum_id INTEGER, + constraint_variable TEXT, + operation TEXT, + value TEXT + ); + INSERT INTO targets VALUES + (1, 101, 'snap', 0, 1.0, 2024, 1, 'test', NULL), + (2, 102, 'deductible_mortgage_interest', 4, 2.0, 2024, 1, 'test', NULL); + INSERT INTO target_overview VALUES + (1, 101, 'snap', 0, 1.0, 2024, 1, 'national', 'US', NULL), + (2, 102, 'deductible_mortgage_interest', 4, 2.0, 2024, 1, 'national', 'US', NULL); + """ + ) + + records = load_national_target_records(path) + assert len(records) == 2