From 1c12fce9f9ee8bfb121fa1906e87250f199e4270 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 25 Apr 2026 23:24:58 -0400 Subject: [PATCH] Remove survey-based national calibration targets --- .gitignore | 1 + .../datasets/cps/enhanced_cps.py | 25 +- policyengine_us_data/db/etl_irs_soi.py | 43 +++ .../db/etl_national_targets.py | 78 +---- .../acs_housing_costs_2024.csv | 52 +++ .../pull_hardcoded_targets.py | 17 +- .../refresh_acs_housing_cost_targets.py | 76 ++++ policyengine_us_data/utils/__init__.py | 2 + policyengine_us_data/utils/loss.py | 330 +++++++++++++----- tests/integration/test_sparse_enhanced_cps.py | 8 +- tests/unit/calibration/test_loss_targets.py | 239 ++++++++++++- tests/unit/test_etl_national_targets.py | 29 ++ 12 files changed, 728 insertions(+), 172 deletions(-) create mode 100644 policyengine_us_data/storage/calibration_targets/acs_housing_costs_2024.csv create mode 100644 policyengine_us_data/storage/calibration_targets/refresh_acs_housing_cost_targets.py diff --git a/.gitignore b/.gitignore index 0a174f2fa..9607a08ef 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ node_modules !population_by_state.csv !aca_spending_and_enrollment_2024.csv !aca_spending_and_enrollment_2025.csv +!policyengine_us_data/storage/calibration_targets/acs_housing_costs_2024.csv !real_estate_taxes_by_state_acs.csv !snap_state.csv !age_state.csv diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 57455617a..9d053d4c3 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -1,7 +1,9 @@ from policyengine_core.data import Dataset import pandas as pd from policyengine_us_data.utils import ( + ABSOLUTE_ERROR_SCALE_TARGETS, build_loss_matrix, + get_target_error_normalisation, HardConcrete, print_reweighting_diagnostics, set_seeds, @@ -113,6 +115,10 @@ def reweight( ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") + numerator_shift_np, error_denominator_np = get_target_error_normalisation( + target_names, + targets_array, + ) loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32) nation_normalisation_factor = is_national * (1 / is_national.sum()) state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) @@ -121,6 +127,8 @@ def reweight( ) normalisation_factor = torch.tensor(normalisation_factor, dtype=torch.float32) targets_array = torch.tensor(targets_array, dtype=torch.float32) + numerator_shift = torch.tensor(numerator_shift_np, dtype=torch.float32) + error_denominator = torch.tensor(error_denominator_np, dtype=torch.float32) inv_mean_normalisation = 1 / np.mean(normalisation_factor.numpy()) @@ -132,7 +140,9 @@ def loss(weights): estimate = weights @ loss_matrix if torch.isnan(estimate).any(): raise ValueError("Estimate contains NaNs") - rel_error = (((estimate - targets_array) + 1) / (targets_array + 1)) ** 2 + rel_error = ( + (estimate - targets_array + numerator_shift) / error_denominator + ) ** 2 rel_error_normalized = inv_mean_normalisation * rel_error * normalisation_factor if torch.isnan(rel_error_normalized).any(): raise ValueError("Relative error contains NaNs") @@ -176,7 +186,10 @@ def loss(weights): ) df["epoch"] = i df["error"] = df.estimate - df.target - df["rel_error"] = df.error / df.target + df["error_denominator"] = error_denominator.detach().numpy() + df["rel_error"] = ( + df.error + numerator_shift.detach().numpy() + ) / df.error_denominator df["abs_error"] = df.error.abs() df["rel_abs_error"] = df.rel_error.abs() df["loss"] = df.rel_abs_error**2 @@ -203,6 +216,7 @@ def loss(weights): loss_matrix, targets_array, "L0 Sparse Solution", + target_names=target_names, ) return final_weights_sparse @@ -248,7 +262,12 @@ def generate(self): # Run the optimization procedure to get (close to) minimum loss weights for year in range(self.start_year, self.end_year + 1): loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year) - zero_mask = np.isclose(targets_array, 0.0, atol=0.1) + scaled_zero_target_mask = loss_matrix.columns.isin( + ABSOLUTE_ERROR_SCALE_TARGETS.keys() + ) + zero_mask = np.isclose(targets_array, 0.0, atol=0.1) & ( + ~scaled_zero_target_mask + ) bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~(zero_mask | bad_mask) keep_idx = np.where(keep_mask_bool)[0] diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index 2c386420b..d3ec6a824 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -450,6 +450,49 @@ def get_national_geography_soi_target( return _get_national_geography_soi_target_from_year(variable, geography_year) +def _get_state_geography_soi_targets_from_year( + variable: str, + geography_year: int, +) -> list[dict]: + spec = _get_geography_file_aggregate_target_spec(variable) + code = spec["code"] + + raw_df = extract_soi_data(geography_year) + state_rows = raw_df[(raw_df["STATE"] != "US") & (raw_df["agi_stub"] == 0)] + if "CONG_DISTRICT" in state_rows.columns: + state_rows = state_rows[state_rows["CONG_DISTRICT"] == 0] + if state_rows.empty: + raise ValueError( + f"IRS geography SOI file for {geography_year} is missing state rows " + f"for {variable}" + ) + + targets = [] + for row in state_rows.itertuples(index=False): + targets.append( + { + "variable": variable, + "source_year": geography_year, + "state_code": row.STATE, + "count": float(getattr(row, f"N{code}")), + "amount": float(getattr(row, f"A{code}")) * 1_000, + } + ) + + return sorted(targets, key=lambda target: target["state_code"]) + + +def get_state_geography_soi_targets( + variable: str, + dataset_year: int, + *, + lag: int = IRS_SOI_LAG_YEARS, +) -> list[dict]: + """Return state count and amount targets from the IRS geography file.""" + geography_year = get_geography_soi_year(dataset_year, lag=lag) + return _get_state_geography_soi_targets_from_year(variable, geography_year) + + def get_national_geography_soi_agi_targets( variable: str, dataset_year: int, diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index a671daedb..a86f8fd4a 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -112,20 +112,6 @@ def extract_national_targets(year: int = DEFAULT_YEAR): tax_expenditure_targets = [{**target} for target in raw_tax_expenditure_targets] direct_sum_targets = [ - { - "variable": "alimony_income", - "value": 13e9, - "source": "Survey-reported (post-TCJA grandfathered)", - "notes": "Alimony received - survey reported, not tax-filer restricted", - "year": 2024, - }, - { - "variable": "alimony_expense", - "value": 13e9, - "source": "Survey-reported (post-TCJA grandfathered)", - "notes": "Alimony paid - survey reported, not tax-filer restricted", - "year": 2024, - }, { "variable": "medicaid", "value": 871.7e9, @@ -140,20 +126,6 @@ def extract_national_targets(year: int = DEFAULT_YEAR): "notes": "Total household net worth", "year": 2024, }, - { - "variable": "health_insurance_premiums_without_medicare_part_b", - "value": 385e9, - "source": "MEPS/NHEA", - "notes": "Health insurance premiums excluding Medicare Part B", - "year": 2024, - }, - { - "variable": "other_medical_expenses", - "value": 278e9, - "source": "MEPS/NHEA", - "notes": "Out-of-pocket medical expenses", - "year": 2024, - }, { "variable": "medicare_part_b_premiums", "value": get_beneficiary_paid_medicare_part_b_premiums_target(2024), @@ -162,52 +134,24 @@ def extract_national_targets(year: int = DEFAULT_YEAR): "year": 2024, }, { - "variable": "over_the_counter_health_expenses", - "value": 72e9, - "source": "Consumer Expenditure Survey", - "notes": "OTC health products and supplies", - "year": 2024, - }, - { - "variable": "child_support_expense", - "value": 33e9, - "source": "Census Bureau", - "notes": "Child support payments", - "year": 2024, - }, - { - "variable": "child_support_received", - "value": 33e9, - "source": "Census Bureau", - "notes": "Child support received", - "year": 2024, - }, - { - "variable": "spm_unit_capped_work_childcare_expenses", - "value": 348e9, - "source": "Census Bureau SPM", - "notes": "Work and childcare expenses for SPM", - "year": 2024, - }, - { - "variable": "spm_unit_capped_housing_subsidy", - "value": 35e9, - "source": "HUD/Census", - "notes": "Housing subsidies", + "variable": "rent", + "value": 764_925_694_800, + "source": "Census ACS 2024 1-year table B25060", + "notes": "Sum of state aggregate contract rent, annualized from monthly ACS aggregate contract rent", "year": 2024, }, { "variable": "real_estate_taxes", - "value": 500e9, - "source": "Census Bureau", - "notes": "Property taxes paid", + "value": 370_014_207_400, + "source": "Census ACS 2024 1-year table B25090", + "notes": "Sum of state aggregate real estate taxes paid by owner-occupied housing units", "year": 2024, }, { - "variable": "rent", - "value": 735e9, - "source": "Census Bureau/BLS", - "notes": "Rental payments", + "variable": "childcare_expenses", + "value": 63_092e6, + "source": "BLS Consumer Expenditure Surveys CE LABSTAT", + "notes": "Series CXU670320LB0101M aggregate expenditure: babysitting, childcare, daycare, preschool", "year": 2024, }, { diff --git a/policyengine_us_data/storage/calibration_targets/acs_housing_costs_2024.csv b/policyengine_us_data/storage/calibration_targets/acs_housing_costs_2024.csv new file mode 100644 index 000000000..4954e0f14 --- /dev/null +++ b/policyengine_us_data/storage/calibration_targets/acs_housing_costs_2024.csv @@ -0,0 +1,52 @@ +state_code,state_fips,annual_contract_rent,real_estate_taxes +AK,02,1350681600,664772900 +AL,01,5761773600,1537253700 +AR,05,3760575600,1167041400 +AZ,04,16849603200,4320807000 +CA,06,143291068800,52872735400 +CO,08,17072544000,5750527500 +CT,09,8116260000,7275184600 +DC,11,4602276000,778233300 +DE,10,1652836800,656213100 +FL,12,57303682800,24312484700 +GA,13,21304225200,8707748600 +HI,15,4073208000,981165300 +IA,19,4069554000,3234507400 +ID,16,3091480800,1222009800 +IL,17,24729199200,21262263300 +IN,18,9115561200,4242347000 +KS,20,4246785600,2863525400 +KY,21,5821017600,2434868700 +LA,22,5928199200,1822794700 +MA,25,21342618000,12097297000 +MD,24,14212159200,7520628800 +ME,23,2153030400,1668939000 +MI,26,13242972000,10402220500 +MN,27,9724164000,6501643100 +MO,29,8718777600,4428280300 +MS,28,3018102000,1026895200 +MT,30,1873186800,1018759800 +NC,37,20318032800,7550042500 +ND,38,1474936800,608757100 +NE,31,3199722000,2283083400 +NH,33,2585438400,2900421200 +NJ,34,25845276000,22119447000 +NM,35,2917616400,1218092800 +NV,32,8914724400,2031449700 +NY,36,71916831600,32203085100 +OH,39,17617650000,12129649100 +OK,40,5521292400,2206132700 +OR,41,10933761600,4917685900 +PA,42,22028415600,14303332700 +RI,44,2401389600,1519517700 +SC,45,7908846000,2768317200 +SD,46,1274104800,825527300 +TN,47,12780411600,3724735100 +TX,48,67268908800,34936256600 +UT,49,6183264000,2346772700 +VA,51,20114900400,8760836100 +VT,50,1119537600,1171089500 +WA,53,23878054800,10671295800 +WI,55,10165308000,6958356700 +WV,54,1337834400,584045200 +WY,56,793893600,505130800 diff --git a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py index 16e92ea01..2c310b89e 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py @@ -3,26 +3,13 @@ from policyengine_us_data.storage import CALIBRATION_FOLDER """ -Hardcoded targets for the year 2024 from CPS-derived statistics and other sources. Include medical expenses, sum of SPM thresholds, and child support expenses. +Hardcoded targets for the year 2024 from administrative and +authoritative aggregate sources. """ HARD_CODED_TOTALS = { - "health_insurance_premiums_without_medicare_part_b": 385e9, - "other_medical_expenses": 278e9, "medicare_part_b_premiums": 112e9, - "over_the_counter_health_expenses": 72e9, - "spm_unit_spm_threshold": 3_945e9, - "child_support_expense": 33e9, - "child_support_received": 33e9, - "spm_unit_capped_work_childcare_expenses": 348e9, - "spm_unit_capped_housing_subsidy": 35e9, "tanf": 7_788_317_474.55, - # Alimony could be targeted via SOI - "alimony_income": 13e9, - "alimony_expense": 13e9, - # Rough estimate, not CPS derived - "real_estate_taxes": 500e9, # Rough estimate between 350bn and 600bn total property tax collections - "rent": 735e9, # ACS total uprated by CPI # Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics # shows $38,316,190,000 in Box 7: Social security tips (2018) # Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC diff --git a/policyengine_us_data/storage/calibration_targets/refresh_acs_housing_cost_targets.py b/policyengine_us_data/storage/calibration_targets/refresh_acs_housing_cost_targets.py new file mode 100644 index 000000000..5718468cc --- /dev/null +++ b/policyengine_us_data/storage/calibration_targets/refresh_acs_housing_cost_targets.py @@ -0,0 +1,76 @@ +import csv +import json +from urllib.request import urlopen + +from policyengine_us_data.storage import CALIBRATION_FOLDER +from policyengine_us_data.storage.calibration_targets.pull_soi_targets import ( + STATE_ABBR_TO_FIPS, +) + + +YEAR = 2024 +ACS_DATASET = "acs/acs1" +STATE_FIPS_TO_ABBR = { + fips: state_code for state_code, fips in STATE_ABBR_TO_FIPS.items() +} + + +def fetch_acs_housing_cost_targets(year: int = YEAR) -> list[dict]: + """Fetch ACS state rent and property-tax aggregates. + + B25060 is aggregate monthly contract rent for renter-occupied units + paying cash rent. We annualize it to match the yearly `rent` variable. + B25090 is aggregate real estate taxes paid by owner-occupied units. + """ + variables = "NAME,B25060_001E,B25090_001E" + url = ( + f"https://api.census.gov/data/{year}/{ACS_DATASET}" + f"?get={variables}&for=state:*" + ) + with urlopen(url) as response: + rows = json.load(response) + + header = rows[0] + column_index = {column: index for index, column in enumerate(header)} + + targets = [] + for row in rows[1:]: + state_fips = row[column_index["state"]] + state_code = STATE_FIPS_TO_ABBR.get(state_fips) + if state_code is None: + continue + + monthly_contract_rent = float(row[column_index["B25060_001E"]]) + real_estate_taxes = float(row[column_index["B25090_001E"]]) + targets.append( + { + "state_code": state_code, + "state_fips": state_fips, + "annual_contract_rent": int(monthly_contract_rent * 12), + "real_estate_taxes": int(real_estate_taxes), + } + ) + + return sorted(targets, key=lambda target: target["state_code"]) + + +def main() -> None: + targets = fetch_acs_housing_cost_targets() + output_path = CALIBRATION_FOLDER / f"acs_housing_costs_{YEAR}.csv" + with output_path.open("w", newline="") as output: + writer = csv.DictWriter( + output, + fieldnames=[ + "state_code", + "state_fips", + "annual_contract_rent", + "real_estate_taxes", + ], + lineterminator="\n", + ) + writer.writeheader() + writer.writerows(targets) + + +if __name__ == "__main__": + main() diff --git a/policyengine_us_data/utils/__init__.py b/policyengine_us_data/utils/__init__.py index 70db28d75..b2a964d6b 100644 --- a/policyengine_us_data/utils/__init__.py +++ b/policyengine_us_data/utils/__init__.py @@ -9,8 +9,10 @@ ) __all__ = [ + "ABSOLUTE_ERROR_SCALE_TARGETS", "HardConcrete", "build_loss_matrix", + "get_target_error_normalisation", "print_reweighting_diagnostics", "set_seeds", ] diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index ce71696cc..41a64ac2d 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -15,7 +15,10 @@ from policyengine_us_data.utils.cms_medicare import ( get_beneficiary_paid_medicare_part_b_premiums_target, ) -from policyengine_us_data.db.etl_irs_soi import get_national_geography_soi_target +from policyengine_us_data.db.etl_irs_soi import ( + get_national_geography_soi_target, + get_state_geography_soi_targets, +) from policyengine_core.reforms import Reform from policyengine_us_data.utils.soi import pe_to_soi, get_soi @@ -27,24 +30,10 @@ # database so this dict can be deleted. See PR #488. HARD_CODED_TOTALS = { - "health_insurance_premiums_without_medicare_part_b": 385e9, - "other_medical_expenses": 278e9, "medicare_part_b_premiums": get_beneficiary_paid_medicare_part_b_premiums_target( 2024 ), - "over_the_counter_health_expenses": 72e9, - "spm_unit_spm_threshold": 3_945e9, - "child_support_expense": 33e9, - "child_support_received": 33e9, - "spm_unit_capped_work_childcare_expenses": 348e9, - "spm_unit_capped_housing_subsidy": 35e9, "tanf": 7_788_317_474.55, - # Alimony could be targeted via SOI - "alimony_income": 13e9, - "alimony_expense": 13e9, - # Rough estimate, not CPS derived - "real_estate_taxes": 500e9, # Rough estimate between 350bn and 600bn total property tax collections - "rent": 735e9, # ACS total uprated by CPI # Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics # shows $38,316,190,000 in Box 7: Social security tips (2018) # Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC @@ -110,6 +99,35 @@ ], } +AGE_BUCKETED_HEALTH_TARGETS = ("medicare_part_b_premiums",) + +BLS_CE_TOTALS = { + # BLS Consumer Expenditure Surveys, CE LABSTAT series + # CXU670320LB0101M, aggregate expenditure (AG) in 2024. + # Item: "Babysitting, childcare, daycare, preschool"; + # AG is reported in millions of dollars. + "childcare_expenses": 63_092e6, +} + +TRANSFER_BALANCE_TARGETS = { + "nation/accounting/alimony_paid_minus_received": ( + "alimony_expense", + "alimony_income", + ), + "nation/accounting/child_support_paid_minus_received": ( + "child_support_expense", + "child_support_received", + ), +} + +ABSOLUTE_ERROR_SCALE_TARGETS = { + # These are accounting identities, not gross flow targets. Use a + # target-specific scale so zero-dollar targets do not get dropped + # by sparse ECPS or dominate the dense reweighting objective. + target: 1e9 + for target in TRANSFER_BALANCE_TARGETS +} + ACA_SPENDING_TARGETS = { 2024: 98e9, } @@ -511,6 +529,166 @@ def _add_ctc_targets(loss_matrix, targets_list, sim, time_period): return targets_list, loss_matrix +def _add_real_estate_tax_targets(loss_matrix, targets_list, sim, time_period): + """Add IRS SOI real-estate-tax amount and count targets. + + These targets correspond to itemizing filers with positive Schedule A + real-estate-tax amounts from the IRS geography file, not total + owner-occupied property-tax payments. + """ + target = get_national_geography_soi_target("real_estate_taxes", time_period) + + real_estate_taxes_person = sim.calculate( + "real_estate_taxes", + period=time_period, + ).values.astype(np.float32) + real_estate_taxes_tax_unit = sim.map_result( + real_estate_taxes_person, + "person", + "tax_unit", + ).astype(np.float32) + is_filer = sim.calculate("tax_unit_is_filer", period=time_period).values > 0 + itemizes = sim.calculate("tax_unit_itemizes", period=time_period).values > 0 + domain_mask = is_filer & itemizes & (real_estate_taxes_tax_unit > 0) + + household_amount = sim.map_result( + real_estate_taxes_tax_unit * domain_mask.astype(np.float32), + "tax_unit", + "household", + ).astype(np.float32) + household_count = sim.map_result( + domain_mask.astype(np.float32), + "tax_unit", + "household", + ).astype(np.float32) + + label = "nation/irs/real_estate_taxes" + loss_matrix[label] = household_amount + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(target["amount"]) + + label = "nation/irs/real_estate_taxes_count" + loss_matrix[label] = household_count + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(target["count"]) + + state_code = sim.calculate( + "state_code", + map_to="household", + period=time_period, + ).values + for state_target in get_state_geography_soi_targets( + "real_estate_taxes", + time_period, + ): + in_state = (state_code == state_target["state_code"]).astype(np.float32) + + label = f"state/irs/real_estate_taxes/{state_target['state_code']}" + loss_matrix[label] = household_amount * in_state + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(state_target["amount"]) + + label = f"state/irs/real_estate_taxes_count/{state_target['state_code']}" + loss_matrix[label] = household_count * in_state + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(state_target["count"]) + + return targets_list, loss_matrix + + +def _add_acs_housing_cost_targets(loss_matrix, targets_list, sim, time_period): + """Add ACS component targets for rent and all-owner property taxes.""" + targets, _ = _load_yeared_target_csv("acs_housing_costs", time_period) + state_code = sim.calculate( + "state_code", + map_to="household", + period=time_period, + ).values + + target_columns = { + "rent": "annual_contract_rent", + "real_estate_taxes": "real_estate_taxes", + } + for variable, target_column in target_columns.items(): + values = sim.calculate( + variable, + map_to="household", + period=time_period, + ).values + + label = f"nation/census/acs/{variable}" + loss_matrix[label] = values + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(float(targets[target_column].sum())) + + for row in targets.itertuples(index=False): + in_state = (state_code == row.state_code).astype(np.float32) + label = f"state/census/acs/{variable}/{row.state_code}" + loss_matrix[label] = values * in_state + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(float(getattr(row, target_column))) + + return targets_list, loss_matrix + + +def _add_bls_ce_targets(loss_matrix, targets_list, sim, time_period): + """Add BLS Consumer Expenditure component-spending targets.""" + for variable, target in BLS_CE_TOTALS.items(): + label = f"nation/bls/ce/{variable}" + loss_matrix[label] = sim.calculate( + variable, + map_to="household", + period=time_period, + ).values + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(target) + + return targets_list, loss_matrix + + +def _add_transfer_balance_targets(loss_matrix, targets_list, sim, time_period): + """Add paid-minus-received accounting targets for private transfers.""" + for label, (paid_variable, received_variable) in TRANSFER_BALANCE_TARGETS.items(): + paid = sim.calculate( + paid_variable, + map_to="household", + period=time_period, + ).values + received = sim.calculate( + received_variable, + map_to="household", + period=time_period, + ).values + loss_matrix[label] = paid - received + if any(pd.isna(loss_matrix[label])): + raise ValueError(f"Missing values for {label}") + targets_list.append(0.0) + + return targets_list, loss_matrix + + +def get_target_error_normalisation(target_names, targets_array): + """Return numerator shifts and denominators for target loss scaling.""" + target_names = np.asarray(target_names) + targets_array = np.asarray(targets_array, dtype=np.float64) + numerator_shift = np.ones_like(targets_array, dtype=np.float64) + denominator = targets_array + 1 + + for label, scale in ABSOLUTE_ERROR_SCALE_TARGETS.items(): + mask = target_names == label + numerator_shift[mask] = 0.0 + denominator[mask] = scale + + return numerator_shift, denominator + + def build_loss_matrix(dataset: type, time_period): loss_matrix = pd.DataFrame() df = pe_to_soi(dataset, time_period) @@ -778,6 +956,13 @@ def build_loss_matrix(dataset: type, time_period): time_period, ) + targets_array, loss_matrix = _add_real_estate_tax_targets( + loss_matrix, + targets_array, + sim, + time_period, + ) + # Tax filer counts by AGI band (SOI Table 1.1). Calibrates total # filers (not just taxable returns), with granular bands sourced # from the latest SOI year <= calibration year to avoid hardcoding @@ -820,6 +1005,27 @@ def build_loss_matrix(dataset: type, time_period): raise ValueError(f"Missing values for {label}") targets_array.append(target) + targets_array, loss_matrix = _add_acs_housing_cost_targets( + loss_matrix, + targets_array, + sim, + time_period, + ) + + targets_array, loss_matrix = _add_bls_ce_targets( + loss_matrix, + targets_array, + sim, + time_period, + ) + + targets_array, loss_matrix = _add_transfer_balance_targets( + loss_matrix, + targets_array, + sim, + time_period, + ) + # Negative household market income total rough estimate from the IRS SOI PUF market_income = sim.calculate("household_market_income").values @@ -838,6 +1044,8 @@ def build_loss_matrix(dataset: type, time_period): # The top row is treated as unbounded (age >= lower_bound) so the # 90+ population is constrained by an age-specific target rather than # only by the national total. See issue #768. + # Keep only Medicare Part B: the other household medical-expense + # aggregates are survey-based and should not drive national calibration. healthcare = pd.read_csv(CALIBRATION_FOLDER / "healthcare_spending.csv") top_age_lower_bound = int(healthcare["age_10_year_lower_bound"].max()) @@ -851,12 +1059,7 @@ def build_loss_matrix(dataset: type, time_period): else: in_age_range = (age >= age_lower_bound) * (age < age_lower_bound + 10) label_suffix = f"age_{age_lower_bound}_to_{age_lower_bound + 9}" - for expense_type in [ - "health_insurance_premiums_without_medicare_part_b", - "over_the_counter_health_expenses", - "other_medical_expenses", - "medicare_part_b_premiums", - ]: + for expense_type in AGE_BUCKETED_HEALTH_TARGETS: label = f"nation/census/{expense_type}/{label_suffix}" value = sim.calculate(expense_type).values loss_matrix[label] = sim.map_result( @@ -864,26 +1067,6 @@ def build_loss_matrix(dataset: type, time_period): ) targets_array.append(row[expense_type]) - # AGI by SPM threshold totals - - spm_threshold_agi = pd.read_csv(CALIBRATION_FOLDER / "spm_threshold_agi.csv") - - for _, row in spm_threshold_agi.iterrows(): - spm_unit_agi = sim.calculate("adjusted_gross_income", map_to="spm_unit").values - spm_threshold = sim.calculate("spm_unit_spm_threshold").values - in_threshold_range = (spm_threshold >= row["lower_spm_threshold"]) * ( - spm_threshold < row["upper_spm_threshold"] - ) - label = f"nation/census/agi_in_spm_threshold_decile_{int(row['decile'])}" - loss_matrix[label] = sim.map_result( - in_threshold_range * spm_unit_agi, "spm_unit", "household" - ) - targets_array.append(row["adjusted_gross_income"]) - - label = f"nation/census/count_in_spm_threshold_decile_{int(row['decile'])}" - loss_matrix[label] = sim.map_result(in_threshold_range, "spm_unit", "household") - targets_array.append(row["count"]) - # Population by state and population under 5 by state state_population = pd.read_csv(CALIBRATION_FOLDER / "population_by_state.csv") @@ -1080,10 +1263,6 @@ def build_loss_matrix(dataset: type, time_period): targets_array.extend(agi_state_targets) loss_matrix = _add_agi_metric_columns(loss_matrix, sim) - targets_array, loss_matrix = _add_state_real_estate_taxes( - loss_matrix, targets_array, sim - ) - snap_state_target_names, snap_state_targets = _add_snap_state_targets(sim) targets_array.extend(snap_state_targets) loss_matrix = _add_snap_metric_columns(loss_matrix, sim) @@ -1219,41 +1398,6 @@ def _add_agi_metric_columns( return loss_matrix -def _add_state_real_estate_taxes(loss_matrix, targets_list, sim): - """ - Add state real estate taxes to the loss matrix and targets list. - """ - # Read the real estate taxes data - real_estate_taxes_targets = pd.read_csv( - CALIBRATION_FOLDER / "real_estate_taxes_by_state_acs.csv" - ) - national_total = HARD_CODED_TOTALS["real_estate_taxes"] - state_sum = real_estate_taxes_targets["real_estate_taxes_bn"].sum() * 1e9 - national_to_state_diff = national_total / state_sum - real_estate_taxes_targets["real_estate_taxes_bn"] *= national_to_state_diff - real_estate_taxes_targets["real_estate_taxes_bn"] = ( - real_estate_taxes_targets["real_estate_taxes_bn"] * 1e9 - ) - - assert np.isclose( - real_estate_taxes_targets["real_estate_taxes_bn"].sum(), - national_total, - rtol=1e-8, - ), "Real estate tax totals do not sum to national target" - - targets_list.extend(real_estate_taxes_targets["real_estate_taxes_bn"].tolist()) - - real_estate_taxes = sim.calculate("real_estate_taxes", map_to="household").values - state = sim.calculate("state_code", map_to="household").values - - for _, r in real_estate_taxes_targets.iterrows(): - in_state = (state == r["state_code"]).astype(float) - label = f"state/real_estate_taxes/{r['state_code']}" - loss_matrix[label] = real_estate_taxes * in_state - - return targets_list, loss_matrix - - def _add_snap_state_targets(sim): """ Add snap targets at the state level, adjusted in aggregate to the sim @@ -1317,7 +1461,9 @@ def _add_snap_metric_columns( return loss_matrix -def print_reweighting_diagnostics(optimised_weights, loss_matrix, targets_array, label): +def print_reweighting_diagnostics( + optimised_weights, loss_matrix, targets_array, label, target_names=None +): # Convert all inputs to NumPy arrays right at the start optimised_weights_np = ( optimised_weights.numpy() @@ -1334,6 +1480,10 @@ def print_reweighting_diagnostics(optimised_weights, loss_matrix, targets_array, if hasattr(targets_array, "numpy") else np.asarray(targets_array) ) + if target_names is None and hasattr(loss_matrix, "columns"): + target_names = np.asarray(loss_matrix.columns) + elif target_names is not None: + target_names = np.asarray(target_names) logging.info(f"\n\n---{label}: reweighting quick diagnostics----\n") logging.info( @@ -1344,10 +1494,20 @@ def print_reweighting_diagnostics(optimised_weights, loss_matrix, targets_array, # All subsequent calculations use the guaranteed NumPy versions estimate = optimised_weights_np @ loss_matrix_np - rel_error = (((estimate - targets_array_np) + 1) / (targets_array_np + 1)) ** 2 - within_10_percent_mask = np.abs(estimate - targets_array_np) <= ( - 0.10 * np.abs(targets_array_np) - ) + if target_names is None: + numerator_shift = np.ones_like(targets_array_np, dtype=np.float64) + denominator = targets_array_np + 1 + else: + numerator_shift, denominator = get_target_error_normalisation( + target_names, targets_array_np + ) + rel_error = ((estimate - targets_array_np + numerator_shift) / denominator) ** 2 + tolerance = 0.10 * np.abs(targets_array_np) + if target_names is not None: + for target_name, scale in ABSOLUTE_ERROR_SCALE_TARGETS.items(): + mask = target_names == target_name + tolerance[mask] = 0.10 * scale + within_10_percent_mask = np.abs(estimate - targets_array_np) <= tolerance percent_within_10 = np.mean(within_10_percent_mask) * 100 logging.info( f"rel_error: min: {np.min(rel_error):.2f}\n" diff --git a/tests/integration/test_sparse_enhanced_cps.py b/tests/integration/test_sparse_enhanced_cps.py index 91728ed57..ab03699e9 100644 --- a/tests/integration/test_sparse_enhanced_cps.py +++ b/tests/integration/test_sparse_enhanced_cps.py @@ -11,6 +11,7 @@ from policyengine_core.reforms import Reform from policyengine_us import Microsimulation from policyengine_us_data.utils import ( + ABSOLUTE_ERROR_SCALE_TARGETS, build_loss_matrix, print_reweighting_diagnostics, ) @@ -106,7 +107,12 @@ def test_sparse_ecps(sim): ] loss_matrix, targets_array = build_loss_matrix(sim.dataset, 2024) - zero_mask = np.isclose(targets_array, 0.0, atol=0.1) + scaled_zero_target_mask = loss_matrix.columns.isin( + ABSOLUTE_ERROR_SCALE_TARGETS.keys() + ) + zero_mask = np.isclose(targets_array, 0.0, atol=0.1) & ( + ~scaled_zero_target_mask + ) bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~(zero_mask | bad_mask) keep_idx = np.where(keep_mask_bool)[0] diff --git a/tests/unit/calibration/test_loss_targets.py b/tests/unit/calibration/test_loss_targets.py index a6d030ca8..ff870ee23 100644 --- a/tests/unit/calibration/test_loss_targets.py +++ b/tests/unit/calibration/test_loss_targets.py @@ -1,14 +1,26 @@ +import inspect + import numpy as np import pandas as pd import pytest from policyengine_us_data.utils.loss import ( + ABSOLUTE_ERROR_SCALE_TARGETS, + AGE_BUCKETED_HEALTH_TARGETS, + BLS_CE_TOTALS, + TRANSFER_BALANCE_TARGETS, _get_aca_national_targets, + _add_acs_housing_cost_targets, + _add_bls_ce_targets, _add_ctc_targets, + _add_real_estate_tax_targets, + _add_transfer_balance_targets, + get_target_error_normalisation, _get_medicaid_national_targets, _load_aca_spending_and_enrollment_targets, _load_medicaid_enrollment_targets, HARD_CODED_TOTALS, + build_loss_matrix, ) @@ -61,7 +73,7 @@ def test_medicaid_national_targets_use_2025_values(): class _FakeArrayResult: def __init__(self, values): - self.values = np.asarray(values, dtype=np.float32) + self.values = np.asarray(values) class _FakeSimulation: @@ -126,5 +138,230 @@ def test_add_ctc_targets(monkeypatch): ) +class _FakeRealEstateTaxSimulation: + def calculate(self, variable, map_to=None, period=None): + values = { + ("real_estate_taxes", None): [100.0, 0.0, 50.0, 0.0], + ("tax_unit_is_filer", None): [1.0, 1.0], + ("tax_unit_itemizes", None): [1.0, 0.0], + ("state_code", "household"): ["CA", "NY"], + } + key = (variable, map_to) + if key not in values: + raise AssertionError(f"Unexpected calculate call {key!r}") + return _FakeArrayResult(values[key]) + + def map_result(self, values, source_entity, target_entity, how=None): + arr = np.asarray(values, dtype=np.float32) + if (source_entity, target_entity) == ("person", "tax_unit"): + return np.array([arr[:2].sum(), arr[2:].sum()], dtype=np.float32) + if (source_entity, target_entity) == ("tax_unit", "household"): + return arr.astype(np.float32) + raise AssertionError( + f"Unexpected map_result call {(source_entity, target_entity, how)!r}" + ) + + +def test_add_real_estate_tax_targets(monkeypatch): + monkeypatch.setattr( + "policyengine_us_data.utils.loss.get_national_geography_soi_target", + lambda variable, year: {"amount": 123_000.0, "count": 17.0}, + ) + monkeypatch.setattr( + "policyengine_us_data.utils.loss.get_state_geography_soi_targets", + lambda variable, year: [ + {"state_code": "CA", "amount": 100_000.0, "count": 10.0}, + {"state_code": "NY", "amount": 50_000.0, "count": 5.0}, + ], + ) + + targets, loss_matrix = _add_real_estate_tax_targets( + pd.DataFrame(), + [], + _FakeRealEstateTaxSimulation(), + 2024, + ) + + assert targets == [123_000.0, 17.0, 100_000.0, 10.0, 50_000.0, 5.0] + np.testing.assert_array_equal( + loss_matrix["nation/irs/real_estate_taxes"], + np.array([100.0, 0.0], dtype=np.float32), + ) + np.testing.assert_array_equal( + loss_matrix["nation/irs/real_estate_taxes_count"], + np.array([1.0, 0.0], dtype=np.float32), + ) + np.testing.assert_array_equal( + loss_matrix["state/irs/real_estate_taxes/CA"], + np.array([100.0, 0.0], dtype=np.float32), + ) + np.testing.assert_array_equal( + loss_matrix["state/irs/real_estate_taxes/NY"], + np.array([0.0, 0.0], dtype=np.float32), + ) + np.testing.assert_array_equal( + loss_matrix["state/irs/real_estate_taxes_count/CA"], + np.array([1.0, 0.0], dtype=np.float32), + ) + np.testing.assert_array_equal( + loss_matrix["state/irs/real_estate_taxes_count/NY"], + np.array([0.0, 0.0], dtype=np.float32), + ) + + +class _FakeAcsHousingCostSimulation: + def calculate(self, variable, map_to=None, period=None): + values = { + ("state_code", "household"): ["CA", "NY", "CA"], + ("rent", "household"): [10.0, 20.0, 30.0], + ("real_estate_taxes", "household"): [1.0, 2.0, 3.0], + ("childcare_expenses", "household"): [4.0, 0.0, 6.0], + } + key = (variable, map_to) + if key not in values: + raise AssertionError(f"Unexpected calculate call {key!r}") + return _FakeArrayResult(values[key]) + + +def test_add_acs_housing_cost_targets(monkeypatch): + monkeypatch.setattr( + "policyengine_us_data.utils.loss._load_yeared_target_csv", + lambda prefix, year: ( + pd.DataFrame( + { + "state_code": ["CA", "NY"], + "annual_contract_rent": [100.0, 200.0], + "real_estate_taxes": [30.0, 40.0], + } + ), + 2024, + ), + ) + + targets, loss_matrix = _add_acs_housing_cost_targets( + pd.DataFrame(), + [], + _FakeAcsHousingCostSimulation(), + 2024, + ) + + assert targets == [300.0, 100.0, 200.0, 70.0, 30.0, 40.0] + np.testing.assert_array_equal( + loss_matrix["nation/census/acs/rent"], + np.array([10.0, 20.0, 30.0]), + ) + np.testing.assert_array_equal( + loss_matrix["state/census/acs/rent/CA"], + np.array([10.0, 0.0, 30.0]), + ) + np.testing.assert_array_equal( + loss_matrix["state/census/acs/real_estate_taxes/NY"], + np.array([0.0, 2.0, 0.0]), + ) + + +def test_bls_ce_childcare_target(): + assert BLS_CE_TOTALS["childcare_expenses"] == pytest.approx(63_092e6) + + targets, loss_matrix = _add_bls_ce_targets( + pd.DataFrame(), + [], + _FakeAcsHousingCostSimulation(), + 2024, + ) + + assert targets == [63_092e6] + np.testing.assert_array_equal( + loss_matrix["nation/bls/ce/childcare_expenses"], + np.array([4.0, 0.0, 6.0]), + ) + + +class _FakeTransferBalanceSimulation: + def calculate(self, variable, map_to=None, period=None): + values = { + "alimony_expense": [100.0, 0.0, 20.0], + "alimony_income": [30.0, 40.0, 0.0], + "child_support_expense": [0.0, 50.0, 10.0], + "child_support_received": [20.0, 10.0, 40.0], + } + if variable not in values: + raise AssertionError(f"Unexpected variable {variable!r}") + assert map_to == "household" + assert period == 2024 + return _FakeArrayResult(values[variable]) + + +def test_transfer_balance_targets_are_net_zero_accounting_constraints(): + targets, loss_matrix = _add_transfer_balance_targets( + pd.DataFrame(), + [], + _FakeTransferBalanceSimulation(), + 2024, + ) + + assert targets == [0.0, 0.0] + assert set(TRANSFER_BALANCE_TARGETS) == { + "nation/accounting/alimony_paid_minus_received", + "nation/accounting/child_support_paid_minus_received", + } + np.testing.assert_array_equal( + loss_matrix["nation/accounting/alimony_paid_minus_received"], + np.array([70.0, -40.0, 20.0]), + ) + np.testing.assert_array_equal( + loss_matrix["nation/accounting/child_support_paid_minus_received"], + np.array([-20.0, 40.0, -30.0]), + ) + + +def test_transfer_balance_targets_use_absolute_error_scale(): + target_names = np.array( + [ + "nation/accounting/alimony_paid_minus_received", + "nation/census/snap", + ] + ) + numerator_shift, denominator = get_target_error_normalisation( + target_names, + np.array([0.0, 10.0]), + ) + + assert ABSOLUTE_ERROR_SCALE_TARGETS[ + "nation/accounting/alimony_paid_minus_received" + ] == pytest.approx(1e9) + np.testing.assert_array_equal(numerator_shift, np.array([0.0, 1.0])) + np.testing.assert_array_equal(denominator, np.array([1e9, 11.0])) + + def test_tanf_hardcoded_target_uses_fy2024_basic_assistance_total(): assert HARD_CODED_TOTALS["tanf"] == pytest.approx(7_788_317_474.55) + + +def test_hardcoded_totals_drop_survey_spm_targets(): + removed_targets = { + "alimony_income", + "alimony_expense", + "child_support_expense", + "child_support_received", + "health_insurance_premiums_without_medicare_part_b", + "other_medical_expenses", + "over_the_counter_health_expenses", + "spm_unit_spm_threshold", + "spm_unit_capped_housing_subsidy", + "spm_unit_capped_work_childcare_expenses", + } + + assert removed_targets.isdisjoint(HARD_CODED_TOTALS) + + +def test_age_bucketed_health_targets_keep_only_medicare_part_b(): + assert AGE_BUCKETED_HEALTH_TARGETS == ("medicare_part_b_premiums",) + + +def test_national_loss_excludes_survey_spm_threshold_decile_targets(): + source = inspect.getsource(build_loss_matrix) + + assert "spm_threshold_agi.csv" not in source + assert "agi_in_spm_threshold_decile" not in source + assert "count_in_spm_threshold_decile" not in source diff --git a/tests/unit/test_etl_national_targets.py b/tests/unit/test_etl_national_targets.py index 84d8c748b..534a6dbbf 100644 --- a/tests/unit/test_etl_national_targets.py +++ b/tests/unit/test_etl_national_targets.py @@ -8,6 +8,7 @@ create_database, ) from policyengine_us_data.db.etl_national_targets import ( + extract_national_targets, load_national_targets, ) @@ -199,3 +200,31 @@ def test_load_national_targets_supports_liheap_household_counts(tmp_path, monkey ).first() assert liheap_target is not None assert liheap_target.value == 5_876_646 + + +def test_extract_national_targets_drops_survey_spm_targets(): + targets = extract_national_targets(year=2024) + direct_sum_variables = { + target["variable"] for target in targets["direct_sum_targets"] + } + removed_targets = { + "alimony_income", + "alimony_expense", + "child_support_expense", + "child_support_received", + "health_insurance_premiums_without_medicare_part_b", + "other_medical_expenses", + "over_the_counter_health_expenses", + "spm_unit_capped_housing_subsidy", + "spm_unit_capped_work_childcare_expenses", + } + + assert removed_targets.isdisjoint(direct_sum_variables) + assert {"rent", "real_estate_taxes", "childcare_expenses"} <= direct_sum_variables + + direct_sum_targets = { + target["variable"]: target for target in targets["direct_sum_targets"] + } + assert direct_sum_targets["rent"]["value"] == 764_925_694_800 + assert direct_sum_targets["real_estate_taxes"]["value"] == 370_014_207_400 + assert direct_sum_targets["childcare_expenses"]["value"] == 63_092e6