Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ node_modules
!population_by_state.csv
!aca_spending_and_enrollment_2024.csv
!aca_spending_and_enrollment_2025.csv
!policyengine_us_data/storage/calibration_targets/acs_housing_costs_2024.csv
!real_estate_taxes_by_state_acs.csv
!snap_state.csv
!age_state.csv
Expand Down
25 changes: 22 additions & 3 deletions policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from policyengine_core.data import Dataset
import pandas as pd
from policyengine_us_data.utils import (
ABSOLUTE_ERROR_SCALE_TARGETS,
build_loss_matrix,
get_target_error_normalisation,
HardConcrete,
print_reweighting_diagnostics,
set_seeds,
Expand Down Expand Up @@ -113,6 +115,10 @@ def reweight(
):
target_names = np.array(loss_matrix.columns)
is_national = loss_matrix.columns.str.startswith("nation/")
numerator_shift_np, error_denominator_np = get_target_error_normalisation(
target_names,
targets_array,
)
loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
nation_normalisation_factor = is_national * (1 / is_national.sum())
state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
Expand All @@ -121,6 +127,8 @@ def reweight(
)
normalisation_factor = torch.tensor(normalisation_factor, dtype=torch.float32)
targets_array = torch.tensor(targets_array, dtype=torch.float32)
numerator_shift = torch.tensor(numerator_shift_np, dtype=torch.float32)
error_denominator = torch.tensor(error_denominator_np, dtype=torch.float32)

inv_mean_normalisation = 1 / np.mean(normalisation_factor.numpy())

Expand All @@ -132,7 +140,9 @@ def loss(weights):
estimate = weights @ loss_matrix
if torch.isnan(estimate).any():
raise ValueError("Estimate contains NaNs")
rel_error = (((estimate - targets_array) + 1) / (targets_array + 1)) ** 2
rel_error = (
(estimate - targets_array + numerator_shift) / error_denominator
) ** 2
rel_error_normalized = inv_mean_normalisation * rel_error * normalisation_factor
if torch.isnan(rel_error_normalized).any():
raise ValueError("Relative error contains NaNs")
Expand Down Expand Up @@ -176,7 +186,10 @@ def loss(weights):
)
df["epoch"] = i
df["error"] = df.estimate - df.target
df["rel_error"] = df.error / df.target
df["error_denominator"] = error_denominator.detach().numpy()
df["rel_error"] = (
df.error + numerator_shift.detach().numpy()
) / df.error_denominator
df["abs_error"] = df.error.abs()
df["rel_abs_error"] = df.rel_error.abs()
df["loss"] = df.rel_abs_error**2
Expand All @@ -203,6 +216,7 @@ def loss(weights):
loss_matrix,
targets_array,
"L0 Sparse Solution",
target_names=target_names,
)

return final_weights_sparse
Expand Down Expand Up @@ -248,7 +262,12 @@ def generate(self):
# Run the optimization procedure to get (close to) minimum loss weights
for year in range(self.start_year, self.end_year + 1):
loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year)
zero_mask = np.isclose(targets_array, 0.0, atol=0.1)
scaled_zero_target_mask = loss_matrix.columns.isin(
ABSOLUTE_ERROR_SCALE_TARGETS.keys()
)
zero_mask = np.isclose(targets_array, 0.0, atol=0.1) & (
~scaled_zero_target_mask
)
bad_mask = loss_matrix.columns.isin(bad_targets)
keep_mask_bool = ~(zero_mask | bad_mask)
keep_idx = np.where(keep_mask_bool)[0]
Expand Down
43 changes: 43 additions & 0 deletions policyengine_us_data/db/etl_irs_soi.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,49 @@ def get_national_geography_soi_target(
return _get_national_geography_soi_target_from_year(variable, geography_year)


def _get_state_geography_soi_targets_from_year(
variable: str,
geography_year: int,
) -> list[dict]:
spec = _get_geography_file_aggregate_target_spec(variable)
code = spec["code"]

raw_df = extract_soi_data(geography_year)
state_rows = raw_df[(raw_df["STATE"] != "US") & (raw_df["agi_stub"] == 0)]
if "CONG_DISTRICT" in state_rows.columns:
state_rows = state_rows[state_rows["CONG_DISTRICT"] == 0]
if state_rows.empty:
raise ValueError(
f"IRS geography SOI file for {geography_year} is missing state rows "
f"for {variable}"
)

targets = []
for row in state_rows.itertuples(index=False):
targets.append(
{
"variable": variable,
"source_year": geography_year,
"state_code": row.STATE,
"count": float(getattr(row, f"N{code}")),
"amount": float(getattr(row, f"A{code}")) * 1_000,
}
)

return sorted(targets, key=lambda target: target["state_code"])


def get_state_geography_soi_targets(
variable: str,
dataset_year: int,
*,
lag: int = IRS_SOI_LAG_YEARS,
) -> list[dict]:
"""Return state count and amount targets from the IRS geography file."""
geography_year = get_geography_soi_year(dataset_year, lag=lag)
return _get_state_geography_soi_targets_from_year(variable, geography_year)


def get_national_geography_soi_agi_targets(
variable: str,
dataset_year: int,
Expand Down
78 changes: 11 additions & 67 deletions policyengine_us_data/db/etl_national_targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,20 +112,6 @@ def extract_national_targets(year: int = DEFAULT_YEAR):
tax_expenditure_targets = [{**target} for target in raw_tax_expenditure_targets]

direct_sum_targets = [
{
"variable": "alimony_income",
"value": 13e9,
"source": "Survey-reported (post-TCJA grandfathered)",
"notes": "Alimony received - survey reported, not tax-filer restricted",
"year": 2024,
},
{
"variable": "alimony_expense",
"value": 13e9,
"source": "Survey-reported (post-TCJA grandfathered)",
"notes": "Alimony paid - survey reported, not tax-filer restricted",
"year": 2024,
},
{
"variable": "medicaid",
"value": 871.7e9,
Expand All @@ -140,20 +126,6 @@ def extract_national_targets(year: int = DEFAULT_YEAR):
"notes": "Total household net worth",
"year": 2024,
},
{
"variable": "health_insurance_premiums_without_medicare_part_b",
"value": 385e9,
"source": "MEPS/NHEA",
"notes": "Health insurance premiums excluding Medicare Part B",
"year": 2024,
},
{
"variable": "other_medical_expenses",
"value": 278e9,
"source": "MEPS/NHEA",
"notes": "Out-of-pocket medical expenses",
"year": 2024,
},
{
"variable": "medicare_part_b_premiums",
"value": get_beneficiary_paid_medicare_part_b_premiums_target(2024),
Expand All @@ -162,52 +134,24 @@ def extract_national_targets(year: int = DEFAULT_YEAR):
"year": 2024,
},
{
"variable": "over_the_counter_health_expenses",
"value": 72e9,
"source": "Consumer Expenditure Survey",
"notes": "OTC health products and supplies",
"year": 2024,
},
{
"variable": "child_support_expense",
"value": 33e9,
"source": "Census Bureau",
"notes": "Child support payments",
"year": 2024,
},
{
"variable": "child_support_received",
"value": 33e9,
"source": "Census Bureau",
"notes": "Child support received",
"year": 2024,
},
{
"variable": "spm_unit_capped_work_childcare_expenses",
"value": 348e9,
"source": "Census Bureau SPM",
"notes": "Work and childcare expenses for SPM",
"year": 2024,
},
{
"variable": "spm_unit_capped_housing_subsidy",
"value": 35e9,
"source": "HUD/Census",
"notes": "Housing subsidies",
"variable": "rent",
"value": 764_925_694_800,
"source": "Census ACS 2024 1-year table B25060",
"notes": "Sum of state aggregate contract rent, annualized from monthly ACS aggregate contract rent",
"year": 2024,
},
{
"variable": "real_estate_taxes",
"value": 500e9,
"source": "Census Bureau",
"notes": "Property taxes paid",
"value": 370_014_207_400,
"source": "Census ACS 2024 1-year table B25090",
"notes": "Sum of state aggregate real estate taxes paid by owner-occupied housing units",
"year": 2024,
},
{
"variable": "rent",
"value": 735e9,
"source": "Census Bureau/BLS",
"notes": "Rental payments",
"variable": "childcare_expenses",
"value": 63_092e6,
"source": "BLS Consumer Expenditure Surveys CE LABSTAT",
"notes": "Series CXU670320LB0101M aggregate expenditure: babysitting, childcare, daycare, preschool",
"year": 2024,
},
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
state_code,state_fips,annual_contract_rent,real_estate_taxes
AK,02,1350681600,664772900
AL,01,5761773600,1537253700
AR,05,3760575600,1167041400
AZ,04,16849603200,4320807000
CA,06,143291068800,52872735400
CO,08,17072544000,5750527500
CT,09,8116260000,7275184600
DC,11,4602276000,778233300
DE,10,1652836800,656213100
FL,12,57303682800,24312484700
GA,13,21304225200,8707748600
HI,15,4073208000,981165300
IA,19,4069554000,3234507400
ID,16,3091480800,1222009800
IL,17,24729199200,21262263300
IN,18,9115561200,4242347000
KS,20,4246785600,2863525400
KY,21,5821017600,2434868700
LA,22,5928199200,1822794700
MA,25,21342618000,12097297000
MD,24,14212159200,7520628800
ME,23,2153030400,1668939000
MI,26,13242972000,10402220500
MN,27,9724164000,6501643100
MO,29,8718777600,4428280300
MS,28,3018102000,1026895200
MT,30,1873186800,1018759800
NC,37,20318032800,7550042500
ND,38,1474936800,608757100
NE,31,3199722000,2283083400
NH,33,2585438400,2900421200
NJ,34,25845276000,22119447000
NM,35,2917616400,1218092800
NV,32,8914724400,2031449700
NY,36,71916831600,32203085100
OH,39,17617650000,12129649100
OK,40,5521292400,2206132700
OR,41,10933761600,4917685900
PA,42,22028415600,14303332700
RI,44,2401389600,1519517700
SC,45,7908846000,2768317200
SD,46,1274104800,825527300
TN,47,12780411600,3724735100
TX,48,67268908800,34936256600
UT,49,6183264000,2346772700
VA,51,20114900400,8760836100
VT,50,1119537600,1171089500
WA,53,23878054800,10671295800
WI,55,10165308000,6958356700
WV,54,1337834400,584045200
WY,56,793893600,505130800
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,13 @@
from policyengine_us_data.storage import CALIBRATION_FOLDER

"""
Hardcoded targets for the year 2024 from CPS-derived statistics and other sources. Include medical expenses, sum of SPM thresholds, and child support expenses.
Hardcoded targets for the year 2024 from administrative and
authoritative aggregate sources.
"""

HARD_CODED_TOTALS = {
"health_insurance_premiums_without_medicare_part_b": 385e9,
"other_medical_expenses": 278e9,
"medicare_part_b_premiums": 112e9,
"over_the_counter_health_expenses": 72e9,
"spm_unit_spm_threshold": 3_945e9,
"child_support_expense": 33e9,
"child_support_received": 33e9,
"spm_unit_capped_work_childcare_expenses": 348e9,
"spm_unit_capped_housing_subsidy": 35e9,
"tanf": 7_788_317_474.55,
# Alimony could be targeted via SOI
"alimony_income": 13e9,
"alimony_expense": 13e9,
# Rough estimate, not CPS derived
"real_estate_taxes": 500e9, # Rough estimate between 350bn and 600bn total property tax collections
"rent": 735e9, # ACS total uprated by CPI
# Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics
# shows $38,316,190,000 in Box 7: Social security tips (2018)
# Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import csv
import json
from urllib.request import urlopen

from policyengine_us_data.storage import CALIBRATION_FOLDER
from policyengine_us_data.storage.calibration_targets.pull_soi_targets import (
STATE_ABBR_TO_FIPS,
)


YEAR = 2024
ACS_DATASET = "acs/acs1"
STATE_FIPS_TO_ABBR = {
fips: state_code for state_code, fips in STATE_ABBR_TO_FIPS.items()
}


def fetch_acs_housing_cost_targets(year: int = YEAR) -> list[dict]:
"""Fetch ACS state rent and property-tax aggregates.

B25060 is aggregate monthly contract rent for renter-occupied units
paying cash rent. We annualize it to match the yearly `rent` variable.
B25090 is aggregate real estate taxes paid by owner-occupied units.
"""
variables = "NAME,B25060_001E,B25090_001E"
url = (
f"https://api.census.gov/data/{year}/{ACS_DATASET}"
f"?get={variables}&for=state:*"
)
with urlopen(url) as response:
rows = json.load(response)

header = rows[0]
column_index = {column: index for index, column in enumerate(header)}

targets = []
for row in rows[1:]:
state_fips = row[column_index["state"]]
state_code = STATE_FIPS_TO_ABBR.get(state_fips)
if state_code is None:
continue

monthly_contract_rent = float(row[column_index["B25060_001E"]])
real_estate_taxes = float(row[column_index["B25090_001E"]])
targets.append(
{
"state_code": state_code,
"state_fips": state_fips,
"annual_contract_rent": int(monthly_contract_rent * 12),
"real_estate_taxes": int(real_estate_taxes),
}
)

return sorted(targets, key=lambda target: target["state_code"])


def main() -> None:
targets = fetch_acs_housing_cost_targets()
output_path = CALIBRATION_FOLDER / f"acs_housing_costs_{YEAR}.csv"
with output_path.open("w", newline="") as output:
writer = csv.DictWriter(
output,
fieldnames=[
"state_code",
"state_fips",
"annual_contract_rent",
"real_estate_taxes",
],
lineterminator="\n",
)
writer.writeheader()
writer.writerows(targets)


if __name__ == "__main__":
main()
Loading
Loading