Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/issue-813-elective-filer.changed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Replace the flat voluntary filing rate with a demographic elective-filer table based on children, wage income, and head age, and add sanity-check metrics for non-EITC filer alignment.
143 changes: 143 additions & 0 deletions policyengine_us_data/calibration/sanity_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
"""

import logging
import re
from typing import List
from pathlib import Path

import h5py
import numpy as np
Expand All @@ -33,9 +35,114 @@
"takes_up_head_start_if_eligible",
"takes_up_early_head_start_if_eligible",
"takes_up_dc_ptc",
"would_file_taxes_voluntarily",
]


def _eitc_target_source_year(eitc_targets_path: Path) -> int:
first_line = eitc_targets_path.read_text().splitlines()[0]
matches = re.search(r"Tax Year (\d{4})", first_line)
if matches is not None:
return int(matches.group(1))
raise ValueError(
f"Could not determine EITC target source year from {eitc_targets_path}"
)


def _non_eitc_filer_alignment_metrics(
h5_path: str,
period: int,
) -> dict | None:
try:
import pandas as pd
from policyengine_core.data import Dataset
from policyengine_us import Microsimulation
from policyengine_us_data.storage import STORAGE_FOLDER
from policyengine_us_data.utils.soi import get_soi
from policyengine_us_data.utils.uprating import (
create_policyengine_uprating_factors_table,
)
except ImportError:
return None

class _SanityChecksDataset(Dataset):
name = "sanity_checks_dataset"
label = "Sanity checks dataset"
data_format = Dataset.TIME_PERIOD_ARRAYS
file_path = Path(h5_path)
time_period = period

sim = Microsimulation(dataset=_SanityChecksDataset)
tax_unit_weight = sim.calculate("tax_unit_weight").values.astype(float)
tax_unit_is_filer = sim.calculate("tax_unit_is_filer").values.astype(bool)
eitc = sim.calculate("eitc").values.astype(float)
agi = sim.calculate("adjusted_gross_income").values.astype(float)
claimed_eitc = eitc > 0

soi = get_soi(period)
soi_filer_counts = soi[
(soi["Variable"] == "count")
& soi["Count"]
& (soi["Filing status"] == "All")
& ~soi["Taxable only"]
& ~soi["Full population"]
][["AGI lower bound", "AGI upper bound", "Value"]].sort_values(
["AGI lower bound", "AGI upper bound"]
)

eitc_targets_path = (
STORAGE_FOLDER / "calibration_targets" / "eitc_by_agi_and_children.csv"
)
eitc_source_year = _eitc_target_source_year(eitc_targets_path)
eitc_targets = pd.read_csv(eitc_targets_path, comment="#")
uprating = create_policyengine_uprating_factors_table()
earliest_uprating_year = int(uprating.columns.astype(int).min())
latest_uprating_year = int(uprating.columns.astype(int).max())
source_year = min(
max(eitc_source_year, earliest_uprating_year), latest_uprating_year
)
target_year = min(max(period, earliest_uprating_year), latest_uprating_year)
population_growth = float(
uprating.loc["population", target_year]
/ uprating.loc["population", source_year]
)
eitc_targets["returns_target_year"] = eitc_targets["returns"] * population_growth

actual_total = float((tax_unit_weight * (tax_unit_is_filer & ~claimed_eitc)).sum())
target_total = 0.0
total_abs_error = 0.0
low_agi_abs_error = 0.0

for lower, upper, total_filers in soi_filer_counts.itertuples(index=False):
target_eitc_returns = float(
eitc_targets.loc[
(eitc_targets["agi_lower"].astype(float) >= float(lower))
& (eitc_targets["agi_upper"].astype(float) <= float(upper)),
"returns_target_year",
].sum()
)
target_non_eitc_filers = float(total_filers - target_eitc_returns)
actual_non_eitc_filers = float(
(
tax_unit_weight
* (tax_unit_is_filer & ~claimed_eitc & (agi >= lower) & (agi < upper))
).sum()
)
abs_error = abs(actual_non_eitc_filers - target_non_eitc_filers)
target_total += target_non_eitc_filers
total_abs_error += abs_error
if float(upper) <= 40_000:
low_agi_abs_error += abs_error

return {
"actual_total": actual_total,
"target_total": target_total,
"total_gap": actual_total - target_total,
"total_abs_error": total_abs_error,
"low_agi_abs_error": low_agi_abs_error,
}


def run_sanity_checks(
h5_path: str,
period: int = 2024,
Expand Down Expand Up @@ -176,6 +283,42 @@ def _get(f, path):
}
)

alignment = _non_eitc_filer_alignment_metrics(h5_path, period)
if alignment is None:
results.append(
{
"check": "non_eitc_filer_alignment",
"status": "SKIP",
"detail": "policyengine_us not available",
}
)
else:
results.append(
{
"check": "non_eitc_filer_total_gap",
"status": "PASS",
"detail": (
f"actual={alignment['actual_total']:,.0f}, "
f"target={alignment['target_total']:,.0f}, "
f"gap={alignment['total_gap']:,.0f}"
),
}
)
results.append(
{
"check": "non_eitc_filer_total_agi_abs_error",
"status": "PASS",
"detail": f"{alignment['total_abs_error']:,.0f}",
}
)
results.append(
{
"check": "non_eitc_filer_low_agi_abs_error",
"status": "PASS",
"detail": f"{alignment['low_agi_abs_error']:,.0f}",
}
)

# 5. Boolean takeup variables
for var in TAKEUP_VARS:
vals = _get(f, f"{var}/{period}")
Expand Down
93 changes: 86 additions & 7 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,71 @@ def _open_dataset_read_only(dataset_source):
yield store


def _sum_person_values_to_tax_units(
person_values: np.ndarray,
person_tax_unit_ids: np.ndarray,
tax_unit_ids: np.ndarray,
) -> np.ndarray:
tax_unit_index = {
int(tax_unit_id): index for index, tax_unit_id in enumerate(tax_unit_ids)
}
person_tax_unit_index = np.array(
[tax_unit_index[int(tax_unit_id)] for tax_unit_id in person_tax_unit_ids],
dtype=np.int64,
)
tax_unit_values = np.zeros(len(tax_unit_ids), dtype=np.float32)
np.add.at(
tax_unit_values,
person_tax_unit_index,
np.asarray(person_values, dtype=np.float32),
)
return tax_unit_values


def _voluntary_filing_children_bin(
tax_unit_child_dependents: np.ndarray,
) -> np.ndarray:
return np.where(
np.asarray(tax_unit_child_dependents) > 0,
"with_children",
"no_children",
)


def _voluntary_filing_wage_income_bin(
tax_unit_wage_income: np.ndarray,
) -> np.ndarray:
wage_income = np.asarray(tax_unit_wage_income, dtype=np.float32)
return np.select(
[
wage_income <= 0,
wage_income < 15_000,
wage_income < 30_000,
],
["zero", "low", "medium"],
default="high",
)


def _voluntary_filing_age_bin(age_head: np.ndarray) -> np.ndarray:
return np.where(np.asarray(age_head) >= 65, "age_65_plus", "under_65")


def _voluntary_filing_rate_by_tax_unit(
voluntary_filing_rates: dict,
children_bin: np.ndarray,
wage_income_bin: np.ndarray,
age_bin: np.ndarray,
) -> np.ndarray:
return np.array(
[
voluntary_filing_rates[children][wage][age]
for children, wage, age in zip(children_bin, wage_income_bin, age_bin)
],
dtype=np.float32,
)


class CPS(Dataset):
name = "cps"
label = "CPS"
Expand Down Expand Up @@ -311,9 +376,11 @@ def add_takeup(self):
head_start_rate = load_take_up_rate("head_start", self.time_period)
early_head_start_rate = load_take_up_rate("early_head_start", self.time_period)
ssi_rate = load_take_up_rate("ssi", self.time_period)
voluntary_filing_rates = load_take_up_rate("voluntary_filing", self.time_period)

# EITC: varies by number of children
eitc_child_count = baseline.calculate("eitc_child_count").values
potential_eitc = baseline.calculate("eitc").values
eitc_takeup_rate = np.array(
[eitc_rates_by_children.get(min(int(c), 3), 0.85) for c in eitc_child_count]
)
Expand Down Expand Up @@ -426,14 +493,26 @@ def add_takeup(self):
rng.random(n_persons) < pregnancy_rate_by_person
)

# Voluntary tax filing: some people file even when not required and not
# seeking a refund. EITC take-up already captures refund-seeking behavior
# (if you take up EITC, you file). This variable captures people who file
# for other reasons: state requirements, documentation, habit.
# ~5% of tax units who don't take up EITC still file voluntarily.
voluntary_filing_rate = 0.05
# Voluntary tax filing: some tax units file even when not required and not
# claiming EITC. Assign rates by a simple demographic table that
# concentrates elective filing among low-wage parents and sharply reduces
# it among older childless households.
claims_eitc = data["takes_up_eitc"] & (potential_eitc > 0)
tax_unit_child_dependents = baseline.calculate("tax_unit_child_dependents").values
tax_unit_wage_income = _sum_person_values_to_tax_units(
data["employment_income"],
data["person_tax_unit_id"],
data["tax_unit_id"],
)
age_head = baseline.calculate("age_head").values
voluntary_filing_rate = _voluntary_filing_rate_by_tax_unit(
voluntary_filing_rates,
_voluntary_filing_children_bin(tax_unit_child_dependents),
_voluntary_filing_wage_income_bin(tax_unit_wage_income),
_voluntary_filing_age_bin(age_head),
)
rng = seeded_rng("would_file_taxes_voluntarily")
data["would_file_taxes_voluntarily"] = ~data["takes_up_eitc"] & (
data["would_file_taxes_voluntarily"] = ~claims_eitc & (
rng.random(n_tax_units) < voluntary_filing_rate
)

Expand Down
8 changes: 6 additions & 2 deletions policyengine_us_data/datasets/org/org.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from io import BytesIO
from pathlib import Path
import fcntl
import time

from microimpute.models.qrf import QRF
import numpy as np
Expand Down Expand Up @@ -217,13 +218,14 @@ def _load_cps_basic_org_month(
year: int,
month: str,
*,
max_attempts: int = 3,
max_attempts: int = 6,
retry_delay_seconds: float = 1.0,
) -> pd.DataFrame:
"""Load one CPS basic-month file with light retry around transient fetch/parser issues."""
url = _cps_basic_org_month_url(year, month)
last_error: Exception | None = None

for _ in range(max_attempts):
for attempt in range(1, max_attempts + 1):
try:
response = requests.get(url, timeout=60)
response.raise_for_status()
Expand All @@ -238,6 +240,8 @@ def _load_cps_basic_org_month(
return _select_cps_basic_org_columns(month_df)
except Exception as error:
last_error = error
if attempt < max_attempts and retry_delay_seconds > 0:
time.sleep(retry_delay_seconds * attempt)

raise ValueError(
f"Failed to load CPS basic ORG month {month} {year} after "
Expand Down
8 changes: 6 additions & 2 deletions policyengine_us_data/parameters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ def load_take_up_rate(variable_name: str, year: int = 2018):
year: Year for which to get the rate

Returns:
float, dict (EITC rates_by_children), or dict (Medicaid
rates_by_state)
float, dict (EITC rates_by_children), dict (Medicaid
rates_by_state), or nested dict (cell-based rates)
"""
yaml_path = PARAMETERS_DIR / "take_up" / f"{variable_name}.yaml"

Expand Down Expand Up @@ -49,6 +49,10 @@ def load_take_up_rate(variable_name: str, year: int = 2018):
result[category] = applicable_value
return result

# Cell-based tables (for example, voluntary filing by demographics)
if "rates" in data:
return data["rates"]

# Standard time-series values
values = data["values"]
applicable_value = None
Expand Down
45 changes: 41 additions & 4 deletions policyengine_us_data/parameters/take_up/voluntary_filing.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,43 @@
description: Percentage of tax units (not taking up EITC) who file taxes voluntarily.
description: Probability that a non-EITC-claiming tax unit files taxes voluntarily, by children, wage income, and head age.
metadata:
label: Voluntary filing rate
label: Voluntary filing probability
unit: /1
values:
2018-01-01: 0.05
breakdown:
num_children:
no_children: tax_unit_child_dependents == 0
with_children: tax_unit_child_dependents >= 1
wage_income:
zero: wage income == $0
low: $0 < wage income < $15,000
medium: $15,000 <= wage income < $30,000
high: wage income >= $30,000
age_head:
under_65: age_head < 65
age_65_plus: age_head >= 65
rates:
no_children:
zero:
under_65: 0.2
age_65_plus: 0.05
low:
under_65: 0.24
age_65_plus: 0.04
medium:
under_65: 0.0
age_65_plus: 0.0
high:
under_65: 0.0
age_65_plus: 0.005
with_children:
zero:
under_65: 0.5
age_65_plus: 0.075
low:
under_65: 0.6
age_65_plus: 0.06
medium:
under_65: 0.0
age_65_plus: 0.0
high:
under_65: 0.025
age_65_plus: 0.0037
Loading
Loading