diff --git a/changelog.d/issue-813-elective-filer.changed.md b/changelog.d/issue-813-elective-filer.changed.md new file mode 100644 index 000000000..b507aa084 --- /dev/null +++ b/changelog.d/issue-813-elective-filer.changed.md @@ -0,0 +1 @@ +Replace the flat voluntary filing rate with a demographic elective-filer table based on children, wage income, and head age, and add sanity-check metrics for non-EITC filer alignment. diff --git a/policyengine_us_data/calibration/sanity_checks.py b/policyengine_us_data/calibration/sanity_checks.py index 025d3de8e..32be33f1d 100644 --- a/policyengine_us_data/calibration/sanity_checks.py +++ b/policyengine_us_data/calibration/sanity_checks.py @@ -7,7 +7,9 @@ """ import logging +import re from typing import List +from pathlib import Path import h5py import numpy as np @@ -33,9 +35,114 @@ "takes_up_head_start_if_eligible", "takes_up_early_head_start_if_eligible", "takes_up_dc_ptc", + "would_file_taxes_voluntarily", ] +def _eitc_target_source_year(eitc_targets_path: Path) -> int: + first_line = eitc_targets_path.read_text().splitlines()[0] + matches = re.search(r"Tax Year (\d{4})", first_line) + if matches is not None: + return int(matches.group(1)) + raise ValueError( + f"Could not determine EITC target source year from {eitc_targets_path}" + ) + + +def _non_eitc_filer_alignment_metrics( + h5_path: str, + period: int, +) -> dict | None: + try: + import pandas as pd + from policyengine_core.data import Dataset + from policyengine_us import Microsimulation + from policyengine_us_data.storage import STORAGE_FOLDER + from policyengine_us_data.utils.soi import get_soi + from policyengine_us_data.utils.uprating import ( + create_policyengine_uprating_factors_table, + ) + except ImportError: + return None + + class _SanityChecksDataset(Dataset): + name = "sanity_checks_dataset" + label = "Sanity checks dataset" + data_format = Dataset.TIME_PERIOD_ARRAYS + file_path = Path(h5_path) + time_period = period + + sim = Microsimulation(dataset=_SanityChecksDataset) + tax_unit_weight = sim.calculate("tax_unit_weight").values.astype(float) + tax_unit_is_filer = sim.calculate("tax_unit_is_filer").values.astype(bool) + eitc = sim.calculate("eitc").values.astype(float) + agi = sim.calculate("adjusted_gross_income").values.astype(float) + claimed_eitc = eitc > 0 + + soi = get_soi(period) + soi_filer_counts = soi[ + (soi["Variable"] == "count") + & soi["Count"] + & (soi["Filing status"] == "All") + & ~soi["Taxable only"] + & ~soi["Full population"] + ][["AGI lower bound", "AGI upper bound", "Value"]].sort_values( + ["AGI lower bound", "AGI upper bound"] + ) + + eitc_targets_path = ( + STORAGE_FOLDER / "calibration_targets" / "eitc_by_agi_and_children.csv" + ) + eitc_source_year = _eitc_target_source_year(eitc_targets_path) + eitc_targets = pd.read_csv(eitc_targets_path, comment="#") + uprating = create_policyengine_uprating_factors_table() + earliest_uprating_year = int(uprating.columns.astype(int).min()) + latest_uprating_year = int(uprating.columns.astype(int).max()) + source_year = min( + max(eitc_source_year, earliest_uprating_year), latest_uprating_year + ) + target_year = min(max(period, earliest_uprating_year), latest_uprating_year) + population_growth = float( + uprating.loc["population", target_year] + / uprating.loc["population", source_year] + ) + eitc_targets["returns_target_year"] = eitc_targets["returns"] * population_growth + + actual_total = float((tax_unit_weight * (tax_unit_is_filer & ~claimed_eitc)).sum()) + target_total = 0.0 + total_abs_error = 0.0 + low_agi_abs_error = 0.0 + + for lower, upper, total_filers in soi_filer_counts.itertuples(index=False): + target_eitc_returns = float( + eitc_targets.loc[ + (eitc_targets["agi_lower"].astype(float) >= float(lower)) + & (eitc_targets["agi_upper"].astype(float) <= float(upper)), + "returns_target_year", + ].sum() + ) + target_non_eitc_filers = float(total_filers - target_eitc_returns) + actual_non_eitc_filers = float( + ( + tax_unit_weight + * (tax_unit_is_filer & ~claimed_eitc & (agi >= lower) & (agi < upper)) + ).sum() + ) + abs_error = abs(actual_non_eitc_filers - target_non_eitc_filers) + target_total += target_non_eitc_filers + total_abs_error += abs_error + if float(upper) <= 40_000: + low_agi_abs_error += abs_error + + return { + "actual_total": actual_total, + "target_total": target_total, + "total_gap": actual_total - target_total, + "total_abs_error": total_abs_error, + "low_agi_abs_error": low_agi_abs_error, + } + + def run_sanity_checks( h5_path: str, period: int = 2024, @@ -176,6 +283,42 @@ def _get(f, path): } ) + alignment = _non_eitc_filer_alignment_metrics(h5_path, period) + if alignment is None: + results.append( + { + "check": "non_eitc_filer_alignment", + "status": "SKIP", + "detail": "policyengine_us not available", + } + ) + else: + results.append( + { + "check": "non_eitc_filer_total_gap", + "status": "PASS", + "detail": ( + f"actual={alignment['actual_total']:,.0f}, " + f"target={alignment['target_total']:,.0f}, " + f"gap={alignment['total_gap']:,.0f}" + ), + } + ) + results.append( + { + "check": "non_eitc_filer_total_agi_abs_error", + "status": "PASS", + "detail": f"{alignment['total_abs_error']:,.0f}", + } + ) + results.append( + { + "check": "non_eitc_filer_low_agi_abs_error", + "status": "PASS", + "detail": f"{alignment['low_agi_abs_error']:,.0f}", + } + ) + # 5. Boolean takeup variables for var in TAKEUP_VARS: vals = _get(f, f"{var}/{period}") diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 751581267..4d4e74ea0 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -109,6 +109,71 @@ def _open_dataset_read_only(dataset_source): yield store +def _sum_person_values_to_tax_units( + person_values: np.ndarray, + person_tax_unit_ids: np.ndarray, + tax_unit_ids: np.ndarray, +) -> np.ndarray: + tax_unit_index = { + int(tax_unit_id): index for index, tax_unit_id in enumerate(tax_unit_ids) + } + person_tax_unit_index = np.array( + [tax_unit_index[int(tax_unit_id)] for tax_unit_id in person_tax_unit_ids], + dtype=np.int64, + ) + tax_unit_values = np.zeros(len(tax_unit_ids), dtype=np.float32) + np.add.at( + tax_unit_values, + person_tax_unit_index, + np.asarray(person_values, dtype=np.float32), + ) + return tax_unit_values + + +def _voluntary_filing_children_bin( + tax_unit_child_dependents: np.ndarray, +) -> np.ndarray: + return np.where( + np.asarray(tax_unit_child_dependents) > 0, + "with_children", + "no_children", + ) + + +def _voluntary_filing_wage_income_bin( + tax_unit_wage_income: np.ndarray, +) -> np.ndarray: + wage_income = np.asarray(tax_unit_wage_income, dtype=np.float32) + return np.select( + [ + wage_income <= 0, + wage_income < 15_000, + wage_income < 30_000, + ], + ["zero", "low", "medium"], + default="high", + ) + + +def _voluntary_filing_age_bin(age_head: np.ndarray) -> np.ndarray: + return np.where(np.asarray(age_head) >= 65, "age_65_plus", "under_65") + + +def _voluntary_filing_rate_by_tax_unit( + voluntary_filing_rates: dict, + children_bin: np.ndarray, + wage_income_bin: np.ndarray, + age_bin: np.ndarray, +) -> np.ndarray: + return np.array( + [ + voluntary_filing_rates[children][wage][age] + for children, wage, age in zip(children_bin, wage_income_bin, age_bin) + ], + dtype=np.float32, + ) + + class CPS(Dataset): name = "cps" label = "CPS" @@ -311,9 +376,11 @@ def add_takeup(self): head_start_rate = load_take_up_rate("head_start", self.time_period) early_head_start_rate = load_take_up_rate("early_head_start", self.time_period) ssi_rate = load_take_up_rate("ssi", self.time_period) + voluntary_filing_rates = load_take_up_rate("voluntary_filing", self.time_period) # EITC: varies by number of children eitc_child_count = baseline.calculate("eitc_child_count").values + potential_eitc = baseline.calculate("eitc").values eitc_takeup_rate = np.array( [eitc_rates_by_children.get(min(int(c), 3), 0.85) for c in eitc_child_count] ) @@ -426,14 +493,26 @@ def add_takeup(self): rng.random(n_persons) < pregnancy_rate_by_person ) - # Voluntary tax filing: some people file even when not required and not - # seeking a refund. EITC take-up already captures refund-seeking behavior - # (if you take up EITC, you file). This variable captures people who file - # for other reasons: state requirements, documentation, habit. - # ~5% of tax units who don't take up EITC still file voluntarily. - voluntary_filing_rate = 0.05 + # Voluntary tax filing: some tax units file even when not required and not + # claiming EITC. Assign rates by a simple demographic table that + # concentrates elective filing among low-wage parents and sharply reduces + # it among older childless households. + claims_eitc = data["takes_up_eitc"] & (potential_eitc > 0) + tax_unit_child_dependents = baseline.calculate("tax_unit_child_dependents").values + tax_unit_wage_income = _sum_person_values_to_tax_units( + data["employment_income"], + data["person_tax_unit_id"], + data["tax_unit_id"], + ) + age_head = baseline.calculate("age_head").values + voluntary_filing_rate = _voluntary_filing_rate_by_tax_unit( + voluntary_filing_rates, + _voluntary_filing_children_bin(tax_unit_child_dependents), + _voluntary_filing_wage_income_bin(tax_unit_wage_income), + _voluntary_filing_age_bin(age_head), + ) rng = seeded_rng("would_file_taxes_voluntarily") - data["would_file_taxes_voluntarily"] = ~data["takes_up_eitc"] & ( + data["would_file_taxes_voluntarily"] = ~claims_eitc & ( rng.random(n_tax_units) < voluntary_filing_rate ) diff --git a/policyengine_us_data/datasets/org/org.py b/policyengine_us_data/datasets/org/org.py index 8f942f9a5..6eb25d96e 100644 --- a/policyengine_us_data/datasets/org/org.py +++ b/policyengine_us_data/datasets/org/org.py @@ -11,6 +11,7 @@ from io import BytesIO from pathlib import Path import fcntl +import time from microimpute.models.qrf import QRF import numpy as np @@ -217,13 +218,14 @@ def _load_cps_basic_org_month( year: int, month: str, *, - max_attempts: int = 3, + max_attempts: int = 6, + retry_delay_seconds: float = 1.0, ) -> pd.DataFrame: """Load one CPS basic-month file with light retry around transient fetch/parser issues.""" url = _cps_basic_org_month_url(year, month) last_error: Exception | None = None - for _ in range(max_attempts): + for attempt in range(1, max_attempts + 1): try: response = requests.get(url, timeout=60) response.raise_for_status() @@ -238,6 +240,8 @@ def _load_cps_basic_org_month( return _select_cps_basic_org_columns(month_df) except Exception as error: last_error = error + if attempt < max_attempts and retry_delay_seconds > 0: + time.sleep(retry_delay_seconds * attempt) raise ValueError( f"Failed to load CPS basic ORG month {month} {year} after " diff --git a/policyengine_us_data/parameters/__init__.py b/policyengine_us_data/parameters/__init__.py index dc385f8e0..84585e733 100644 --- a/policyengine_us_data/parameters/__init__.py +++ b/policyengine_us_data/parameters/__init__.py @@ -19,8 +19,8 @@ def load_take_up_rate(variable_name: str, year: int = 2018): year: Year for which to get the rate Returns: - float, dict (EITC rates_by_children), or dict (Medicaid - rates_by_state) + float, dict (EITC rates_by_children), dict (Medicaid + rates_by_state), or nested dict (cell-based rates) """ yaml_path = PARAMETERS_DIR / "take_up" / f"{variable_name}.yaml" @@ -49,6 +49,10 @@ def load_take_up_rate(variable_name: str, year: int = 2018): result[category] = applicable_value return result + # Cell-based tables (for example, voluntary filing by demographics) + if "rates" in data: + return data["rates"] + # Standard time-series values values = data["values"] applicable_value = None diff --git a/policyengine_us_data/parameters/take_up/voluntary_filing.yaml b/policyengine_us_data/parameters/take_up/voluntary_filing.yaml index 46d23e504..e281d0611 100644 --- a/policyengine_us_data/parameters/take_up/voluntary_filing.yaml +++ b/policyengine_us_data/parameters/take_up/voluntary_filing.yaml @@ -1,6 +1,43 @@ -description: Percentage of tax units (not taking up EITC) who file taxes voluntarily. +description: Probability that a non-EITC-claiming tax unit files taxes voluntarily, by children, wage income, and head age. metadata: - label: Voluntary filing rate + label: Voluntary filing probability unit: /1 -values: - 2018-01-01: 0.05 + breakdown: + num_children: + no_children: tax_unit_child_dependents == 0 + with_children: tax_unit_child_dependents >= 1 + wage_income: + zero: wage income == $0 + low: $0 < wage income < $15,000 + medium: $15,000 <= wage income < $30,000 + high: wage income >= $30,000 + age_head: + under_65: age_head < 65 + age_65_plus: age_head >= 65 +rates: + no_children: + zero: + under_65: 0.2 + age_65_plus: 0.05 + low: + under_65: 0.24 + age_65_plus: 0.04 + medium: + under_65: 0.0 + age_65_plus: 0.0 + high: + under_65: 0.0 + age_65_plus: 0.005 + with_children: + zero: + under_65: 0.5 + age_65_plus: 0.075 + low: + under_65: 0.6 + age_65_plus: 0.06 + medium: + under_65: 0.0 + age_65_plus: 0.0 + high: + under_65: 0.025 + age_65_plus: 0.0037 diff --git a/tests/unit/datasets/test_cps_voluntary_filing.py b/tests/unit/datasets/test_cps_voluntary_filing.py new file mode 100644 index 000000000..7f869d174 --- /dev/null +++ b/tests/unit/datasets/test_cps_voluntary_filing.py @@ -0,0 +1,62 @@ +import numpy as np + +from policyengine_us_data.datasets.cps.cps import ( + _sum_person_values_to_tax_units, + _voluntary_filing_age_bin, + _voluntary_filing_children_bin, + _voluntary_filing_rate_by_tax_unit, + _voluntary_filing_wage_income_bin, +) + + +def test_sum_person_values_to_tax_units_aggregates_wages(): + result = _sum_person_values_to_tax_units( + person_values=np.array([10_000, 5_000, 2_500, 7_500], dtype=np.float32), + person_tax_unit_ids=np.array([101, 101, 102, 103]), + tax_unit_ids=np.array([101, 102, 103]), + ) + + np.testing.assert_allclose(result, np.array([15_000, 2_500, 7_500])) + + +def test_voluntary_filing_bins_map_expected_categories(): + np.testing.assert_array_equal( + _voluntary_filing_children_bin(np.array([0, 1, 3])), + np.array(["no_children", "with_children", "with_children"]), + ) + np.testing.assert_array_equal( + _voluntary_filing_wage_income_bin( + np.array([0, 1, 14_999, 15_000, 29_999, 30_000], dtype=np.float32) + ), + np.array(["zero", "low", "low", "medium", "medium", "high"]), + ) + np.testing.assert_array_equal( + _voluntary_filing_age_bin(np.array([24, 64, 65, 80])), + np.array(["under_65", "under_65", "age_65_plus", "age_65_plus"]), + ) + + +def test_voluntary_filing_rate_lookup_uses_all_three_dimensions(): + rates = { + "no_children": { + "zero": {"under_65": 0.2, "age_65_plus": 0.05}, + "low": {"under_65": 0.24, "age_65_plus": 0.04}, + "medium": {"under_65": 0.0, "age_65_plus": 0.0}, + "high": {"under_65": 0.0, "age_65_plus": 0.005}, + }, + "with_children": { + "zero": {"under_65": 0.5, "age_65_plus": 0.075}, + "low": {"under_65": 0.6, "age_65_plus": 0.06}, + "medium": {"under_65": 0.0, "age_65_plus": 0.0}, + "high": {"under_65": 0.025, "age_65_plus": 0.0037}, + }, + } + + result = _voluntary_filing_rate_by_tax_unit( + rates, + children_bin=np.array(["no_children", "with_children", "with_children"]), + wage_income_bin=np.array(["zero", "low", "high"]), + age_bin=np.array(["under_65", "under_65", "age_65_plus"]), + ) + + np.testing.assert_allclose(result, np.array([0.2, 0.6, 0.0037])) diff --git a/tests/unit/datasets/test_org.py b/tests/unit/datasets/test_org.py index 40b90d388..58f452fb1 100644 --- a/tests/unit/datasets/test_org.py +++ b/tests/unit/datasets/test_org.py @@ -183,7 +183,12 @@ def fake_get(*args, **kwargs): monkeypatch.setattr("policyengine_us_data.datasets.org.org.requests.get", fake_get) - loaded = _load_cps_basic_org_month(2024, "may", max_attempts=2) + loaded = _load_cps_basic_org_month( + 2024, + "may", + max_attempts=2, + retry_delay_seconds=0, + ) assert len(calls) == 2 assert loaded.columns.tolist() == CPS_BASIC_MONTHLY_ORG_COLUMNS @@ -208,7 +213,12 @@ def raise_for_status(self): lambda *args, **kwargs: FakeResponse(csv_text), ) - loaded = _load_cps_basic_org_month(2024, "may", max_attempts=1) + loaded = _load_cps_basic_org_month( + 2024, + "may", + max_attempts=1, + retry_delay_seconds=0, + ) assert loaded.columns.tolist() == CPS_BASIC_MONTHLY_ORG_COLUMNS assert loaded.iloc[0].to_dict() == { diff --git a/tests/unit/test_stochastic_variables.py b/tests/unit/test_stochastic_variables.py index 6fc23bdbb..5a698323f 100644 --- a/tests/unit/test_stochastic_variables.py +++ b/tests/unit/test_stochastic_variables.py @@ -53,6 +53,16 @@ def test_ssi_takeup_rate_loads(self): rate = load_take_up_rate("ssi", 2022) assert rate == 0.50 + def test_voluntary_filing_table_loads(self): + rates = load_take_up_rate("voluntary_filing", 2024) + assert isinstance(rates, dict) + assert rates["no_children"]["zero"]["under_65"] == 0.2 + assert rates["with_children"]["low"]["under_65"] == 0.6 + for children_rates in rates.values(): + for wage_rates in children_rates.values(): + for rate in wage_rates.values(): + assert 0 <= rate <= 1 + class TestStableStringHash: def test_deterministic(self):