PolicyEngine · MaxGhenis · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/changelog.d/issue-813-elective-filer.changed.md b/changelog.d/issue-813-elective-filer.changed.md
@@ -0,0 +1 @@
+Replace the flat voluntary filing rate with a demographic elective-filer table based on children, wage income, and head age, and add sanity-check metrics for non-EITC filer alignment.
diff --git a/policyengine_us_data/calibration/sanity_checks.py b/policyengine_us_data/calibration/sanity_checks.py
@@ -7,7 +7,9 @@
 """
 
 import logging
+import re
 from typing import List
+from pathlib import Path
 
 import h5py
 import numpy as np
@@ -33,9 +35,114 @@
     "takes_up_head_start_if_eligible",
     "takes_up_early_head_start_if_eligible",
     "takes_up_dc_ptc",
+    "would_file_taxes_voluntarily",
 ]
 
 
+def _eitc_target_source_year(eitc_targets_path: Path) -> int:
+    first_line = eitc_targets_path.read_text().splitlines()[0]
+    matches = re.search(r"Tax Year (\d{4})", first_line)
+    if matches is not None:
+        return int(matches.group(1))
+    raise ValueError(
+        f"Could not determine EITC target source year from {eitc_targets_path}"
+    )
+
+
+def _non_eitc_filer_alignment_metrics(
+    h5_path: str,
+    period: int,
+) -> dict | None:
+    try:
+        import pandas as pd
+        from policyengine_core.data import Dataset
+        from policyengine_us import Microsimulation
+        from policyengine_us_data.storage import STORAGE_FOLDER
+        from policyengine_us_data.utils.soi import get_soi
+        from policyengine_us_data.utils.uprating import (
+            create_policyengine_uprating_factors_table,
+        )
+    except ImportError:
+        return None
+
+    class _SanityChecksDataset(Dataset):
+        name = "sanity_checks_dataset"
+        label = "Sanity checks dataset"
+        data_format = Dataset.TIME_PERIOD_ARRAYS
+        file_path = Path(h5_path)
+        time_period = period
+
+    sim = Microsimulation(dataset=_SanityChecksDataset)
+    tax_unit_weight = sim.calculate("tax_unit_weight").values.astype(float)
+    tax_unit_is_filer = sim.calculate("tax_unit_is_filer").values.astype(bool)
+    eitc = sim.calculate("eitc").values.astype(float)
+    agi = sim.calculate("adjusted_gross_income").values.astype(float)
+    claimed_eitc = eitc > 0
+
+    soi = get_soi(period)
+    soi_filer_counts = soi[
+        (soi["Variable"] == "count")
+        & soi["Count"]
+        & (soi["Filing status"] == "All")
+        & ~soi["Taxable only"]
+        & ~soi["Full population"]
+    ][["AGI lower bound", "AGI upper bound", "Value"]].sort_values(
+        ["AGI lower bound", "AGI upper bound"]
+    )
+
+    eitc_targets_path = (
+        STORAGE_FOLDER / "calibration_targets" / "eitc_by_agi_and_children.csv"
+    )
+    eitc_source_year = _eitc_target_source_year(eitc_targets_path)
+    eitc_targets = pd.read_csv(eitc_targets_path, comment="#")
+    uprating = create_policyengine_uprating_factors_table()
+    earliest_uprating_year = int(uprating.columns.astype(int).min())
+    latest_uprating_year = int(uprating.columns.astype(int).max())
+    source_year = min(
+        max(eitc_source_year, earliest_uprating_year), latest_uprating_year
+    )
+    target_year = min(max(period, earliest_uprating_year), latest_uprating_year)
+    population_growth = float(
+        uprating.loc["population", target_year]
+        / uprating.loc["population", source_year]
+    )
+    eitc_targets["returns_target_year"] = eitc_targets["returns"] * population_growth
+
+    actual_total = float((tax_unit_weight * (tax_unit_is_filer & ~claimed_eitc)).sum())
+    target_total = 0.0
+    total_abs_error = 0.0
+    low_agi_abs_error = 0.0
+
+    for lower, upper, total_filers in soi_filer_counts.itertuples(index=False):
+        target_eitc_returns = float(
+            eitc_targets.loc[
+                (eitc_targets["agi_lower"].astype(float) >= float(lower))
+                & (eitc_targets["agi_upper"].astype(float) <= float(upper)),
+                "returns_target_year",
+            ].sum()
+        )
+        target_non_eitc_filers = float(total_filers - target_eitc_returns)
+        actual_non_eitc_filers = float(
+            (
+                tax_unit_weight
+                * (tax_unit_is_filer & ~claimed_eitc & (agi >= lower) & (agi < upper))
+            ).sum()
+        )
+        abs_error = abs(actual_non_eitc_filers - target_non_eitc_filers)
+        target_total += target_non_eitc_filers
+        total_abs_error += abs_error
+        if float(upper) <= 40_000:
+            low_agi_abs_error += abs_error
+
+    return {
+        "actual_total": actual_total,
+        "target_total": target_total,
+        "total_gap": actual_total - target_total,
+        "total_abs_error": total_abs_error,
+        "low_agi_abs_error": low_agi_abs_error,
+    }
+
+
 def run_sanity_checks(
     h5_path: str,
     period: int = 2024,
@@ -176,6 +283,42 @@ def _get(f, path):
                     }
                 )
 
+        alignment = _non_eitc_filer_alignment_metrics(h5_path, period)
+        if alignment is None:
+            results.append(
+                {
+                    "check": "non_eitc_filer_alignment",
+                    "status": "SKIP",
+                    "detail": "policyengine_us not available",
+                }
+            )
+        else:
+            results.append(
+                {
+                    "check": "non_eitc_filer_total_gap",
+                    "status": "PASS",
+                    "detail": (
+                        f"actual={alignment['actual_total']:,.0f}, "
+                        f"target={alignment['target_total']:,.0f}, "
+                        f"gap={alignment['total_gap']:,.0f}"
+                    ),
+                }
+            )
+            results.append(
+                {
+                    "check": "non_eitc_filer_total_agi_abs_error",
+                    "status": "PASS",
+                    "detail": f"{alignment['total_abs_error']:,.0f}",
+                }
+            )
+            results.append(
+                {
+                    "check": "non_eitc_filer_low_agi_abs_error",
+                    "status": "PASS",
+                    "detail": f"{alignment['low_agi_abs_error']:,.0f}",
+                }
+            )
+
         # 5. Boolean takeup variables
         for var in TAKEUP_VARS:
             vals = _get(f, f"{var}/{period}")

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -109,6 +109,71 @@ def _open_dataset_read_only(dataset_source):
         yield store
 
 
+def _sum_person_values_to_tax_units(
+    person_values: np.ndarray,
+    person_tax_unit_ids: np.ndarray,
+    tax_unit_ids: np.ndarray,
+) -> np.ndarray:
+    tax_unit_index = {
+        int(tax_unit_id): index for index, tax_unit_id in enumerate(tax_unit_ids)
+    }
+    person_tax_unit_index = np.array(
+        [tax_unit_index[int(tax_unit_id)] for tax_unit_id in person_tax_unit_ids],
+        dtype=np.int64,
+    )
+    tax_unit_values = np.zeros(len(tax_unit_ids), dtype=np.float32)
+    np.add.at(
+        tax_unit_values,
+        person_tax_unit_index,
+        np.asarray(person_values, dtype=np.float32),
+    )
+    return tax_unit_values
+
+
+def _voluntary_filing_children_bin(
+    tax_unit_child_dependents: np.ndarray,
+) -> np.ndarray:
+    return np.where(
+        np.asarray(tax_unit_child_dependents) > 0,
+        "with_children",
+        "no_children",
+    )
+
+
+def _voluntary_filing_wage_income_bin(
+    tax_unit_wage_income: np.ndarray,
+) -> np.ndarray:
+    wage_income = np.asarray(tax_unit_wage_income, dtype=np.float32)
+    return np.select(
+        [
+            wage_income <= 0,
+            wage_income < 15_000,
+            wage_income < 30_000,
+        ],
+        ["zero", "low", "medium"],
+        default="high",
+    )
+
+
+def _voluntary_filing_age_bin(age_head: np.ndarray) -> np.ndarray:
+    return np.where(np.asarray(age_head) >= 65, "age_65_plus", "under_65")
+
+
+def _voluntary_filing_rate_by_tax_unit(
+    voluntary_filing_rates: dict,
+    children_bin: np.ndarray,
+    wage_income_bin: np.ndarray,
+    age_bin: np.ndarray,
+) -> np.ndarray:
+    return np.array(
+        [
+            voluntary_filing_rates[children][wage][age]
+            for children, wage, age in zip(children_bin, wage_income_bin, age_bin)
+        ],
+        dtype=np.float32,
+    )
+
+
 class CPS(Dataset):
     name = "cps"
     label = "CPS"
@@ -311,9 +376,11 @@ def add_takeup(self):
     head_start_rate = load_take_up_rate("head_start", self.time_period)
     early_head_start_rate = load_take_up_rate("early_head_start", self.time_period)
     ssi_rate = load_take_up_rate("ssi", self.time_period)
+    voluntary_filing_rates = load_take_up_rate("voluntary_filing", self.time_period)
 
     # EITC: varies by number of children
     eitc_child_count = baseline.calculate("eitc_child_count").values
+    potential_eitc = baseline.calculate("eitc").values
     eitc_takeup_rate = np.array(
         [eitc_rates_by_children.get(min(int(c), 3), 0.85) for c in eitc_child_count]
     )
@@ -426,14 +493,26 @@ def add_takeup(self):
         rng.random(n_persons) < pregnancy_rate_by_person
     )
 
-    # Voluntary tax filing: some people file even when not required and not
-    # seeking a refund. EITC take-up already captures refund-seeking behavior
-    # (if you take up EITC, you file). This variable captures people who file
-    # for other reasons: state requirements, documentation, habit.
-    # ~5% of tax units who don't take up EITC still file voluntarily.
-    voluntary_filing_rate = 0.05
+    # Voluntary tax filing: some tax units file even when not required and not
+    # claiming EITC. Assign rates by a simple demographic table that
+    # concentrates elective filing among low-wage parents and sharply reduces
+    # it among older childless households.
+    claims_eitc = data["takes_up_eitc"] & (potential_eitc > 0)
+    tax_unit_child_dependents = baseline.calculate("tax_unit_child_dependents").values
+    tax_unit_wage_income = _sum_person_values_to_tax_units(
+        data["employment_income"],
+        data["person_tax_unit_id"],
+        data["tax_unit_id"],
+    )
+    age_head = baseline.calculate("age_head").values
+    voluntary_filing_rate = _voluntary_filing_rate_by_tax_unit(
+        voluntary_filing_rates,
+        _voluntary_filing_children_bin(tax_unit_child_dependents),
+        _voluntary_filing_wage_income_bin(tax_unit_wage_income),
+        _voluntary_filing_age_bin(age_head),
+    )
     rng = seeded_rng("would_file_taxes_voluntarily")
-    data["would_file_taxes_voluntarily"] = ~data["takes_up_eitc"] & (
+    data["would_file_taxes_voluntarily"] = ~claims_eitc & (
         rng.random(n_tax_units) < voluntary_filing_rate
     )
 

diff --git a/policyengine_us_data/datasets/org/org.py b/policyengine_us_data/datasets/org/org.py
@@ -11,6 +11,7 @@
 from io import BytesIO
 from pathlib import Path
 import fcntl
+import time
 
 from microimpute.models.qrf import QRF
 import numpy as np
@@ -217,13 +218,14 @@ def _load_cps_basic_org_month(
     year: int,
     month: str,
     *,
-    max_attempts: int = 3,
+    max_attempts: int = 6,
+    retry_delay_seconds: float = 1.0,
 ) -> pd.DataFrame:
     """Load one CPS basic-month file with light retry around transient fetch/parser issues."""
     url = _cps_basic_org_month_url(year, month)
     last_error: Exception | None = None
 
-    for _ in range(max_attempts):
+    for attempt in range(1, max_attempts + 1):
         try:
             response = requests.get(url, timeout=60)
             response.raise_for_status()
@@ -238,6 +240,8 @@ def _load_cps_basic_org_month(
             return _select_cps_basic_org_columns(month_df)
         except Exception as error:
             last_error = error
+            if attempt < max_attempts and retry_delay_seconds > 0:
+                time.sleep(retry_delay_seconds * attempt)
 
     raise ValueError(
         f"Failed to load CPS basic ORG month {month} {year} after "

diff --git a/policyengine_us_data/parameters/__init__.py b/policyengine_us_data/parameters/__init__.py
@@ -19,8 +19,8 @@ def load_take_up_rate(variable_name: str, year: int = 2018):
         year: Year for which to get the rate
 
     Returns:
-        float, dict (EITC rates_by_children), or dict (Medicaid
-        rates_by_state)
+        float, dict (EITC rates_by_children), dict (Medicaid
+        rates_by_state), or nested dict (cell-based rates)
     """
     yaml_path = PARAMETERS_DIR / "take_up" / f"{variable_name}.yaml"
 
@@ -49,6 +49,10 @@ def load_take_up_rate(variable_name: str, year: int = 2018):
                 result[category] = applicable_value
         return result
 
+    # Cell-based tables (for example, voluntary filing by demographics)
+    if "rates" in data:
+        return data["rates"]
+
     # Standard time-series values
     values = data["values"]
     applicable_value = None

diff --git a/policyengine_us_data/parameters/take_up/voluntary_filing.yaml b/policyengine_us_data/parameters/take_up/voluntary_filing.yaml
@@ -1,6 +1,43 @@
-description: Percentage of tax units (not taking up EITC) who file taxes voluntarily.
+description: Probability that a non-EITC-claiming tax unit files taxes voluntarily, by children, wage income, and head age.
 metadata:
-  label: Voluntary filing rate
+  label: Voluntary filing probability
   unit: /1
-values:
-  2018-01-01: 0.05
+  breakdown:
+    num_children:
+      no_children: tax_unit_child_dependents == 0
+      with_children: tax_unit_child_dependents >= 1
+    wage_income:
+      zero: wage income == $0
+      low: $0 < wage income < $15,000
+      medium: $15,000 <= wage income < $30,000
+      high: wage income >= $30,000
+    age_head:
+      under_65: age_head < 65
+      age_65_plus: age_head >= 65
+rates:
+  no_children:
+    zero:
+      under_65: 0.2
+      age_65_plus: 0.05
+    low:
+      under_65: 0.24
+      age_65_plus: 0.04
+    medium:
+      under_65: 0.0
+      age_65_plus: 0.0
+    high:
+      under_65: 0.0
+      age_65_plus: 0.005
+  with_children:
+    zero:
+      under_65: 0.5
+      age_65_plus: 0.075
+    low:
+      under_65: 0.6
+      age_65_plus: 0.06
+    medium:
+      under_65: 0.0
+      age_65_plus: 0.0
+    high:
+      under_65: 0.025
+      age_65_plus: 0.0037
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Replace the flat voluntary filing rate with a demographic elective-filer table based on children, wage income, and head age, and add sanity-check metrics for non-EITC filer alignment.