From 75e647c5d530249f33e1286e42610e4d96b76cc6 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Thu, 5 Mar 2026 14:06:06 -0500 Subject: [PATCH 1/4] Fix employment_income zeroed out in published H5 datasets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs caused employment_income (and self_employment_income) to be zero in published enhanced CPS datasets: 1. _drop_formula_variables() drops variables with `adds`/`subtracts`, including employment_income. But CPS raw data stores income under employment_income directly — employment_income_before_lsr was never in the H5. The formula engine then can't recompute the aggregate. Fix: rename CPS aggregate variables to their input-variable equivalents before the drop loop. 2. create_sparse_ecps() had the empty-dict cleanup indented inside the inner time_period loop instead of the outer variable loop, causing empty variable groups to be written to the H5. Fix: dedent to match create_small_ecps(). Closes #573, relates to #571, #444. Co-Authored-By: Claude Opus 4.6 --- policyengine_us_data/datasets/cps/extended_cps.py | 14 ++++++++++++++ .../datasets/cps/small_enhanced_cps.py | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index ac8b6e45..49a1dfc6 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -57,6 +57,15 @@ def generate(self): # needed by the dataset loader before formulas can run). _KEEP_FORMULA_VARS = {"person_id"} + # CPS stores aggregate variables (e.g. employment_income) but + # policyengine-us computes them via ``adds`` from input variables + # (e.g. employment_income_before_lsr). Rename before dropping so + # the raw data is preserved under the correct input-variable name. + _RENAME_BEFORE_DROP = { + "employment_income": "employment_income_before_lsr", + "self_employment_income": ("self_employment_income_before_lsr"), + } + @classmethod def _drop_formula_variables(cls, data): """Remove variables that are computed by policyengine-us. @@ -67,6 +76,11 @@ def _drop_formula_variables(cls, data): """ from policyengine_us import CountryTaxBenefitSystem + for src, dst in cls._RENAME_BEFORE_DROP.items(): + if src in data and dst not in data: + logger.info("Renaming %s -> %s before drop", src, dst) + data[dst] = data.pop(src) + tbs = CountryTaxBenefitSystem() formula_vars = { name diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py index 483593e2..aecbc086 100644 --- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py @@ -125,8 +125,8 @@ def create_sparse_ecps(): if values is not None: data[variable][time_period] = values - if len(data[variable]) == 0: - del data[variable] + if len(data[variable]) == 0: + del data[variable] # Validate critical variables exist before writing critical_vars = [ From a11b0c557f731408c176f478937e0563645d1852 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Thu, 5 Mar 2026 15:37:11 -0500 Subject: [PATCH 2/4] Discover behavioral-response renames programmatically, fix sparse validation Replace hard-coded _RENAME_BEFORE_DROP dict with dynamic discovery from the tax-benefit system, and update sparse eCPS validation to check for employment_income_before_lsr (the input variable) instead of the computed aggregate. Co-Authored-By: Claude Opus 4.6 --- .../datasets/cps/extended_cps.py | 32 +++++++++++-------- .../datasets/cps/small_enhanced_cps.py | 2 +- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 49a1dfc6..bfda3d0f 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -57,15 +57,6 @@ def generate(self): # needed by the dataset loader before formulas can run). _KEEP_FORMULA_VARS = {"person_id"} - # CPS stores aggregate variables (e.g. employment_income) but - # policyengine-us computes them via ``adds`` from input variables - # (e.g. employment_income_before_lsr). Rename before dropping so - # the raw data is preserved under the correct input-variable name. - _RENAME_BEFORE_DROP = { - "employment_income": "employment_income_before_lsr", - "self_employment_income": ("self_employment_income_before_lsr"), - } - @classmethod def _drop_formula_variables(cls, data): """Remove variables that are computed by policyengine-us. @@ -73,15 +64,28 @@ def _drop_formula_variables(cls, data): Variables with formulas, ``adds``, or ``subtracts`` are recomputed by the simulation engine, so storing them wastes space and can mislead validation. + + Aggregate variables whose ``adds`` include a behavioral- + response input (e.g. ``employment_income_before_lsr``) are + renamed to that input before dropping so the raw data is + preserved under the correct input-variable name. """ from policyengine_us import CountryTaxBenefitSystem - for src, dst in cls._RENAME_BEFORE_DROP.items(): - if src in data and dst not in data: - logger.info("Renaming %s -> %s before drop", src, dst) - data[dst] = data.pop(src) - tbs = CountryTaxBenefitSystem() + + _SUFFIXES = ("_before_lsr", "_before_response") + for name, var in tbs.variables.items(): + for add_var in getattr(var, "adds", None) or []: + if any(add_var.endswith(s) for s in _SUFFIXES): + if name in data and add_var not in data: + logger.info( + "Renaming %s -> %s before drop", + name, + add_var, + ) + data[add_var] = data.pop(name) + formula_vars = { name for name, var in tbs.variables.items() diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py index aecbc086..c84181ea 100644 --- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py @@ -131,7 +131,7 @@ def create_sparse_ecps(): # Validate critical variables exist before writing critical_vars = [ "household_weight", - "employment_income", + "employment_income_before_lsr", "household_id", "person_id", ] From d78c12e8dc076b5daad32869fd63d63ccfb9bf61 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Thu, 5 Mar 2026 15:42:02 -0500 Subject: [PATCH 3/4] Replace suffix-based matching with structural input variable detection Instead of matching hard-coded suffixes like _before_lsr, detect input variables structurally: an adds component with no formula, no adds, and no subtracts is a pure input variable. Co-Authored-By: Claude Opus 4.6 --- .../datasets/cps/extended_cps.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index bfda3d0f..9960f853 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -74,17 +74,26 @@ def _drop_formula_variables(cls, data): tbs = CountryTaxBenefitSystem() - _SUFFIXES = ("_before_lsr", "_before_response") for name, var in tbs.variables.items(): + if name not in data: + continue for add_var in getattr(var, "adds", None) or []: - if any(add_var.endswith(s) for s in _SUFFIXES): - if name in data and add_var not in data: - logger.info( - "Renaming %s -> %s before drop", - name, - add_var, - ) - data[add_var] = data.pop(name) + av = tbs.variables.get(add_var) + if av is None: + continue + is_input = ( + not (hasattr(av, "formulas") and av.formulas) + and not getattr(av, "adds", None) + and not getattr(av, "subtracts", None) + ) + if is_input and add_var not in data: + logger.info( + "Renaming %s -> %s before drop", + name, + add_var, + ) + data[add_var] = data.pop(name) + break formula_vars = { name From e8553429d471251420d867ec61cce95a10f0bf5b Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Thu, 5 Mar 2026 15:44:04 -0500 Subject: [PATCH 4/4] Revert to suffix-based detection for behavioral response variables The structural approach (any pure-input adds component) matches ~90 variables and causes false positives. The _before_lsr/_before_response suffixes are a naming convention in policyengine-us for behavioral response variables and precisely target the right ones. Co-Authored-By: Claude Opus 4.6 --- .../datasets/cps/extended_cps.py | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 9960f853..b60fbf42 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -74,25 +74,19 @@ def _drop_formula_variables(cls, data): tbs = CountryTaxBenefitSystem() + _RESPONSE_SUFFIXES = ("_before_lsr", "_before_response") for name, var in tbs.variables.items(): if name not in data: continue for add_var in getattr(var, "adds", None) or []: - av = tbs.variables.get(add_var) - if av is None: - continue - is_input = ( - not (hasattr(av, "formulas") and av.formulas) - and not getattr(av, "adds", None) - and not getattr(av, "subtracts", None) - ) - if is_input and add_var not in data: - logger.info( - "Renaming %s -> %s before drop", - name, - add_var, - ) - data[add_var] = data.pop(name) + if any(add_var.endswith(s) for s in _RESPONSE_SUFFIXES): + if add_var not in data: + logger.info( + "Renaming %s -> %s before drop", + name, + add_var, + ) + data[add_var] = data.pop(name) break formula_vars = {