From 75e647c5d530249f33e1286e42610e4d96b76cc6 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Thu, 5 Mar 2026 14:06:06 -0500
Subject: [PATCH 1/4] Fix employment_income zeroed out in published H5 datasets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two bugs caused employment_income (and self_employment_income) to be
zero in published enhanced CPS datasets:

1. _drop_formula_variables() drops variables with `adds`/`subtracts`,
   including employment_income. But CPS raw data stores income under
   employment_income directly — employment_income_before_lsr was never
   in the H5. The formula engine then can't recompute the aggregate.
   Fix: rename CPS aggregate variables to their input-variable
   equivalents before the drop loop.

2. create_sparse_ecps() had the empty-dict cleanup indented inside the
   inner time_period loop instead of the outer variable loop, causing
   empty variable groups to be written to the H5.
   Fix: dedent to match create_small_ecps().

Closes #573, relates to #571, #444.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 policyengine_us_data/datasets/cps/extended_cps.py  | 14 ++++++++++++++
 .../datasets/cps/small_enhanced_cps.py             |  4 ++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
index ac8b6e45..49a1dfc6 100644
--- a/policyengine_us_data/datasets/cps/extended_cps.py
+++ b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -57,6 +57,15 @@ def generate(self):
     # needed by the dataset loader before formulas can run).
     _KEEP_FORMULA_VARS = {"person_id"}
 
+    # CPS stores aggregate variables (e.g. employment_income) but
+    # policyengine-us computes them via ``adds`` from input variables
+    # (e.g. employment_income_before_lsr).  Rename before dropping so
+    # the raw data is preserved under the correct input-variable name.
+    _RENAME_BEFORE_DROP = {
+        "employment_income": "employment_income_before_lsr",
+        "self_employment_income": ("self_employment_income_before_lsr"),
+    }
+
     @classmethod
     def _drop_formula_variables(cls, data):
         """Remove variables that are computed by policyengine-us.
@@ -67,6 +76,11 @@ def _drop_formula_variables(cls, data):
         """
         from policyengine_us import CountryTaxBenefitSystem
 
+        for src, dst in cls._RENAME_BEFORE_DROP.items():
+            if src in data and dst not in data:
+                logger.info("Renaming %s -> %s before drop", src, dst)
+                data[dst] = data.pop(src)
+
         tbs = CountryTaxBenefitSystem()
         formula_vars = {
             name
diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py
index 483593e2..aecbc086 100644
--- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py
@@ -125,8 +125,8 @@ def create_sparse_ecps():
             if values is not None:
                 data[variable][time_period] = values
 
-            if len(data[variable]) == 0:
-                del data[variable]
+        if len(data[variable]) == 0:
+            del data[variable]
 
     # Validate critical variables exist before writing
     critical_vars = [

From a11b0c557f731408c176f478937e0563645d1852 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Thu, 5 Mar 2026 15:37:11 -0500
Subject: [PATCH 2/4] Discover behavioral-response renames programmatically,
 fix sparse validation

Replace hard-coded _RENAME_BEFORE_DROP dict with dynamic discovery from
the tax-benefit system, and update sparse eCPS validation to check for
employment_income_before_lsr (the input variable) instead of the computed
aggregate.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../datasets/cps/extended_cps.py              | 32 +++++++++++--------
 .../datasets/cps/small_enhanced_cps.py        |  2 +-
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
index 49a1dfc6..bfda3d0f 100644
--- a/policyengine_us_data/datasets/cps/extended_cps.py
+++ b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -57,15 +57,6 @@ def generate(self):
     # needed by the dataset loader before formulas can run).
     _KEEP_FORMULA_VARS = {"person_id"}
 
-    # CPS stores aggregate variables (e.g. employment_income) but
-    # policyengine-us computes them via ``adds`` from input variables
-    # (e.g. employment_income_before_lsr).  Rename before dropping so
-    # the raw data is preserved under the correct input-variable name.
-    _RENAME_BEFORE_DROP = {
-        "employment_income": "employment_income_before_lsr",
-        "self_employment_income": ("self_employment_income_before_lsr"),
-    }
-
     @classmethod
     def _drop_formula_variables(cls, data):
         """Remove variables that are computed by policyengine-us.
@@ -73,15 +64,28 @@ def _drop_formula_variables(cls, data):
         Variables with formulas, ``adds``, or ``subtracts`` are
         recomputed by the simulation engine, so storing them wastes
         space and can mislead validation.
+
+        Aggregate variables whose ``adds`` include a behavioral-
+        response input (e.g. ``employment_income_before_lsr``) are
+        renamed to that input before dropping so the raw data is
+        preserved under the correct input-variable name.
         """
         from policyengine_us import CountryTaxBenefitSystem
 
-        for src, dst in cls._RENAME_BEFORE_DROP.items():
-            if src in data and dst not in data:
-                logger.info("Renaming %s -> %s before drop", src, dst)
-                data[dst] = data.pop(src)
-
         tbs = CountryTaxBenefitSystem()
+
+        _SUFFIXES = ("_before_lsr", "_before_response")
+        for name, var in tbs.variables.items():
+            for add_var in getattr(var, "adds", None) or []:
+                if any(add_var.endswith(s) for s in _SUFFIXES):
+                    if name in data and add_var not in data:
+                        logger.info(
+                            "Renaming %s -> %s before drop",
+                            name,
+                            add_var,
+                        )
+                        data[add_var] = data.pop(name)
+
         formula_vars = {
             name
             for name, var in tbs.variables.items()
diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py
index aecbc086..c84181ea 100644
--- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py
@@ -131,7 +131,7 @@ def create_sparse_ecps():
     # Validate critical variables exist before writing
     critical_vars = [
         "household_weight",
-        "employment_income",
+        "employment_income_before_lsr",
         "household_id",
         "person_id",
     ]

From d78c12e8dc076b5daad32869fd63d63ccfb9bf61 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Thu, 5 Mar 2026 15:42:02 -0500
Subject: [PATCH 3/4] Replace suffix-based matching with structural input
 variable detection

Instead of matching hard-coded suffixes like _before_lsr, detect input
variables structurally: an adds component with no formula, no adds, and
no subtracts is a pure input variable.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../datasets/cps/extended_cps.py              | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
index bfda3d0f..9960f853 100644
--- a/policyengine_us_data/datasets/cps/extended_cps.py
+++ b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -74,17 +74,26 @@ def _drop_formula_variables(cls, data):
 
         tbs = CountryTaxBenefitSystem()
 
-        _SUFFIXES = ("_before_lsr", "_before_response")
         for name, var in tbs.variables.items():
+            if name not in data:
+                continue
             for add_var in getattr(var, "adds", None) or []:
-                if any(add_var.endswith(s) for s in _SUFFIXES):
-                    if name in data and add_var not in data:
-                        logger.info(
-                            "Renaming %s -> %s before drop",
-                            name,
-                            add_var,
-                        )
-                        data[add_var] = data.pop(name)
+                av = tbs.variables.get(add_var)
+                if av is None:
+                    continue
+                is_input = (
+                    not (hasattr(av, "formulas") and av.formulas)
+                    and not getattr(av, "adds", None)
+                    and not getattr(av, "subtracts", None)
+                )
+                if is_input and add_var not in data:
+                    logger.info(
+                        "Renaming %s -> %s before drop",
+                        name,
+                        add_var,
+                    )
+                    data[add_var] = data.pop(name)
+                    break
 
         formula_vars = {
             name

From e8553429d471251420d867ec61cce95a10f0bf5b Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Thu, 5 Mar 2026 15:44:04 -0500
Subject: [PATCH 4/4] Revert to suffix-based detection for behavioral response
 variables

The structural approach (any pure-input adds component) matches ~90
variables and causes false positives. The _before_lsr/_before_response
suffixes are a naming convention in policyengine-us for behavioral
response variables and precisely target the right ones.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../datasets/cps/extended_cps.py              | 24 +++++++------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
index 9960f853..b60fbf42 100644
--- a/policyengine_us_data/datasets/cps/extended_cps.py
+++ b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -74,25 +74,19 @@ def _drop_formula_variables(cls, data):
 
         tbs = CountryTaxBenefitSystem()
 
+        _RESPONSE_SUFFIXES = ("_before_lsr", "_before_response")
         for name, var in tbs.variables.items():
             if name not in data:
                 continue
             for add_var in getattr(var, "adds", None) or []:
-                av = tbs.variables.get(add_var)
-                if av is None:
-                    continue
-                is_input = (
-                    not (hasattr(av, "formulas") and av.formulas)
-                    and not getattr(av, "adds", None)
-                    and not getattr(av, "subtracts", None)
-                )
-                if is_input and add_var not in data:
-                    logger.info(
-                        "Renaming %s -> %s before drop",
-                        name,
-                        add_var,
-                    )
-                    data[add_var] = data.pop(name)
+                if any(add_var.endswith(s) for s in _RESPONSE_SUFFIXES):
+                    if add_var not in data:
+                        logger.info(
+                            "Renaming %s -> %s before drop",
+                            name,
+                            add_var,
+                        )
+                        data[add_var] = data.pop(name)
                     break
 
         formula_vars = {