diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index ac8b6e45..b60fbf42 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -64,10 +64,31 @@ def _drop_formula_variables(cls, data): Variables with formulas, ``adds``, or ``subtracts`` are recomputed by the simulation engine, so storing them wastes space and can mislead validation. + + Aggregate variables whose ``adds`` include a behavioral- + response input (e.g. ``employment_income_before_lsr``) are + renamed to that input before dropping so the raw data is + preserved under the correct input-variable name. """ from policyengine_us import CountryTaxBenefitSystem tbs = CountryTaxBenefitSystem() + + _RESPONSE_SUFFIXES = ("_before_lsr", "_before_response") + for name, var in tbs.variables.items(): + if name not in data: + continue + for add_var in getattr(var, "adds", None) or []: + if any(add_var.endswith(s) for s in _RESPONSE_SUFFIXES): + if add_var not in data: + logger.info( + "Renaming %s -> %s before drop", + name, + add_var, + ) + data[add_var] = data.pop(name) + break + formula_vars = { name for name, var in tbs.variables.items() diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py index 483593e2..c84181ea 100644 --- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py @@ -125,13 +125,13 @@ def create_sparse_ecps(): if values is not None: data[variable][time_period] = values - if len(data[variable]) == 0: - del data[variable] + if len(data[variable]) == 0: + del data[variable] # Validate critical variables exist before writing critical_vars = [ "household_weight", - "employment_income", + "employment_income_before_lsr", "household_id", "person_id", ]