From eabe520a7a399f78f1f413712928039efc07965c Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 27 Apr 2026 15:38:03 -0400 Subject: [PATCH 1/4] Blend SIPP and SCF financial assets --- docs/data.md | 18 +- docs/methodology.md | 25 +- .../calibration/source_impute.py | 34 ++- policyengine_us_data/datasets/cps/cps.py | 34 ++- .../utils/asset_imputation.py | 223 ++++++++++++++++++ tests/unit/test_asset_imputation.py | 133 +++++++++++ 6 files changed, 448 insertions(+), 19 deletions(-) diff --git a/docs/data.md b/docs/data.md index bbce2c4a5..f7b703798 100644 --- a/docs/data.md +++ b/docs/data.md @@ -68,16 +68,24 @@ missing from the CPS: ### Survey of Income and Program Participation (SIPP) The SIPP provides income and program participation data. We use SIPP primarily to impute tip income -through a Quantile Regression Forest model trained on SIPP data, using employment income, age, and -household composition as predictors. +and policy-relevant asset inputs through Quantile Regression Forest models trained on SIPP data. +The asset imputations currently cover bank accounts, stocks, bonds, household vehicle counts, and +household vehicle values. Bank accounts, stocks, and bonds are then combined with comparable SCF +predictions through a stable household-level 50/50 source-model draw. These fields are not a +complete household balance sheet; they are exposed so policy models can select the resources that +matter for a specific program. ### Survey of Consumer Finances (SCF) The SCF provides wealth and debt information that we use to impute several financial variables missing from the CPS. We match auto loan balances based on household demographics and income, then -calculate interest on auto loans from these imputed balances. Additionally, we impute various net -worth components and other wealth measures not available in CPS. The SCF imputation uses their -reference person definition to ensure proper matching. +calculate interest on auto loans from these imputed balances. We also impute `net_worth` as an SCF +aggregate. This aggregate is not reconstructible from the currently exposed asset and liability +fields because those fields combine SIPP liquid-asset and vehicle imputations with selected SCF debt +inputs and omit major balance-sheet categories such as home equity, mortgage debt, retirement +assets, business equity, other real estate, and other debts. Use the specific asset variables for +resource-tested policy rules; use `net_worth` only as a broad wealth aggregate. The SCF imputation +uses their reference person definition to ensure proper matching. ### American Community Survey (ACS) diff --git a/docs/methodology.md b/docs/methodology.md index 730196ff7..db9f34aea 100644 --- a/docs/methodology.md +++ b/docs/methodology.md @@ -237,12 +237,25 @@ as a predictor, which allows the imputed values to reflect geographic variation rates and rent levels. **SIPP (Survey of Income and Program Participation)**: Tip income, bank account assets, stock -assets, bond assets. The SIPP lacks state identifiers, so these imputations are state-blind at the -microdata level — geographic variation in tip income and assets enters only through calibration -weights, not through the imputed values themselves. - -**SCF (Survey of Consumer Finances)**: Net worth, auto loan balances, auto loan interest. The SCF -also lacks state identifiers, so these imputations are likewise state-blind. +assets, bond assets, household vehicle counts, and household vehicle values. The SIPP lacks state +identifiers, so these imputations are state-blind at the microdata level - geographic variation in +tip income and assets enters only through calibration weights, not through the imputed values +themselves. + +**SCF (Survey of Consumer Finances)**: Aggregate net worth, auto loan balances, and auto loan +interest. The SCF also lacks state identifiers, so these imputations are likewise state-blind. + +The current asset fields are a mixed-source partial balance sheet. `net_worth` is independently +imputed from the SCF aggregate and includes components that are not currently exposed in the public +CPS file, such as primary residence equity, mortgage debt, retirement assets, business equity, +other real estate, other financial assets, and other debts. The SIPP liquid-asset and vehicle fields +are policy-relevant inputs in their own right. For overlapping bank-account, stock, and bond asset +variables, we use a stable household-level 50/50 source-model draw between the SIPP QRF prediction +and the comparable SCF QRF prediction, with a single draw shared across the financial-asset block. +We do not rescale these policy leaves to force them to add up to SCF `net_worth`. Therefore, +row-level reconciliation between `net_worth` and the exposed component fields is not expected. A +net-worth component diagnostic should only be enabled when the component set is explicitly intended +to be complete and household-aligned. The output of this stage is the source-imputed stratified CPS (`source_imputed_stratified_extended_cps_2024.h5`), which serves as the input to the diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py index a543eff4a..464337866 100644 --- a/policyengine_us_data/calibration/source_impute.py +++ b/policyengine_us_data/calibration/source_impute.py @@ -12,7 +12,8 @@ household_vehicles_value (no state predictor) ORG -> hourly_wage, is_paid_hourly, is_union_member_or_covered - SCF -> net_worth, auto_loan_balance, auto_loan_interest + SCF -> net_worth, auto_loan_balance, auto_loan_interest, and + 50/50 source-model averaging for overlapping financial assets (no state predictor) Usage in unified calibration pipeline: @@ -45,7 +46,10 @@ predict_org_features, ) from policyengine_us_data.utils.asset_imputation import ( + SCF_FINANCIAL_ASSET_POLICY_VARIABLES, + add_scf_financial_asset_targets, build_household_vehicle_receiver, + combine_sipp_and_scf_financial_assets, ) logger = logging.getLogger(__name__) @@ -765,15 +769,19 @@ def _impute_scf( if "networth" in scf_df.columns and "net_worth" not in scf_df.columns: scf_df["net_worth"] = scf_df["networth"] + scf_financial_asset_targets = add_scf_financial_asset_targets(scf_df) available_vars = [v for v in SCF_IMPUTED_VARIABLES if v in scf_df.columns] + qrf_vars = available_vars + [ + v for v in scf_financial_asset_targets if v in scf_df.columns + ] if not available_vars: - logger.warning("No SCF imputed variables available. Skipping.") + logger.warning("No SCF aggregate imputed variables available. Skipping.") return data weights = scf_df.get("wgt") - donor = scf_df[scf_predictors + available_vars].copy() + donor = scf_df[scf_predictors + qrf_vars].copy() if weights is not None: donor["wgt"] = weights donor = donor.dropna(subset=scf_predictors) @@ -834,12 +842,12 @@ def _impute_scf( "SCF QRF: %d train, %d test, vars=%s", len(donor), len(cps_df), - available_vars, + qrf_vars, ) fitted = qrf.fit( X_train=donor, predictors=scf_predictors, - imputed_variables=available_vars, + imputed_variables=qrf_vars, weight_col="wgt" if weights is not None else None, tune_hyperparameters=False, ) @@ -870,6 +878,22 @@ def _impute_scf( else: data[var] = {time_period: person_vals} + person_hh_ids = data.get("person_household_id", {}).get(time_period) + if person_hh_ids is not None: + first_person_mask = ~pd.Series(person_hh_ids).duplicated().values + for scf_var, policy_var in SCF_FINANCIAL_ASSET_POLICY_VARIABLES.items(): + if scf_var not in preds or policy_var not in data: + continue + data[policy_var] = { + time_period: combine_sipp_and_scf_financial_assets( + sipp_values=data[policy_var][time_period], + scf_household_values=preds.loc[first_person_mask, scf_var].values, + person_household_ids=person_hh_ids, + reference_person_mask=first_person_mask, + time_period=time_period, + ) + } + del fitted, preds gc.collect() diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index d12ba7eef..40d6d7717 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -48,7 +48,10 @@ reported_subsidized_marketplace_by_tax_unit, ) from policyengine_us_data.utils.asset_imputation import ( + SCF_FINANCIAL_ASSET_POLICY_VARIABLES, + add_scf_financial_asset_targets, build_household_vehicle_receiver, + combine_sipp_and_scf_financial_assets, ) from policyengine_us_data.utils.policyengine import ( supports_medicare_enrollment_input, @@ -2156,7 +2159,9 @@ def add_tips(self, cps: h5py.File): mean_quantile=0.5, ).tip_income.values - # Impute liquid assets from SIPP (bank accounts, stocks, bonds) + # Impute SIPP liquid assets used directly by resource-tested policy rules. + # The SCF step below applies a stable 50/50 source-model draw for the + # overlapping bank, stock, and bond leaves. from policyengine_us_data.datasets.sipp import get_asset_model @@ -2473,6 +2478,7 @@ def determine_reference_person(group): mask = create_scf_reference_person_mask(cps_data, person_data) mask_len = mask.shape[0] + original_person_household_ids = np.asarray(cps_data["person_household_id"]) cps_data = { var: data[mask] if data.shape[0] == mask_len else data @@ -2543,7 +2549,10 @@ def determine_reference_person(group): reference_persons = person_data[mask] receiver_data["is_married"] = reference_persons.A_MARITL.isin([1, 2]).values - # Impute auto loan balance from the SCF + # Impute SCF net_worth as an aggregate, selected auto-loan fields, and + # SCF equivalents for overlapping financial asset leaves. + # Current public asset components are partial and mixed-source, so row-level + # net_worth identity checks are not expected to pass. from policyengine_us_data.datasets.scf.scf import SCF_2022 scf_dataset = SCF_2022() @@ -2560,7 +2569,12 @@ def determine_reference_person(group): "interest_dividend_income", "social_security_pension_income", ] - IMPUTED_VARIABLES = ["networth", "auto_loan_balance", "auto_loan_interest"] + scf_financial_asset_targets = add_scf_financial_asset_targets(scf_data) + IMPUTED_VARIABLES = [ + "networth", + "auto_loan_balance", + "auto_loan_interest", + ] + list(scf_financial_asset_targets) weights = ["wgt"] donor_data = scf_data[PREDICTORS + IMPUTED_VARIABLES + weights].copy() @@ -2589,6 +2603,20 @@ def determine_reference_person(group): for var in IMPUTED_VARIABLES: cps[var] = imputations[var] + for scf_var, policy_var in SCF_FINANCIAL_ASSET_POLICY_VARIABLES.items(): + if scf_var not in imputations: + continue + if policy_var in cps: + cps[policy_var] = combine_sipp_and_scf_financial_assets( + sipp_values=cps[policy_var], + scf_household_values=imputations[scf_var].values, + person_household_ids=original_person_household_ids, + reference_person_mask=mask, + time_period=self.time_period, + ) + if scf_var in cps: + del cps[scf_var] + cps["net_worth"] = cps["networth"] del cps["networth"] diff --git a/policyengine_us_data/utils/asset_imputation.py b/policyengine_us_data/utils/asset_imputation.py index 60c1572e0..a2523cc8c 100644 --- a/policyengine_us_data/utils/asset_imputation.py +++ b/policyengine_us_data/utils/asset_imputation.py @@ -1,7 +1,230 @@ +from dataclasses import dataclass +import hashlib +from typing import Mapping, Sequence + import numpy as np import pandas as pd +SIPP_LIQUID_ASSET_VARIABLES = ( + "bank_account_assets", + "stock_assets", + "bond_assets", +) +SIPP_VEHICLE_ASSET_VARIABLES = ("household_vehicles_value",) +SCF_NET_WORTH_VARIABLE = "net_worth" +SCF_BALANCE_SHEET_DEBT_VARIABLES = ("auto_loan_balance",) +SCF_FINANCIAL_ASSET_TARGETS = { + "scf_bank_account_assets": ("liq",), + "scf_stock_assets": ("stocks", "nmmf"), + "scf_bond_assets": ("bond",), +} +SCF_FINANCIAL_ASSET_POLICY_VARIABLES = { + "scf_bank_account_assets": "bank_account_assets", + "scf_stock_assets": "stock_assets", + "scf_bond_assets": "bond_assets", +} + +EXPOSED_NET_WORTH_COMPONENT_VARIABLES = ( + SIPP_LIQUID_ASSET_VARIABLES + + SIPP_VEHICLE_ASSET_VARIABLES + + SCF_BALANCE_SHEET_DEBT_VARIABLES +) +NET_WORTH_COMPONENT_SIGNS = { + "auto_loan_balance": -1.0, +} +UNOBSERVED_NET_WORTH_COMPONENT_GROUPS = ( + "primary_residence_value", + "mortgage_debt", + "retirement_assets", + "business_equity", + "other_real_estate", + "other_financial_assets", + "other_debts", +) +NET_WORTH_COMPONENTS_ARE_COMPLETE = False +FINANCIAL_ASSET_SOURCE_SCF_PROBABILITY = 0.5 + + +@dataclass(frozen=True) +class NetWorthReconciliationReport: + """Summary of a household-level net worth reconciliation check.""" + + components_are_complete: bool + available_component_variables: tuple[str, ...] + missing_component_variables: tuple[str, ...] + unobserved_component_groups: tuple[str, ...] + max_abs_difference: float | None + is_reconciled: bool | None + message: str + + +def check_household_net_worth_reconciliation( + data: Mapping[str, Sequence[float]], + *, + component_variables: Sequence[str] = EXPOSED_NET_WORTH_COMPONENT_VARIABLES, + net_worth_variable: str = SCF_NET_WORTH_VARIABLE, + component_signs: Mapping[str, float] = NET_WORTH_COMPONENT_SIGNS, + components_are_complete: bool = NET_WORTH_COMPONENTS_ARE_COMPLETE, + rtol: float = 1e-6, + atol: float = 1.0, +) -> NetWorthReconciliationReport: + """Check whether household net worth equals signed balance-sheet components. + + The current CPS asset fields are intentionally not a complete balance sheet: + liquid assets and vehicles are imputed from SIPP, while net worth and auto + loan balances are imputed from SCF. Leave ``components_are_complete`` false + for current public datasets. Set it to true only for a household-aligned data + frame whose component variables are intended to exhaust net worth. + """ + component_variables = tuple(component_variables) + available_components = tuple( + variable for variable in component_variables if variable in data + ) + missing_components = tuple( + variable for variable in component_variables if variable not in data + ) + + if not components_are_complete: + return NetWorthReconciliationReport( + components_are_complete=False, + available_component_variables=available_components, + missing_component_variables=missing_components, + unobserved_component_groups=UNOBSERVED_NET_WORTH_COMPONENT_GROUPS, + max_abs_difference=None, + is_reconciled=None, + message=( + "Net worth is an independently imputed SCF aggregate. The " + "available SIPP/SCF asset fields are partial and should not be " + "expected to reconstruct it." + ), + ) + + if net_worth_variable not in data: + raise KeyError(f"Missing net worth variable: {net_worth_variable}") + if missing_components: + raise KeyError( + "Cannot reconcile net worth with a complete component set because " + f"these component variables are missing: {', '.join(missing_components)}" + ) + + net_worth = np.asarray(data[net_worth_variable], dtype=float) + component_total = np.zeros_like(net_worth, dtype=float) + + for variable in component_variables: + values = np.asarray(data[variable], dtype=float) + if values.shape != net_worth.shape: + raise ValueError( + f"{variable} has shape {values.shape}, but {net_worth_variable} " + f"has shape {net_worth.shape}. Reconciliation data must already " + "be aligned to household rows." + ) + component_total += component_signs.get(variable, 1.0) * values + + difference = net_worth - component_total + max_abs_difference = ( + float(np.nanmax(np.abs(difference))) if difference.size else 0.0 + ) + is_reconciled = bool( + np.allclose(net_worth, component_total, rtol=rtol, atol=atol, equal_nan=True) + ) + + return NetWorthReconciliationReport( + components_are_complete=True, + available_component_variables=available_components, + missing_component_variables=(), + unobserved_component_groups=(), + max_abs_difference=max_abs_difference, + is_reconciled=is_reconciled, + message=( + "Net worth reconciles to the signed component variables." + if is_reconciled + else "Net worth does not reconcile to the signed component variables." + ), + ) + + +def add_scf_financial_asset_targets(scf: pd.DataFrame) -> tuple[str, ...]: + """Add SCF financial asset targets comparable to SIPP policy leaves.""" + added_targets = [] + for target, source_columns in SCF_FINANCIAL_ASSET_TARGETS.items(): + if all(column in scf.columns for column in source_columns): + scf[target] = sum(scf[column].fillna(0) for column in source_columns) + added_targets.append(target) + return tuple(added_targets) + + +def _stable_unit_interval(key: str) -> float: + digest = hashlib.blake2b(key.encode("utf-8"), digest_size=8).digest() + return int.from_bytes(digest, "big") / 2**64 + + +def financial_asset_source_is_scf( + household_ids: Sequence, + *, + time_period: int, + probability: float = FINANCIAL_ASSET_SOURCE_SCF_PROBABILITY, +) -> np.ndarray: + """Return a stable 50/50 source-model draw for financial assets. + + The draw is at the household asset-block level, so bank accounts, stocks, + and bonds all come from the same source for a household. + """ + if not 0 <= probability <= 1: + raise ValueError("probability must be between 0 and 1") + + household_ids = np.asarray(household_ids) + draws_by_household = { + household_id: ( + _stable_unit_interval( + f"financial_asset_source:{time_period}:{household_id}" + ) + < probability + ) + for household_id in pd.unique(household_ids) + } + return np.array( + [draws_by_household[household_id] for household_id in household_ids], + dtype=bool, + ) + + +def combine_sipp_and_scf_financial_assets( + *, + sipp_values: Sequence[float], + scf_household_values: Sequence[float], + person_household_ids: Sequence, + reference_person_mask: Sequence[bool], + time_period: int, +) -> np.ndarray: + """Apply a stable 50/50 SIPP/SCF source draw to a person-level asset leaf.""" + sipp_values = np.asarray(sipp_values, dtype=np.float32) + scf_household_values = np.asarray(scf_household_values, dtype=np.float32) + person_household_ids = np.asarray(person_household_ids) + reference_person_mask = np.asarray(reference_person_mask, dtype=bool) + + if sipp_values.shape != person_household_ids.shape: + raise ValueError( + "sipp_values and person_household_ids must have the same shape" + ) + if reference_person_mask.shape != person_household_ids.shape: + raise ValueError( + "reference_person_mask and person_household_ids must have the same shape" + ) + if scf_household_values.shape[0] != reference_person_mask.sum(): + raise ValueError( + "scf_household_values must contain one value per reference person" + ) + + scf_person_values = np.zeros_like(sipp_values, dtype=np.float32) + scf_person_values[reference_person_mask] = scf_household_values + use_scf = financial_asset_source_is_scf( + person_household_ids, + time_period=time_period, + ) + return np.where(use_scf, scf_person_values, sipp_values).astype(np.float32) + + def build_household_vehicle_receiver( person_df: pd.DataFrame, tenure_type: np.ndarray | None = None, diff --git a/tests/unit/test_asset_imputation.py b/tests/unit/test_asset_imputation.py index 6178450c2..2629d8831 100644 --- a/tests/unit/test_asset_imputation.py +++ b/tests/unit/test_asset_imputation.py @@ -2,7 +2,13 @@ import pandas as pd from policyengine_us_data.utils.asset_imputation import ( + NET_WORTH_COMPONENTS_ARE_COMPLETE, + UNOBSERVED_NET_WORTH_COMPONENT_GROUPS, + add_scf_financial_asset_targets, build_household_vehicle_receiver, + check_household_net_worth_reconciliation, + combine_sipp_and_scf_financial_assets, + financial_asset_source_is_scf, ) @@ -37,3 +43,130 @@ def test_build_household_vehicle_receiver_aggregates_person_inputs(): assert receiver["reference_is_female"].tolist() == [1.0, 0.0] assert receiver["reference_is_married"].tolist() == [1.0, 0.0] assert receiver["is_homeowner"].tolist() == [1.0, 0.0] + + +def test_current_net_worth_components_are_marked_incomplete(): + data = { + "net_worth": np.array([500_000.0]), + "bank_account_assets": np.array([10_000.0]), + "stock_assets": np.array([5_000.0]), + "bond_assets": np.array([1_000.0]), + "household_vehicles_value": np.array([15_000.0]), + "auto_loan_balance": np.array([2_000.0]), + } + + report = check_household_net_worth_reconciliation(data) + + assert NET_WORTH_COMPONENTS_ARE_COMPLETE is False + assert report.components_are_complete is False + assert report.is_reconciled is None + assert report.max_abs_difference is None + assert "retirement_assets" in UNOBSERVED_NET_WORTH_COMPONENT_GROUPS + assert "independently imputed SCF aggregate" in report.message + + +def test_net_worth_reconciliation_checks_complete_household_components(): + data = { + "net_worth": np.array([125.0, -10.0]), + "bank_account_assets": np.array([100.0, 10.0]), + "stock_assets": np.array([50.0, 0.0]), + "auto_loan_balance": np.array([25.0, 20.0]), + } + + report = check_household_net_worth_reconciliation( + data, + component_variables=( + "bank_account_assets", + "stock_assets", + "auto_loan_balance", + ), + components_are_complete=True, + atol=0.0, + ) + + assert report.components_are_complete is True + assert report.is_reconciled is True + assert report.max_abs_difference == 0.0 + + +def test_net_worth_reconciliation_reports_complete_component_mismatch(): + data = { + "net_worth": np.array([126.0]), + "bank_account_assets": np.array([100.0]), + "stock_assets": np.array([50.0]), + "auto_loan_balance": np.array([25.0]), + } + + report = check_household_net_worth_reconciliation( + data, + component_variables=( + "bank_account_assets", + "stock_assets", + "auto_loan_balance", + ), + components_are_complete=True, + atol=0.0, + ) + + assert report.is_reconciled is False + assert report.max_abs_difference == 1.0 + + +def test_add_scf_financial_asset_targets_builds_sipp_comparable_columns(): + scf = pd.DataFrame( + { + "liq": [100.0, 200.0], + "stocks": [10.0, 20.0], + "nmmf": [1.0, 2.0], + "bond": [5.0, 6.0], + } + ) + + targets = add_scf_financial_asset_targets(scf) + + assert targets == ( + "scf_bank_account_assets", + "scf_stock_assets", + "scf_bond_assets", + ) + assert scf["scf_bank_account_assets"].tolist() == [100.0, 200.0] + assert scf["scf_stock_assets"].tolist() == [11.0, 22.0] + assert scf["scf_bond_assets"].tolist() == [5.0, 6.0] + + +def test_financial_asset_source_draw_is_household_stable(): + household_ids = np.array([10, 10, 20, 30]) + + first = financial_asset_source_is_scf(household_ids, time_period=2024) + second = financial_asset_source_is_scf(household_ids, time_period=2024) + + assert first.tolist() == second.tolist() + assert first[0] == first[1] + + +def test_combine_sipp_and_scf_financial_assets_preserves_household_scf_total(): + person_household_ids = np.array([10, 10, 20, 20]) + reference_person_mask = np.array([True, False, True, False]) + use_scf = financial_asset_source_is_scf( + person_household_ids, + time_period=2024, + ) + + combined = combine_sipp_and_scf_financial_assets( + sipp_values=np.array([1.0, 2.0, 3.0, 4.0]), + scf_household_values=np.array([100.0, 200.0]), + person_household_ids=person_household_ids, + reference_person_mask=reference_person_mask, + time_period=2024, + ) + + for household_id, scf_total in [(10, 100.0), (20, 200.0)]: + household_mask = person_household_ids == household_id + if use_scf[household_mask][0]: + assert combined[household_mask].sum() == scf_total + assert combined[household_mask & ~reference_person_mask].sum() == 0.0 + else: + np.testing.assert_array_equal( + combined[household_mask], + np.array([1.0, 2.0, 3.0, 4.0])[household_mask], + ) From b2eda4258a9a60ce62363209778b107c9fbe05b8 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 27 Apr 2026 15:51:54 -0400 Subject: [PATCH 2/4] Add SCF net worth formula components --- docs/data.md | 15 +-- docs/methodology.md | 23 ++-- .../calibration/source_impute.py | 60 +++++++++- policyengine_us_data/datasets/cps/cps.py | 42 ++++++- .../utils/asset_imputation.py | 103 ++++++++++++++++-- tests/unit/calibration/test_source_impute.py | 3 + tests/unit/test_asset_imputation.py | 73 ++++++++++++- 7 files changed, 283 insertions(+), 36 deletions(-) diff --git a/docs/data.md b/docs/data.md index f7b703798..54b0a2c49 100644 --- a/docs/data.md +++ b/docs/data.md @@ -79,13 +79,14 @@ matter for a specific program. The SCF provides wealth and debt information that we use to impute several financial variables missing from the CPS. We match auto loan balances based on household demographics and income, then -calculate interest on auto loans from these imputed balances. We also impute `net_worth` as an SCF -aggregate. This aggregate is not reconstructible from the currently exposed asset and liability -fields because those fields combine SIPP liquid-asset and vehicle imputations with selected SCF debt -inputs and omit major balance-sheet categories such as home equity, mortgage debt, retirement -assets, business equity, other real estate, and other debts. Use the specific asset variables for -resource-tested policy rules; use `net_worth` only as a broad wealth aggregate. The SCF imputation -uses their reference person definition to ensure proper matching. +calculate interest on auto loans from these imputed balances. We also impute the SCF balance-sheet +components needed to express `net_worth` as a formula: certificates of deposit, retirement assets, +cash-value life insurance, managed assets, other financial assets, home value, other real estate, +business equity, other nonfinancial assets, mortgages, other residential debt, lines of credit, +credit card debt, student debt, other installment debt, buy-now-pay-later debt, other debt, and a +`net_worth_residual`. The residual captures remaining source and definition differences after the +SIPP/SCF asset blend, so resource-tested policy leaves are not rescaled to force the SCF aggregate. +The SCF imputation uses their reference person definition to ensure proper matching. ### American Community Survey (ACS) diff --git a/docs/methodology.md b/docs/methodology.md index db9f34aea..52376f4d2 100644 --- a/docs/methodology.md +++ b/docs/methodology.md @@ -242,20 +242,19 @@ identifiers, so these imputations are state-blind at the microdata level - geogr tip income and assets enters only through calibration weights, not through the imputed values themselves. -**SCF (Survey of Consumer Finances)**: Aggregate net worth, auto loan balances, and auto loan -interest. The SCF also lacks state identifiers, so these imputations are likewise state-blind. - -The current asset fields are a mixed-source partial balance sheet. `net_worth` is independently -imputed from the SCF aggregate and includes components that are not currently exposed in the public -CPS file, such as primary residence equity, mortgage debt, retirement assets, business equity, -other real estate, other financial assets, and other debts. The SIPP liquid-asset and vehicle fields -are policy-relevant inputs in their own right. For overlapping bank-account, stock, and bond asset +**SCF (Survey of Consumer Finances)**: Aggregate net worth, auto loan balances, auto loan interest, +and balance-sheet components needed to express net worth as a formula. The SCF also lacks state +identifiers, so these imputations are likewise state-blind. + +The asset fields are a mixed-source balance sheet. The SIPP liquid-asset and vehicle fields are +policy-relevant inputs in their own right. For overlapping bank-account, stock, and bond asset variables, we use a stable household-level 50/50 source-model draw between the SIPP QRF prediction and the comparable SCF QRF prediction, with a single draw shared across the financial-asset block. -We do not rescale these policy leaves to force them to add up to SCF `net_worth`. Therefore, -row-level reconciliation between `net_worth` and the exposed component fields is not expected. A -net-worth component diagnostic should only be enabled when the component set is explicitly intended -to be complete and household-aligned. +We then impute the non-overlapping SCF balance-sheet components - home value, mortgage debt, +retirement assets, business equity, other real estate, other financial assets, other debts, and +related categories - and compute `net_worth_residual` so that a downstream `net_worth` formula can +reconcile exactly without rescaling resource-tested policy leaves. The residual captures remaining +source and definition differences after the SIPP/SCF blend. The output of this stage is the source-imputed stratified CPS (`source_imputed_stratified_extended_cps_2024.h5`), which serves as the input to the diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py index 464337866..eb2038e13 100644 --- a/policyengine_us_data/calibration/source_impute.py +++ b/policyengine_us_data/calibration/source_impute.py @@ -12,7 +12,8 @@ household_vehicles_value (no state predictor) ORG -> hourly_wage, is_paid_hourly, is_union_member_or_covered - SCF -> net_worth, auto_loan_balance, auto_loan_interest, and + SCF -> net_worth, auto_loan_balance, auto_loan_interest, + net_worth_residual, SCF-only balance-sheet components, and 50/50 source-model averaging for overlapping financial assets (no state predictor) @@ -46,10 +47,16 @@ predict_org_features, ) from policyengine_us_data.utils.asset_imputation import ( + NET_WORTH_RESIDUAL_VARIABLE, SCF_FINANCIAL_ASSET_POLICY_VARIABLES, + SCF_NET_WORTH_COMPONENT_VARIABLES, add_scf_financial_asset_targets, + add_scf_net_worth_component_targets, + aggregate_person_values_to_reference_households, + align_household_values_to_reference_households, build_household_vehicle_receiver, combine_sipp_and_scf_financial_assets, + compute_net_worth_residual, ) logger = logging.getLogger(__name__) @@ -68,12 +75,18 @@ "household_vehicles_value", ] -SCF_IMPUTED_VARIABLES = [ +SCF_AGGREGATE_IMPUTED_VARIABLES = [ "net_worth", "auto_loan_balance", "auto_loan_interest", ] +SCF_IMPUTED_VARIABLES = [ + *SCF_AGGREGATE_IMPUTED_VARIABLES, + *SCF_NET_WORTH_COMPONENT_VARIABLES, + NET_WORTH_RESIDUAL_VARIABLE, +] + ALL_SOURCE_VARIABLES = ( ACS_IMPUTED_VARIABLES + SIPP_IMPUTED_VARIABLES @@ -770,10 +783,15 @@ def _impute_scf( if "networth" in scf_df.columns and "net_worth" not in scf_df.columns: scf_df["net_worth"] = scf_df["networth"] scf_financial_asset_targets = add_scf_financial_asset_targets(scf_df) + scf_component_targets = add_scf_net_worth_component_targets(scf_df) - available_vars = [v for v in SCF_IMPUTED_VARIABLES if v in scf_df.columns] + available_vars = [ + v for v in SCF_AGGREGATE_IMPUTED_VARIABLES if v in scf_df.columns + ] qrf_vars = available_vars + [ v for v in scf_financial_asset_targets if v in scf_df.columns + ] + [ + v for v in scf_component_targets if v in scf_df.columns ] if not available_vars: logger.warning("No SCF aggregate imputed variables available. Skipping.") @@ -881,6 +899,14 @@ def _impute_scf( person_hh_ids = data.get("person_household_id", {}).get(time_period) if person_hh_ids is not None: first_person_mask = ~pd.Series(person_hh_ids).duplicated().values + reference_household_ids = person_hh_ids[first_person_mask] + for var in SCF_NET_WORTH_COMPONENT_VARIABLES: + if var in preds: + data[var] = { + time_period: preds.loc[first_person_mask, var].values.astype( + np.float32 + ) + } for scf_var, policy_var in SCF_FINANCIAL_ASSET_POLICY_VARIABLES.items(): if scf_var not in preds or policy_var not in data: continue @@ -893,6 +919,34 @@ def _impute_scf( time_period=time_period, ) } + if "net_worth" in data: + net_worth_components = {} + for var in ("bank_account_assets", "stock_assets", "bond_assets"): + if var in data: + net_worth_components[var] = ( + aggregate_person_values_to_reference_households( + data[var][time_period], + person_hh_ids, + first_person_mask, + ) + ) + if "household_vehicles_value" in data: + net_worth_components["household_vehicles_value"] = ( + align_household_values_to_reference_households( + data["household_vehicles_value"][time_period], + hh_ids, + reference_household_ids, + ) + ) + for var in SCF_NET_WORTH_COMPONENT_VARIABLES + ("auto_loan_balance",): + if var in data: + net_worth_components[var] = data[var][time_period] + data[NET_WORTH_RESIDUAL_VARIABLE] = { + time_period: compute_net_worth_residual( + net_worth=data["net_worth"][time_period], + components=net_worth_components, + ) + } del fitted, preds gc.collect() diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 40d6d7717..e1c49e2f6 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -48,10 +48,16 @@ reported_subsidized_marketplace_by_tax_unit, ) from policyengine_us_data.utils.asset_imputation import ( + NET_WORTH_RESIDUAL_VARIABLE, SCF_FINANCIAL_ASSET_POLICY_VARIABLES, + SCF_NET_WORTH_COMPONENT_VARIABLES, add_scf_financial_asset_targets, + add_scf_net_worth_component_targets, + aggregate_person_values_to_reference_households, + align_household_values_to_reference_households, build_household_vehicle_receiver, combine_sipp_and_scf_financial_assets, + compute_net_worth_residual, ) from policyengine_us_data.utils.policyengine import ( supports_medicare_enrollment_input, @@ -2549,10 +2555,9 @@ def determine_reference_person(group): reference_persons = person_data[mask] receiver_data["is_married"] = reference_persons.A_MARITL.isin([1, 2]).values - # Impute SCF net_worth as an aggregate, selected auto-loan fields, and - # SCF equivalents for overlapping financial asset leaves. - # Current public asset components are partial and mixed-source, so row-level - # net_worth identity checks are not expected to pass. + # Impute SCF net_worth as an aggregate, selected auto-loan fields, SCF + # equivalents for overlapping financial asset leaves, and SCF-only + # balance-sheet leaves needed to make net_worth a formula with a residual. from policyengine_us_data.datasets.scf.scf import SCF_2022 scf_dataset = SCF_2022() @@ -2570,11 +2575,12 @@ def determine_reference_person(group): "social_security_pension_income", ] scf_financial_asset_targets = add_scf_financial_asset_targets(scf_data) + scf_component_targets = add_scf_net_worth_component_targets(scf_data) IMPUTED_VARIABLES = [ "networth", "auto_loan_balance", "auto_loan_interest", - ] + list(scf_financial_asset_targets) + ] + list(scf_financial_asset_targets) + list(scf_component_targets) weights = ["wgt"] donor_data = scf_data[PREDICTORS + IMPUTED_VARIABLES + weights].copy() @@ -2619,6 +2625,32 @@ def determine_reference_person(group): cps["net_worth"] = cps["networth"] del cps["networth"] + reference_household_ids = original_person_household_ids[mask] + net_worth_components = {} + for variable in ("bank_account_assets", "stock_assets", "bond_assets"): + if variable in cps: + net_worth_components[variable] = ( + aggregate_person_values_to_reference_households( + cps[variable], + original_person_household_ids, + mask, + ) + ) + if "household_vehicles_value" in cps_data and "household_id" in cps_data: + net_worth_components["household_vehicles_value"] = ( + align_household_values_to_reference_households( + cps_data["household_vehicles_value"], + cps_data["household_id"], + reference_household_ids, + ) + ) + for variable in SCF_NET_WORTH_COMPONENT_VARIABLES + ("auto_loan_balance",): + if variable in cps: + net_worth_components[variable] = cps[variable] + cps[NET_WORTH_RESIDUAL_VARIABLE] = compute_net_worth_residual( + net_worth=cps["net_worth"], + components=net_worth_components, + ) self.save_dataset(cps) diff --git a/policyengine_us_data/utils/asset_imputation.py b/policyengine_us_data/utils/asset_imputation.py index a2523cc8c..a215d679b 100644 --- a/policyengine_us_data/utils/asset_imputation.py +++ b/policyengine_us_data/utils/asset_imputation.py @@ -13,6 +13,7 @@ ) SIPP_VEHICLE_ASSET_VARIABLES = ("household_vehicles_value",) SCF_NET_WORTH_VARIABLE = "net_worth" +NET_WORTH_RESIDUAL_VARIABLE = "net_worth_residual" SCF_BALANCE_SHEET_DEBT_VARIABLES = ("auto_loan_balance",) SCF_FINANCIAL_ASSET_TARGETS = { "scf_bank_account_assets": ("liq",), @@ -24,23 +25,48 @@ "scf_stock_assets": "stock_assets", "scf_bond_assets": "bond_assets", } +SCF_NET_WORTH_COMPONENT_TARGETS = { + "scf_certificates_of_deposit": ("cds",), + "scf_retirement_assets": ("retqliq",), + "scf_cash_value_life_insurance": ("cashli",), + "scf_other_managed_assets": ("othma",), + "scf_other_financial_assets": ("othfin",), + "scf_primary_residence_value": ("houses",), + "scf_other_residential_real_estate": ("oresre",), + "scf_nonresidential_real_estate_equity": ("nnresre",), + "scf_business_equity": ("bus",), + "scf_other_nonfinancial_assets": ("othnfin",), + "scf_mortgage_debt": ("mrthel",), + "scf_other_residential_debt": ("resdbt",), + "scf_other_lines_of_credit": ("othloc",), + "scf_credit_card_debt": ("ccbal",), + "scf_student_loan_debt": ("edn_inst",), + "scf_other_installment_debt": ("oth_inst",), + "scf_buy_now_pay_later_debt": ("bnpl",), + "scf_other_debt": ("odebt",), +} +SCF_NET_WORTH_COMPONENT_VARIABLES = tuple(SCF_NET_WORTH_COMPONENT_TARGETS) EXPOSED_NET_WORTH_COMPONENT_VARIABLES = ( SIPP_LIQUID_ASSET_VARIABLES + SIPP_VEHICLE_ASSET_VARIABLES + + SCF_NET_WORTH_COMPONENT_VARIABLES + SCF_BALANCE_SHEET_DEBT_VARIABLES + + (NET_WORTH_RESIDUAL_VARIABLE,) ) NET_WORTH_COMPONENT_SIGNS = { "auto_loan_balance": -1.0, + "scf_mortgage_debt": -1.0, + "scf_other_residential_debt": -1.0, + "scf_other_lines_of_credit": -1.0, + "scf_credit_card_debt": -1.0, + "scf_student_loan_debt": -1.0, + "scf_other_installment_debt": -1.0, + "scf_buy_now_pay_later_debt": -1.0, + "scf_other_debt": -1.0, } UNOBSERVED_NET_WORTH_COMPONENT_GROUPS = ( - "primary_residence_value", - "mortgage_debt", - "retirement_assets", - "business_equity", - "other_real_estate", - "other_financial_assets", - "other_debts", + "SCF/SIPP source and definition differences captured in net_worth_residual", ) NET_WORTH_COMPONENTS_ARE_COMPLETE = False FINANCIAL_ASSET_SOURCE_SCF_PROBABILITY = 0.5 @@ -146,8 +172,20 @@ def check_household_net_worth_reconciliation( def add_scf_financial_asset_targets(scf: pd.DataFrame) -> tuple[str, ...]: """Add SCF financial asset targets comparable to SIPP policy leaves.""" + return _add_scf_targets(scf, SCF_FINANCIAL_ASSET_TARGETS) + + +def add_scf_net_worth_component_targets(scf: pd.DataFrame) -> tuple[str, ...]: + """Add SCF-only balance-sheet targets needed for a net worth formula.""" + return _add_scf_targets(scf, SCF_NET_WORTH_COMPONENT_TARGETS) + + +def _add_scf_targets( + scf: pd.DataFrame, + target_map: Mapping[str, tuple[str, ...]], +) -> tuple[str, ...]: added_targets = [] - for target, source_columns in SCF_FINANCIAL_ASSET_TARGETS.items(): + for target, source_columns in target_map.items(): if all(column in scf.columns for column in source_columns): scf[target] = sum(scf[column].fillna(0) for column in source_columns) added_targets.append(target) @@ -225,6 +263,55 @@ def combine_sipp_and_scf_financial_assets( return np.where(use_scf, scf_person_values, sipp_values).astype(np.float32) +def aggregate_person_values_to_reference_households( + person_values: Sequence[float], + person_household_ids: Sequence, + reference_person_mask: Sequence[bool], +) -> np.ndarray: + """Aggregate person values to households in reference-person order.""" + person_values = np.asarray(person_values, dtype=np.float32) + person_household_ids = np.asarray(person_household_ids) + reference_person_mask = np.asarray(reference_person_mask, dtype=bool) + reference_household_ids = person_household_ids[reference_person_mask] + totals = pd.Series(person_values).groupby(person_household_ids).sum() + return totals.reindex(reference_household_ids).fillna(0).to_numpy(dtype=np.float32) + + +def align_household_values_to_reference_households( + household_values: Sequence[float], + household_ids: Sequence, + reference_household_ids: Sequence, +) -> np.ndarray: + """Align household values from household-id order to reference-person order.""" + household_values = np.asarray(household_values, dtype=np.float32) + household_ids = np.asarray(household_ids) + reference_household_ids = np.asarray(reference_household_ids) + values = pd.Series(household_values, index=household_ids) + return values.reindex(reference_household_ids).fillna(0).to_numpy(dtype=np.float32) + + +def compute_net_worth_residual( + *, + net_worth: Sequence[float], + components: Mapping[str, Sequence[float]], + component_signs: Mapping[str, float] = NET_WORTH_COMPONENT_SIGNS, +) -> np.ndarray: + """Compute the residual that makes net worth reconcile exactly.""" + net_worth = np.asarray(net_worth, dtype=np.float32) + component_total = np.zeros_like(net_worth, dtype=np.float32) + + for variable, values in components.items(): + values = np.asarray(values, dtype=np.float32) + if values.shape != net_worth.shape: + raise ValueError( + f"{variable} has shape {values.shape}, but net_worth has " + f"shape {net_worth.shape}." + ) + component_total += component_signs.get(variable, 1.0) * values + + return (net_worth - component_total).astype(np.float32) + + def build_household_vehicle_receiver( person_df: pd.DataFrame, tenure_type: np.ndarray | None = None, diff --git a/tests/unit/calibration/test_source_impute.py b/tests/unit/calibration/test_source_impute.py index 7324351e6..893dd5704 100644 --- a/tests/unit/calibration/test_source_impute.py +++ b/tests/unit/calibration/test_source_impute.py @@ -87,6 +87,9 @@ def test_scf_variables_defined(self): assert "net_worth" in SCF_IMPUTED_VARIABLES assert "auto_loan_balance" in SCF_IMPUTED_VARIABLES assert "auto_loan_interest" in SCF_IMPUTED_VARIABLES + assert "scf_retirement_assets" in SCF_IMPUTED_VARIABLES + assert "scf_mortgage_debt" in SCF_IMPUTED_VARIABLES + assert "net_worth_residual" in SCF_IMPUTED_VARIABLES def test_org_variables_defined(self): assert "hourly_wage" in ORG_IMPUTED_VARIABLES diff --git a/tests/unit/test_asset_imputation.py b/tests/unit/test_asset_imputation.py index 2629d8831..add359ad3 100644 --- a/tests/unit/test_asset_imputation.py +++ b/tests/unit/test_asset_imputation.py @@ -3,11 +3,16 @@ from policyengine_us_data.utils.asset_imputation import ( NET_WORTH_COMPONENTS_ARE_COMPLETE, + NET_WORTH_RESIDUAL_VARIABLE, UNOBSERVED_NET_WORTH_COMPONENT_GROUPS, add_scf_financial_asset_targets, + add_scf_net_worth_component_targets, + aggregate_person_values_to_reference_households, + align_household_values_to_reference_households, build_household_vehicle_receiver, check_household_net_worth_reconciliation, combine_sipp_and_scf_financial_assets, + compute_net_worth_residual, financial_asset_source_is_scf, ) @@ -61,7 +66,7 @@ def test_current_net_worth_components_are_marked_incomplete(): assert report.components_are_complete is False assert report.is_reconciled is None assert report.max_abs_difference is None - assert "retirement_assets" in UNOBSERVED_NET_WORTH_COMPONENT_GROUPS + assert "net_worth_residual" in UNOBSERVED_NET_WORTH_COMPONENT_GROUPS[0] assert "independently imputed SCF aggregate" in report.message @@ -134,6 +139,38 @@ def test_add_scf_financial_asset_targets_builds_sipp_comparable_columns(): assert scf["scf_bond_assets"].tolist() == [5.0, 6.0] +def test_add_scf_net_worth_component_targets_builds_formula_columns(): + scf = pd.DataFrame( + { + "cds": [1.0], + "retqliq": [2.0], + "cashli": [3.0], + "othma": [4.0], + "othfin": [5.0], + "houses": [100.0], + "oresre": [20.0], + "nnresre": [30.0], + "bus": [40.0], + "othnfin": [6.0], + "mrthel": [50.0], + "resdbt": [7.0], + "othloc": [8.0], + "ccbal": [9.0], + "edn_inst": [10.0], + "oth_inst": [11.0], + "bnpl": [12.0], + "odebt": [13.0], + } + ) + + targets = add_scf_net_worth_component_targets(scf) + + assert "scf_retirement_assets" in targets + assert "scf_mortgage_debt" in targets + assert scf["scf_retirement_assets"].tolist() == [2.0] + assert scf["scf_mortgage_debt"].tolist() == [50.0] + + def test_financial_asset_source_draw_is_household_stable(): household_ids = np.array([10, 10, 20, 30]) @@ -170,3 +207,37 @@ def test_combine_sipp_and_scf_financial_assets_preserves_household_scf_total(): combined[household_mask], np.array([1.0, 2.0, 3.0, 4.0])[household_mask], ) + + +def test_aggregate_and_align_household_components(): + person_household_ids = np.array([20, 10, 20, 10]) + reference_person_mask = np.array([True, True, False, False]) + + aggregated = aggregate_person_values_to_reference_households( + [1.0, 2.0, 3.0, 4.0], + person_household_ids, + reference_person_mask, + ) + aligned = align_household_values_to_reference_households( + household_values=[100.0, 200.0], + household_ids=np.array([10, 20]), + reference_household_ids=person_household_ids[reference_person_mask], + ) + + assert aggregated.tolist() == [4.0, 6.0] + assert aligned.tolist() == [200.0, 100.0] + + +def test_compute_net_worth_residual_makes_formula_exact(): + residual = compute_net_worth_residual( + net_worth=np.array([1_000.0]), + components={ + "bank_account_assets": np.array([100.0]), + "scf_retirement_assets": np.array([300.0]), + "auto_loan_balance": np.array([50.0]), + "scf_mortgage_debt": np.array([200.0]), + }, + ) + + assert NET_WORTH_RESIDUAL_VARIABLE == "net_worth_residual" + assert residual.tolist() == [850.0] From 9d527b20213b17a00754127727a7d0ad4f5ac4d6 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 27 Apr 2026 15:56:38 -0400 Subject: [PATCH 3/4] Align SCF components to net worth definition --- docs/data.md | 8 ++++---- policyengine_us_data/utils/asset_imputation.py | 3 +-- tests/unit/test_asset_imputation.py | 5 ++++- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/docs/data.md b/docs/data.md index 54b0a2c49..8dbc85146 100644 --- a/docs/data.md +++ b/docs/data.md @@ -80,10 +80,10 @@ matter for a specific program. The SCF provides wealth and debt information that we use to impute several financial variables missing from the CPS. We match auto loan balances based on household demographics and income, then calculate interest on auto loans from these imputed balances. We also impute the SCF balance-sheet -components needed to express `net_worth` as a formula: certificates of deposit, retirement assets, -cash-value life insurance, managed assets, other financial assets, home value, other real estate, -business equity, other nonfinancial assets, mortgages, other residential debt, lines of credit, -credit card debt, student debt, other installment debt, buy-now-pay-later debt, other debt, and a +components needed to express `net_worth` as a formula: certificates of deposit, savings bonds, +retirement assets, cash-value life insurance, managed assets, other financial assets, home value, +other real estate, business equity, other nonfinancial assets, mortgages, other residential debt, +lines of credit, credit card debt, student debt, other installment debt, other debt, and a `net_worth_residual`. The residual captures remaining source and definition differences after the SIPP/SCF asset blend, so resource-tested policy leaves are not rescaled to force the SCF aggregate. The SCF imputation uses their reference person definition to ensure proper matching. diff --git a/policyengine_us_data/utils/asset_imputation.py b/policyengine_us_data/utils/asset_imputation.py index a215d679b..3172884c8 100644 --- a/policyengine_us_data/utils/asset_imputation.py +++ b/policyengine_us_data/utils/asset_imputation.py @@ -27,6 +27,7 @@ } SCF_NET_WORTH_COMPONENT_TARGETS = { "scf_certificates_of_deposit": ("cds",), + "scf_savings_bonds": ("savbnd",), "scf_retirement_assets": ("retqliq",), "scf_cash_value_life_insurance": ("cashli",), "scf_other_managed_assets": ("othma",), @@ -42,7 +43,6 @@ "scf_credit_card_debt": ("ccbal",), "scf_student_loan_debt": ("edn_inst",), "scf_other_installment_debt": ("oth_inst",), - "scf_buy_now_pay_later_debt": ("bnpl",), "scf_other_debt": ("odebt",), } SCF_NET_WORTH_COMPONENT_VARIABLES = tuple(SCF_NET_WORTH_COMPONENT_TARGETS) @@ -62,7 +62,6 @@ "scf_credit_card_debt": -1.0, "scf_student_loan_debt": -1.0, "scf_other_installment_debt": -1.0, - "scf_buy_now_pay_later_debt": -1.0, "scf_other_debt": -1.0, } UNOBSERVED_NET_WORTH_COMPONENT_GROUPS = ( diff --git a/tests/unit/test_asset_imputation.py b/tests/unit/test_asset_imputation.py index add359ad3..f0fab9cd5 100644 --- a/tests/unit/test_asset_imputation.py +++ b/tests/unit/test_asset_imputation.py @@ -143,6 +143,7 @@ def test_add_scf_net_worth_component_targets_builds_formula_columns(): scf = pd.DataFrame( { "cds": [1.0], + "savbnd": [1.5], "retqliq": [2.0], "cashli": [3.0], "othma": [4.0], @@ -158,15 +159,17 @@ def test_add_scf_net_worth_component_targets_builds_formula_columns(): "ccbal": [9.0], "edn_inst": [10.0], "oth_inst": [11.0], - "bnpl": [12.0], "odebt": [13.0], } ) targets = add_scf_net_worth_component_targets(scf) + assert "scf_savings_bonds" in targets assert "scf_retirement_assets" in targets assert "scf_mortgage_debt" in targets + assert "scf_buy_now_pay_later_debt" not in targets + assert scf["scf_savings_bonds"].tolist() == [1.5] assert scf["scf_retirement_assets"].tolist() == [2.0] assert scf["scf_mortgage_debt"].tolist() == [50.0] From 559087253eafb6bd1b7f0ff5faaf8ba639f059d3 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 27 Apr 2026 16:14:06 -0400 Subject: [PATCH 4/4] Compute net worth from formula components --- docs/data.md | 8 +- docs/methodology.md | 7 +- .../calibration/source_impute.py | 75 +++++++++--------- policyengine_us_data/datasets/cps/cps.py | 24 +++--- .../utils/asset_imputation.py | 76 ++++++++++++------- tests/unit/calibration/test_source_impute.py | 2 +- tests/unit/datasets/test_cps_file_handles.py | 31 +++++++- tests/unit/test_asset_imputation.py | 27 +++---- 8 files changed, 148 insertions(+), 102 deletions(-) diff --git a/docs/data.md b/docs/data.md index 8dbc85146..5fd11c4ae 100644 --- a/docs/data.md +++ b/docs/data.md @@ -83,10 +83,10 @@ calculate interest on auto loans from these imputed balances. We also impute the components needed to express `net_worth` as a formula: certificates of deposit, savings bonds, retirement assets, cash-value life insurance, managed assets, other financial assets, home value, other real estate, business equity, other nonfinancial assets, mortgages, other residential debt, -lines of credit, credit card debt, student debt, other installment debt, other debt, and a -`net_worth_residual`. The residual captures remaining source and definition differences after the -SIPP/SCF asset blend, so resource-tested policy leaves are not rescaled to force the SCF aggregate. -The SCF imputation uses their reference person definition to ensure proper matching. +lines of credit, credit card debt, vehicle installment debt, student debt, other installment debt, +and other debt. We compute `net_worth` from these components and the final SIPP/SCF-blended policy +leaves rather than rescaling resource-tested policy leaves to force an independently imputed SCF +aggregate. The SCF imputation uses their reference person definition to ensure proper matching. ### American Community Survey (ACS) diff --git a/docs/methodology.md b/docs/methodology.md index 52376f4d2..c0b9b7d93 100644 --- a/docs/methodology.md +++ b/docs/methodology.md @@ -252,9 +252,10 @@ variables, we use a stable household-level 50/50 source-model draw between the S and the comparable SCF QRF prediction, with a single draw shared across the financial-asset block. We then impute the non-overlapping SCF balance-sheet components - home value, mortgage debt, retirement assets, business equity, other real estate, other financial assets, other debts, and -related categories - and compute `net_worth_residual` so that a downstream `net_worth` formula can -reconcile exactly without rescaling resource-tested policy leaves. The residual captures remaining -source and definition differences after the SIPP/SCF blend. +related categories including vehicle, student, and other installment debt - and compute `net_worth` +from those components and the final SIPP/SCF-blended policy leaves. This gives downstream code a +direct component formula without an accounting residual or rescaling of resource-tested policy +leaves. The output of this stage is the source-imputed stratified CPS (`source_imputed_stratified_extended_cps_2024.h5`), which serves as the input to the diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py index eb2038e13..bcf9ce84f 100644 --- a/policyengine_us_data/calibration/source_impute.py +++ b/policyengine_us_data/calibration/source_impute.py @@ -13,7 +13,7 @@ ORG -> hourly_wage, is_paid_hourly, is_union_member_or_covered SCF -> net_worth, auto_loan_balance, auto_loan_interest, - net_worth_residual, SCF-only balance-sheet components, and + SCF-only balance-sheet components, and 50/50 source-model averaging for overlapping financial assets (no state predictor) @@ -47,7 +47,6 @@ predict_org_features, ) from policyengine_us_data.utils.asset_imputation import ( - NET_WORTH_RESIDUAL_VARIABLE, SCF_FINANCIAL_ASSET_POLICY_VARIABLES, SCF_NET_WORTH_COMPONENT_VARIABLES, add_scf_financial_asset_targets, @@ -56,7 +55,8 @@ align_household_values_to_reference_households, build_household_vehicle_receiver, combine_sipp_and_scf_financial_assets, - compute_net_worth_residual, + compute_net_worth_from_components, + require_scf_net_worth_formula_targets, ) logger = logging.getLogger(__name__) @@ -75,16 +75,15 @@ "household_vehicles_value", ] -SCF_AGGREGATE_IMPUTED_VARIABLES = [ - "net_worth", +SCF_CORE_IMPUTED_VARIABLES = [ "auto_loan_balance", "auto_loan_interest", ] SCF_IMPUTED_VARIABLES = [ - *SCF_AGGREGATE_IMPUTED_VARIABLES, + "net_worth", + *SCF_CORE_IMPUTED_VARIABLES, *SCF_NET_WORTH_COMPONENT_VARIABLES, - NET_WORTH_RESIDUAL_VARIABLE, ] ALL_SOURCE_VARIABLES = ( @@ -780,21 +779,23 @@ def _impute_scf( logger.warning("SCF missing predictors: %s", missing_preds) scf_predictors = available_preds - if "networth" in scf_df.columns and "net_worth" not in scf_df.columns: - scf_df["net_worth"] = scf_df["networth"] scf_financial_asset_targets = add_scf_financial_asset_targets(scf_df) scf_component_targets = add_scf_net_worth_component_targets(scf_df) + require_scf_net_worth_formula_targets( + scf_financial_asset_targets=scf_financial_asset_targets, + scf_component_targets=scf_component_targets, + ) available_vars = [ - v for v in SCF_AGGREGATE_IMPUTED_VARIABLES if v in scf_df.columns + v for v in SCF_CORE_IMPUTED_VARIABLES if v in scf_df.columns ] qrf_vars = available_vars + [ v for v in scf_financial_asset_targets if v in scf_df.columns ] + [ v for v in scf_component_targets if v in scf_df.columns ] - if not available_vars: - logger.warning("No SCF aggregate imputed variables available. Skipping.") + if not qrf_vars: + logger.warning("No SCF imputed variables available. Skipping.") return data weights = scf_df.get("wgt") @@ -919,39 +920,35 @@ def _impute_scf( time_period=time_period, ) } - if "net_worth" in data: - net_worth_components = {} - for var in ("bank_account_assets", "stock_assets", "bond_assets"): - if var in data: - net_worth_components[var] = ( - aggregate_person_values_to_reference_households( - data[var][time_period], - person_hh_ids, - first_person_mask, - ) - ) - if "household_vehicles_value" in data: - net_worth_components["household_vehicles_value"] = ( - align_household_values_to_reference_households( - data["household_vehicles_value"][time_period], - hh_ids, - reference_household_ids, - ) + net_worth_components = {} + for var in ("bank_account_assets", "stock_assets", "bond_assets"): + if var in data: + net_worth_components[var] = aggregate_person_values_to_reference_households( + data[var][time_period], + person_hh_ids, + first_person_mask, ) - for var in SCF_NET_WORTH_COMPONENT_VARIABLES + ("auto_loan_balance",): - if var in data: - net_worth_components[var] = data[var][time_period] - data[NET_WORTH_RESIDUAL_VARIABLE] = { - time_period: compute_net_worth_residual( - net_worth=data["net_worth"][time_period], - components=net_worth_components, + if "household_vehicles_value" in data: + net_worth_components["household_vehicles_value"] = ( + align_household_values_to_reference_households( + data["household_vehicles_value"][time_period], + hh_ids, + reference_household_ids, ) - } + ) + for var in SCF_NET_WORTH_COMPONENT_VARIABLES: + if var in data: + net_worth_components[var] = data[var][time_period] + data["net_worth"] = { + time_period: compute_net_worth_from_components( + components=net_worth_components, + ) + } del fitted, preds gc.collect() - logger.info("SCF imputation complete: %s", available_vars) + logger.info("SCF imputation complete: %s", SCF_IMPUTED_VARIABLES) return data diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index e1c49e2f6..aa263a3de 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -48,7 +48,6 @@ reported_subsidized_marketplace_by_tax_unit, ) from policyengine_us_data.utils.asset_imputation import ( - NET_WORTH_RESIDUAL_VARIABLE, SCF_FINANCIAL_ASSET_POLICY_VARIABLES, SCF_NET_WORTH_COMPONENT_VARIABLES, add_scf_financial_asset_targets, @@ -57,7 +56,8 @@ align_household_values_to_reference_households, build_household_vehicle_receiver, combine_sipp_and_scf_financial_assets, - compute_net_worth_residual, + compute_net_worth_from_components, + require_scf_net_worth_formula_targets, ) from policyengine_us_data.utils.policyengine import ( supports_medicare_enrollment_input, @@ -2555,9 +2555,9 @@ def determine_reference_person(group): reference_persons = person_data[mask] receiver_data["is_married"] = reference_persons.A_MARITL.isin([1, 2]).values - # Impute SCF net_worth as an aggregate, selected auto-loan fields, SCF - # equivalents for overlapping financial asset leaves, and SCF-only - # balance-sheet leaves needed to make net_worth a formula with a residual. + # Impute selected auto-loan fields, SCF equivalents for overlapping + # financial asset leaves, and SCF-only balance-sheet leaves. We compute + # net_worth from those components rather than storing an SCF aggregate. from policyengine_us_data.datasets.scf.scf import SCF_2022 scf_dataset = SCF_2022() @@ -2576,8 +2576,11 @@ def determine_reference_person(group): ] scf_financial_asset_targets = add_scf_financial_asset_targets(scf_data) scf_component_targets = add_scf_net_worth_component_targets(scf_data) + require_scf_net_worth_formula_targets( + scf_financial_asset_targets=scf_financial_asset_targets, + scf_component_targets=scf_component_targets, + ) IMPUTED_VARIABLES = [ - "networth", "auto_loan_balance", "auto_loan_interest", ] + list(scf_financial_asset_targets) + list(scf_component_targets) @@ -2623,8 +2626,6 @@ def determine_reference_person(group): if scf_var in cps: del cps[scf_var] - cps["net_worth"] = cps["networth"] - del cps["networth"] reference_household_ids = original_person_household_ids[mask] net_worth_components = {} for variable in ("bank_account_assets", "stock_assets", "bond_assets"): @@ -2644,12 +2645,11 @@ def determine_reference_person(group): reference_household_ids, ) ) - for variable in SCF_NET_WORTH_COMPONENT_VARIABLES + ("auto_loan_balance",): + for variable in SCF_NET_WORTH_COMPONENT_VARIABLES: if variable in cps: net_worth_components[variable] = cps[variable] - cps[NET_WORTH_RESIDUAL_VARIABLE] = compute_net_worth_residual( - net_worth=cps["net_worth"], - components=net_worth_components, + cps["net_worth"] = compute_net_worth_from_components( + components=net_worth_components ) self.save_dataset(cps) diff --git a/policyengine_us_data/utils/asset_imputation.py b/policyengine_us_data/utils/asset_imputation.py index 3172884c8..117fc0b26 100644 --- a/policyengine_us_data/utils/asset_imputation.py +++ b/policyengine_us_data/utils/asset_imputation.py @@ -13,8 +13,6 @@ ) SIPP_VEHICLE_ASSET_VARIABLES = ("household_vehicles_value",) SCF_NET_WORTH_VARIABLE = "net_worth" -NET_WORTH_RESIDUAL_VARIABLE = "net_worth_residual" -SCF_BALANCE_SHEET_DEBT_VARIABLES = ("auto_loan_balance",) SCF_FINANCIAL_ASSET_TARGETS = { "scf_bank_account_assets": ("liq",), "scf_stock_assets": ("stocks", "nmmf"), @@ -41,6 +39,7 @@ "scf_other_residential_debt": ("resdbt",), "scf_other_lines_of_credit": ("othloc",), "scf_credit_card_debt": ("ccbal",), + "scf_vehicle_installment_debt": ("veh_inst",), "scf_student_loan_debt": ("edn_inst",), "scf_other_installment_debt": ("oth_inst",), "scf_other_debt": ("odebt",), @@ -51,8 +50,6 @@ SIPP_LIQUID_ASSET_VARIABLES + SIPP_VEHICLE_ASSET_VARIABLES + SCF_NET_WORTH_COMPONENT_VARIABLES - + SCF_BALANCE_SHEET_DEBT_VARIABLES - + (NET_WORTH_RESIDUAL_VARIABLE,) ) NET_WORTH_COMPONENT_SIGNS = { "auto_loan_balance": -1.0, @@ -60,14 +57,13 @@ "scf_other_residential_debt": -1.0, "scf_other_lines_of_credit": -1.0, "scf_credit_card_debt": -1.0, + "scf_vehicle_installment_debt": -1.0, "scf_student_loan_debt": -1.0, "scf_other_installment_debt": -1.0, "scf_other_debt": -1.0, } -UNOBSERVED_NET_WORTH_COMPONENT_GROUPS = ( - "SCF/SIPP source and definition differences captured in net_worth_residual", -) -NET_WORTH_COMPONENTS_ARE_COMPLETE = False +UNOBSERVED_NET_WORTH_COMPONENT_GROUPS = () +NET_WORTH_COMPONENTS_ARE_COMPLETE = True FINANCIAL_ASSET_SOURCE_SCF_PROBABILITY = 0.5 @@ -96,11 +92,9 @@ def check_household_net_worth_reconciliation( ) -> NetWorthReconciliationReport: """Check whether household net worth equals signed balance-sheet components. - The current CPS asset fields are intentionally not a complete balance sheet: - liquid assets and vehicles are imputed from SIPP, while net worth and auto - loan balances are imputed from SCF. Leave ``components_are_complete`` false - for current public datasets. Set it to true only for a household-aligned data - frame whose component variables are intended to exhaust net worth. + The current CPS asset fields use blended SIPP/SCF liquid assets plus SCF-only + balance-sheet components. They are intended to reconstruct net worth without + an accounting residual when aligned to household rows. """ component_variables = tuple(component_variables) available_components = tuple( @@ -119,9 +113,8 @@ def check_household_net_worth_reconciliation( max_abs_difference=None, is_reconciled=None, message=( - "Net worth is an independently imputed SCF aggregate. The " - "available SIPP/SCF asset fields are partial and should not be " - "expected to reconstruct it." + "Net worth component reconciliation was skipped because the " + "component set was marked incomplete." ), ) @@ -179,6 +172,28 @@ def add_scf_net_worth_component_targets(scf: pd.DataFrame) -> tuple[str, ...]: return _add_scf_targets(scf, SCF_NET_WORTH_COMPONENT_TARGETS) +def require_scf_net_worth_formula_targets( + *, + scf_financial_asset_targets: Sequence[str], + scf_component_targets: Sequence[str], +) -> None: + """Fail loudly if the SCF source cannot supply the formula leaves.""" + available_targets = set(scf_financial_asset_targets) | set(scf_component_targets) + missing_targets = [ + target + for target in ( + *SCF_FINANCIAL_ASSET_TARGETS, + *SCF_NET_WORTH_COMPONENT_TARGETS, + ) + if target not in available_targets + ] + if missing_targets: + raise KeyError( + "SCF data is missing source columns needed to build these net " + f"worth formula targets: {', '.join(missing_targets)}" + ) + + def _add_scf_targets( scf: pd.DataFrame, target_map: Mapping[str, tuple[str, ...]], @@ -289,26 +304,33 @@ def align_household_values_to_reference_households( return values.reindex(reference_household_ids).fillna(0).to_numpy(dtype=np.float32) -def compute_net_worth_residual( +def compute_net_worth_from_components( *, - net_worth: Sequence[float], components: Mapping[str, Sequence[float]], component_signs: Mapping[str, float] = NET_WORTH_COMPONENT_SIGNS, ) -> np.ndarray: - """Compute the residual that makes net worth reconcile exactly.""" - net_worth = np.asarray(net_worth, dtype=np.float32) - component_total = np.zeros_like(net_worth, dtype=np.float32) - - for variable, values in components.items(): + """Compute household net worth from signed balance-sheet components.""" + iterator = iter(components.items()) + try: + first_variable, first_values = next(iterator) + except StopIteration: + return np.array([], dtype=np.float32) + + first_values = np.asarray(first_values, dtype=np.float32) + component_total = ( + component_signs.get(first_variable, 1.0) * first_values + ).astype(np.float32) + + for variable, values in iterator: values = np.asarray(values, dtype=np.float32) - if values.shape != net_worth.shape: + if values.shape != component_total.shape: raise ValueError( - f"{variable} has shape {values.shape}, but net_worth has " - f"shape {net_worth.shape}." + f"{variable} has shape {values.shape}, but expected " + f"{component_total.shape}." ) component_total += component_signs.get(variable, 1.0) * values - return (net_worth - component_total).astype(np.float32) + return component_total.astype(np.float32) def build_household_vehicle_receiver( diff --git a/tests/unit/calibration/test_source_impute.py b/tests/unit/calibration/test_source_impute.py index 893dd5704..188141753 100644 --- a/tests/unit/calibration/test_source_impute.py +++ b/tests/unit/calibration/test_source_impute.py @@ -88,8 +88,8 @@ def test_scf_variables_defined(self): assert "auto_loan_balance" in SCF_IMPUTED_VARIABLES assert "auto_loan_interest" in SCF_IMPUTED_VARIABLES assert "scf_retirement_assets" in SCF_IMPUTED_VARIABLES + assert "scf_vehicle_installment_debt" in SCF_IMPUTED_VARIABLES assert "scf_mortgage_debt" in SCF_IMPUTED_VARIABLES - assert "net_worth_residual" in SCF_IMPUTED_VARIABLES def test_org_variables_defined(self): assert "hourly_wage" in ORG_IMPUTED_VARIABLES diff --git a/tests/unit/datasets/test_cps_file_handles.py b/tests/unit/datasets/test_cps_file_handles.py index db5add17f..d5e223e38 100644 --- a/tests/unit/datasets/test_cps_file_handles.py +++ b/tests/unit/datasets/test_cps_file_handles.py @@ -189,7 +189,29 @@ def load_dataset(self): "employment_income": np.array([35_000.0, 20_000.0]), "interest_dividend_income": np.array([100.0, 50.0]), "social_security_pension_income": np.array([0.0, 0.0]), - "networth": np.array([10_000.0, 5_000.0]), + "liq": np.array([0.0, 0.0]), + "stocks": np.array([0.0, 0.0]), + "nmmf": np.array([0.0, 0.0]), + "bond": np.array([0.0, 0.0]), + "cds": np.array([12_000.0, 6_000.0]), + "savbnd": np.array([0.0, 0.0]), + "retqliq": np.array([0.0, 0.0]), + "cashli": np.array([0.0, 0.0]), + "othma": np.array([0.0, 0.0]), + "othfin": np.array([0.0, 0.0]), + "houses": np.array([0.0, 0.0]), + "oresre": np.array([0.0, 0.0]), + "nnresre": np.array([0.0, 0.0]), + "bus": np.array([0.0, 0.0]), + "othnfin": np.array([0.0, 0.0]), + "mrthel": np.array([0.0, 0.0]), + "resdbt": np.array([0.0, 0.0]), + "othloc": np.array([0.0, 0.0]), + "ccbal": np.array([0.0, 0.0]), + "veh_inst": np.array([2_000.0, 1_000.0]), + "edn_inst": np.array([0.0, 0.0]), + "oth_inst": np.array([0.0, 0.0]), + "odebt": np.array([0.0, 0.0]), "auto_loan_balance": np.array([2_000.0, 1_000.0]), "auto_loan_interest": np.array([200.0, 100.0]), "wgt": np.array([1.0, 1.0]), @@ -211,13 +233,16 @@ def fit( def predict(self, X_test): assert X_test["is_married"].tolist() == [True, False] - return pd.DataFrame( + values = {var: [0.0, 0.0] for var in self.imputed_variables} + values.update( { - "networth": [10_000.0, 5_000.0], + "scf_certificates_of_deposit": [12_000.0, 6_000.0], + "scf_vehicle_installment_debt": [2_000.0, 1_000.0], "auto_loan_balance": [2_000.0, 1_000.0], "auto_loan_interest": [200.0, 100.0], } ) + return pd.DataFrame(values) import policyengine_us_data.datasets.scf.scf as scf_module import microimpute.models.qrf as qrf_module diff --git a/tests/unit/test_asset_imputation.py b/tests/unit/test_asset_imputation.py index f0fab9cd5..ce9af2e36 100644 --- a/tests/unit/test_asset_imputation.py +++ b/tests/unit/test_asset_imputation.py @@ -3,8 +3,6 @@ from policyengine_us_data.utils.asset_imputation import ( NET_WORTH_COMPONENTS_ARE_COMPLETE, - NET_WORTH_RESIDUAL_VARIABLE, - UNOBSERVED_NET_WORTH_COMPONENT_GROUPS, add_scf_financial_asset_targets, add_scf_net_worth_component_targets, aggregate_person_values_to_reference_households, @@ -12,7 +10,7 @@ build_household_vehicle_receiver, check_household_net_worth_reconciliation, combine_sipp_and_scf_financial_assets, - compute_net_worth_residual, + compute_net_worth_from_components, financial_asset_source_is_scf, ) @@ -50,7 +48,7 @@ def test_build_household_vehicle_receiver_aggregates_person_inputs(): assert receiver["is_homeowner"].tolist() == [1.0, 0.0] -def test_current_net_worth_components_are_marked_incomplete(): +def test_net_worth_reconciliation_can_be_explicitly_skipped(): data = { "net_worth": np.array([500_000.0]), "bank_account_assets": np.array([10_000.0]), @@ -60,14 +58,16 @@ def test_current_net_worth_components_are_marked_incomplete(): "auto_loan_balance": np.array([2_000.0]), } - report = check_household_net_worth_reconciliation(data) + report = check_household_net_worth_reconciliation( + data, + components_are_complete=False, + ) - assert NET_WORTH_COMPONENTS_ARE_COMPLETE is False + assert NET_WORTH_COMPONENTS_ARE_COMPLETE is True assert report.components_are_complete is False assert report.is_reconciled is None assert report.max_abs_difference is None - assert "net_worth_residual" in UNOBSERVED_NET_WORTH_COMPONENT_GROUPS[0] - assert "independently imputed SCF aggregate" in report.message + assert "marked incomplete" in report.message def test_net_worth_reconciliation_checks_complete_household_components(): @@ -157,6 +157,7 @@ def test_add_scf_net_worth_component_targets_builds_formula_columns(): "resdbt": [7.0], "othloc": [8.0], "ccbal": [9.0], + "veh_inst": [9.5], "edn_inst": [10.0], "oth_inst": [11.0], "odebt": [13.0], @@ -167,10 +168,12 @@ def test_add_scf_net_worth_component_targets_builds_formula_columns(): assert "scf_savings_bonds" in targets assert "scf_retirement_assets" in targets + assert "scf_vehicle_installment_debt" in targets assert "scf_mortgage_debt" in targets assert "scf_buy_now_pay_later_debt" not in targets assert scf["scf_savings_bonds"].tolist() == [1.5] assert scf["scf_retirement_assets"].tolist() == [2.0] + assert scf["scf_vehicle_installment_debt"].tolist() == [9.5] assert scf["scf_mortgage_debt"].tolist() == [50.0] @@ -231,9 +234,8 @@ def test_aggregate_and_align_household_components(): assert aligned.tolist() == [200.0, 100.0] -def test_compute_net_worth_residual_makes_formula_exact(): - residual = compute_net_worth_residual( - net_worth=np.array([1_000.0]), +def test_compute_net_worth_from_components_applies_signs(): + net_worth = compute_net_worth_from_components( components={ "bank_account_assets": np.array([100.0]), "scf_retirement_assets": np.array([300.0]), @@ -242,5 +244,4 @@ def test_compute_net_worth_residual_makes_formula_exact(): }, ) - assert NET_WORTH_RESIDUAL_VARIABLE == "net_worth_residual" - assert residual.tolist() == [850.0] + assert net_worth.tolist() == [150.0]