From 1a49c7e36a8593de431ecccefacb7d5bb63fa26d Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 6 Mar 2026 07:52:53 -0500 Subject: [PATCH] Switch from black to ruff format Replace black with ruff as the code formatter across pyproject.toml, Makefile, and CI workflows. Reformat all files with ruff defaults. Co-Authored-By: Claude Opus 4.6 --- .github/bump_version.py | 4 +- .github/workflows/reusable_lint.yaml | 6 +- Makefile | 2 +- changelog.d/switch-to-ruff.changed.md | 1 + docs/calibration_matrix.ipynb | 101 +++++-- docs/hierarchical_uprating.ipynb | 51 +--- docs/local_area_calibration_setup.ipynb | 56 +--- modal_app/data_build.py | 21 +- modal_app/local_area.py | 36 +-- modal_app/remote_calibration_runner.py | 4 +- paper/scripts/build_from_content.py | 36 +-- .../calculate_distributional_metrics.py | 2 +- paper/scripts/calculate_target_performance.py | 3 +- paper/scripts/generate_all_tables.py | 8 +- paper/scripts/generate_validation_metrics.py | 4 +- paper/scripts/markdown_to_latex.py | 16 +- .../calibration/clone_and_assign.py | 3 +- .../calibration/puf_impute.py | 54 +--- .../calibration/source_impute.py | 47 ++-- .../calibration/unified_calibration.py | 34 +-- .../calibration/unified_matrix_builder.py | 64 ++--- policyengine_us_data/datasets/acs/acs.py | 12 +- .../datasets/acs/census_acs.py | 22 +- .../datasets/cps/census_cps.py | 32 +-- policyengine_us_data/datasets/cps/cps.py | 247 +++++++----------- .../datasets/cps/enhanced_cps.py | 32 +-- .../block_assignment.py | 27 +- .../calibration_utils.py | 29 +- .../county_assignment.py | 4 +- .../create_stratified_cps.py | 23 +- .../publish_local_area.py | 54 ++-- .../stacked_dataset_builder.py | 121 +++------ .../check_calibrated_estimates_interactive.py | 68 +++-- .../cps/long_term/extract_ssa_costs.py | 4 +- .../cps/long_term/projection_utils.py | 16 +- .../cps/long_term/run_household_projection.py | 122 ++++----- .../datasets/cps/small_enhanced_cps.py | 28 +- policyengine_us_data/datasets/puf/irs_puf.py | 4 +- policyengine_us_data/datasets/puf/puf.py | 43 ++- policyengine_us_data/datasets/scf/fed_scf.py | 16 +- policyengine_us_data/datasets/scf/scf.py | 36 +-- policyengine_us_data/datasets/sipp/sipp.py | 3 +- .../db/create_database_tables.py | 39 +-- .../db/create_initial_strata.py | 16 +- policyengine_us_data/db/etl_age.py | 8 +- policyengine_us_data/db/etl_irs_soi.py | 79 ++---- policyengine_us_data/db/etl_medicaid.py | 12 +- .../db/etl_national_targets.py | 52 +--- policyengine_us_data/db/etl_pregnancy.py | 24 +- policyengine_us_data/db/etl_snap.py | 8 +- .../db/etl_state_income_tax.py | 10 +- policyengine_us_data/db/validate_database.py | 4 +- policyengine_us_data/db/validate_hierarchy.py | 58 ++-- policyengine_us_data/geography/__init__.py | 4 +- policyengine_us_data/geography/county_fips.py | 8 +- .../geography/create_zip_code_dataset.py | 4 +- policyengine_us_data/parameters/__init__.py | 4 +- .../calibration_targets/audit_county_enum.py | 4 +- .../make_block_cd_distributions.py | 8 +- .../make_block_crosswalk.py | 16 +- .../make_county_cd_distributions.py | 16 +- .../make_district_mapping.py | 8 +- .../pull_hardcoded_targets.py | 8 +- .../calibration_targets/pull_snap_targets.py | 8 +- .../calibration_targets/pull_soi_targets.py | 87 ++---- .../storage/upload_completed_datasets.py | 7 +- .../test_build_matrix_masking.py | 20 +- .../test_calibration/test_clone_and_assign.py | 13 +- .../tests/test_calibration/test_puf_impute.py | 8 +- .../test_retirement_imputation.py | 127 +++------ .../test_calibration/test_source_impute.py | 4 +- .../test_unified_matrix_builder.py | 23 +- .../tests/test_constraint_validation.py | 12 +- policyengine_us_data/tests/test_database.py | 2 +- .../tests/test_database_build.py | 21 +- .../tests/test_datasets/test_county_fips.py | 9 +- .../tests/test_datasets/test_cps.py | 17 +- .../test_datasets/test_dataset_sanity.py | 46 ++-- .../tests/test_datasets/test_enhanced_cps.py | 64 ++--- .../tests/test_datasets/test_sipp_assets.py | 24 +- .../test_datasets/test_small_enhanced_cps.py | 10 +- .../test_datasets/test_sparse_enhanced_cps.py | 40 ++- .../create_test_fixture.py | 32 +-- .../test_county_assignment.py | 8 +- .../test_stacked_dataset_builder.py | 52 ++-- policyengine_us_data/tests/test_puf_impute.py | 4 +- .../tests/test_schema_views_and_lookups.py | 16 +- .../tests/test_stochastic_variables.py | 4 - policyengine_us_data/utils/census.py | 4 +- .../utils/constraint_validation.py | 22 +- policyengine_us_data/utils/data_upload.py | 16 +- policyengine_us_data/utils/db.py | 25 +- policyengine_us_data/utils/huggingface.py | 4 +- policyengine_us_data/utils/l0.py | 4 +- policyengine_us_data/utils/loss.py | 155 ++++------- policyengine_us_data/utils/randomness.py | 4 +- policyengine_us_data/utils/soi.py | 27 +- policyengine_us_data/utils/spm.py | 4 +- policyengine_us_data/utils/uprating.py | 4 +- pyproject.toml | 20 +- scripts/generate_test_data.py | 42 +-- scripts/migrate_versioned_to_production.py | 4 +- tests/test_h6_reform.py | 18 +- tests/test_no_formula_variables_stored.py | 26 +- tests/test_reproducibility.py | 6 +- tests/test_weeks_unemployed.py | 6 +- uv.lock | 85 ++---- validation/benefit_validation.py | 20 +- validation/generate_qrf_statistics.py | 38 +-- validation/qrf_diagnostics.py | 42 +-- validation/run_qrf_diagnostics.py | 6 +- validation/tax_policy_validation.py | 8 +- validation/validate_retirement_imputation.py | 24 +- 113 files changed, 1024 insertions(+), 2134 deletions(-) create mode 100644 changelog.d/switch-to-ruff.changed.md diff --git a/.github/bump_version.py b/.github/bump_version.py index bb0fd6dd3..779a82e38 100644 --- a/.github/bump_version.py +++ b/.github/bump_version.py @@ -19,9 +19,7 @@ def get_current_version(pyproject_path: Path) -> str: def infer_bump(changelog_dir: Path) -> str: fragments = [ - f - for f in changelog_dir.iterdir() - if f.is_file() and f.name != ".gitkeep" + f for f in changelog_dir.iterdir() if f.is_file() and f.name != ".gitkeep" ] if not fragments: print("No changelog fragments found", file=sys.stderr) diff --git a/.github/workflows/reusable_lint.yaml b/.github/workflows/reusable_lint.yaml index f5fa02cf7..862e90a8a 100644 --- a/.github/workflows/reusable_lint.yaml +++ b/.github/workflows/reusable_lint.yaml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + - name: Install ruff + run: pip install ruff>=0.9.0 - name: Check formatting - uses: "lgeiger/black-action@master" - with: - args: ". -l 79 --check" \ No newline at end of file + run: ruff format --check . diff --git a/Makefile b/Makefile index b34b8eb60..ce38e165e 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ HF_CLONE_DIR ?= $(HOME)/huggingface/policyengine-us-data all: data test format: - black . -l 79 + ruff format . test: pytest diff --git a/changelog.d/switch-to-ruff.changed.md b/changelog.d/switch-to-ruff.changed.md new file mode 100644 index 000000000..a514e08ff --- /dev/null +++ b/changelog.d/switch-to-ruff.changed.md @@ -0,0 +1 @@ +Switch from black to ruff format. diff --git a/docs/calibration_matrix.ipynb b/docs/calibration_matrix.ipynb index 41497b1e8..44b5246b0 100644 --- a/docs/calibration_matrix.ipynb +++ b/docs/calibration_matrix.ipynb @@ -27,7 +27,28 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "import numpy as np\nimport pandas as pd\nfrom policyengine_us import Microsimulation\nfrom policyengine_us_data.storage import STORAGE_FOLDER\nfrom policyengine_us_data.calibration.unified_matrix_builder import (\n UnifiedMatrixBuilder,\n)\nfrom policyengine_us_data.calibration.clone_and_assign import (\n assign_random_geography,\n)\nfrom policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n create_target_groups,\n drop_target_groups,\n get_geo_level,\n STATE_CODES,\n)\n\ndb_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\ndb_uri = f\"sqlite:///{db_path}\"\ndataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\"" + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from policyengine_us import Microsimulation\n", + "from policyengine_us_data.storage import STORAGE_FOLDER\n", + "from policyengine_us_data.calibration.unified_matrix_builder import (\n", + " UnifiedMatrixBuilder,\n", + ")\n", + "from policyengine_us_data.calibration.clone_and_assign import (\n", + " assign_random_geography,\n", + ")\n", + "from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n", + " create_target_groups,\n", + " drop_target_groups,\n", + " get_geo_level,\n", + " STATE_CODES,\n", + ")\n", + "\n", + "db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n", + "db_uri = f\"sqlite:///{db_path}\"\n", + "dataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\"" + ] }, { "cell_type": "code", @@ -82,7 +103,19 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "print(f\"Targets: {X_sparse.shape[0]}\")\nprint(f\"Columns: {X_sparse.shape[1]:,} ({N_CLONES} clones x {n_records:,} records)\")\nprint(f\"Non-zeros: {X_sparse.nnz:,}\")\nprint(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")\n\ngeo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\nlevel_names = {0: \"National\", 1: \"State\", 2: \"District\"}\nfor level in [0, 1, 2]:\n n = (geo_levels == level).sum()\n if n > 0:\n print(f\" {level_names[level]}: {n} targets\")" + "source": [ + "print(f\"Targets: {X_sparse.shape[0]}\")\n", + "print(f\"Columns: {X_sparse.shape[1]:,} ({N_CLONES} clones x {n_records:,} records)\")\n", + "print(f\"Non-zeros: {X_sparse.nnz:,}\")\n", + "print(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")\n", + "\n", + "geo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\n", + "level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n", + "for level in [0, 1, 2]:\n", + " n = (geo_levels == level).sum()\n", + " if n > 0:\n", + " print(f\" {level_names[level]}: {n} targets\")" + ] }, { "cell_type": "markdown", @@ -294,14 +327,16 @@ "for gid, info in enumerate(group_info):\n", " mask = target_groups == gid\n", " vals = targets_df.loc[mask, \"value\"]\n", - " records.append({\n", - " \"group_id\": gid,\n", - " \"description\": info,\n", - " \"n_targets\": mask.sum(),\n", - " \"min_value\": vals.min(),\n", - " \"median_value\": vals.median(),\n", - " \"max_value\": vals.max(),\n", - " })\n", + " records.append(\n", + " {\n", + " \"group_id\": gid,\n", + " \"description\": info,\n", + " \"n_targets\": mask.sum(),\n", + " \"min_value\": vals.min(),\n", + " \"median_value\": vals.median(),\n", + " \"max_value\": vals.max(),\n", + " }\n", + " )\n", "\n", "group_df = pd.DataFrame(records)\n", "print(group_df.to_string(index=False))" @@ -431,8 +466,7 @@ " for r in nz_rows[:5]:\n", " row = targets_df.iloc[r]\n", " print(\n", - " f\" {row['variable']} (geo={row['geographic_id']}): \"\n", - " f\"{X_sparse[r, col]:.2f}\"\n", + " f\" {row['variable']} (geo={row['geographic_id']}): {X_sparse[r, col]:.2f}\"\n", " )\n", " if len(nz_rows) > 5:\n", " print(f\" ... and {len(nz_rows) - 5} more\")" @@ -475,7 +509,28 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "nnz_per_row = np.diff(X_sparse.indptr)\nprint(f\"Non-zeros per row:\")\nprint(f\" min: {nnz_per_row.min():,}\")\nprint(f\" median: {int(np.median(nnz_per_row)):,}\")\nprint(f\" mean: {nnz_per_row.mean():,.0f}\")\nprint(f\" max: {nnz_per_row.max():,}\")\n\ngeo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\nlevel_names = {0: \"National\", 1: \"State\", 2: \"District\"}\nprint(\"\\nBy geographic level:\")\nfor level in [0, 1, 2]:\n mask = (geo_levels == level).values\n if mask.any():\n vals = nnz_per_row[mask]\n print(\n f\" {level_names[level]:10s}: \"\n f\"n={mask.sum():>4d}, \"\n f\"median nnz={int(np.median(vals)):>7,}, \"\n f\"range=[{vals.min():,}, {vals.max():,}]\"\n )" + "source": [ + "nnz_per_row = np.diff(X_sparse.indptr)\n", + "print(f\"Non-zeros per row:\")\n", + "print(f\" min: {nnz_per_row.min():,}\")\n", + "print(f\" median: {int(np.median(nnz_per_row)):,}\")\n", + "print(f\" mean: {nnz_per_row.mean():,.0f}\")\n", + "print(f\" max: {nnz_per_row.max():,}\")\n", + "\n", + "geo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\n", + "level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n", + "print(\"\\nBy geographic level:\")\n", + "for level in [0, 1, 2]:\n", + " mask = (geo_levels == level).values\n", + " if mask.any():\n", + " vals = nnz_per_row[mask]\n", + " print(\n", + " f\" {level_names[level]:10s}: \"\n", + " f\"n={mask.sum():>4d}, \"\n", + " f\"median nnz={int(np.median(vals)):>7,}, \"\n", + " f\"range=[{vals.min():,}, {vals.max():,}]\"\n", + " )" + ] }, { "cell_type": "code", @@ -498,12 +553,16 @@ "clone_nnz = []\n", "for ci in range(N_CLONES):\n", " block = X_sparse[:, ci * n_records : (ci + 1) * n_records]\n", - " n_states = len(np.unique(geography.state_fips[ci * n_records : (ci + 1) * n_records]))\n", - " clone_nnz.append({\n", - " \"clone\": ci,\n", - " \"nnz\": block.nnz,\n", - " \"unique_states\": n_states,\n", - " })\n", + " n_states = len(\n", + " np.unique(geography.state_fips[ci * n_records : (ci + 1) * n_records])\n", + " )\n", + " clone_nnz.append(\n", + " {\n", + " \"clone\": ci,\n", + " \"nnz\": block.nnz,\n", + " \"unique_states\": n_states,\n", + " }\n", + " )\n", "\n", "clone_df = pd.DataFrame(clone_nnz)\n", "print(\"Non-zeros per clone block:\")\n", @@ -666,7 +725,9 @@ } ], "source": [ - "ratios = row_sums[achievable_mask] / targets_filtered.loc[achievable_mask, \"value\"].values\n", + "ratios = (\n", + " row_sums[achievable_mask] / targets_filtered.loc[achievable_mask, \"value\"].values\n", + ")\n", "ratio_df = targets_filtered[achievable_mask].copy()\n", "ratio_df[\"row_sum\"] = row_sums[achievable_mask]\n", "ratio_df[\"ratio\"] = ratios\n", diff --git a/docs/hierarchical_uprating.ipynb b/docs/hierarchical_uprating.ipynb index 4da30d82c..be0b43a84 100644 --- a/docs/hierarchical_uprating.ipynb +++ b/docs/hierarchical_uprating.ipynb @@ -264,8 +264,7 @@ ], "source": [ "snap_hh = raw[\n", - " (raw[\"domain_variable\"] == \"snap\")\n", - " & (raw[\"variable\"] == \"household_count\")\n", + " (raw[\"domain_variable\"] == \"snap\") & (raw[\"variable\"] == \"household_count\")\n", "]\n", "for level in [\"state\", \"district\"]:\n", " total = snap_hh[snap_hh[\"geo_level\"] == level][\"value\"].sum()\n", @@ -333,9 +332,9 @@ "source": [ "raw[\"original_value\"] = raw[\"value\"].copy()\n", "raw[\"uprating_factor\"] = raw.apply(\n", - " lambda r: builder._get_uprating_info(\n", - " r[\"variable\"], r[\"period\"], uprating_factors\n", - " )[0],\n", + " lambda r: builder._get_uprating_info(r[\"variable\"], r[\"period\"], uprating_factors)[\n", + " 0\n", + " ],\n", " axis=1,\n", ")\n", "raw[\"value\"] = raw[\"original_value\"] * raw[\"uprating_factor\"]" @@ -376,10 +375,7 @@ "sample_states = {6: \"CA\", 48: \"TX\", 36: \"NY\"}\n", "\n", "for fips, abbr in sample_states.items():\n", - " rows = raw[\n", - " (raw[\"geo_level\"] == \"state\")\n", - " & (raw[\"geographic_id\"] == str(fips))\n", - " ]\n", + " rows = raw[(raw[\"geo_level\"] == \"state\") & (raw[\"geographic_id\"] == str(fips))]\n", " for _, r in rows.iterrows():\n", " print(\n", " f\" {abbr} [{r['domain_variable']:8s}] \"\n", @@ -412,9 +408,7 @@ "metadata": {}, "outputs": [], "source": [ - "result = builder._apply_hierarchical_uprating(\n", - " raw, DOMAINS, uprating_factors\n", - ")" + "result = builder._apply_hierarchical_uprating(raw, DOMAINS, uprating_factors)" ] }, { @@ -454,11 +448,7 @@ " for fips, abbr in sample_states.items():\n", " cd_state = cd_domain[\n", " cd_domain[\"geographic_id\"].apply(\n", - " lambda g, s=fips: (\n", - " int(g) // 100 == s\n", - " if g not in (\"US\",)\n", - " else False\n", - " )\n", + " lambda g, s=fips: int(g) // 100 == s if g not in (\"US\",) else False\n", " )\n", " ]\n", " if cd_state.empty:\n", @@ -474,11 +464,7 @@ " & (raw[\"variable\"] == var)\n", " & (raw[\"domain_variable\"] == domain)\n", " ]\n", - " uprated_state = (\n", - " st_row[\"value\"].iloc[0]\n", - " if len(st_row)\n", - " else np.nan\n", - " )\n", + " uprated_state = st_row[\"value\"].iloc[0] if len(st_row) else np.nan\n", " print(\n", " f\" {abbr} {var:20s} \"\n", " f\"hif={hif:.6f} \"\n", @@ -487,6 +473,7 @@ " f\"uprated_state={uprated_state:>14,.0f}\"\n", " )\n", "\n", + "\n", "show_reconciliation(result, raw, \"aca_ptc\", sample_states)" ] }, @@ -527,9 +514,7 @@ "]\n", "\n", "state_ufs = (\n", - " aca_cds.assign(state_fips=aca_cds[\"geographic_id\"].apply(\n", - " lambda g: int(g) // 100\n", - " ))\n", + " aca_cds.assign(state_fips=aca_cds[\"geographic_id\"].apply(lambda g: int(g) // 100))\n", " .groupby(\"state_fips\")[\"state_uprating_factor\"]\n", " .first()\n", " .sort_values()\n", @@ -537,7 +522,7 @@ "\n", "print(\"ACA PTC uprating factors (aca_ptc = vol_mult * val_mult):\")\n", "print(f\" {'State FIPS':>12s} {'Factor':>8s}\")\n", - "print(f\" {'─'*12} {'─'*8}\")\n", + "print(f\" {'─' * 12} {'─' * 8}\")\n", "for fips in list(state_ufs.index[:5]) + [\"...\"] + list(state_ufs.index[-5:]):\n", " if fips == \"...\":\n", " print(f\" {'...':>12s}\")\n", @@ -676,9 +661,7 @@ ], "source": [ "level_counts = (\n", - " result.groupby([\"domain_variable\", \"geo_level\"])\n", - " .size()\n", - " .reset_index(name=\"count\")\n", + " result.groupby([\"domain_variable\", \"geo_level\"]).size().reset_index(name=\"count\")\n", ")\n", "level_counts" ] @@ -749,20 +732,14 @@ "checks = 0\n", "for domain in DOMAINS:\n", " domain_result = result[result[\"domain_variable\"] == domain]\n", - " cd_result = domain_result[\n", - " domain_result[\"geo_level\"] == \"district\"\n", - " ]\n", + " cd_result = domain_result[domain_result[\"geo_level\"] == \"district\"]\n", " if cd_result.empty:\n", " continue\n", "\n", " for fips, abbr in sorted(STATE_CODES.items()):\n", " cd_rows = cd_result[\n", " cd_result[\"geographic_id\"].apply(\n", - " lambda g, s=fips: (\n", - " int(g) // 100 == s\n", - " if g not in (\"US\",)\n", - " else False\n", - " )\n", + " lambda g, s=fips: int(g) // 100 == s if g not in (\"US\",) else False\n", " )\n", " ]\n", " if cd_rows.empty:\n", diff --git a/docs/local_area_calibration_setup.ipynb b/docs/local_area_calibration_setup.ipynb index 2e8614aa9..21e38750e 100644 --- a/docs/local_area_calibration_setup.ipynb +++ b/docs/local_area_calibration_setup.ipynb @@ -241,9 +241,7 @@ } ], "source": [ - "print(\n", - " f\"Example household (record_idx={record_idx}) across {N_CLONES} clones:\\n\"\n", - ")\n", + "print(f\"Example household (record_idx={record_idx}) across {N_CLONES} clones:\\n\")\n", "rows = []\n", "for c in range(N_CLONES):\n", " col = c * n_records + record_idx\n", @@ -351,14 +349,8 @@ "new_state = clone_states[record_idx]\n", "\n", "print(f\"Example household (record_idx={record_idx}):\")\n", - "print(\n", - " f\" Original state: {STATE_CODES.get(int(orig_state), '??')} \"\n", - " f\"({int(orig_state)})\"\n", - ")\n", - "print(\n", - " f\" Clone 0 state: {STATE_CODES.get(int(new_state), '??')} \"\n", - " f\"({int(new_state)})\"\n", - ")\n", + "print(f\" Original state: {STATE_CODES.get(int(orig_state), '??')} ({int(orig_state)})\")\n", + "print(f\" Clone 0 state: {STATE_CODES.get(int(new_state), '??')} ({int(new_state)})\")\n", "print(f\" Original SNAP: ${snap_values[record_idx]:,.2f}\")\n", "print(f\" Clone 0 SNAP: ${new_snap[record_idx]:,.2f}\")" ] @@ -451,9 +443,7 @@ " s.set_input(\n", " \"state_fips\",\n", " 2024,\n", - " geography.state_fips[c * n_records : (c + 1) * n_records].astype(\n", - " np.int32\n", - " ),\n", + " geography.state_fips[c * n_records : (c + 1) * n_records].astype(np.int32),\n", " )\n", " for var in get_calculated_variables(s):\n", " s.delete_arrays(var)\n", @@ -576,9 +566,7 @@ " f\"{col in cd_to_cols.get(cd, [])}\"\n", " )\n", " # Check an unrelated state\n", - " print(\n", - " f\" Visible to NC (37) targets: \" f\"{col in state_to_cols.get(37, [])}\"\n", - " )\n", + " print(f\" Visible to NC (37) targets: {col in state_to_cols.get(37, [])}\")\n", " print()" ] }, @@ -634,14 +622,9 @@ " else:\n", " rate = load_take_up_rate(rate_key, 2024)\n", " rate_str = (\n", - " f\"{rate:.2%}\"\n", - " if isinstance(rate, float)\n", - " else f\"dict ({len(rate)} entries)\"\n", + " f\"{rate:.2%}\" if isinstance(rate, float) else f\"dict ({len(rate)} entries)\"\n", " )\n", - " print(\n", - " f\" {spec['variable']:40s} \"\n", - " f\"entity={spec['entity']:10s} rate={rate_str}\"\n", - " )" + " print(f\" {spec['variable']:40s} entity={spec['entity']:10s} rate={rate_str}\")" ] }, { @@ -965,14 +948,9 @@ "os.makedirs(output_dir, exist_ok=True)\n", "output_path = os.path.join(output_dir, \"results.h5\")\n", "\n", - "print(\n", - " f\"Weight vector: {len(w):,} entries \"\n", - " f\"({n_demo_cds} CDs x {n_records:,} HH)\"\n", - ")\n", + "print(f\"Weight vector: {len(w):,} entries ({n_demo_cds} CDs x {n_records:,} HH)\")\n", "print(f\"Non-zero weights: {(w > 0).sum()}\")\n", - "print(\n", - " f\"Example HH weight in CD 3701: {w[cd_idx_3701 * n_records + record_idx]}\"\n", - ")\n", + "print(f\"Example HH weight in CD 3701: {w[cd_idx_3701 * n_records + record_idx]}\")\n", "print(f\"Example HH weight in CD 201: {w[cd_idx_201 * n_records + record_idx]}\")" ] }, @@ -1118,22 +1096,14 @@ ")\n", "print(f\"Stacked dataset: {len(hh_after_df)} households\\n\")\n", "\n", - "mapping_df = pd.read_csv(\n", - " f\"{output_dir}/mappings/results_household_mapping.csv\"\n", - ")\n", - "example_mapping = mapping_df.loc[\n", - " mapping_df.original_household_id == example_hh_id\n", - "]\n", - "print(f\"Example household (original_id={example_hh_id}) \" f\"in mapping:\\n\")\n", + "mapping_df = pd.read_csv(f\"{output_dir}/mappings/results_household_mapping.csv\")\n", + "example_mapping = mapping_df.loc[mapping_df.original_household_id == example_hh_id]\n", + "print(f\"Example household (original_id={example_hh_id}) in mapping:\\n\")\n", "print(example_mapping.to_string(index=False))\n", "\n", "new_ids = example_mapping.new_household_id\n", "print(f\"\\nIn stacked dataset:\\n\")\n", - "print(\n", - " hh_after_df.loc[hh_after_df.household_id.isin(new_ids)].to_string(\n", - " index=False\n", - " )\n", - ")" + "print(hh_after_df.loc[hh_after_df.household_id.isin(new_ids)].to_string(index=False))" ] }, { diff --git a/modal_app/data_build.py b/modal_app/data_build.py index 131e7f0bf..f3c2191bb 100644 --- a/modal_app/data_build.py +++ b/modal_app/data_build.py @@ -20,9 +20,7 @@ ) image = ( - modal.Image.debian_slim(python_version="3.13") - .apt_install("git") - .pip_install("uv") + modal.Image.debian_slim(python_version="3.13").apt_install("git").pip_install("uv") ) REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git" @@ -380,9 +378,7 @@ def build_datasets( print("=== Phase 3: Building extended CPS ===") run_script_with_checkpoint( "policyengine_us_data/datasets/cps/extended_cps.py", - SCRIPT_OUTPUTS[ - "policyengine_us_data/datasets/cps/extended_cps.py" - ], + SCRIPT_OUTPUTS["policyengine_us_data/datasets/cps/extended_cps.py"], branch, checkpoint_volume, env=env, @@ -390,18 +386,13 @@ def build_datasets( # GROUP 3: After extended_cps - run in parallel # enhanced_cps and stratified_cps both depend on extended_cps - print( - "=== Phase 4: Building enhanced and stratified CPS (parallel)" - " ===" - ) + print("=== Phase 4: Building enhanced and stratified CPS (parallel) ===") with ThreadPoolExecutor(max_workers=2) as executor: futures = [ executor.submit( run_script_with_checkpoint, "policyengine_us_data/datasets/cps/enhanced_cps.py", - SCRIPT_OUTPUTS[ - "policyengine_us_data/datasets/cps/enhanced_cps.py" - ], + SCRIPT_OUTPUTS["policyengine_us_data/datasets/cps/enhanced_cps.py"], branch, checkpoint_volume, env=env, @@ -426,9 +417,7 @@ def build_datasets( print("=== Phase 5: Building small enhanced CPS ===") run_script_with_checkpoint( "policyengine_us_data/datasets/cps/small_enhanced_cps.py", - SCRIPT_OUTPUTS[ - "policyengine_us_data/datasets/cps/small_enhanced_cps.py" - ], + SCRIPT_OUTPUTS["policyengine_us_data/datasets/cps/small_enhanced_cps.py"], branch, checkpoint_volume, env=env, diff --git a/modal_app/local_area.py b/modal_app/local_area.py index 92e068335..76ba00537 100644 --- a/modal_app/local_area.py +++ b/modal_app/local_area.py @@ -245,9 +245,7 @@ def validate_staging(branch: str, version: str) -> Dict: print(f" States: {manifest['totals']['states']}") print(f" Districts: {manifest['totals']['districts']}") print(f" Cities: {manifest['totals']['cities']}") - print( - f" Total size: {manifest['totals']['total_size_bytes'] / 1e9:.2f} GB" - ) + print(f" Total size: {manifest['totals']['total_size_bytes'] / 1e9:.2f} GB") return manifest @@ -362,8 +360,7 @@ def promote_publish(branch: str = "main", version: str = "") -> str: manifest_path = staging_dir / version / "manifest.json" if not manifest_path.exists(): raise RuntimeError( - f"No manifest found at {manifest_path}. " - f"Run build+stage workflow first." + f"No manifest found at {manifest_path}. Run build+stage workflow first." ) with open(manifest_path) as f: @@ -405,7 +402,9 @@ def promote_publish(branch: str = "main", version: str = "") -> str: if result.returncode != 0: raise RuntimeError(f"Promote failed: {result.stderr}") - return f"Successfully promoted version {version} with {len(manifest['files'])} files" + return ( + f"Successfully promoted version {version} with {len(manifest['files'])} files" + ) @app.function( @@ -436,12 +435,8 @@ def coordinate_publish( calibration_dir.mkdir(parents=True, exist_ok=True) # hf_hub_download preserves directory structure, so files are in calibration/ subdir - weights_path = ( - calibration_dir / "calibration" / "w_district_calibration.npy" - ) - dataset_path = ( - calibration_dir / "calibration" / "stratified_extended_cps.h5" - ) + weights_path = calibration_dir / "calibration" / "w_district_calibration.npy" + dataset_path = calibration_dir / "calibration" / "stratified_extended_cps.h5" db_path = calibration_dir / "calibration" / "policy_data.db" if not all(p.exists() for p in [weights_path, dataset_path, db_path]): @@ -514,15 +509,10 @@ def coordinate_publish( completed = get_completed_from_volume(version_dir) print(f"Found {len(completed)} already-completed items on volume") - work_chunks = partition_work( - states, districts, cities, num_workers, completed - ) + work_chunks = partition_work(states, districts, cities, num_workers, completed) total_remaining = sum(len(c) for c in work_chunks) - print( - f"Remaining work: {total_remaining} items " - f"across {len(work_chunks)} workers" - ) + print(f"Remaining work: {total_remaining} items across {len(work_chunks)} workers") if total_remaining == 0: print("All items already built!") @@ -594,14 +584,10 @@ def coordinate_publish( ) if actual_total < expected_total: - print( - f"WARNING: Expected {expected_total} files, found {actual_total}" - ) + print(f"WARNING: Expected {expected_total} files, found {actual_total}") print("\nStarting upload to staging...") - result = upload_to_staging.remote( - branch=branch, version=version, manifest=manifest - ) + result = upload_to_staging.remote(branch=branch, version=version, manifest=manifest) print(result) print("\n" + "=" * 60) diff --git a/modal_app/remote_calibration_runner.py b/modal_app/remote_calibration_runner.py index 689d245dd..f3afb509d 100644 --- a/modal_app/remote_calibration_runner.py +++ b/modal_app/remote_calibration_runner.py @@ -7,9 +7,7 @@ hf_secret = modal.Secret.from_name("huggingface-token") image = ( - modal.Image.debian_slim(python_version="3.11") - .apt_install("git") - .pip_install("uv") + modal.Image.debian_slim(python_version="3.11").apt_install("git").pip_install("uv") ) REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git" diff --git a/paper/scripts/build_from_content.py b/paper/scripts/build_from_content.py index 21068f0db..52f88389d 100644 --- a/paper/scripts/build_from_content.py +++ b/paper/scripts/build_from_content.py @@ -47,12 +47,8 @@ def md_to_latex(self, content, section_type="section"): latex = re.sub(r"^# Abstract\n\n", "", latex) else: # Convert markdown headers to LaTeX sections - latex = re.sub( - r"^# (.+)$", r"\\section{\1}", latex, flags=re.MULTILINE - ) - latex = re.sub( - r"^## (.+)$", r"\\subsection{\1}", latex, flags=re.MULTILINE - ) + latex = re.sub(r"^# (.+)$", r"\\section{\1}", latex, flags=re.MULTILINE) + latex = re.sub(r"^## (.+)$", r"\\subsection{\1}", latex, flags=re.MULTILINE) latex = re.sub( r"^### (.+)$", r"\\subsubsection{\1}", @@ -173,15 +169,11 @@ def convert_citation(match): if len(author_list) == 1: # Handle "Author1 and Author2" format if " and " in authors: - first_author = ( - authors.split(" and ")[0].strip().split()[-1] - ) + first_author = authors.split(" and ")[0].strip().split()[-1] cite_key = f"{first_author.lower()}{year}" else: # Single author - author = ( - author_list[0].strip().split()[-1] - ) # Last name + author = author_list[0].strip().split()[-1] # Last name cite_key = f"{author.lower()}{year}" else: # Multiple authors - use first author @@ -191,9 +183,7 @@ def convert_citation(match): return f"\\citep{{{cite_key}}}" return match.group(0) # Return original if no year found - latex = re.sub( - r"\(([^)]+(?:19|20)\d{2}[a-z]?)\)", convert_citation, latex - ) + latex = re.sub(r"\(([^)]+(?:19|20)\d{2}[a-z]?)\)", convert_citation, latex) # Also handle inline citations like "Author (Year)" or "Author et al. (Year)" def convert_inline_citation(match): @@ -276,15 +266,11 @@ def convert_myst_citation(match): if len(author_list) == 1: # Handle "Author1 and Author2" format if " and " in authors: - first_author = ( - authors.split(" and ")[0].strip().split()[-1] - ) + first_author = authors.split(" and ")[0].strip().split()[-1] cite_key = f"{first_author.lower()}{year}" else: # Single author - author = ( - author_list[0].strip().split()[-1] - ) # Last name + author = author_list[0].strip().split()[-1] # Last name cite_key = f"{author.lower()}{year}" else: # Multiple authors - use first author @@ -294,9 +280,7 @@ def convert_myst_citation(match): return f"{{cite}}`{cite_key}`" return match.group(0) - myst = re.sub( - r"\(([^)]+(?:19|20)\d{2}[a-z]?)\)", convert_myst_citation, myst - ) + myst = re.sub(r"\(([^)]+(?:19|20)\d{2}[a-z]?)\)", convert_myst_citation, myst) # Handle inline citations like "Author (Year)" - convert to {cite:t}`author_year` def convert_inline_myst(match): @@ -343,9 +327,7 @@ def process_content_file(self, content_file): # LaTeX conversion if stem == "abstract": latex_content = self.md_to_latex(content, section_type="abstract") - latex_content = ( - f"\\begin{{abstract}}\n{latex_content}\n\\end{{abstract}}" - ) + latex_content = f"\\begin{{abstract}}\n{latex_content}\n\\end{{abstract}}" latex_path = self.paper_dir / "abstract.tex" elif stem == "introduction": latex_content = self.md_to_latex(content) diff --git a/paper/scripts/calculate_distributional_metrics.py b/paper/scripts/calculate_distributional_metrics.py index 4afdc67d9..61de771b9 100644 --- a/paper/scripts/calculate_distributional_metrics.py +++ b/paper/scripts/calculate_distributional_metrics.py @@ -82,7 +82,7 @@ def calculate_top_shares(values, weights, percentiles=[90, 99]): threshold = weighted_percentile(values, weights, p) mask = values >= threshold top_income = np.sum(values[mask] * weights[mask]) - shares[f"top_{100-p}%"] = top_income / total_income + shares[f"top_{100 - p}%"] = top_income / total_income return shares diff --git a/paper/scripts/calculate_target_performance.py b/paper/scripts/calculate_target_performance.py index 1a50ab3c4..8f5a65f1d 100644 --- a/paper/scripts/calculate_target_performance.py +++ b/paper/scripts/calculate_target_performance.py @@ -79,8 +79,7 @@ def compare_dataset_performance( # Calculate average improvement by target category categories = { - "IRS Income": lambda x: "employment_income" in x - or "capital_gains" in x, + "IRS Income": lambda x: "employment_income" in x or "capital_gains" in x, "Demographics": lambda x: "age_" in x or "population" in x, "Programs": lambda x: "snap" in x or "social_security" in x, "Tax Expenditures": lambda x: "salt" in x or "charitable" in x, diff --git a/paper/scripts/generate_all_tables.py b/paper/scripts/generate_all_tables.py index 8f4762036..690b528d4 100644 --- a/paper/scripts/generate_all_tables.py +++ b/paper/scripts/generate_all_tables.py @@ -33,9 +33,7 @@ def create_latex_table(df, caption, label, float_format=None): # Format the dataframe as LaTeX if float_format: - table_body = df.to_latex( - index=False, escape=False, float_format=float_format - ) + table_body = df.to_latex(index=False, escape=False, float_format=float_format) else: table_body = df.to_latex(index=False, escape=False) @@ -44,9 +42,7 @@ def create_latex_table(df, caption, label, float_format=None): tabular_start = next( i for i, line in enumerate(lines) if "\\begin{tabular}" in line ) - tabular_end = next( - i for i, line in enumerate(lines) if "\\end{tabular}" in line - ) + tabular_end = next(i for i, line in enumerate(lines) if "\\end{tabular}" in line) # Indent the tabular content for i in range(tabular_start, tabular_end + 1): diff --git a/paper/scripts/generate_validation_metrics.py b/paper/scripts/generate_validation_metrics.py index db586959d..90b3624d8 100644 --- a/paper/scripts/generate_validation_metrics.py +++ b/paper/scripts/generate_validation_metrics.py @@ -235,9 +235,7 @@ def main(): print(f"\nResults saved to {results_dir}/") print("\nNOTE: All metrics marked as [TO BE CALCULATED] require full") - print( - "dataset generation and microsimulation runs to compute actual values." - ) + print("dataset generation and microsimulation runs to compute actual values.") if __name__ == "__main__": diff --git a/paper/scripts/markdown_to_latex.py b/paper/scripts/markdown_to_latex.py index 5c3b0e3bb..7cc80b049 100644 --- a/paper/scripts/markdown_to_latex.py +++ b/paper/scripts/markdown_to_latex.py @@ -24,12 +24,8 @@ def convert_markdown_to_latex(markdown_content: str) -> str: # Convert headers latex = re.sub(r"^# (.+)$", r"\\section{\1}", latex, flags=re.MULTILINE) - latex = re.sub( - r"^## (.+)$", r"\\subsection{\1}", latex, flags=re.MULTILINE - ) - latex = re.sub( - r"^### (.+)$", r"\\subsubsection{\1}", latex, flags=re.MULTILINE - ) + latex = re.sub(r"^## (.+)$", r"\\subsection{\1}", latex, flags=re.MULTILINE) + latex = re.sub(r"^### (.+)$", r"\\subsubsection{\1}", latex, flags=re.MULTILINE) # Convert bold and italic latex = re.sub(r"\*\*(.+?)\*\*", r"\\textbf{\1}", latex) @@ -67,9 +63,7 @@ def convert_markdown_to_latex(markdown_content: str) -> str: # Manage list stack while len(list_stack) > indent_level + 1: - new_lines.append( - " " * (len(list_stack) - 1) + "\\end{itemize}" - ) + new_lines.append(" " * (len(list_stack) - 1) + "\\end{itemize}") list_stack.pop() if len(list_stack) <= indent_level: @@ -81,9 +75,7 @@ def convert_markdown_to_latex(markdown_content: str) -> str: else: # Close any open lists while list_stack: - new_lines.append( - " " * (len(list_stack) - 1) + "\\end{itemize}" - ) + new_lines.append(" " * (len(list_stack) - 1) + "\\end{itemize}") list_stack.pop() new_lines.append(line) in_list = False diff --git a/policyengine_us_data/calibration/clone_and_assign.py b/policyengine_us_data/calibration/clone_and_assign.py index 9aa64cbbc..3e9642a19 100644 --- a/policyengine_us_data/calibration/clone_and_assign.py +++ b/policyengine_us_data/calibration/clone_and_assign.py @@ -45,8 +45,7 @@ def load_global_block_distribution(): csv_path = STORAGE_FOLDER / "block_cd_distributions.csv.gz" if not csv_path.exists(): raise FileNotFoundError( - f"{csv_path} not found. " - "Run make_block_cd_distributions.py to generate." + f"{csv_path} not found. Run make_block_cd_distributions.py to generate." ) df = pd.read_csv(csv_path, dtype={"block_geoid": str}) diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index 4e2224895..bf835583c 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -194,9 +194,7 @@ "social_security", ] -RETIREMENT_PREDICTORS = ( - RETIREMENT_DEMOGRAPHIC_PREDICTORS + RETIREMENT_INCOME_PREDICTORS -) +RETIREMENT_PREDICTORS = RETIREMENT_DEMOGRAPHIC_PREDICTORS + RETIREMENT_INCOME_PREDICTORS def _get_retirement_limits(year: int) -> dict: @@ -411,9 +409,7 @@ def reconcile_ss_subcomponents( if puf_has_ss.any(): shares = _qrf_ss_shares(data, n_cps, time_period, puf_has_ss) if shares is None: - shares = _age_heuristic_ss_shares( - data, n_cps, time_period, puf_has_ss - ) + shares = _age_heuristic_ss_shares(data, n_cps, time_period, puf_has_ss) for sub in SS_SUBCOMPONENTS: if sub not in data: @@ -492,17 +488,13 @@ def _map_to_entity(pred_values, variable_name): return pred_values entity = var_meta.entity.key if entity != "person": - return cps_sim.populations[entity].value_from_first_person( - pred_values - ) + return cps_sim.populations[entity].value_from_first_person(pred_values) return pred_values # Impute weeks_unemployed for PUF half puf_weeks = None if y_full is not None and dataset_path is not None: - puf_weeks = _impute_weeks_unemployed( - data, y_full, time_period, dataset_path - ) + puf_weeks = _impute_weeks_unemployed(data, y_full, time_period, dataset_path) # Impute retirement contributions for PUF half puf_retirement = None @@ -526,24 +518,14 @@ def _map_to_entity(pred_values, variable_name): time_period: np.concatenate([values, values + values.max()]) } elif "_weight" in variable: - new_data[variable] = { - time_period: np.concatenate([values, values * 0]) - } + new_data[variable] = {time_period: np.concatenate([values, values * 0])} elif variable == "weeks_unemployed" and puf_weeks is not None: - new_data[variable] = { - time_period: np.concatenate([values, puf_weeks]) - } - elif ( - variable in CPS_RETIREMENT_VARIABLES and puf_retirement is not None - ): + new_data[variable] = {time_period: np.concatenate([values, puf_weeks])} + elif variable in CPS_RETIREMENT_VARIABLES and puf_retirement is not None: puf_vals = puf_retirement[variable] - new_data[variable] = { - time_period: np.concatenate([values, puf_vals]) - } + new_data[variable] = {time_period: np.concatenate([values, puf_vals])} else: - new_data[variable] = { - time_period: np.concatenate([values, values]) - } + new_data[variable] = {time_period: np.concatenate([values, values])} new_data["state_fips"] = { time_period: np.concatenate([state_fips, state_fips]).astype(np.int32) @@ -662,11 +644,7 @@ def _impute_weeks_unemployed( logger.info( "Imputed weeks_unemployed for PUF: %d with weeks > 0, mean = %.1f", (imputed_weeks > 0).sum(), - ( - imputed_weeks[imputed_weeks > 0].mean() - if (imputed_weeks > 0).any() - else 0 - ), + (imputed_weeks[imputed_weeks > 0].mean() if (imputed_weeks > 0).any() else 0), ) del fitted, predictions @@ -836,9 +814,7 @@ def _run_qrf_imputation( puf_sim = Microsimulation(dataset=puf_dataset) - puf_agi = puf_sim.calculate( - "adjusted_gross_income", map_to="person" - ).values + puf_agi = puf_sim.calculate("adjusted_gross_income", map_to="person").values X_train_full = puf_sim.calculate_dataframe( DEMOGRAPHIC_PREDICTORS + IMPUTED_VARIABLES @@ -873,9 +849,7 @@ def _run_qrf_imputation( X_test[pred] = data[pred][time_period].astype(np.float32) logger.info("Imputing %d PUF variables (full)", len(IMPUTED_VARIABLES)) - y_full = _batch_qrf( - X_train_full, X_test, DEMOGRAPHIC_PREDICTORS, IMPUTED_VARIABLES - ) + y_full = _batch_qrf(X_train_full, X_test, DEMOGRAPHIC_PREDICTORS, IMPUTED_VARIABLES) logger.info( "Imputing %d PUF variables (override)", @@ -915,9 +889,7 @@ def _stratified_subsample_index( if remaining_quota >= len(bottom_idx): selected_bottom = bottom_idx else: - selected_bottom = rng.choice( - bottom_idx, size=remaining_quota, replace=False - ) + selected_bottom = rng.choice(bottom_idx, size=remaining_quota, replace=False) selected = np.concatenate([top_idx, selected_bottom]) selected.sort() diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py index 339e038ed..25c7975ad 100644 --- a/policyengine_us_data/calibration/source_impute.py +++ b/policyengine_us_data/calibration/source_impute.py @@ -225,9 +225,7 @@ def _person_state_fips( if hh_ids_person is not None: hh_ids = data["household_id"][time_period] hh_to_idx = {int(hh_id): i for i, hh_id in enumerate(hh_ids)} - return np.array( - [state_fips[hh_to_idx[int(hh_id)]] for hh_id in hh_ids_person] - ) + return np.array([state_fips[hh_to_idx[int(hh_id)]] for hh_id in hh_ids_person]) # Fallback: distribute persons across households as evenly # as possible (first households get any remainder). n_hh = len(data["household_id"][time_period]) @@ -264,9 +262,9 @@ def _impute_acs( predictors = ACS_PREDICTORS + ["state_fips"] acs_df = acs.calculate_dataframe(ACS_PREDICTORS + ACS_IMPUTED_VARIABLES) - acs_df["state_fips"] = acs.calculate( - "state_fips", map_to="person" - ).values.astype(np.float32) + acs_df["state_fips"] = acs.calculate("state_fips", map_to="person").values.astype( + np.float32 + ) train_df = acs_df[acs_df.is_household_head].sample(10_000, random_state=42) train_df = _encode_tenure_type(train_df) @@ -368,16 +366,10 @@ def _impute_sipp( sipp_df["is_under_18"] = sipp_df.TAGE < 18 sipp_df["is_under_6"] = sipp_df.TAGE < 6 sipp_df["count_under_18"] = ( - sipp_df.groupby("SSUID")["is_under_18"] - .sum() - .loc[sipp_df.SSUID.values] - .values + sipp_df.groupby("SSUID")["is_under_18"].sum().loc[sipp_df.SSUID.values].values ) sipp_df["count_under_6"] = ( - sipp_df.groupby("SSUID")["is_under_6"] - .sum() - .loc[sipp_df.SSUID.values] - .values + sipp_df.groupby("SSUID")["is_under_6"].sum().loc[sipp_df.SSUID.values].values ) tip_cols = [ @@ -408,9 +400,9 @@ def _impute_sipp( age_df = pd.DataFrame({"hh": hh_ids_person, "age": person_ages}) under_18 = age_df.groupby("hh")["age"].apply(lambda x: (x < 18).sum()) under_6 = age_df.groupby("hh")["age"].apply(lambda x: (x < 6).sum()) - cps_tip_df["count_under_18"] = under_18.loc[ - hh_ids_person - ].values.astype(np.float32) + cps_tip_df["count_under_18"] = under_18.loc[hh_ids_person].values.astype( + np.float32 + ) cps_tip_df["count_under_6"] = under_6.loc[hh_ids_person].values.astype( np.float32 ) @@ -499,10 +491,7 @@ def _impute_sipp( asset_train.index, size=min(20_000, len(asset_train)), replace=True, - p=( - asset_train.household_weight - / asset_train.household_weight.sum() - ), + p=(asset_train.household_weight / asset_train.household_weight.sum()), ) ] @@ -513,15 +502,15 @@ def _impute_sipp( ["employment_income", "age", "is_male"], ) if "is_male" in cps_asset_df.columns: - cps_asset_df["is_female"] = ( - ~cps_asset_df["is_male"].astype(bool) - ).astype(np.float32) + cps_asset_df["is_female"] = (~cps_asset_df["is_male"].astype(bool)).astype( + np.float32 + ) else: cps_asset_df["is_female"] = 0.0 if "is_married" in data: - cps_asset_df["is_married"] = data["is_married"][ - time_period - ].astype(np.float32) + cps_asset_df["is_married"] = data["is_married"][time_period].astype( + np.float32 + ) else: cps_asset_df["is_married"] = 0.0 cps_asset_df["count_under_18"] = ( @@ -623,9 +612,7 @@ def _impute_scf( cps_df = _build_cps_receiver(data, time_period, dataset_path, pe_vars) if "is_male" in cps_df.columns: - cps_df["is_female"] = (~cps_df["is_male"].astype(bool)).astype( - np.float32 - ) + cps_df["is_female"] = (~cps_df["is_male"].astype(bool)).astype(np.float32) else: cps_df["is_female"] = 0.0 diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py index 1fb7a6b34..6a7a8bd1b 100644 --- a/policyengine_us_data/calibration/unified_calibration.py +++ b/policyengine_us_data/calibration/unified_calibration.py @@ -144,20 +144,14 @@ def rerandomize_takeup( is_state_specific = isinstance(rate_or_dict, dict) - entity_ids = sim.calculate( - f"{entity_level}_id", map_to=entity_level - ).values - entity_hh_ids = sim.calculate( - "household_id", map_to=entity_level - ).values + entity_ids = sim.calculate(f"{entity_level}_id", map_to=entity_level).values + entity_hh_ids = sim.calculate("household_id", map_to=entity_level).values n_entities = len(entity_ids) draws = np.zeros(n_entities, dtype=np.float64) rates = np.zeros(n_entities, dtype=np.float64) - entity_blocks = np.array( - [hh_to_block.get(hid, "0") for hid in entity_hh_ids] - ) + entity_blocks = np.array([hh_to_block.get(hid, "0") for hid in entity_hh_ids]) unique_blocks = np.unique(entity_blocks) for block in unique_blocks: @@ -185,9 +179,7 @@ def rerandomize_takeup( def parse_args(argv=None): - parser = argparse.ArgumentParser( - description="Unified L0 calibration pipeline" - ) + parser = argparse.ArgumentParser(description="Unified L0 calibration pipeline") parser.add_argument( "--dataset", default=None, @@ -308,8 +300,7 @@ def fit_l0_weights( initial_weights = np.ones(n_total) * 100 logger.info( - "L0 calibration: %d targets, %d features, " - "lambda_l0=%.1e, epochs=%d", + "L0 calibration: %d targets, %d features, lambda_l0=%.1e, epochs=%d", X_sparse.shape[0], n_total, lambda_l0, @@ -609,8 +600,7 @@ def run_calibration( ) source_path = str( - Path(dataset_path).parent - / f"source_imputed_{Path(dataset_path).stem}.h5" + Path(dataset_path).parent / f"source_imputed_{Path(dataset_path).stem}.h5" ) with h5py.File(source_path, "w") as f: for var, time_dict in data_dict.items(): @@ -716,9 +706,7 @@ def main(argv=None): dataset_path = args.dataset or str( STORAGE_FOLDER / "stratified_extended_cps_2024.h5" ) - db_path = args.db_path or str( - STORAGE_FOLDER / "calibration" / "policy_data.db" - ) + db_path = args.db_path or str(STORAGE_FOLDER / "calibration" / "policy_data.db") output_path = args.output or str( STORAGE_FOLDER / "calibration" / "unified_weights.npy" ) @@ -732,15 +720,11 @@ def main(argv=None): domain_variables = None if args.domain_variables: - domain_variables = [ - x.strip() for x in args.domain_variables.split(",") - ] + domain_variables = [x.strip() for x in args.domain_variables.split(",")] hierarchical_domains = None if args.hierarchical_domains: - hierarchical_domains = [ - x.strip() for x in args.hierarchical_domains.split(",") - ] + hierarchical_domains = [x.strip() for x in args.hierarchical_domains.split(",")] t_start = time.time() diff --git a/policyengine_us_data/calibration/unified_matrix_builder.py b/policyengine_us_data/calibration/unified_matrix_builder.py index ac31c34e1..c0b442f35 100644 --- a/policyengine_us_data/calibration/unified_matrix_builder.py +++ b/policyengine_us_data/calibration/unified_matrix_builder.py @@ -71,18 +71,10 @@ def _build_entity_relationship(self, sim) -> pd.DataFrame: self._entity_rel_cache = pd.DataFrame( { - "person_id": sim.calculate( - "person_id", map_to="person" - ).values, - "household_id": sim.calculate( - "household_id", map_to="person" - ).values, - "tax_unit_id": sim.calculate( - "tax_unit_id", map_to="person" - ).values, - "spm_unit_id": sim.calculate( - "spm_unit_id", map_to="person" - ).values, + "person_id": sim.calculate("person_id", map_to="person").values, + "household_id": sim.calculate("household_id", map_to="person").values, + "tax_unit_id": sim.calculate("tax_unit_id", map_to="person").values, + "spm_unit_id": sim.calculate("spm_unit_id", map_to="person").values, } ) return self._entity_rel_cache @@ -126,9 +118,7 @@ def _evaluate_constraints_entity_aware( df["satisfies"] = person_mask hh_mask = df.groupby("household_id")["satisfies"].any() - household_ids = sim.calculate( - "household_id", map_to="household" - ).values + household_ids = sim.calculate("household_id", map_to="household").values return np.array([hh_mask.get(hid, False) for hid in household_ids]) # --------------------------------------------------------------- @@ -240,9 +230,7 @@ def _calculate_uprating_factors(self, params) -> dict: factors[(from_year, "cpi")] = 1.0 try: - pop_from = params.calibration.gov.census.populations.total( - from_year - ) + pop_from = params.calibration.gov.census.populations.total(from_year) pop_to = params.calibration.gov.census.populations.total( self.time_period ) @@ -326,9 +314,7 @@ def _get_state_uprating_factors( var_factors[var] = 1.0 continue period = row.iloc[0]["period"] - factor, _ = self._get_uprating_info( - var, period, national_factors - ) + factor, _ = self._get_uprating_info(var, period, national_factors) var_factors[var] = factor result[state_int] = var_factors @@ -430,14 +416,12 @@ def print_uprating_summary(self, targets_df: pd.DataFrame) -> None: print("\n" + "=" * 60) print("UPRATING SUMMARY") print("=" * 60) - print(f"Uprated {len(uprated)} of " f"{len(targets_df)} targets") + print(f"Uprated {len(uprated)} of {len(targets_df)} targets") period_counts = uprated["period"].value_counts().sort_index() for period, count in period_counts.items(): print(f" Period {period}: {count} targets") factors = eff[eff != 1.0] - print( - f" Factor range: [{factors.min():.4f}, " f"{factors.max():.4f}]" - ) + print(f" Factor range: [{factors.min():.4f}, {factors.max():.4f}]") # --------------------------------------------------------------- # Target naming @@ -465,9 +449,7 @@ def _make_target_name( non_geo = [c for c in constraints if c["variable"] not in _GEO_VARS] if non_geo: - strs = [ - f"{c['variable']}{c['operation']}{c['value']}" for c in non_geo - ] + strs = [f"{c['variable']}{c['operation']}{c['value']}" for c in non_geo] parts.append("[" + ",".join(strs) + "]") return "/".join(parts) @@ -510,12 +492,8 @@ def _calculate_target_values( return np.zeros(n_households, dtype=np.float32) person_mask &= apply_op(cv, c["operation"], c["value"]) - target_entity = sim.tax_benefit_system.variables[ - target_variable - ].entity.key - household_ids = sim.calculate( - "household_id", map_to="household" - ).values + target_entity = sim.tax_benefit_system.variables[target_variable].entity.key + household_ids = sim.calculate("household_id", map_to="household").values if target_entity == "household": if non_geo_constraints: @@ -674,15 +652,9 @@ def build_matrix( n_targets = len(targets_df) # 2. Sort targets by geographic level - targets_df["_geo_level"] = targets_df["geographic_id"].apply( - get_geo_level - ) - targets_df = targets_df.sort_values( - ["_geo_level", "variable", "geographic_id"] - ) - targets_df = targets_df.drop(columns=["_geo_level"]).reset_index( - drop=True - ) + targets_df["_geo_level"] = targets_df["geographic_id"].apply(get_geo_level) + targets_df = targets_df.sort_values(["_geo_level", "variable", "geographic_id"]) + targets_df = targets_df.drop(columns=["_geo_level"]).reset_index(drop=True) # 3. Build column index structures from geography state_col_lists: Dict[int, list] = defaultdict(list) @@ -709,9 +681,7 @@ def build_matrix( geo_id = row["geographic_id"] target_geo_info.append((geo_level, geo_id)) - non_geo = [ - c for c in constraints if c["variable"] not in _GEO_VARS - ] + non_geo = [c for c in constraints if c["variable"] not in _GEO_VARS] non_geo_constraints_list.append(non_geo) target_names.append( @@ -745,7 +715,7 @@ def build_matrix( clone_states = geography.state_fips[col_start:col_end] logger.info( - "Processing clone %d/%d " "(cols %d-%d, %d unique states)...", + "Processing clone %d/%d (cols %d-%d, %d unique states)...", clone_idx + 1, n_clones, col_start, diff --git a/policyengine_us_data/datasets/acs/acs.py b/policyengine_us_data/datasets/acs/acs.py index 0ecd3ee7c..11d1ef738 100644 --- a/policyengine_us_data/datasets/acs/acs.py +++ b/policyengine_us_data/datasets/acs/acs.py @@ -18,9 +18,7 @@ def generate(self) -> None: raw_data = self.census_acs(require=True).load() acs = h5py.File(self.file_path, mode="w") - person, household = [ - raw_data[entity] for entity in ("person", "household") - ] + person, household = [raw_data[entity] for entity in ("person", "household")] self.add_id_variables(acs, person, household) self.add_person_variables(acs, person, household) @@ -39,9 +37,7 @@ def add_id_variables( h_id_to_number = pd.Series( np.arange(len(household)), index=household["SERIALNO"] ) - household["household_id"] = h_id_to_number[ - household["SERIALNO"] - ].values + household["household_id"] = h_id_to_number[household["SERIALNO"]].values person["household_id"] = h_id_to_number[person["SERIALNO"]].values person["person_id"] = person.index + 1 @@ -100,9 +96,7 @@ def add_spm_variables(acs: h5py.File, spm_unit: DataFrame) -> None: @staticmethod def add_household_variables(acs: h5py.File, household: DataFrame) -> None: acs["household_vehicles_owned"] = household.VEH - acs["state_fips"] = acs["household_state_fips"] = household.ST.astype( - int - ) + acs["state_fips"] = acs["household_state_fips"] = household.ST.astype(int) class ACS_2022(ACS): diff --git a/policyengine_us_data/datasets/acs/census_acs.py b/policyengine_us_data/datasets/acs/census_acs.py index 842af6279..7bd28bd61 100644 --- a/policyengine_us_data/datasets/acs/census_acs.py +++ b/policyengine_us_data/datasets/acs/census_acs.py @@ -66,9 +66,7 @@ def generate(self) -> None: household = self.process_household_data( household_url, "psam_hus", HOUSEHOLD_COLUMNS ) - person = self.process_person_data( - person_url, "psam_pus", PERSON_COLUMNS - ) + person = self.process_person_data(person_url, "psam_pus", PERSON_COLUMNS) person = person[person.SERIALNO.isin(household.SERIALNO)] household = household[household.SERIALNO.isin(person.SERIALNO)] storage["household"] = household @@ -106,9 +104,7 @@ def process_household_data( return res @staticmethod - def process_person_data( - url: str, prefix: str, columns: List[str] - ) -> pd.DataFrame: + def process_person_data(url: str, prefix: str, columns: List[str]) -> pd.DataFrame: req = requests.get(url, stream=True) with BytesIO() as f: pbar = tqdm() @@ -137,9 +133,7 @@ def process_person_data( return res @staticmethod - def create_spm_unit_table( - storage: pd.HDFStore, person: pd.DataFrame - ) -> None: + def create_spm_unit_table(storage: pd.HDFStore, person: pd.DataFrame) -> None: SPM_UNIT_COLUMNS = [ "CAPHOUSESUB", "CAPWKCCXPNS", @@ -181,12 +175,10 @@ def create_spm_unit_table( # Ensure SERIALNO is treated as string JOIN_COLUMNS = ["SERIALNO", "SPORDER"] - original_person_table["SERIALNO"] = original_person_table[ - "SERIALNO" - ].astype(str) - original_person_table["SPORDER"] = original_person_table[ - "SPORDER" - ].astype(int) + original_person_table["SERIALNO"] = original_person_table["SERIALNO"].astype( + str + ) + original_person_table["SPORDER"] = original_person_table["SPORDER"].astype(int) person["SERIALNO"] = person["SERIALNO"].astype(str) person["SPORDER"] = person["SPORDER"].astype(int) diff --git a/policyengine_us_data/datasets/cps/census_cps.py b/policyengine_us_data/datasets/cps/census_cps.py index 00ca020ef..042fefe56 100644 --- a/policyengine_us_data/datasets/cps/census_cps.py +++ b/policyengine_us_data/datasets/cps/census_cps.py @@ -15,9 +15,7 @@ class CensusCPS(Dataset): def generate(self): if self._cps_download_url is None: - raise ValueError( - f"No raw CPS data URL known for year {self.time_period}." - ) + raise ValueError(f"No raw CPS data URL known for year {self.time_period}.") url = self._cps_download_url @@ -28,9 +26,7 @@ def generate(self): ] response = requests.get(url, stream=True) - total_size_in_bytes = int( - response.headers.get("content-length", 200e6) - ) + total_size_in_bytes = int(response.headers.get("content-length", 200e6)) progress_bar = tqdm( total=total_size_in_bytes, unit="iB", @@ -38,9 +34,7 @@ def generate(self): desc="Downloading ASEC", ) if response.status_code == 404: - raise FileNotFoundError( - "Received a 404 response when fetching the data." - ) + raise FileNotFoundError("Received a 404 response when fetching the data.") with BytesIO() as file: content_length_actual = 0 for data in response.iter_content(int(1e6)): @@ -65,33 +59,23 @@ def generate(self): file_prefix = "cpspb/asec/prod/data/2019/" else: file_prefix = "" - with zipfile.open( - f"{file_prefix}pppub{file_year_code}.csv" - ) as f: + with zipfile.open(f"{file_prefix}pppub{file_year_code}.csv") as f: storage["person"] = pd.read_csv( f, - usecols=PERSON_COLUMNS - + spm_unit_columns - + TAX_UNIT_COLUMNS, + usecols=PERSON_COLUMNS + spm_unit_columns + TAX_UNIT_COLUMNS, ).fillna(0) person = storage["person"] - with zipfile.open( - f"{file_prefix}ffpub{file_year_code}.csv" - ) as f: + with zipfile.open(f"{file_prefix}ffpub{file_year_code}.csv") as f: person_family_id = person.PH_SEQ * 10 + person.PF_SEQ family = pd.read_csv(f).fillna(0) family_id = family.FH_SEQ * 10 + family.FFPOS family = family[family_id.isin(person_family_id)] storage["family"] = family - with zipfile.open( - f"{file_prefix}hhpub{file_year_code}.csv" - ) as f: + with zipfile.open(f"{file_prefix}hhpub{file_year_code}.csv") as f: person_household_id = person.PH_SEQ household = pd.read_csv(f).fillna(0) household_id = household.H_SEQ - household = household[ - household_id.isin(person_household_id) - ] + household = household[household_id.isin(person_household_id)] storage["household"] = household storage["tax_unit"] = self._create_tax_unit_table(person) storage["spm_unit"] = self._create_spm_unit_table( diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index bbc7f4fba..ccbe48850 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -93,9 +93,7 @@ def downsample(self, frac: float): # Store original dtypes before modifying original_data: dict = self.load_dataset() - original_dtypes = { - key: original_data[key].dtype for key in original_data - } + original_dtypes = {key: original_data[key].dtype for key in original_data} sim = Microsimulation(dataset=self) sim.subsample(frac=frac) @@ -208,18 +206,13 @@ def add_takeup(self): aca_rate = load_take_up_rate("aca", self.time_period) medicaid_rates_by_state = load_take_up_rate("medicaid", self.time_period) head_start_rate = load_take_up_rate("head_start", self.time_period) - early_head_start_rate = load_take_up_rate( - "early_head_start", self.time_period - ) + early_head_start_rate = load_take_up_rate("early_head_start", self.time_period) ssi_rate = load_take_up_rate("ssi", self.time_period) # EITC: varies by number of children eitc_child_count = baseline.calculate("eitc_child_count").values eitc_takeup_rate = np.array( - [ - eitc_rates_by_children.get(min(int(c), 3), 0.85) - for c in eitc_child_count - ] + [eitc_rates_by_children.get(min(int(c), 3), 0.85) for c in eitc_child_count] ) rng = seeded_rng("takes_up_eitc") data["takes_up_eitc"] = rng.random(n_tax_units) < eitc_takeup_rate @@ -238,9 +231,7 @@ def add_takeup(self): target_snap_takeup_count = int(snap_rate * n_spm_units) remaining_snap_needed = max(0, target_snap_takeup_count - n_snap_reporters) snap_non_reporter_rate = ( - remaining_snap_needed / n_snap_non_reporters - if n_snap_non_reporters > 0 - else 0 + remaining_snap_needed / n_snap_non_reporters if n_snap_non_reporters > 0 else 0 ) # Assign: all reporters + adjusted rate for non-reporters @@ -257,9 +248,7 @@ def add_takeup(self): hh_ids = data["household_id"] person_hh_ids = data["person_household_id"] hh_to_state = dict(zip(hh_ids, state_codes)) - person_states = np.array( - [hh_to_state.get(hh_id, "CA") for hh_id in person_hh_ids] - ) + person_states = np.array([hh_to_state.get(hh_id, "CA") for hh_id in person_hh_ids]) medicaid_rate_by_person = np.array( [medicaid_rates_by_state.get(s, 0.93) for s in person_states] ) @@ -270,9 +259,7 @@ def add_takeup(self): # Head Start rng = seeded_rng("takes_up_head_start_if_eligible") - data["takes_up_head_start_if_eligible"] = ( - rng.random(n_persons) < head_start_rate - ) + data["takes_up_head_start_if_eligible"] = rng.random(n_persons) < head_start_rate # Early Head Start rng = seeded_rng("takes_up_early_head_start_if_eligible") @@ -290,9 +277,7 @@ def add_takeup(self): target_ssi_takeup_count = int(ssi_rate * n_persons) remaining_ssi_needed = max(0, target_ssi_takeup_count - n_ssi_reporters) ssi_non_reporter_rate = ( - remaining_ssi_needed / n_ssi_non_reporters - if n_ssi_non_reporters > 0 - else 0 + remaining_ssi_needed / n_ssi_non_reporters if n_ssi_non_reporters > 0 else 0 ) # Assign: all reporters + adjusted rate for non-reporters @@ -315,9 +300,7 @@ def add_takeup(self): data["would_claim_wic"] = rng.random(n_persons) < wic_takeup_rate_by_person # WIC nutritional risk — fully resolved - wic_risk_rates = load_take_up_rate( - "wic_nutritional_risk", self.time_period - ) + wic_risk_rates = load_take_up_rate("wic_nutritional_risk", self.time_period) wic_risk_rate_by_person = np.array( [wic_risk_rates.get(c, 0) for c in wic_categories] ) @@ -364,12 +347,8 @@ def uprate_cps_data(data, from_period, to_period): uprating = create_policyengine_uprating_factors_table() for variable in uprating.index.unique(): if variable in data: - current_index = uprating[uprating.index == variable][ - to_period - ].values[0] - start_index = uprating[uprating.index == variable][ - from_period - ].values[0] + current_index = uprating[uprating.index == variable][to_period].values[0] + start_index = uprating[uprating.index == variable][from_period].values[0] growth = current_index / start_index data[variable] = data[variable] * growth @@ -411,9 +390,7 @@ def add_id_variables( # Marital units - marital_unit_id = person.PH_SEQ * 1e6 + np.maximum( - person.A_LINENO, person.A_SPOUSE - ) + marital_unit_id = person.PH_SEQ * 1e6 + np.maximum(person.A_LINENO, person.A_SPOUSE) # marital_unit_id is not the household ID, zero padded and followed # by the index within household (of each person, or their spouse if @@ -453,9 +430,7 @@ def add_personal_variables(cps: h5py.File, person: DataFrame) -> None: # "Is...blind or does...have serious difficulty seeing even when Wearing # glasses?" 1 -> Yes cps["is_blind"] = person.PEDISEYE == 1 - DISABILITY_FLAGS = [ - "PEDIS" + i for i in ["DRS", "EAR", "EYE", "OUT", "PHY", "REM"] - ] + DISABILITY_FLAGS = ["PEDIS" + i for i in ["DRS", "EAR", "EYE", "OUT", "PHY", "REM"]] cps["is_disabled"] = (person[DISABILITY_FLAGS] == 1).any(axis=1) def children_per_parent(col: str) -> pd.DataFrame: @@ -477,9 +452,7 @@ def children_per_parent(col: str) -> pd.DataFrame: # Aggregate to parent. res = ( - pd.concat( - [children_per_parent("PEPAR1"), children_per_parent("PEPAR2")] - ) + pd.concat([children_per_parent("PEPAR1"), children_per_parent("PEPAR2")]) .groupby(["PH_SEQ", "A_LINENO"]) .children.sum() .reset_index() @@ -505,9 +478,7 @@ def children_per_parent(col: str) -> pd.DataFrame: add_overtime_occupation(cps, person) -def add_personal_income_variables( - cps: h5py.File, person: DataFrame, year: int -): +def add_personal_income_variables(cps: h5py.File, person: DataFrame, year: int): """Add income variables. Args: @@ -533,16 +504,14 @@ def add_personal_income_variables( cps["weekly_hours_worked"] = person.HRSWK * person.WKSWORK / 52 cps["hours_worked_last_week"] = person.A_HRS1 * person.WKSWORK / 52 - cps["taxable_interest_income"] = person.INT_VAL * ( - p["taxable_interest_fraction"] - ) + cps["taxable_interest_income"] = person.INT_VAL * (p["taxable_interest_fraction"]) cps["tax_exempt_interest_income"] = person.INT_VAL * ( 1 - p["taxable_interest_fraction"] ) cps["self_employment_income"] = person.SEMP_VAL cps["farm_income"] = person.FRSE_VAL - cps["qualified_dividend_income"] = person.DIV_VAL * ( - p["qualified_dividend_fraction"] + cps["qualified_dividend_income"] = ( + person.DIV_VAL * (p["qualified_dividend_fraction"]) ) cps["non_qualified_dividend_income"] = person.DIV_VAL * ( 1 - p["qualified_dividend_fraction"] @@ -561,18 +530,14 @@ def add_personal_income_variables( # 8 = Other is_retirement = (person.RESNSS1 == 1) | (person.RESNSS2 == 1) is_disability = (person.RESNSS1 == 2) | (person.RESNSS2 == 2) - is_survivor = np.isin(person.RESNSS1, [3, 5]) | np.isin( - person.RESNSS2, [3, 5] - ) + is_survivor = np.isin(person.RESNSS1, [3, 5]) | np.isin(person.RESNSS2, [3, 5]) is_dependent = np.isin(person.RESNSS1, [4, 6, 7]) | np.isin( person.RESNSS2, [4, 6, 7] ) # Primary classification: assign full SS_VAL to the highest- # priority category when someone has multiple source codes. - cps["social_security_retirement"] = np.where( - is_retirement, person.SS_VAL, 0 - ) + cps["social_security_retirement"] = np.where(is_retirement, person.SS_VAL, 0) cps["social_security_disability"] = np.where( is_disability & ~is_retirement, person.SS_VAL, 0 ) @@ -615,9 +580,7 @@ def add_personal_income_variables( # Add pensions and annuities. cps_pensions = person.PNSN_VAL + person.ANN_VAL # Assume a constant fraction of pension income is taxable. - cps["taxable_private_pension_income"] = ( - cps_pensions * p["taxable_pension_fraction"] - ) + cps["taxable_private_pension_income"] = cps_pensions * p["taxable_pension_fraction"] cps["tax_exempt_private_pension_income"] = cps_pensions * ( 1 - p["taxable_pension_fraction"] ) @@ -641,18 +604,11 @@ def add_personal_income_variables( for source_with_taxable_fraction in ["401k", "403b", "sep"]: cps[f"taxable_{source_with_taxable_fraction}_distributions"] = ( cps[f"{source_with_taxable_fraction}_distributions"] - * p[ - f"taxable_{source_with_taxable_fraction}_distribution_fraction" - ] + * p[f"taxable_{source_with_taxable_fraction}_distribution_fraction"] ) cps[f"tax_exempt_{source_with_taxable_fraction}_distributions"] = cps[ f"{source_with_taxable_fraction}_distributions" - ] * ( - 1 - - p[ - f"taxable_{source_with_taxable_fraction}_distribution_fraction" - ] - ) + ] * (1 - p[f"taxable_{source_with_taxable_fraction}_distribution_fraction"]) del cps[f"{source_with_taxable_fraction}_distributions"] # Assume all regular IRA distributions are taxable, @@ -740,9 +696,7 @@ def add_personal_income_variables( cps["traditional_ira_contributions"] = ira_capped * trad_ira_share cps["roth_ira_contributions"] = ira_capped * (1 - trad_ira_share) # Allocate capital gains into long-term and short-term based on aggregate split. - cps["long_term_capital_gains"] = person.CAP_VAL * ( - p["long_term_capgain_fraction"] - ) + cps["long_term_capital_gains"] = person.CAP_VAL * (p["long_term_capgain_fraction"]) cps["short_term_capital_gains"] = person.CAP_VAL * ( 1 - p["long_term_capgain_fraction"] ) @@ -770,10 +724,7 @@ def add_personal_income_variables( # Get QBI simulation parameters --- yamlfilename = ( - files("policyengine_us_data") - / "datasets" - / "puf" - / "qbi_assumptions.yaml" + files("policyengine_us_data") / "datasets" / "puf" / "qbi_assumptions.yaml" ) with open(yamlfilename, "r", encoding="utf-8") as yamlfile: p = yaml.safe_load(yamlfile) @@ -827,14 +778,10 @@ def add_spm_variables(self, cps: h5py.File, spm_unit: DataFrame) -> None: 3: "RENTER", } cps["spm_unit_tenure_type"] = ( - spm_unit.SPM_TENMORTSTATUS.map(tenure_map) - .fillna("RENTER") - .astype("S") + spm_unit.SPM_TENMORTSTATUS.map(tenure_map).fillna("RENTER").astype("S") ) - cps["reduced_price_school_meals_reported"] = ( - cps["free_school_meals_reported"] * 0 - ) + cps["reduced_price_school_meals_reported"] = cps["free_school_meals_reported"] * 0 def add_household_variables(cps: h5py.File, household: DataFrame) -> None: @@ -968,9 +915,7 @@ def select_random_subset_to_target( share_to_move = min(share_to_move, 1.0) # Cap at 100% else: # Calculate how much to move to reach target (for EAD case) - needed_weighted = ( - current_weighted - target_weighted - ) # Will be negative + needed_weighted = current_weighted - target_weighted # Will be negative total_weight = np.sum(person_weights[eligible_ids]) share_to_move = abs(needed_weighted) / total_weight share_to_move = min(share_to_move, 1.0) # Cap at 100% @@ -1214,9 +1159,7 @@ def select_random_subset_to_target( ) # CONDITION 10: Government Employees - is_government_worker = np.isin( - person.PEIO1COW, [1, 2, 3] - ) # Fed/state/local gov + is_government_worker = np.isin(person.PEIO1COW, [1, 2, 3]) # Fed/state/local gov is_military_occupation = person.A_MJOCC == 11 # Military occupation is_government_employee = is_government_worker | is_military_occupation condition_10_mask = potentially_undocumented & is_government_employee @@ -1330,12 +1273,8 @@ def select_random_subset_to_target( undocumented_students_mask = ( (ssn_card_type == 0) & noncitizens & (person.A_HSCOL == 2) ) - undocumented_workers_count = np.sum( - person_weights[undocumented_workers_mask] - ) - undocumented_students_count = np.sum( - person_weights[undocumented_students_mask] - ) + undocumented_workers_count = np.sum(person_weights[undocumented_workers_mask]) + undocumented_students_count = np.sum(person_weights[undocumented_students_mask]) after_conditions_code_0 = np.sum(person_weights[ssn_card_type == 0]) print(f"After conditions - Code 0 people: {after_conditions_code_0:,.0f}") @@ -1530,15 +1469,11 @@ def select_random_subset_to_target( f"Selected {len(selected_indices)} people from {len(mixed_household_candidates)} candidates in mixed households" ) else: - print( - "No additional family members selected (target already reached)" - ) + print("No additional family members selected (target already reached)") else: print("No mixed-status households found for family correlation") else: - print( - "No additional undocumented people needed - target already reached" - ) + print("No additional undocumented people needed - target already reached") # Calculate the weighted impact code_0_after = np.sum(person_weights[ssn_card_type == 0]) @@ -1613,9 +1548,7 @@ def get_arrival_year_midpoint(peinusyr): age_at_entry = np.maximum(0, person.A_AGE - years_in_us) # start every non-citizen as LPR so no UNSET survives - immigration_status = np.full( - len(person), "LEGAL_PERMANENT_RESIDENT", dtype="U32" - ) + immigration_status = np.full(len(person), "LEGAL_PERMANENT_RESIDENT", dtype="U32") # Set citizens (SSN card type 1) to CITIZEN status immigration_status[ssn_card_type == 1] = "CITIZEN" @@ -1663,9 +1596,7 @@ def get_arrival_year_midpoint(peinusyr): immigration_status[recent_refugee_mask] = "REFUGEE" # 6. Temp non-qualified (Code 2 not caught by DACA rule) - mask = (ssn_card_type == 2) & ( - immigration_status == "LEGAL_PERMANENT_RESIDENT" - ) + mask = (ssn_card_type == 2) & (immigration_status == "LEGAL_PERMANENT_RESIDENT") immigration_status[mask] = "TPS" # Final write (all values now in ImmigrationStatus Enum) @@ -1681,9 +1612,7 @@ def get_arrival_year_midpoint(peinusyr): 2: "NON_CITIZEN_VALID_EAD", # Non-citizens with work/study authorization 3: "OTHER_NON_CITIZEN", # Non-citizens with indicators of legal status } - ssn_card_type_str = ( - pd.Series(ssn_card_type).map(code_to_str).astype("S").values - ) + ssn_card_type_str = pd.Series(ssn_card_type).map(code_to_str).astype("S").values cps["ssn_card_type"] = ssn_card_type_str # Final population summary @@ -1749,25 +1678,63 @@ def _update_documentation_with_numbers(log_df, docs_dir): # Define replacements based on our logging structure replacements = { - "- **Step 0 - Initial**: Code 0 people = *[Run cps.py to populate]*": lambda: f"- **Step 0 - Initial**: Code 0 people = {data_map.get(('Step 0 - Initial', 'Code 0 people'), 0):,.0f}", - "- **Step 1 - Citizens**: Moved to Code 1 = *[Run cps.py to populate]*": lambda: f"- **Step 1 - Citizens**: Moved to Code 1 = {data_map.get(('Step 1 - Citizens', 'Moved to Code 1'), 0):,.0f}", - "- **ASEC Conditions**: Current Code 0 people = *[Run cps.py to populate]*": lambda: f"- **ASEC Conditions**: Current Code 0 people = {data_map.get(('ASEC Conditions', 'Current Code 0 people'), 0):,.0f}", - "- **After conditions**: Code 0 people = *[Run cps.py to populate]*": lambda: f"- **After conditions**: Code 0 people = {data_map.get(('After conditions', 'Code 0 people'), 0):,.0f}", - "- **Before adjustment**: Undocumented workers = *[Run cps.py to populate]*": lambda: f"- **Before adjustment**: Undocumented workers = {data_map.get(('Before adjustment', 'Undocumented workers'), 0):,.0f}", - "- **Target**: Undocumented workers target = *[Run cps.py to populate]*": lambda: f"- **Target**: Undocumented workers target = {data_map.get(('Target', 'Undocumented workers target'), 0):,.0f}", - "- **Before adjustment**: Undocumented students = *[Run cps.py to populate]*": lambda: f"- **Before adjustment**: Undocumented students = {data_map.get(('Before adjustment', 'Undocumented students'), 0):,.0f}", - "- **Target**: Undocumented students target = *[Run cps.py to populate]*": lambda: f"- **Target**: Undocumented students target = {data_map.get(('Target', 'Undocumented students target'), 0):,.0f}", - "- **Step 3 - EAD workers**: Moved from Code 0 to Code 2 = *[Run cps.py to populate]*": lambda: f"- **Step 3 - EAD workers**: Moved from Code 0 to Code 2 = {data_map.get(('Step 3 - EAD workers', 'Moved from Code 0 to Code 2'), 0):,.0f}", - "- **Step 4 - EAD students**: Moved from Code 0 to Code 2 = *[Run cps.py to populate]*": lambda: f"- **Step 4 - EAD students**: Moved from Code 0 to Code 2 = {data_map.get(('Step 4 - EAD students', 'Moved from Code 0 to Code 2'), 0):,.0f}", - "- **After EAD assignment**: Code 0 people = *[Run cps.py to populate]*": lambda: f"- **After EAD assignment**: Code 0 people = {data_map.get(('After EAD assignment', 'Code 0 people'), 0):,.0f}", - "- **Step 5 - Family correlation**: Changed from Code 3 to Code 0 = *[Run cps.py to populate]*": lambda: f"- **Step 5 - Family correlation**: Changed from Code 3 to Code 0 = {data_map.get(('Step 5 - Family correlation', 'Changed from Code 3 to Code 0'), 0):,.0f}", - "- **After family correlation**: Code 0 people = *[Run cps.py to populate]*": lambda: f"- **After family correlation**: Code 0 people = {data_map.get(('After family correlation', 'Code 0 people'), 0):,.0f}", - "- **Final**: Code 0 (NONE) = *[Run cps.py to populate]*": lambda: f"- **Final**: Code 0 (NONE) = {data_map.get(('Final', 'Code 0 (NONE)'), 0):,.0f}", - "- **Final**: Code 1 (CITIZEN) = *[Run cps.py to populate]*": lambda: f"- **Final**: Code 1 (CITIZEN) = {data_map.get(('Final', 'Code 1 (CITIZEN)'), 0):,.0f}", - "- **Final**: Code 2 (NON_CITIZEN_VALID_EAD) = *[Run cps.py to populate]*": lambda: f"- **Final**: Code 2 (NON_CITIZEN_VALID_EAD) = {data_map.get(('Final', 'Code 2 (NON_CITIZEN_VALID_EAD)'), 0):,.0f}", - "- **Final**: Code 3 (OTHER_NON_CITIZEN) = *[Run cps.py to populate]*": lambda: f"- **Final**: Code 3 (OTHER_NON_CITIZEN) = {data_map.get(('Final', 'Code 3 (OTHER_NON_CITIZEN)'), 0):,.0f}", - "- **Final**: Total undocumented (Code 0) = *[Run cps.py to populate]*": lambda: f"- **Final**: Total undocumented (Code 0) = {data_map.get(('Final', 'Total undocumented (Code 0)'), 0):,.0f}", - "- **Final**: Undocumented target = *[Run cps.py to populate]*": lambda: f"- **Final**: Undocumented target = {data_map.get(('Final', 'Undocumented target'), 0):,.0f}", + "- **Step 0 - Initial**: Code 0 people = *[Run cps.py to populate]*": lambda: ( + f"- **Step 0 - Initial**: Code 0 people = {data_map.get(('Step 0 - Initial', 'Code 0 people'), 0):,.0f}" + ), + "- **Step 1 - Citizens**: Moved to Code 1 = *[Run cps.py to populate]*": lambda: ( + f"- **Step 1 - Citizens**: Moved to Code 1 = {data_map.get(('Step 1 - Citizens', 'Moved to Code 1'), 0):,.0f}" + ), + "- **ASEC Conditions**: Current Code 0 people = *[Run cps.py to populate]*": lambda: ( + f"- **ASEC Conditions**: Current Code 0 people = {data_map.get(('ASEC Conditions', 'Current Code 0 people'), 0):,.0f}" + ), + "- **After conditions**: Code 0 people = *[Run cps.py to populate]*": lambda: ( + f"- **After conditions**: Code 0 people = {data_map.get(('After conditions', 'Code 0 people'), 0):,.0f}" + ), + "- **Before adjustment**: Undocumented workers = *[Run cps.py to populate]*": lambda: ( + f"- **Before adjustment**: Undocumented workers = {data_map.get(('Before adjustment', 'Undocumented workers'), 0):,.0f}" + ), + "- **Target**: Undocumented workers target = *[Run cps.py to populate]*": lambda: ( + f"- **Target**: Undocumented workers target = {data_map.get(('Target', 'Undocumented workers target'), 0):,.0f}" + ), + "- **Before adjustment**: Undocumented students = *[Run cps.py to populate]*": lambda: ( + f"- **Before adjustment**: Undocumented students = {data_map.get(('Before adjustment', 'Undocumented students'), 0):,.0f}" + ), + "- **Target**: Undocumented students target = *[Run cps.py to populate]*": lambda: ( + f"- **Target**: Undocumented students target = {data_map.get(('Target', 'Undocumented students target'), 0):,.0f}" + ), + "- **Step 3 - EAD workers**: Moved from Code 0 to Code 2 = *[Run cps.py to populate]*": lambda: ( + f"- **Step 3 - EAD workers**: Moved from Code 0 to Code 2 = {data_map.get(('Step 3 - EAD workers', 'Moved from Code 0 to Code 2'), 0):,.0f}" + ), + "- **Step 4 - EAD students**: Moved from Code 0 to Code 2 = *[Run cps.py to populate]*": lambda: ( + f"- **Step 4 - EAD students**: Moved from Code 0 to Code 2 = {data_map.get(('Step 4 - EAD students', 'Moved from Code 0 to Code 2'), 0):,.0f}" + ), + "- **After EAD assignment**: Code 0 people = *[Run cps.py to populate]*": lambda: ( + f"- **After EAD assignment**: Code 0 people = {data_map.get(('After EAD assignment', 'Code 0 people'), 0):,.0f}" + ), + "- **Step 5 - Family correlation**: Changed from Code 3 to Code 0 = *[Run cps.py to populate]*": lambda: ( + f"- **Step 5 - Family correlation**: Changed from Code 3 to Code 0 = {data_map.get(('Step 5 - Family correlation', 'Changed from Code 3 to Code 0'), 0):,.0f}" + ), + "- **After family correlation**: Code 0 people = *[Run cps.py to populate]*": lambda: ( + f"- **After family correlation**: Code 0 people = {data_map.get(('After family correlation', 'Code 0 people'), 0):,.0f}" + ), + "- **Final**: Code 0 (NONE) = *[Run cps.py to populate]*": lambda: ( + f"- **Final**: Code 0 (NONE) = {data_map.get(('Final', 'Code 0 (NONE)'), 0):,.0f}" + ), + "- **Final**: Code 1 (CITIZEN) = *[Run cps.py to populate]*": lambda: ( + f"- **Final**: Code 1 (CITIZEN) = {data_map.get(('Final', 'Code 1 (CITIZEN)'), 0):,.0f}" + ), + "- **Final**: Code 2 (NON_CITIZEN_VALID_EAD) = *[Run cps.py to populate]*": lambda: ( + f"- **Final**: Code 2 (NON_CITIZEN_VALID_EAD) = {data_map.get(('Final', 'Code 2 (NON_CITIZEN_VALID_EAD)'), 0):,.0f}" + ), + "- **Final**: Code 3 (OTHER_NON_CITIZEN) = *[Run cps.py to populate]*": lambda: ( + f"- **Final**: Code 3 (OTHER_NON_CITIZEN) = {data_map.get(('Final', 'Code 3 (OTHER_NON_CITIZEN)'), 0):,.0f}" + ), + "- **Final**: Total undocumented (Code 0) = *[Run cps.py to populate]*": lambda: ( + f"- **Final**: Total undocumented (Code 0) = {data_map.get(('Final', 'Total undocumented (Code 0)'), 0):,.0f}" + ), + "- **Final**: Undocumented target = *[Run cps.py to populate]*": lambda: ( + f"- **Final**: Undocumented target = {data_map.get(('Final', 'Undocumented target'), 0):,.0f}" + ), } # Apply replacements @@ -1852,9 +1819,7 @@ def add_tips(self, cps: h5py.File): # Drop temporary columns used only for imputation # is_married is person-level here but policyengine-us defines it at Family # level, so we must not save it - cps = cps.drop( - columns=["is_married", "is_under_18", "is_under_6"], errors="ignore" - ) + cps = cps.drop(columns=["is_married", "is_under_18", "is_under_6"], errors="ignore") self.save_dataset(cps) @@ -1974,9 +1939,7 @@ def create_scf_reference_person_mask(cps_data, raw_person_data): all_persons_data["is_female"] = (raw_person_data.A_SEX == 2).values # Add marital status (A_MARITL codes: 1,2 = married with spouse present/absent) - all_persons_data["is_married"] = raw_person_data.A_MARITL.isin( - [1, 2] - ).values + all_persons_data["is_married"] = raw_person_data.A_MARITL.isin([1, 2]).values # Define adults as age 18+ all_persons_data["is_adult"] = all_persons_data["age"] >= 18 @@ -1995,8 +1958,7 @@ def create_scf_reference_person_mask(cps_data, raw_person_data): # Identify couple households (households with exactly 2 married adults) married_adults_per_household = ( all_persons_data[ - (all_persons_data["is_adult"]) - & (all_persons_data["is_married"]) + (all_persons_data["is_adult"]) & (all_persons_data["is_married"]) ] .groupby("person_household_id") .size() @@ -2004,12 +1966,7 @@ def create_scf_reference_person_mask(cps_data, raw_person_data): couple_households = married_adults_per_household[ (married_adults_per_household == 2) - & ( - all_persons_data.groupby("person_household_id")[ - "n_adults" - ].first() - == 2 - ) + & (all_persons_data.groupby("person_household_id")["n_adults"].first() == 2) ].index all_persons_data["is_couple_household"] = all_persons_data[ @@ -2109,9 +2066,7 @@ def determine_reference_person(group): } # Apply the mapping to recode the race values - cps_data["cps_race"] = np.vectorize(CPS_RACE_MAPPING.get)( - cps_data["cps_race"] - ) + cps_data["cps_race"] = np.vectorize(CPS_RACE_MAPPING.get)(cps_data["cps_race"]) lengths = {k: len(v) for k, v in cps_data.items()} var_len = cps_data["person_household_id"].shape[0] @@ -2143,9 +2098,7 @@ def determine_reference_person(group): # Add is_married variable for household heads based on raw person data reference_persons = person_data[mask] - receiver_data["is_married"] = reference_persons.A_MARITL.isin( - [1, 2] - ).values + receiver_data["is_married"] = reference_persons.A_MARITL.isin([1, 2]).values # Impute auto loan balance from the SCF from policyengine_us_data.datasets.scf.scf import SCF_2022 @@ -2180,9 +2133,7 @@ def determine_reference_person(group): logging.getLogger("microimpute").setLevel(getattr(logging, log_level)) qrf_model = QRF() - donor_data = donor_data.sample(frac=0.5, random_state=42).reset_index( - drop=True - ) + donor_data = donor_data.sample(frac=0.5, random_state=42).reset_index(drop=True) fitted_model = qrf_model.fit( X_train=donor_data, predictors=PREDICTORS, diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 3bf5515b3..578756203 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -44,9 +44,7 @@ def reweight( normalisation_factor = np.where( is_national, nation_normalisation_factor, state_normalisation_factor ) - normalisation_factor = torch.tensor( - normalisation_factor, dtype=torch.float32 - ) + normalisation_factor = torch.tensor(normalisation_factor, dtype=torch.float32) targets_array = torch.tensor(targets_array, dtype=torch.float32) inv_mean_normalisation = 1 / np.mean(normalisation_factor.numpy()) @@ -59,12 +57,8 @@ def loss(weights): estimate = weights @ loss_matrix if torch.isnan(estimate).any(): raise ValueError("Estimate contains NaNs") - rel_error = ( - ((estimate - targets_array) + 1) / (targets_array + 1) - ) ** 2 - rel_error_normalized = ( - inv_mean_normalisation * rel_error * normalisation_factor - ) + rel_error = (((estimate - targets_array) + 1) / (targets_array + 1)) ** 2 + rel_error_normalized = inv_mean_normalisation * rel_error * normalisation_factor if torch.isnan(rel_error_normalized).any(): raise ValueError("Relative error contains NaNs") return rel_error_normalized.mean() @@ -119,9 +113,7 @@ def loss(weights): start_loss = l.item() loss_rel_change = (l.item() - start_loss) / start_loss l.backward() - iterator.set_postfix( - {"loss": l.item(), "loss_rel_change": loss_rel_change} - ) + iterator.set_postfix({"loss": l.item(), "loss_rel_change": loss_rel_change}) optimizer.step() if log_path is not None: performance.to_csv(log_path, index=False) @@ -180,9 +172,7 @@ def generate(self): # Run the optimization procedure to get (close to) minimum loss weights for year in range(self.start_year, self.end_year + 1): - loss_matrix, targets_array = build_loss_matrix( - self.input_dataset, year - ) + loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year) zero_mask = np.isclose(targets_array, 0.0, atol=0.1) bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~(zero_mask | bad_mask) @@ -204,9 +194,7 @@ def generate(self): # Validate dense weights w = optimised_weights if np.any(np.isnan(w)): - raise ValueError( - f"Year {year}: household_weight contains NaN values" - ) + raise ValueError(f"Year {year}: household_weight contains NaN values") if np.any(w < 0): raise ValueError( f"Year {year}: household_weight contains negative values" @@ -247,12 +235,8 @@ def generate(self): 1, 0.1, len(original_weights) ) for year in [2024]: - loss_matrix, targets_array = build_loss_matrix( - self.input_dataset, year - ) - optimised_weights = reweight( - original_weights, loss_matrix, targets_array - ) + loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year) + optimised_weights = reweight(original_weights, loss_matrix, targets_array) data["household_weight"] = optimised_weights self.save_dataset(data) diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/block_assignment.py b/policyengine_us_data/datasets/cps/local_area_calibration/block_assignment.py index 73b435f69..f479435b5 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/block_assignment.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/block_assignment.py @@ -127,7 +127,9 @@ def _load_cbsa_crosswalk() -> Dict[str, str]: Returns: Dict mapping 5-digit county FIPS to CBSA code (or None if not in CBSA) """ - url = "https://data.nber.org/cbsa-csa-fips-county-crosswalk/2023/cbsa2fipsxw_2023.csv" + url = ( + "https://data.nber.org/cbsa-csa-fips-county-crosswalk/2023/cbsa2fipsxw_2023.csv" + ) try: df = pd.read_csv(url, dtype=str) # Build 5-digit county FIPS from state + county codes @@ -171,8 +173,7 @@ def _load_block_crosswalk() -> pd.DataFrame: if not csv_path.exists(): print( - f"Warning: {csv_path} not found. " - "Run make_block_crosswalk.py to generate." + f"Warning: {csv_path} not found. Run make_block_crosswalk.py to generate." ) return pd.DataFrame() @@ -260,14 +261,10 @@ def get_all_geography_from_block(block_geoid: str) -> Dict[str, Optional[str]]: result = { "sldu": row["sldu"] if pd.notna(row["sldu"]) else None, "sldl": row["sldl"] if pd.notna(row["sldl"]) else None, - "place_fips": ( - row["place_fips"] if pd.notna(row["place_fips"]) else None - ), + "place_fips": (row["place_fips"] if pd.notna(row["place_fips"]) else None), "vtd": row["vtd"] if pd.notna(row["vtd"]) else None, "puma": row["puma"] if pd.notna(row["puma"]) else None, - "zcta": ( - row["zcta"] if has_zcta and pd.notna(row["zcta"]) else None - ), + "zcta": (row["zcta"] if has_zcta and pd.notna(row["zcta"]) else None), } return result return { @@ -436,17 +433,11 @@ def assign_geography_for_cd( - county_index: int32 indices into County enum (for backwards compat) """ # Assign blocks first - block_geoids = assign_blocks_for_cd( - cd_geoid, n_households, seed, distributions - ) + block_geoids = assign_blocks_for_cd(cd_geoid, n_households, seed, distributions) # Derive geography directly from block GEOID structure - county_fips = np.array( - [get_county_fips_from_block(b) for b in block_geoids] - ) - tract_geoids = np.array( - [get_tract_geoid_from_block(b) for b in block_geoids] - ) + county_fips = np.array([get_county_fips_from_block(b) for b in block_geoids]) + tract_geoids = np.array([get_tract_geoid_from_block(b) for b in block_geoids]) state_fips = np.array([get_state_fips_from_block(b) for b in block_geoids]) # CBSA lookup via county (may be None for rural areas) diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py index 97c82360d..3db2477bd 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py @@ -351,9 +351,7 @@ def create_target_groups( for domain_var, var_name in pairs: var_mask = ( - (targets_df["variable"] == var_name) - & level_mask - & ~processed_mask + (targets_df["variable"] == var_name) & level_mask & ~processed_mask ) if has_domain and domain_var is not None: var_mask &= targets_df["domain_variable"] == domain_var @@ -379,15 +377,11 @@ def create_target_groups( # Format output based on level and count if n_targets == 1: value = matching["value"].iloc[0] - info_str = ( - f"{level_name} {label} (1 target, value={value:,.0f})" - ) + info_str = f"{level_name} {label} (1 target, value={value:,.0f})" print_str = f" Group {group_id}: {label} = {value:,.0f}" else: info_str = f"{level_name} {label} ({n_targets} targets)" - print_str = ( - f" Group {group_id}: {label} ({n_targets} targets)" - ) + print_str = f" Group {group_id}: {label} ({n_targets} targets)" group_info.append(f"Group {group_id}: {info_str}") print(print_str) @@ -440,9 +434,7 @@ def drop_target_groups( drop_ids.add(gid) matched = True if not matched: - print( - f" WARNING: no match for " f"({label_substr!r}, {geo_name!r})" - ) + print(f" WARNING: no match for ({label_substr!r}, {geo_name!r})") keep_mask = ~np.isin(target_groups, list(drop_ids)) @@ -600,9 +592,7 @@ def calculate_spm_thresholds_for_cd( .reset_index() ) - tenure_types = sim.calculate( - "spm_unit_tenure_type", map_to="spm_unit" - ).values + tenure_types = sim.calculate("spm_unit_tenure_type", map_to="spm_unit").values spm_unit_ids_unit = sim.calculate("spm_unit_id", map_to="spm_unit").values tenure_df = pd.DataFrame( @@ -614,10 +604,7 @@ def calculate_spm_thresholds_for_cd( merged = agg.merge(tenure_df, on="spm_unit_id", how="left") merged["tenure_code"] = ( - merged["tenure_type"] - .map(SPM_TENURE_STRING_TO_CODE) - .fillna(3) - .astype(int) + merged["tenure_type"].map(SPM_TENURE_STRING_TO_CODE).fillna(3).astype(int) ) calc = SPMCalculator(year=year) @@ -627,9 +614,7 @@ def calculate_spm_thresholds_for_cd( thresholds = np.zeros(n, dtype=np.float32) for i in range(n): - tenure_str = TENURE_CODE_MAP.get( - int(merged.iloc[i]["tenure_code"]), "renter" - ) + tenure_str = TENURE_CODE_MAP.get(int(merged.iloc[i]["tenure_code"]), "renter") base = base_thresholds[tenure_str] equiv_scale = spm_equivalence_scale( int(merged.iloc[i]["num_adults"]), diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/county_assignment.py b/policyengine_us_data/datasets/cps/local_area_calibration/county_assignment.py index 780bc4c77..54aaaf07f 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/county_assignment.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/county_assignment.py @@ -150,9 +150,7 @@ def get_county_filter_probability( else: dist = _generate_uniform_distribution(cd_key) - return sum( - prob for county, prob in dist.items() if county in county_filter - ) + return sum(prob for county, prob in dist.items() if county in county_filter) def get_filtered_county_distribution( diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py b/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py index 54d9a959f..2aa15a9f3 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py @@ -57,7 +57,7 @@ def create_stratified_cps_dataset( print(f"Original dataset: {n_households_orig:,} households") print(f"Target dataset: {target_households:,} households") - print(f"Reduction ratio: {target_households/n_households_orig:.1%}") + print(f"Reduction ratio: {target_households / n_households_orig:.1%}") # Show income distribution print("\nAGI Percentiles (original):") @@ -79,16 +79,14 @@ def create_stratified_cps_dataset( f" Top {100 - high_income_percentile}% (AGI >= ${high_income_threshold:,.0f}): {n_top:,}" ) print(f" Middle 25-{high_income_percentile}%: {n_middle:,}") - print( - f" Bottom 25% (AGI < ${bottom_25_pct_threshold:,.0f}): {n_bottom_25:,}" - ) + print(f" Bottom 25% (AGI < ${bottom_25_pct_threshold:,.0f}): {n_bottom_25:,}") # Calculate sampling rates # Keep ALL top earners, distribute remaining quota between middle and bottom remaining_quota = target_households - n_top if remaining_quota <= 0: raise ValueError( - f"Target ({target_households:,}) is less than top {100-high_income_percentile}% " + f"Target ({target_households:,}) is less than top {100 - high_income_percentile}% " f"count ({n_top:,}). Increase target_households." ) @@ -132,9 +130,7 @@ def create_stratified_cps_dataset( # Top earners - keep all top_mask = agi >= high_income_threshold selected_mask[top_mask] = True - print( - f" Top {100 - high_income_percentile}%: selected {np.sum(top_mask):,}" - ) + print(f" Top {100 - high_income_percentile}%: selected {np.sum(top_mask):,}") # Bottom 25% bottom_mask = agi < bottom_25_pct_threshold @@ -176,7 +172,7 @@ def create_stratified_cps_dataset( n_selected = np.sum(selected_mask) print( - f"\nTotal selected: {n_selected:,} households ({n_selected/n_households_orig:.1%} of original)" + f"\nTotal selected: {n_selected:,} households ({n_selected / n_households_orig:.1%} of original)" ) # Verify high earners are preserved @@ -271,10 +267,7 @@ def create_stratified_cps_dataset( if "person_id" in f and str(time_period) in f["person_id"]: person_ids = f["person_id"][str(time_period)][:] print(f" Final persons: {len(person_ids):,}") - if ( - "household_weight" in f - and str(time_period) in f["household_weight"] - ): + if "household_weight" in f and str(time_period) in f["household_weight"]: weights = f["household_weight"][str(time_period)][:] print(f" Final household weights sum: {np.sum(weights):,.0f}") @@ -342,7 +335,5 @@ def create_stratified_cps_dataset( ) print("\nExamples:") print(" python create_stratified_cps.py 30000") - print( - " python create_stratified_cps.py 50000 --top=99.5 --oversample-poor" - ) + print(" python create_stratified_cps.py 50000 --top=99.5 --oversample-poor") print(" python create_stratified_cps.py 30000 --seed=123 # reproducible") diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/publish_local_area.py b/policyengine_us_data/datasets/cps/local_area_calibration/publish_local_area.py index 4963f3979..e473e3653 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/publish_local_area.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/publish_local_area.py @@ -113,9 +113,9 @@ def build_state_h5( states_dir.mkdir(parents=True, exist_ok=True) output_path = states_dir / f"{state_code}.h5" - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"Building {state_code} ({len(cd_subset)} CDs)") - print(f"{'='*60}") + print(f"{'=' * 60}") create_sparse_cd_stacked_dataset( weights, @@ -158,9 +158,9 @@ def build_district_h5( districts_dir.mkdir(parents=True, exist_ok=True) output_path = districts_dir / f"{friendly_name}.h5" - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"Building {friendly_name}") - print(f"{'='*60}") + print(f"{'=' * 60}") create_sparse_cd_stacked_dataset( weights, @@ -208,9 +208,9 @@ def build_city_h5( cities_dir.mkdir(parents=True, exist_ok=True) output_path = cities_dir / "NYC.h5" - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"Building NYC ({len(cd_subset)} CDs)") - print(f"{'='*60}") + print(f"{'=' * 60}") create_sparse_cd_stacked_dataset( weights, @@ -256,17 +256,15 @@ def build_and_upload_states( print(f"Skipping {state_code} (already completed)") continue - cd_subset = [ - cd for cd in cds_to_calibrate if int(cd) // 100 == state_fips - ] + cd_subset = [cd for cd in cds_to_calibrate if int(cd) // 100 == state_fips] if not cd_subset: print(f"No CDs found for {state_code}, skipping") continue output_path = states_dir / f"{state_code}.h5" - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"Building {state_code} ({len(cd_subset)} CDs)") - print(f"{'='*60}") + print(f"{'=' * 60}") try: create_sparse_cd_stacked_dataset( @@ -288,9 +286,7 @@ def build_and_upload_states( # Flush HF queue every batch_size files if len(hf_queue) >= hf_batch_size: - print( - f"\nUploading batch of {len(hf_queue)} files to HuggingFace..." - ) + print(f"\nUploading batch of {len(hf_queue)} files to HuggingFace...") upload_local_area_batch_to_hf(hf_queue) hf_queue = [] @@ -300,9 +296,7 @@ def build_and_upload_states( # Flush remaining files to HuggingFace if hf_queue: - print( - f"\nUploading final batch of {len(hf_queue)} files to HuggingFace..." - ) + print(f"\nUploading final batch of {len(hf_queue)} files to HuggingFace...") upload_local_area_batch_to_hf(hf_queue) @@ -336,9 +330,9 @@ def build_and_upload_districts( continue output_path = districts_dir / f"{friendly_name}.h5" - print(f"\n{'='*60}") - print(f"[{i+1}/{len(cds_to_calibrate)}] Building {friendly_name}") - print(f"{'='*60}") + print(f"\n{'=' * 60}") + print(f"[{i + 1}/{len(cds_to_calibrate)}] Building {friendly_name}") + print(f"{'=' * 60}") try: create_sparse_cd_stacked_dataset( @@ -360,9 +354,7 @@ def build_and_upload_districts( # Flush HF queue every batch_size files if len(hf_queue) >= hf_batch_size: - print( - f"\nUploading batch of {len(hf_queue)} files to HuggingFace..." - ) + print(f"\nUploading batch of {len(hf_queue)} files to HuggingFace...") upload_local_area_batch_to_hf(hf_queue) hf_queue = [] @@ -372,9 +364,7 @@ def build_and_upload_districts( # Flush remaining files to HuggingFace if hf_queue: - print( - f"\nUploading final batch of {len(hf_queue)} files to HuggingFace..." - ) + print(f"\nUploading final batch of {len(hf_queue)} files to HuggingFace...") upload_local_area_batch_to_hf(hf_queue) @@ -405,9 +395,9 @@ def build_and_upload_cities( print("No NYC-related CDs found, skipping") else: output_path = cities_dir / "NYC.h5" - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"Building NYC ({len(cd_subset)} CDs)") - print(f"{'='*60}") + print(f"{'=' * 60}") try: create_sparse_cd_stacked_dataset( @@ -420,9 +410,7 @@ def build_and_upload_cities( ) print("Uploading NYC.h5 to GCP...") - upload_local_area_file( - str(output_path), "cities", skip_hf=True - ) + upload_local_area_file(str(output_path), "cities", skip_hf=True) # Queue for batched HuggingFace upload hf_queue.append((str(output_path), "cities")) @@ -436,9 +424,7 @@ def build_and_upload_cities( # Flush remaining files to HuggingFace if hf_queue: - print( - f"\nUploading batch of {len(hf_queue)} city files to HuggingFace..." - ) + print(f"\nUploading batch of {len(hf_queue)} city files to HuggingFace...") upload_local_area_batch_to_hf(hf_queue) diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py b/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py index 010e151f3..6991b8d98 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py @@ -107,9 +107,7 @@ def create_sparse_cd_stacked_dataset( # Process all CDs cd_indices = list(range(len(cds_to_calibrate))) cds_to_process = cds_to_calibrate - print( - f"Processing all {len(cds_to_calibrate)} congressional districts" - ) + print(f"Processing all {len(cds_to_calibrate)} congressional districts") # Generate output path if not provided if output_path is None: @@ -125,9 +123,7 @@ def create_sparse_cd_stacked_dataset( # Load the original simulation base_sim = Microsimulation(dataset=dataset_path) - household_ids = base_sim.calculate( - "household_id", map_to="household" - ).values + household_ids = base_sim.calculate("household_id", map_to="household").values n_households_orig = len(household_ids) # From the base sim, create mapping from household ID to index for proper filtering @@ -155,9 +151,7 @@ def create_sparse_cd_stacked_dataset( # Extract only the CDs we want to process if cd_subset is not None: W = W_full[cd_indices, :] - print( - f"Extracted weights for {len(cd_indices)} CDs from full weight matrix" - ) + print(f"Extracted weights for {len(cd_indices)} CDs from full weight matrix") else: W = W_full @@ -177,9 +171,7 @@ def create_sparse_cd_stacked_dataset( for idx, cd_geoid in enumerate(cds_to_process): # Progress every 10 CDs and at the end ---- if (idx + 1) % 10 == 0 or (idx + 1) == len(cds_to_process): - print( - f"Processing CD {cd_geoid} ({idx + 1}/{len(cds_to_process)})..." - ) + print(f"Processing CD {cd_geoid} ({idx + 1}/{len(cds_to_process)})...") # Get the correct index in the weight matrix cd_idx = idx # Index in our filtered W matrix @@ -231,21 +223,13 @@ def create_sparse_cd_stacked_dataset( entity_rel = pd.DataFrame( { - "person_id": cd_sim.calculate( - "person_id", map_to="person" - ).values, + "person_id": cd_sim.calculate("person_id", map_to="person").values, "household_id": cd_sim.calculate( "household_id", map_to="person" ).values, - "tax_unit_id": cd_sim.calculate( - "tax_unit_id", map_to="person" - ).values, - "spm_unit_id": cd_sim.calculate( - "spm_unit_id", map_to="person" - ).values, - "family_id": cd_sim.calculate( - "family_id", map_to="person" - ).values, + "tax_unit_id": cd_sim.calculate("tax_unit_id", map_to="person").values, + "spm_unit_id": cd_sim.calculate("spm_unit_id", map_to="person").values, + "family_id": cd_sim.calculate("family_id", map_to="person").values, "marital_unit_id": cd_sim.calculate( "marital_unit_id", map_to="person" ).values, @@ -264,9 +248,7 @@ def create_sparse_cd_stacked_dataset( .reset_index(name="persons_per_hh") ) hh_df = hh_df.merge(counts) - hh_df["per_person_hh_weight"] = ( - hh_df.household_weight / hh_df.persons_per_hh - ) + hh_df["per_person_hh_weight"] = hh_df.household_weight / hh_df.persons_per_hh # SET WEIGHTS IN SIMULATION BEFORE EXTRACTING DATAFRAME # This is the key - set_input updates the simulation's internal state @@ -300,12 +282,8 @@ def create_sparse_cd_stacked_dataset( ) new_weights_per_id[col] = hh_info2.id_weight - cd_sim.set_input( - "household_weight", time_period, hh_df.household_weight.values - ) - cd_sim.set_input( - "person_weight", time_period, new_weights_per_id["person_id"] - ) + cd_sim.set_input("household_weight", time_period, hh_df.household_weight.values) + cd_sim.set_input("person_weight", time_period, new_weights_per_id["person_id"]) cd_sim.set_input( "tax_unit_weight", time_period, new_weights_per_id["tax_unit_id"] ) @@ -317,9 +295,7 @@ def create_sparse_cd_stacked_dataset( time_period, new_weights_per_id["marital_unit_id"], ) - cd_sim.set_input( - "family_weight", time_period, new_weights_per_id["family_id"] - ) + cd_sim.set_input("family_weight", time_period, new_weights_per_id["family_id"]) # Extract state from CD GEOID and update simulation BEFORE calling to_input_dataframe() # This ensures calculated variables (SNAP, Medicaid) use the correct state @@ -340,9 +316,7 @@ def create_sparse_cd_stacked_dataset( # Assign all geography using census block assignment # For city datasets: use only blocks in target counties if county_filter is not None: - filtered_dist = get_filtered_block_distribution( - cd_geoid, county_filter - ) + filtered_dist = get_filtered_block_distribution(cd_geoid, county_filter) if not filtered_dist: # Should not happen if we already checked p_target > 0 continue @@ -380,9 +354,7 @@ def create_sparse_cd_stacked_dataset( new_spm_thresholds = calculate_spm_thresholds_for_cd( cd_sim, time_period, geoadj, year=time_period ) - cd_sim.set_input( - "spm_unit_spm_threshold", time_period, new_spm_thresholds - ) + cd_sim.set_input("spm_unit_spm_threshold", time_period, new_spm_thresholds) # Delete cached calculated variables to ensure they're recalculated # with new state and county. Exclude 'county' itself since we just set it. @@ -460,9 +432,7 @@ def create_sparse_cd_stacked_dataset( # Group by household ID AND congressional district to create unique household-CD pairs hh_groups = ( - combined_df.groupby([hh_id_col, cd_geoid_col])["_row_idx"] - .apply(list) - .to_dict() + combined_df.groupby([hh_id_col, cd_geoid_col])["_row_idx"].apply(list).to_dict() ) # Assign new household IDs using 25k ranges per CD @@ -484,9 +454,7 @@ def create_sparse_cd_stacked_dataset( # Check we haven't exceeded the range if new_hh_id > end_id: - raise ValueError( - f"CD {cd_str} exceeded its 25k household allocation" - ) + raise ValueError(f"CD {cd_str} exceeded its 25k household allocation") # All rows in the same household-CD pair get the SAME new ID for row_idx in row_indices: @@ -546,9 +514,7 @@ def create_sparse_cd_stacked_dataset( ) # Create sequential IDs for this CD - new_person_ids = np.arange( - start_id, start_id + n_persons_in_cd, dtype=np.int32 - ) + new_person_ids = np.arange(start_id, start_id + n_persons_in_cd, dtype=np.int32) # Assign all at once using loc combined_df.loc[cd_mask, person_id_col] = new_person_ids @@ -566,9 +532,7 @@ def create_sparse_cd_stacked_dataset( for entity_name, person_col, entity_col in entity_configs: print(f" Reindexing {entity_name}...") # Group by (household_id, original_entity_id) and assign unique group numbers - new_ids = combined_df.groupby( - [hh_id_col, person_col], sort=False - ).ngroup() + new_ids = combined_df.groupby([hh_id_col, person_col], sort=False).ngroup() combined_df[person_col] = new_ids if entity_col in combined_df.columns: combined_df[entity_col] = new_ids @@ -581,17 +545,13 @@ def create_sparse_cd_stacked_dataset( print(f" Final households: {total_households:,}") print(f" Final tax units: {combined_df[person_tax_unit_col].nunique():,}") print(f" Final SPM units: {combined_df[person_spm_unit_col].nunique():,}") - print( - f" Final marital units: {combined_df[person_marital_unit_col].nunique():,}" - ) + print(f" Final marital units: {combined_df[person_marital_unit_col].nunique():,}") print(f" Final families: {combined_df[person_family_col].nunique():,}") # Check weights in combined_df AFTER reindexing print(f"\nWeights in combined_df AFTER reindexing:") - print(f" HH weight sum: {combined_df[hh_weight_col].sum()/1e6:.2f}M") - print( - f" Person weight sum: {combined_df[person_weight_col].sum()/1e6:.2f}M" - ) + print(f" HH weight sum: {combined_df[hh_weight_col].sum() / 1e6:.2f}M") + print(f" Person weight sum: {combined_df[person_weight_col].sum() / 1e6:.2f}M") print( f" Ratio: {combined_df[person_weight_col].sum() / combined_df[hh_weight_col].sum():.2f}" ) @@ -662,9 +622,7 @@ def create_sparse_cd_stacked_dataset( # Handle different value types if ( - sparse_sim.tax_benefit_system.variables.get( - variable - ).value_type + sparse_sim.tax_benefit_system.variables.get(variable).value_type in (Enum, str) and variable != "county_fips" ): @@ -701,9 +659,7 @@ def create_sparse_cd_stacked_dataset( # Save household mapping to CSV in a mappings subdirectory mapping_df = pd.DataFrame(household_mapping) output_dir = os.path.dirname(output_path) - mappings_dir = ( - os.path.join(output_dir, "mappings") if output_dir else "mappings" - ) + mappings_dir = os.path.join(output_dir, "mappings") if output_dir else "mappings" os.makedirs(mappings_dir, exist_ok=True) csv_filename = os.path.basename(output_path).replace( ".h5", "_household_mapping.csv" @@ -721,10 +677,7 @@ def create_sparse_cd_stacked_dataset( if "person_id" in f and str(time_period) in f["person_id"]: person_ids = f["person_id"][str(time_period)][:] print(f" Final persons: {len(person_ids):,}") - if ( - "household_weight" in f - and str(time_period) in f["household_weight"] - ): + if "household_weight" in f and str(time_period) in f["household_weight"]: weights = f["household_weight"][str(time_period)][:] print( f" Total population (from household weights): {np.sum(weights):,.0f}" @@ -744,20 +697,14 @@ def create_sparse_cd_stacked_dataset( if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser( - description="Create sparse CD-stacked datasets" - ) - parser.add_argument( - "--weights-path", required=True, help="Path to w_cd.npy file" - ) + parser = argparse.ArgumentParser(description="Create sparse CD-stacked datasets") + parser.add_argument("--weights-path", required=True, help="Path to w_cd.npy file") parser.add_argument( "--dataset-path", required=True, help="Path to stratified dataset .h5 file", ) - parser.add_argument( - "--db-path", required=True, help="Path to policy_data.db" - ) + parser.add_argument("--db-path", required=True, help="Path to policy_data.db") parser.add_argument( "--output-dir", default="./temp", @@ -826,9 +773,7 @@ def create_sparse_cd_stacked_dataset( elif mode == "states": for state_fips, state_code in STATE_CODES.items(): - cd_subset = [ - cd for cd in cds_to_calibrate if int(cd) // 100 == state_fips - ] + cd_subset = [cd for cd in cds_to_calibrate if int(cd) // 100 == state_fips] if not cd_subset: continue output_path = f"{output_dir}/{state_code}.h5" @@ -852,7 +797,7 @@ def create_sparse_cd_stacked_dataset( output_path = f"{output_dir}/{friendly_name}.h5" print( - f"\n[{i+1}/{len(cds_to_calibrate)}] Creating {friendly_name}.h5 (GEOID {cd_geoid})" + f"\n[{i + 1}/{len(cds_to_calibrate)}] Creating {friendly_name}.h5 (GEOID {cd_geoid})" ) create_sparse_cd_stacked_dataset( w, @@ -890,9 +835,7 @@ def create_sparse_cd_stacked_dataset( if state_fips is None: raise ValueError(f"Unknown state code: {args.state}") - cd_subset = [ - cd for cd in cds_to_calibrate if int(cd) // 100 == state_fips - ] + cd_subset = [cd for cd in cds_to_calibrate if int(cd) // 100 == state_fips] if not cd_subset: raise ValueError(f"No CDs found for state {state_code_upper}") @@ -914,9 +857,7 @@ def create_sparse_cd_stacked_dataset( raise ValueError("No NYC-related CDs found in calibrated CDs list") output_path = f"{output_dir}/NYC.h5" - print( - f"\nCreating NYC dataset with {len(cd_subset)} CDs: {output_path}" - ) + print(f"\nCreating NYC dataset with {len(cd_subset)} CDs: {output_path}") print(f" CDs: {', '.join(cd_subset)}") print(" Filtering to NYC counties only") diff --git a/policyengine_us_data/datasets/cps/long_term/check_calibrated_estimates_interactive.py b/policyengine_us_data/datasets/cps/long_term/check_calibrated_estimates_interactive.py index af0414841..5fe3e599e 100644 --- a/policyengine_us_data/datasets/cps/long_term/check_calibrated_estimates_interactive.py +++ b/policyengine_us_data/datasets/cps/long_term/check_calibrated_estimates_interactive.py @@ -24,8 +24,7 @@ ## Taxable Payroll for Social Security taxible_estimate_b = ( sim.calculate("taxable_earnings_for_social_security").sum() / 1e9 - + sim.calculate("social_security_taxable_self_employment_income").sum() - / 1e9 + + sim.calculate("social_security_taxable_self_employment_income").sum() / 1e9 ) ### Trustees SingleYearTRTables_TR2025.xlsx, Tab VI.G6 (nominal dollars in billions) @@ -66,8 +65,7 @@ ## Taxable Payroll for Social Security taxible_estimate_b = ( sim.calculate("taxable_earnings_for_social_security").sum() / 1e9 - + sim.calculate("social_security_taxable_self_employment_income").sum() - / 1e9 + + sim.calculate("social_security_taxable_self_employment_income").sum() / 1e9 ) ### Trustees SingleYearTRTables_TR2025.xlsx, Tab VI.G6 (nominal dollars in billions) @@ -175,9 +173,9 @@ def create_h6_reform(): # The swapped rate error is 14x smaller and aligns with tax-cutting intent. # Tier 1 (Base): HI ONLY (35%) - reform_payload[ - "gov.irs.social_security.taxability.rate.base.benefit_cap" - ][period] = 0.35 + reform_payload["gov.irs.social_security.taxability.rate.base.benefit_cap"][ + period + ] = 0.35 reform_payload["gov.irs.social_security.taxability.rate.base.excess"][ period ] = 0.35 @@ -186,25 +184,25 @@ def create_h6_reform(): reform_payload[ "gov.irs.social_security.taxability.rate.additional.benefit_cap" ][period] = 0.85 - reform_payload[ - "gov.irs.social_security.taxability.rate.additional.excess" - ][period] = 0.85 + reform_payload["gov.irs.social_security.taxability.rate.additional.excess"][ + period + ] = 0.85 # --- SET THRESHOLDS (MIN/MAX SWAP) --- # Always put the smaller number in 'base' and larger in 'adjusted_base' # Single - reform_payload[ - "gov.irs.social_security.taxability.threshold.base.main.SINGLE" - ][period] = min(oasdi_target_single, HI_SINGLE) + reform_payload["gov.irs.social_security.taxability.threshold.base.main.SINGLE"][ + period + ] = min(oasdi_target_single, HI_SINGLE) reform_payload[ "gov.irs.social_security.taxability.threshold.adjusted_base.main.SINGLE" ][period] = max(oasdi_target_single, HI_SINGLE) # Joint - reform_payload[ - "gov.irs.social_security.taxability.threshold.base.main.JOINT" - ][period] = min(oasdi_target_joint, HI_JOINT) + reform_payload["gov.irs.social_security.taxability.threshold.base.main.JOINT"][ + period + ] = min(oasdi_target_joint, HI_JOINT) reform_payload[ "gov.irs.social_security.taxability.threshold.adjusted_base.main.JOINT" ][period] = max(oasdi_target_joint, HI_JOINT) @@ -228,12 +226,12 @@ def create_h6_reform(): # 1. Set Thresholds to "HI Only" mode # Base = $34k / $44k - reform_payload[ - "gov.irs.social_security.taxability.threshold.base.main.SINGLE" - ][elim_period] = HI_SINGLE - reform_payload[ - "gov.irs.social_security.taxability.threshold.base.main.JOINT" - ][elim_period] = HI_JOINT + reform_payload["gov.irs.social_security.taxability.threshold.base.main.SINGLE"][ + elim_period + ] = HI_SINGLE + reform_payload["gov.irs.social_security.taxability.threshold.base.main.JOINT"][ + elim_period + ] = HI_JOINT # Adjusted = Infinity (Disable the second tier effectively) reform_payload[ @@ -262,12 +260,12 @@ def create_h6_reform(): ] = 0.35 # Tier 2 (Disabled via threshold, but zero out for safety) - reform_payload[ - "gov.irs.social_security.taxability.rate.additional.benefit_cap" - ][elim_period] = 0.35 - reform_payload[ - "gov.irs.social_security.taxability.rate.additional.excess" - ][elim_period] = 0.35 + reform_payload["gov.irs.social_security.taxability.rate.additional.benefit_cap"][ + elim_period + ] = 0.35 + reform_payload["gov.irs.social_security.taxability.rate.additional.excess"][ + elim_period + ] = 0.35 return reform_payload @@ -295,26 +293,20 @@ def create_h6_reform(): # Calculate impact revenue_impact = reform_revenue - baseline_revenue -print(f"revenue_impact (B): {revenue_impact / 1E9:.2f}") +print(f"revenue_impact (B): {revenue_impact / 1e9:.2f}") # Calculate taxable payroll -taxable_ss_earnings = baseline.calculate( - "taxable_earnings_for_social_security" -) +taxable_ss_earnings = baseline.calculate("taxable_earnings_for_social_security") taxable_self_employment = baseline.calculate( "social_security_taxable_self_employment_income" ) -total_taxable_payroll = ( - taxable_ss_earnings.sum() + taxable_self_employment.sum() -) +total_taxable_payroll = taxable_ss_earnings.sum() + taxable_self_employment.sum() # Calculate SS benefits ss_benefits = baseline.calculate("social_security") total_ss_benefits = ss_benefits.sum() -est_rev_as_pct_of_taxable_payroll = ( - 100 * revenue_impact / total_taxable_payroll -) +est_rev_as_pct_of_taxable_payroll = 100 * revenue_impact / total_taxable_payroll # From https://www.ssa.gov/oact/solvency/provisions/tables/table_run133.html: target_rev_as_pct_of_taxable_payroll = -1.12 diff --git a/policyengine_us_data/datasets/cps/long_term/extract_ssa_costs.py b/policyengine_us_data/datasets/cps/long_term/extract_ssa_costs.py index 5ada2db9a..492a9d69f 100644 --- a/policyengine_us_data/datasets/cps/long_term/extract_ssa_costs.py +++ b/policyengine_us_data/datasets/cps/long_term/extract_ssa_costs.py @@ -2,9 +2,7 @@ import numpy as np # Read the file -df = pd.read_excel( - "SingleYearTRTables_TR2025.xlsx", sheet_name="VI.G9", header=None -) +df = pd.read_excel("SingleYearTRTables_TR2025.xlsx", sheet_name="VI.G9", header=None) print("DataFrame shape:", df.shape) print("\nChecking data types around row 66-70:") diff --git a/policyengine_us_data/datasets/cps/long_term/projection_utils.py b/policyengine_us_data/datasets/cps/long_term/projection_utils.py index d0af8533e..8aee4f3b7 100644 --- a/policyengine_us_data/datasets/cps/long_term/projection_utils.py +++ b/policyengine_us_data/datasets/cps/long_term/projection_utils.py @@ -27,9 +27,7 @@ def build_household_age_matrix(sim, n_ages=86): n_households = len(household_ids_unique) X = np.zeros((n_households, n_ages)) - hh_id_to_idx = { - hh_id: idx for idx, hh_id in enumerate(household_ids_unique) - } + hh_id_to_idx = {hh_id: idx for idx, hh_id in enumerate(household_ids_unique)} for person_idx in range(len(age_person)): age = int(age_person.values[person_idx]) @@ -67,9 +65,7 @@ def get_pseudo_input_variables(sim): return pseudo_inputs -def create_household_year_h5( - year, household_weights, base_dataset_path, output_dir -): +def create_household_year_h5(year, household_weights, base_dataset_path, output_dir): """ Create a year-specific .h5 file with calibrated household weights. @@ -193,9 +189,7 @@ def calculate_year_statistics( Returns: Dictionary with year statistics and calibrated weights """ - income_tax_hh = sim.calculate( - "income_tax", period=year, map_to="household" - ) + income_tax_hh = sim.calculate("income_tax", period=year, map_to="household") income_tax_baseline_total = income_tax_hh.sum() income_tax_values = income_tax_hh.values @@ -206,9 +200,7 @@ def calculate_year_statistics( ss_values = None ss_target = None if use_ss: - ss_hh = sim.calculate( - "social_security", period=year, map_to="household" - ) + ss_hh = sim.calculate("social_security", period=year, map_to="household") ss_baseline_total = ss_hh.sum() ss_values = ss_hh.values diff --git a/policyengine_us_data/datasets/cps/long_term/run_household_projection.py b/policyengine_us_data/datasets/cps/long_term/run_household_projection.py index 651f7b504..1413efe4b 100644 --- a/policyengine_us_data/datasets/cps/long_term/run_household_projection.py +++ b/policyengine_us_data/datasets/cps/long_term/run_household_projection.py @@ -105,9 +105,9 @@ def create_h6_reform(): # The swapped rate error is 14x smaller and aligns with tax-cutting intent. # Tier 1 (Base): HI ONLY (35%) - reform_payload[ - "gov.irs.social_security.taxability.rate.base.benefit_cap" - ][period] = 0.35 + reform_payload["gov.irs.social_security.taxability.rate.base.benefit_cap"][ + period + ] = 0.35 reform_payload["gov.irs.social_security.taxability.rate.base.excess"][ period ] = 0.35 @@ -116,25 +116,25 @@ def create_h6_reform(): reform_payload[ "gov.irs.social_security.taxability.rate.additional.benefit_cap" ][period] = 0.85 - reform_payload[ - "gov.irs.social_security.taxability.rate.additional.excess" - ][period] = 0.85 + reform_payload["gov.irs.social_security.taxability.rate.additional.excess"][ + period + ] = 0.85 # --- SET THRESHOLDS (MIN/MAX SWAP) --- # Always put the smaller number in 'base' and larger in 'adjusted_base' # Single - reform_payload[ - "gov.irs.social_security.taxability.threshold.base.main.SINGLE" - ][period] = min(oasdi_target_single, HI_SINGLE) + reform_payload["gov.irs.social_security.taxability.threshold.base.main.SINGLE"][ + period + ] = min(oasdi_target_single, HI_SINGLE) reform_payload[ "gov.irs.social_security.taxability.threshold.adjusted_base.main.SINGLE" ][period] = max(oasdi_target_single, HI_SINGLE) # Joint - reform_payload[ - "gov.irs.social_security.taxability.threshold.base.main.JOINT" - ][period] = min(oasdi_target_joint, HI_JOINT) + reform_payload["gov.irs.social_security.taxability.threshold.base.main.JOINT"][ + period + ] = min(oasdi_target_joint, HI_JOINT) reform_payload[ "gov.irs.social_security.taxability.threshold.adjusted_base.main.JOINT" ][period] = max(oasdi_target_joint, HI_JOINT) @@ -158,12 +158,12 @@ def create_h6_reform(): # 1. Set Thresholds to "HI Only" mode # Base = $34k / $44k - reform_payload[ - "gov.irs.social_security.taxability.threshold.base.main.SINGLE" - ][elim_period] = HI_SINGLE - reform_payload[ - "gov.irs.social_security.taxability.threshold.base.main.JOINT" - ][elim_period] = HI_JOINT + reform_payload["gov.irs.social_security.taxability.threshold.base.main.SINGLE"][ + elim_period + ] = HI_SINGLE + reform_payload["gov.irs.social_security.taxability.threshold.base.main.JOINT"][ + elim_period + ] = HI_JOINT # Adjusted = Infinity (Disable the second tier effectively) reform_payload[ @@ -192,12 +192,12 @@ def create_h6_reform(): ] = 0.35 # Tier 2 (Disabled via threshold, but zero out for safety) - reform_payload[ - "gov.irs.social_security.taxability.rate.additional.benefit_cap" - ][elim_period] = 0.35 - reform_payload[ - "gov.irs.social_security.taxability.rate.additional.excess" - ][elim_period] = 0.35 + reform_payload["gov.irs.social_security.taxability.rate.additional.benefit_cap"][ + elim_period + ] = 0.35 + reform_payload["gov.irs.social_security.taxability.rate.additional.excess"][ + elim_period + ] = 0.35 # Create the Reform Object from policyengine_core.reforms import Reform @@ -242,18 +242,14 @@ def create_h6_reform(): if USE_PAYROLL: sys.argv.remove("--use-payroll") if not USE_GREG: - print( - "Warning: --use-payroll requires --greg, enabling GREG automatically" - ) + print("Warning: --use-payroll requires --greg, enabling GREG automatically") USE_GREG = True USE_H6_REFORM = "--use-h6-reform" in sys.argv if USE_H6_REFORM: sys.argv.remove("--use-h6-reform") if not USE_GREG: - print( - "Warning: --use-h6-reform requires --greg, enabling GREG automatically" - ) + print("Warning: --use-h6-reform requires --greg, enabling GREG automatically") USE_GREG = True from ssa_data import load_h6_income_rate_change @@ -261,9 +257,7 @@ def create_h6_reform(): if USE_TOB: sys.argv.remove("--use-tob") if not USE_GREG: - print( - "Warning: --use-tob requires --greg, enabling GREG automatically" - ) + print("Warning: --use-tob requires --greg, enabling GREG automatically") USE_GREG = True from ssa_data import load_oasdi_tob_projections, load_hi_tob_projections @@ -320,9 +314,7 @@ def create_h6_reform(): print("STEP 1: DEMOGRAPHIC PROJECTIONS") print("=" * 70) -target_matrix = load_ssa_age_projections( - start_year=START_YEAR, end_year=END_YEAR -) +target_matrix = load_ssa_age_projections(start_year=START_YEAR, end_year=END_YEAR) n_years = target_matrix.shape[1] n_ages = target_matrix.shape[0] @@ -341,7 +333,7 @@ def create_h6_reform(): idx = y - START_YEAR if idx < n_years: pop = target_matrix[:, idx].sum() - print(f" {y}: {pop/1e6:6.1f}M") + print(f" {y}: {pop / 1e6:6.1f}M") # ========================================================================= # STEP 2: BUILD HOUSEHOLD AGE MATRIX @@ -390,9 +382,7 @@ def create_h6_reform(): sim = Microsimulation(dataset=BASE_DATASET_PATH) - income_tax_hh = sim.calculate( - "income_tax", period=year, map_to="household" - ) + income_tax_hh = sim.calculate("income_tax", period=year, map_to="household") income_tax_baseline_total = income_tax_hh.sum() income_tax_values = income_tax_hh.values @@ -405,15 +395,13 @@ def create_h6_reform(): ss_values = None ss_target = None if USE_SS: - ss_hh = sim.calculate( - "social_security", period=year, map_to="household" - ) + ss_hh = sim.calculate("social_security", period=year, map_to="household") ss_values = ss_hh.values ss_target = load_ssa_benefit_projections(year) if year in display_years: ss_baseline = np.sum(ss_values * baseline_weights) print( - f" [DEBUG {year}] SS baseline: ${ss_baseline/1e9:.1f}B, target: ${ss_target/1e9:.1f}B" + f" [DEBUG {year}] SS baseline: ${ss_baseline / 1e9:.1f}B, target: ${ss_target / 1e9:.1f}B" ) payroll_values = None @@ -435,7 +423,7 @@ def create_h6_reform(): if year in display_years: payroll_baseline = np.sum(payroll_values * baseline_weights) print( - f" [DEBUG {year}] Payroll baseline: ${payroll_baseline/1e9:.1f}B, target: ${payroll_target/1e9:.1f}B" + f" [DEBUG {year}] Payroll baseline: ${payroll_baseline / 1e9:.1f}B, target: ${payroll_target / 1e9:.1f}B" ) h6_income_values = None @@ -452,9 +440,7 @@ def create_h6_reform(): else: # Create and apply H6 reform h6_reform = create_h6_reform() - reform_sim = Microsimulation( - dataset=BASE_DATASET_PATH, reform=h6_reform - ) + reform_sim = Microsimulation(dataset=BASE_DATASET_PATH, reform=h6_reform) # Calculate reform income tax income_tax_reform_hh = reform_sim.calculate( @@ -472,14 +458,12 @@ def create_h6_reform(): # Debug output for key years if year in display_years: - h6_impact_baseline = np.sum( - h6_income_values * baseline_weights - ) + h6_impact_baseline = np.sum(h6_income_values * baseline_weights) print( - f" [DEBUG {year}] H6 baseline revenue: ${h6_impact_baseline/1e9:.3f}B, target: ${h6_revenue_target/1e9:.3f}B" + f" [DEBUG {year}] H6 baseline revenue: ${h6_impact_baseline / 1e9:.3f}B, target: ${h6_revenue_target / 1e9:.3f}B" ) print( - f" [DEBUG {year}] H6 target ratio: {h6_target_ratio:.4f} × payroll ${payroll_target_year/1e9:.1f}B" + f" [DEBUG {year}] H6 target ratio: {h6_target_ratio:.4f} × payroll ${payroll_target_year / 1e9:.1f}B" ) del reform_sim @@ -506,10 +490,10 @@ def create_h6_reform(): oasdi_baseline = np.sum(oasdi_tob_values * baseline_weights) hi_baseline = np.sum(hi_tob_values * baseline_weights) print( - f" [DEBUG {year}] OASDI TOB baseline: ${oasdi_baseline/1e9:.1f}B, target: ${oasdi_tob_target/1e9:.1f}B" + f" [DEBUG {year}] OASDI TOB baseline: ${oasdi_baseline / 1e9:.1f}B, target: ${oasdi_tob_target / 1e9:.1f}B" ) print( - f" [DEBUG {year}] HI TOB baseline: ${hi_baseline/1e9:.1f}B, target: ${hi_tob_target/1e9:.1f}B" + f" [DEBUG {year}] HI TOB baseline: ${hi_baseline / 1e9:.1f}B, target: ${hi_tob_target / 1e9:.1f}B" ) y_target = target_matrix[:, year_idx] @@ -547,43 +531,37 @@ def create_h6_reform(): f"largest: {max_neg:,.0f}" ) else: - print( - f" [DEBUG {year}] Negative weights: 0 (all weights non-negative)" - ) + print(f" [DEBUG {year}] Negative weights: 0 (all weights non-negative)") - if year in display_years and ( - USE_SS or USE_PAYROLL or USE_H6_REFORM or USE_TOB - ): + if year in display_years and (USE_SS or USE_PAYROLL or USE_H6_REFORM or USE_TOB): if USE_SS: ss_achieved = np.sum(ss_values * w_new) print( - f" [DEBUG {year}] SS achieved: ${ss_achieved/1e9:.1f}B (error: ${abs(ss_achieved - ss_target)/1e6:.1f}M, {(ss_achieved - ss_target)/ss_target*100:.3f}%)" + f" [DEBUG {year}] SS achieved: ${ss_achieved / 1e9:.1f}B (error: ${abs(ss_achieved - ss_target) / 1e6:.1f}M, {(ss_achieved - ss_target) / ss_target * 100:.3f}%)" ) if USE_PAYROLL: payroll_achieved = np.sum(payroll_values * w_new) print( - f" [DEBUG {year}] Payroll achieved: ${payroll_achieved/1e9:.1f}B (error: ${abs(payroll_achieved - payroll_target)/1e6:.1f}M, {(payroll_achieved - payroll_target)/payroll_target*100:.3f}%)" + f" [DEBUG {year}] Payroll achieved: ${payroll_achieved / 1e9:.1f}B (error: ${abs(payroll_achieved - payroll_target) / 1e6:.1f}M, {(payroll_achieved - payroll_target) / payroll_target * 100:.3f}%)" ) if USE_H6_REFORM and h6_revenue_target is not None: h6_revenue_achieved = np.sum(h6_income_values * w_new) error_pct = ( - (h6_revenue_achieved - h6_revenue_target) - / abs(h6_revenue_target) - * 100 + (h6_revenue_achieved - h6_revenue_target) / abs(h6_revenue_target) * 100 if h6_revenue_target != 0 else 0 ) print( - f" [DEBUG {year}] H6 achieved revenue: ${h6_revenue_achieved/1e9:.3f}B (error: ${abs(h6_revenue_achieved - h6_revenue_target)/1e6:.1f}M, {error_pct:.3f}%)" + f" [DEBUG {year}] H6 achieved revenue: ${h6_revenue_achieved / 1e9:.3f}B (error: ${abs(h6_revenue_achieved - h6_revenue_target) / 1e6:.1f}M, {error_pct:.3f}%)" ) if USE_TOB: oasdi_achieved = np.sum(oasdi_tob_values * w_new) hi_achieved = np.sum(hi_tob_values * w_new) print( - f" [DEBUG {year}] OASDI TOB achieved: ${oasdi_achieved/1e9:.1f}B (error: ${abs(oasdi_achieved - oasdi_tob_target)/1e6:.1f}M, {(oasdi_achieved - oasdi_tob_target)/oasdi_tob_target*100:.3f}%)" + f" [DEBUG {year}] OASDI TOB achieved: ${oasdi_achieved / 1e9:.1f}B (error: ${abs(oasdi_achieved - oasdi_tob_target) / 1e6:.1f}M, {(oasdi_achieved - oasdi_tob_target) / oasdi_tob_target * 100:.3f}%)" ) print( - f" [DEBUG {year}] HI TOB achieved: ${hi_achieved/1e9:.1f}B (error: ${abs(hi_achieved - hi_tob_target)/1e6:.1f}M, {(hi_achieved - hi_tob_target)/hi_tob_target*100:.3f}%)" + f" [DEBUG {year}] HI TOB achieved: ${hi_achieved / 1e9:.1f}B (error: ${abs(hi_achieved - hi_tob_target) / 1e6:.1f}M, {(hi_achieved - hi_tob_target) / hi_tob_target * 100:.3f}%)" ) weights_matrix[:, year_idx] = w_new @@ -593,9 +571,7 @@ def create_h6_reform(): total_population[year_idx] = np.sum(y_target) if SAVE_H5: - h5_path = create_household_year_h5( - year, w_new, BASE_DATASET_PATH, OUTPUT_DIR - ) + h5_path = create_household_year_h5(year, w_new, BASE_DATASET_PATH, OUTPUT_DIR) if year in display_years: print(f" Saved {year}.h5") @@ -613,5 +589,5 @@ def create_h6_reform(): ) elif year_idx % 5 == 0: print( - f"{year} Processing... ({year_idx+1}/{n_years}) {mem_gb:.2f}GB" + f"{year} Processing... ({year_idx + 1}/{n_years}) {mem_gb:.2f}GB" ) diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py index c84181eae..a15080321 100644 --- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py @@ -22,8 +22,7 @@ def create_small_ecps(): weights = simulation.calculate("household_weight").values if np.all(weights == 0): raise ValueError( - "create_small_ecps: all household weights are zero " - "after subsample" + "create_small_ecps: all household weights are zero after subsample" ) logging.info( f"create_small_ecps: subsample has " @@ -36,9 +35,10 @@ def create_small_ecps(): data[variable] = {} for time_period in simulation.get_holder(variable).get_known_periods(): values = simulation.get_holder(variable).get_array(time_period) - if simulation.tax_benefit_system.variables.get( - variable - ).value_type in (Enum, str): + if simulation.tax_benefit_system.variables.get(variable).value_type in ( + Enum, + str, + ): if hasattr(values, "decode_to_str"): values = values.decode_to_str().astype("S") else: @@ -95,8 +95,7 @@ def create_sparse_ecps(): f"non-zero weight (expected > 1000)" ) logging.info( - f"create_sparse_ecps: {len(h_ids)} households after " - f"zero-weight filtering" + f"create_sparse_ecps: {len(h_ids)} households after zero-weight filtering" ) subset_df = df[df[df_household_id_column].isin(h_ids)].copy() @@ -113,8 +112,7 @@ def create_sparse_ecps(): for time_period in sim.get_holder(variable).get_known_periods(): values = sim.get_holder(variable).get_array(time_period) if ( - sim.tax_benefit_system.variables.get(variable).value_type - in (Enum, str) + sim.tax_benefit_system.variables.get(variable).value_type in (Enum, str) and variable != "county_fips" ): values = values.decode_to_str().astype("S") @@ -137,9 +135,7 @@ def create_sparse_ecps(): ] missing = [v for v in critical_vars if v not in data] if missing: - raise ValueError( - f"create_sparse_ecps: missing critical variables: {missing}" - ) + raise ValueError(f"create_sparse_ecps: missing critical variables: {missing}") logging.info(f"create_sparse_ecps: data dict has {len(data)} variables") output_path = STORAGE_FOLDER / "sparse_enhanced_cps_2024.h5" @@ -152,13 +148,9 @@ def create_sparse_ecps(): file_size = os.path.getsize(output_path) if file_size < 1_000_000: raise ValueError( - f"create_sparse_ecps: output file only {file_size:,} bytes " - f"(expected > 1MB)" + f"create_sparse_ecps: output file only {file_size:,} bytes (expected > 1MB)" ) - logging.info( - f"create_sparse_ecps: wrote {file_size / 1e6:.1f}MB to " - f"{output_path}" - ) + logging.info(f"create_sparse_ecps: wrote {file_size / 1e6:.1f}MB to {output_path}") if __name__ == "__main__": diff --git a/policyengine_us_data/datasets/puf/irs_puf.py b/policyengine_us_data/datasets/puf/irs_puf.py index dd77890a3..c357cd56c 100644 --- a/policyengine_us_data/datasets/puf/irs_puf.py +++ b/policyengine_us_data/datasets/puf/irs_puf.py @@ -30,9 +30,7 @@ def generate(self): with pd.HDFStore(self.file_path, mode="w") as storage: storage.put("puf", pd.read_csv(puf_file_path)) - storage.put( - "puf_demographics", pd.read_csv(puf_demographics_file_path) - ) + storage.put("puf_demographics", pd.read_csv(puf_demographics_file_path)) class IRS_PUF_2015(IRS_PUF): diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index f52153e38..040098c16 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -109,14 +109,10 @@ def simulate_w2_and_ubia_from_puf(puf, *, seed=None, diagnostics=True): ) revenues = np.maximum(qbi, 0) / margins - logit = ( - logit_params["intercept"] + logit_params["slope_per_dollar"] * revenues - ) + logit = logit_params["intercept"] + logit_params["slope_per_dollar"] * revenues # Set p = 0 when simulated receipts == 0 (no revenue means no payroll) - pr_has_employees = np.where( - revenues == 0.0, 0.0, 1.0 / (1.0 + np.exp(-logit)) - ) + pr_has_employees = np.where(revenues == 0.0, 0.0, 1.0 / (1.0 + np.exp(-logit))) has_employees = rng.binomial(1, pr_has_employees) # Labor share simulation @@ -125,8 +121,7 @@ def simulate_w2_and_ubia_from_puf(puf, *, seed=None, diagnostics=True): labor_ratios = np.where( is_rental, rng.beta(rental_beta_a, rental_beta_b, qbi.size) * rental_scale, - rng.beta(non_rental_beta_a, non_rental_beta_b, qbi.size) - * non_rental_scale, + rng.beta(non_rental_beta_a, non_rental_beta_b, qbi.size) * non_rental_scale, ) w2_wages = revenues * labor_ratios * has_employees @@ -155,9 +150,9 @@ def simulate_w2_and_ubia_from_puf(puf, *, seed=None, diagnostics=True): print(f"Share with QBI > 0: {share_qbi_pos:6.2%}") print(f"Among those, share with W-2 wages: {share_wages:6.2%}") if np.any(w2_wages > 0): - print(f"Mean W-2 (if >0): ${np.mean(w2_wages[w2_wages>0]):,.0f}") + print(f"Mean W-2 (if >0): ${np.mean(w2_wages[w2_wages > 0]):,.0f}") if np.any(ubia > 0): - print(f"Median UBIA (if >0): ${np.median(ubia[ubia>0]):,.0f}") + print(f"Median UBIA (if >0): ${np.median(ubia[ubia > 0]):,.0f}") return w2_wages, ubia @@ -209,9 +204,7 @@ def impute_missing_demographics( .fillna(0) ) - puf_with_demographics = puf_with_demographics.sample( - n=10_000, random_state=0 - ) + puf_with_demographics = puf_with_demographics.sample(n=10_000, random_state=0) DEMOGRAPHIC_VARIABLES = [ "AGEDP1", @@ -411,9 +404,7 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: - puf["E25920"].fillna(0) - puf["E25960"].fillna(0) ) != 0 - partnership_se = np.where( - has_partnership, gross_se - schedule_c_f_income, 0 - ) + partnership_se = np.where(has_partnership, gross_se - schedule_c_f_income, 0) puf["partnership_se_income"] = partnership_se # --- Qualified Business Income Deduction (QBID) simulation --- @@ -424,9 +415,9 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: puf_qbi_sources_for_sstb = puf[QBI_PARAMS["sstb_prob_map_by_name"].keys()] largest_qbi_source_name = puf_qbi_sources_for_sstb.idxmax(axis=1) - pr_sstb = largest_qbi_source_name.map( - QBI_PARAMS["sstb_prob_map_by_name"] - ).fillna(0.0) + pr_sstb = largest_qbi_source_name.map(QBI_PARAMS["sstb_prob_map_by_name"]).fillna( + 0.0 + ) puf["business_is_sstb"] = np.random.binomial(n=1, p=pr_sstb) reit_params = QBI_PARAMS["reit_ptp_income_distribution"] @@ -553,9 +544,9 @@ def generate(self): current_index = uprating[uprating.Variable == variable][ self.time_period ].values[0] - start_index = uprating[uprating.Variable == variable][ - 2021 - ].values[0] + start_index = uprating[uprating.Variable == variable][2021].values[ + 0 + ] growth = current_index / start_index arrays[variable] = arrays[variable] * growth self.save_dataset(arrays) @@ -635,9 +626,7 @@ def generate(self): for group in groups_assumed_to_be_tax_unit_like: self.holder[f"{group}_id"] = self.holder["tax_unit_id"] - self.holder[f"person_{group}_id"] = self.holder[ - "person_tax_unit_id" - ] + self.holder[f"person_{group}_id"] = self.holder["person_tax_unit_id"] for key in self.holder: if key == "filing_status": @@ -689,9 +678,7 @@ def add_filer(self, row, tax_unit_id): # Assume all of the interest deduction is the filer's deductible mortgage interest - self.holder["deductible_mortgage_interest"].append( - row["interest_deduction"] - ) + self.holder["deductible_mortgage_interest"].append(row["interest_deduction"]) for key in self.available_financial_vars: if key == "deductible_mortgage_interest": diff --git a/policyengine_us_data/datasets/scf/fed_scf.py b/policyengine_us_data/datasets/scf/fed_scf.py index f67a2c076..8c0d8e8cc 100644 --- a/policyengine_us_data/datasets/scf/fed_scf.py +++ b/policyengine_us_data/datasets/scf/fed_scf.py @@ -32,16 +32,12 @@ def load(self): def generate(self): if self._scf_download_url is None: - raise ValueError( - f"No raw SCF data URL known for year {self.time_period}." - ) + raise ValueError(f"No raw SCF data URL known for year {self.time_period}.") url = self._scf_download_url response = requests.get(url, stream=True) - total_size_in_bytes = int( - response.headers.get("content-length", 200e6) - ) + total_size_in_bytes = int(response.headers.get("content-length", 200e6)) progress_bar = tqdm( total=total_size_in_bytes, unit="iB", @@ -49,9 +45,7 @@ def generate(self): desc="Downloading SCF", ) if response.status_code == 404: - raise FileNotFoundError( - "Received a 404 response when fetching the data." - ) + raise FileNotFoundError("Received a 404 response when fetching the data.") with BytesIO() as file: content_length_actual = 0 for data in response.iter_content(int(1e6)): @@ -65,9 +59,7 @@ def generate(self): zipfile = ZipFile(file) with pd.HDFStore(self.file_path, mode="w") as storage: # Find the Stata file, which should be the only .dta file in the zip - dta_files = [ - f for f in zipfile.namelist() if f.endswith(".dta") - ] + dta_files = [f for f in zipfile.namelist() if f.endswith(".dta")] if not dta_files: raise FileNotFoundError( "No .dta file found in the SCF zip archive." diff --git a/policyengine_us_data/datasets/scf/scf.py b/policyengine_us_data/datasets/scf/scf.py index 1567fbbb6..3f2f11a74 100644 --- a/policyengine_us_data/datasets/scf/scf.py +++ b/policyengine_us_data/datasets/scf/scf.py @@ -55,9 +55,7 @@ def generate(self): try: scf[key] = np.array(scf[key]) except Exception as e: - print( - f"Warning: Could not convert {key} to numpy array: {e}" - ) + print(f"Warning: Could not convert {key} to numpy array: {e}") self.save_dataset(scf) @@ -110,9 +108,7 @@ def downsample(self, frac: float): # Store original dtypes before modifying original_data: dict = self.load_dataset() - original_dtypes = { - key: original_data[key].dtype for key in original_data - } + original_dtypes = {key: original_data[key].dtype for key in original_data} sim = Microsimulation(dataset=self) sim.subsample(frac=frac) @@ -189,17 +185,13 @@ def rename_columns_to_match_cps(scf: dict, raw_data: pd.DataFrame) -> None: 4: 4, # Asian 5: 7, # Other } - scf["cps_race"] = ( - raw_data["racecl5"].map(race_map).fillna(6).astype(int).values - ) + scf["cps_race"] = raw_data["racecl5"].map(race_map).fillna(6).astype(int).values # Hispanic indicator scf["is_hispanic"] = (raw_data["racecl5"] == 3).values # Children in household if "kids" in raw_data.columns: - scf["own_children_in_household"] = ( - raw_data["kids"].fillna(0).astype(int).values - ) + scf["own_children_in_household"] = raw_data["kids"].fillna(0).astype(int).values # Rent if "rent" in raw_data.columns: @@ -207,9 +199,7 @@ def rename_columns_to_match_cps(scf: dict, raw_data: pd.DataFrame) -> None: # Vehicle loan (auto loan) if "veh_inst" in raw_data.columns: - scf["total_vehicle_installments"] = ( - raw_data["veh_inst"].fillna(0).values - ) + scf["total_vehicle_installments"] = raw_data["veh_inst"].fillna(0).values # Marital status if "married" in raw_data.columns: @@ -269,9 +259,7 @@ def add_auto_loan_interest(scf: dict, year: int) -> None: logger.error( f"Network error downloading SCF data for year {year}: {str(e)}" ) - raise RuntimeError( - f"Failed to download SCF data for year {year}" - ) from e + raise RuntimeError(f"Failed to download SCF data for year {year}") from e # Process zip file try: @@ -282,9 +270,7 @@ def add_auto_loan_interest(scf: dict, year: int) -> None: dta_files = [f for f in z.namelist() if f.endswith(".dta")] if not dta_files: logger.error(f"No Stata files found in zip for year {year}") - raise ValueError( - f"No Stata files found in zip for year {year}" - ) + raise ValueError(f"No Stata files found in zip for year {year}") logger.info(f"Found Stata files: {dta_files}") @@ -298,18 +284,14 @@ def add_auto_loan_interest(scf: dict, year: int) -> None: ) logger.info(f"Read DataFrame with shape {df.shape}") except Exception as e: - logger.error( - f"Error reading Stata file for year {year}: {str(e)}" - ) + logger.error(f"Error reading Stata file for year {year}: {str(e)}") raise RuntimeError( f"Failed to process Stata file for year {year}" ) from e except zipfile.BadZipFile as e: logger.error(f"Bad zip file for year {year}: {str(e)}") - raise RuntimeError( - f"Downloaded zip file is corrupt for year {year}" - ) from e + raise RuntimeError(f"Downloaded zip file is corrupt for year {year}") from e # Process the interest data and add to final SCF dictionary auto_df = df[IDENTIFYER_COLUMNS + AUTO_LOAN_COLUMNS].copy() diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py index bf8b75ddc..d77082665 100644 --- a/policyengine_us_data/datasets/sipp/sipp.py +++ b/policyengine_us_data/datasets/sipp/sipp.py @@ -68,8 +68,7 @@ def train_tip_model(): ) # Sum tip columns (AJB*_TXAMT + TJB*_TXAMT) across all jobs. df["tip_income"] = ( - df[df.columns[df.columns.str.contains("TXAMT")]].fillna(0).sum(axis=1) - * 12 + df[df.columns[df.columns.str.contains("TXAMT")]].fillna(0).sum(axis=1) * 12 ) df["employment_income"] = df.TPTOTINC * 12 df["is_under_18"] = (df.TAGE < 18) & (df.MONTHCODE == 12) diff --git a/policyengine_us_data/db/create_database_tables.py b/policyengine_us_data/db/create_database_tables.py index 8590f79e8..d89bad317 100644 --- a/policyengine_us_data/db/create_database_tables.py +++ b/policyengine_us_data/db/create_database_tables.py @@ -39,9 +39,7 @@ class Stratum(SQLModel, table=True): description="Unique identifier for the stratum.", ) definition_hash: str = Field( - sa_column_kwargs={ - "comment": "SHA-256 hash of the stratum's constraints." - }, + sa_column_kwargs={"comment": "SHA-256 hash of the stratum's constraints."}, max_length=64, ) parent_stratum_id: Optional[int] = Field( @@ -89,9 +87,7 @@ class StratumConstraint(SQLModel, table=True): primary_key=True, description="The comparison operator (==, !=, >, >=, <, <=).", ) - value: str = Field( - description="The value for the constraint rule (e.g., '25')." - ) + value: str = Field(description="The value for the constraint rule (e.g., '25').") notes: Optional[str] = Field( default=None, description="Optional notes about the constraint." ) @@ -117,9 +113,7 @@ class Target(SQLModel, table=True): variable: str = Field( description="A variable defined in policyengine-us (e.g., 'income_tax')." ) - period: int = Field( - description="The time period for the data, typically a year." - ) + period: int = Field(description="The time period for the data, typically a year.") stratum_id: int = Field(foreign_key="strata.stratum_id", index=True) reform_id: int = Field( default=0, @@ -156,19 +150,13 @@ def calculate_definition_hash(mapper, connection, target: Stratum): Calculate and set the definition_hash before saving a Stratum instance. """ constraints_history = get_history(target, "constraints_rel") - if not ( - constraints_history.has_changes() or target.definition_hash is None - ): + if not (constraints_history.has_changes() or target.definition_hash is None): return if not target.constraints_rel: # Handle cases with no constraints # Include parent_stratum_id to make hash unique per parent - parent_str = ( - str(target.parent_stratum_id) if target.parent_stratum_id else "" - ) - target.definition_hash = hashlib.sha256( - parent_str.encode("utf-8") - ).hexdigest() + parent_str = str(target.parent_stratum_id) if target.parent_stratum_id else "" + target.definition_hash = hashlib.sha256(parent_str.encode("utf-8")).hexdigest() return constraint_strings = [ @@ -178,9 +166,7 @@ def calculate_definition_hash(mapper, connection, target: Stratum): constraint_strings.sort() # Include parent_stratum_id in the hash to ensure uniqueness per parent - parent_str = ( - str(target.parent_stratum_id) if target.parent_stratum_id else "" - ) + parent_str = str(target.parent_stratum_id) if target.parent_stratum_id else "" fingerprint_text = parent_str + "\n" + "\n".join(constraint_strings) h = hashlib.sha256(fingerprint_text.encode("utf-8")) target.definition_hash = h.hexdigest() @@ -241,10 +227,7 @@ def _validate_geographic_consistency(parent_rows, child_constraints): ) # CD must belong to the parent state. - if ( - "state_fips" in parent_dict - and "congressional_district_geoid" in child_dict - ): + if "state_fips" in parent_dict and "congressional_district_geoid" in child_dict: parent_state = int(parent_dict["state_fips"]) child_cd = int(child_dict["congressional_district_geoid"]) cd_state = child_cd // 100 @@ -288,8 +271,7 @@ def validate_parent_child_constraints(mapper, connection, target: Stratum): return child_set = { - (c.constraint_variable, c.operation, c.value) - for c in target.constraints_rel + (c.constraint_variable, c.operation, c.value) for c in target.constraints_rel } for var, op, val in parent_rows: @@ -306,8 +288,7 @@ def validate_parent_child_constraints(mapper, connection, target: Stratum): if any(int(cv) == int(val) for cv in child_vals): continue raise ValueError( - f"Child stratum must include parent constraint " - f"({var} {op} {val})" + f"Child stratum must include parent constraint ({var} {op} {val})" ) diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py index 0b9ae8a6d..2af5df7f8 100644 --- a/policyengine_us_data/db/create_initial_strata.py +++ b/policyengine_us_data/db/create_initial_strata.py @@ -44,16 +44,12 @@ def fetch_congressional_districts(year): ) # Filter out statewide summary records for multi-district states - df["n_districts"] = df.groupby("state_fips")["state_fips"].transform( - "count" - ) + df["n_districts"] = df.groupby("state_fips")["state_fips"].transform("count") df = df[(df["n_districts"] == 1) | (df["district_number"] > 0)].copy() df = df.drop(columns=["n_districts"]) df.loc[df["district_number"] == 0, "district_number"] = 1 - df["congressional_district_geoid"] = ( - df["state_fips"] * 100 + df["district_number"] - ) + df["congressional_district_geoid"] = df["state_fips"] * 100 + df["district_number"] df = df[ [ @@ -129,9 +125,7 @@ def main(): # Fetch congressional district data cd_df = fetch_congressional_districts(year) - DATABASE_URL = ( - f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" - ) + DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" engine = create_engine(DATABASE_URL) with Session(engine) as session: @@ -156,9 +150,7 @@ def main(): # Create state-level strata unique_states = cd_df["state_fips"].unique() for state_fips in sorted(unique_states): - state_name = STATE_NAMES.get( - state_fips, f"State FIPS {state_fips}" - ) + state_name = STATE_NAMES.get(state_fips, f"State FIPS {state_fips}") state_stratum = Stratum( parent_stratum_id=us_stratum_id, notes=state_name, diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py index 1a12f372f..db5e54da0 100644 --- a/policyengine_us_data/db/etl_age.py +++ b/policyengine_us_data/db/etl_age.py @@ -66,9 +66,7 @@ def transform_age_data(age_data, docs): # Filter out Puerto Rico's district and state records # 5001800US7298 = 118th Congress, 5001900US7298 = 119th Congress df_geos = df_data[ - ~df_data["ucgid_str"].isin( - ["5001800US7298", "5001900US7298", "0400000US72"] - ) + ~df_data["ucgid_str"].isin(["5001800US7298", "5001900US7298", "0400000US72"]) ].copy() df = df_geos[["ucgid_str"] + AGE_COLS] @@ -106,9 +104,7 @@ def load_age_data(df_long, geo, year): raise ValueError('geo must be one of "National", "State", "District"') # Prepare to load data ----------- - DATABASE_URL = ( - f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" - ) + DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" engine = create_engine(DATABASE_URL) with Session(engine) as session: diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index aa8122a59..f2b177957 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -104,9 +104,7 @@ def make_records( f"WARNING: A59664 values appear to be in thousands (max={max_value:,.0f})" ) print("The IRS may have fixed their data inconsistency.") - print( - "Please verify and remove the special case handling if confirmed." - ) + print("Please verify and remove the special case handling if confirmed.") # Don't apply the fix - data appears to already be in thousands else: # Convert from dollars to thousands to match other columns @@ -162,9 +160,7 @@ def convert_district_data( """Transforms data from pre- to post- 2020 census districts""" df = input_df.copy() old_districts_df = df[df["ucgid_str"].str.startswith("5001800US")].copy() - old_districts_df = old_districts_df.sort_values("ucgid_str").reset_index( - drop=True - ) + old_districts_df = old_districts_df.sort_values("ucgid_str").reset_index(drop=True) old_values = old_districts_df["target_value"].to_numpy() new_values = mapping_matrix.T @ old_values @@ -289,19 +285,15 @@ def transform_soi_data(raw_df): # State ------------------- # You've got agi_stub == 0 in here, which you want to use any time you don't want to # divide data by AGI classes (i.e., agi_stub) - state_df = raw_df.copy().loc[ - (raw_df.STATE != "US") & (raw_df.CONG_DISTRICT == 0) - ] - state_df["ucgid_str"] = "0400000US" + state_df["STATEFIPS"].astype( - str - ).str.zfill(2) + state_df = raw_df.copy().loc[(raw_df.STATE != "US") & (raw_df.CONG_DISTRICT == 0)] + state_df["ucgid_str"] = "0400000US" + state_df["STATEFIPS"].astype(str).str.zfill(2) # District ------------------ district_df = raw_df.copy().loc[(raw_df.CONG_DISTRICT > 0)] - max_cong_district_by_state = raw_df.groupby("STATE")[ - "CONG_DISTRICT" - ].transform("max") + max_cong_district_by_state = raw_df.groupby("STATE")["CONG_DISTRICT"].transform( + "max" + ) district_df = raw_df.copy().loc[ (raw_df["CONG_DISTRICT"] > 0) | (max_cong_district_by_state == 0) ] @@ -370,9 +362,7 @@ def transform_soi_data(raw_df): # Pre- to Post- 2020 Census redisticting mapping = get_district_mapping() converted = [ - convert_district_data( - r, mapping["mapping_matrix"], mapping["new_codes"] - ) + convert_district_data(r, mapping["mapping_matrix"], mapping["new_codes"]) for r in records ] @@ -382,9 +372,7 @@ def transform_soi_data(raw_df): def load_soi_data(long_dfs, year): """Load a list of databases into the db, critically dependent on order""" - DATABASE_URL = ( - f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" - ) + DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" engine = create_engine(DATABASE_URL) session = Session(engine) @@ -458,9 +446,7 @@ def load_soi_data(long_dfs, year): filer_strata["state"][state_fips] = state_filer_stratum.stratum_id # District filer strata - for district_geoid, district_geo_stratum_id in geo_strata[ - "district" - ].items(): + for district_geoid, district_geo_stratum_id in geo_strata["district"].items(): # Check if district filer stratum exists district_filer_stratum = ( session.query(Stratum) @@ -492,9 +478,7 @@ def load_soi_data(long_dfs, year): session.add(district_filer_stratum) session.flush() - filer_strata["district"][ - district_geoid - ] = district_filer_stratum.stratum_id + filer_strata["district"][district_geoid] = district_filer_stratum.stratum_id session.commit() @@ -525,9 +509,7 @@ def load_soi_data(long_dfs, year): ) ] elif geo_info["type"] == "state": - parent_stratum_id = filer_strata["state"][ - geo_info["state_fips"] - ] + parent_stratum_id = filer_strata["state"][geo_info["state_fips"]] note = f"State FIPS {geo_info['state_fips']} EITC received with {n_children} children (filers)" constraints = [ StratumConstraint( @@ -636,9 +618,7 @@ def load_soi_data(long_dfs, year): # Store lookup for later use if geo_info["type"] == "national": - eitc_stratum_lookup["national"][ - n_children - ] = new_stratum.stratum_id + eitc_stratum_lookup["national"][n_children] = new_stratum.stratum_id elif geo_info["type"] == "state": key = (geo_info["state_fips"], n_children) eitc_stratum_lookup["state"][key] = new_stratum.stratum_id @@ -652,8 +632,7 @@ def load_soi_data(long_dfs, year): first_agi_index = [ i for i in range(len(long_dfs)) - if long_dfs[i][["target_variable"]].values[0] - == "adjusted_gross_income" + if long_dfs[i][["target_variable"]].values[0] == "adjusted_gross_income" and long_dfs[i][["breakdown_variable"]].values[0] == "one" ][0] for j in range(8, first_agi_index, 2): @@ -676,17 +655,13 @@ def load_soi_data(long_dfs, year): parent_stratum_id = filer_strata["national"] geo_description = "National" elif geo_info["type"] == "state": - parent_stratum_id = filer_strata["state"][ - geo_info["state_fips"] - ] + parent_stratum_id = filer_strata["state"][geo_info["state_fips"]] geo_description = f"State {geo_info['state_fips']}" elif geo_info["type"] == "district": parent_stratum_id = filer_strata["district"][ geo_info["congressional_district_geoid"] ] - geo_description = ( - f"CD {geo_info['congressional_district_geoid']}" - ) + geo_description = f"CD {geo_info['congressional_district_geoid']}" # Create child stratum with constraint for this IRS variable # Note: This stratum will have the constraint that amount_variable > 0 @@ -741,9 +716,7 @@ def load_soi_data(long_dfs, year): StratumConstraint( constraint_variable="congressional_district_geoid", operation="==", - value=str( - geo_info["congressional_district_geoid"] - ), + value=str(geo_info["congressional_district_geoid"]), ) ) @@ -805,9 +778,7 @@ def load_soi_data(long_dfs, year): elif geo_info["type"] == "district": stratum = session.get( Stratum, - filer_strata["district"][ - geo_info["congressional_district_geoid"] - ], + filer_strata["district"][geo_info["congressional_district_geoid"]], ) # Check if target already exists @@ -822,9 +793,7 @@ def load_soi_data(long_dfs, year): ) if existing_target: - existing_target.value = agi_values.iloc[i][ - ["target_value"] - ].values[0] + existing_target.value = agi_values.iloc[i][["target_value"]].values[0] else: stratum.targets_rel.append( Target( @@ -901,9 +870,7 @@ def load_soi_data(long_dfs, year): person_count = agi_df.iloc[i][["target_value"]].values[0] if geo_info["type"] == "state": - parent_stratum_id = filer_strata["state"][ - geo_info["state_fips"] - ] + parent_stratum_id = filer_strata["state"][geo_info["state_fips"]] note = f"State FIPS {geo_info['state_fips']} filers, AGI >= {agi_income_lower}, AGI < {agi_income_upper}" constraints = [ StratumConstraint( @@ -1000,9 +967,9 @@ def load_soi_data(long_dfs, year): session.flush() if geo_info["type"] == "state": - agi_stratum_lookup["state"][ - geo_info["state_fips"] - ] = new_stratum.stratum_id + agi_stratum_lookup["state"][geo_info["state_fips"]] = ( + new_stratum.stratum_id + ) elif geo_info["type"] == "district": agi_stratum_lookup["district"][ geo_info["congressional_district_geoid"] diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py index dfc19cdcc..2c4677996 100644 --- a/policyengine_us_data/db/etl_medicaid.py +++ b/policyengine_us_data/db/etl_medicaid.py @@ -116,9 +116,7 @@ def transform_administrative_medicaid_data(state_admin_df, year): ].sort_values("Reporting Period", ascending=False) if not state_history.empty: - fallback_value = state_history.iloc[0][ - "Total Medicaid Enrollment" - ] + fallback_value = state_history.iloc[0]["Total Medicaid Enrollment"] fallback_period = state_history.iloc[0]["Reporting Period"] print( f" {state_abbrev}: Using {fallback_value:,.0f} from period {fallback_period}" @@ -153,9 +151,7 @@ def transform_survey_medicaid_data(cd_survey_df): def load_medicaid_data(long_state, long_cd, year): - DATABASE_URL = ( - f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" - ) + DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" engine = create_engine(DATABASE_URL) with Session(engine) as session: @@ -222,9 +218,7 @@ def load_medicaid_data(long_state, long_cd, year): ) session.add(new_stratum) session.flush() - medicaid_stratum_lookup["state"][ - state_fips - ] = new_stratum.stratum_id + medicaid_stratum_lookup["state"][state_fips] = new_stratum.stratum_id # District ------------------- if long_cd is None: diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index 2b78b6d6e..0e87aa84a 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -423,14 +423,10 @@ def transform_national_targets(raw_targets): # Note: income_tax_positive from CBO and eitc from Treasury need # filer constraint cbo_non_tax = [ - t - for t in raw_targets["cbo_targets"] - if t["variable"] != "income_tax_positive" + t for t in raw_targets["cbo_targets"] if t["variable"] != "income_tax_positive" ] cbo_tax = [ - t - for t in raw_targets["cbo_targets"] - if t["variable"] == "income_tax_positive" + t for t in raw_targets["cbo_targets"] if t["variable"] == "income_tax_positive" ] all_direct_targets = raw_targets["direct_sum_targets"] + cbo_non_tax @@ -443,14 +439,10 @@ def transform_national_targets(raw_targets): ) direct_df = ( - pd.DataFrame(all_direct_targets) - if all_direct_targets - else pd.DataFrame() + pd.DataFrame(all_direct_targets) if all_direct_targets else pd.DataFrame() ) tax_filer_df = ( - pd.DataFrame(all_tax_filer_targets) - if all_tax_filer_targets - else pd.DataFrame() + pd.DataFrame(all_tax_filer_targets) if all_tax_filer_targets else pd.DataFrame() ) # Conditional targets stay as list for special processing @@ -459,9 +451,7 @@ def transform_national_targets(raw_targets): return direct_df, tax_filer_df, conditional_targets -def load_national_targets( - direct_targets_df, tax_filer_df, conditional_targets -): +def load_national_targets(direct_targets_df, tax_filer_df, conditional_targets): """ Load national targets into the database. @@ -475,17 +465,13 @@ def load_national_targets( List of conditional count targets requiring strata """ - DATABASE_URL = ( - f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" - ) + DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" engine = create_engine(DATABASE_URL) with Session(engine) as session: # Get the national stratum us_stratum = ( - session.query(Stratum) - .filter(Stratum.parent_stratum_id == None) - .first() + session.query(Stratum).filter(Stratum.parent_stratum_id == None).first() ) if not us_stratum: @@ -511,9 +497,7 @@ def load_national_targets( notes_parts = [] if pd.notna(target_data.get("notes")): notes_parts.append(target_data["notes"]) - notes_parts.append( - f"Source: {target_data.get('source', 'Unknown')}" - ) + notes_parts.append(f"Source: {target_data.get('source', 'Unknown')}") combined_notes = " | ".join(notes_parts) if existing_target: @@ -583,9 +567,7 @@ def load_national_targets( notes_parts = [] if pd.notna(target_data.get("notes")): notes_parts.append(target_data["notes"]) - notes_parts.append( - f"Source: {target_data.get('source', 'Unknown')}" - ) + notes_parts.append(f"Source: {target_data.get('source', 'Unknown')}") combined_notes = " | ".join(notes_parts) if existing_target: @@ -699,23 +681,17 @@ def load_national_targets( ] session.add(new_stratum) - print( - f"Created stratum and target for {constraint_var} enrollment" - ) + print(f"Created stratum and target for {constraint_var} enrollment") session.commit() total_targets = ( - len(direct_targets_df) - + len(tax_filer_df) - + len(conditional_targets) + len(direct_targets_df) + len(tax_filer_df) + len(conditional_targets) ) print(f"\nSuccessfully loaded {total_targets} national targets") print(f" - {len(direct_targets_df)} direct sum targets") print(f" - {len(tax_filer_df)} tax filer targets") - print( - f" - {len(conditional_targets)} enrollment count targets (as strata)" - ) + print(f" - {len(conditional_targets)} enrollment count targets (as strata)") def main(): @@ -730,8 +706,8 @@ def main(): # Transform print("Transforming targets...") - direct_targets_df, tax_filer_df, conditional_targets = ( - transform_national_targets(raw_targets) + direct_targets_df, tax_filer_df, conditional_targets = transform_national_targets( + raw_targets ) # Load diff --git a/policyengine_us_data/db/etl_pregnancy.py b/policyengine_us_data/db/etl_pregnancy.py index de3fec9dc..e8756cfb5 100644 --- a/policyengine_us_data/db/etl_pregnancy.py +++ b/policyengine_us_data/db/etl_pregnancy.py @@ -182,10 +182,7 @@ def extract_female_population(year: int) -> pd.DataFrame: data = load_json(cache_file) else: var_ids = ",".join([f"B01001_{i:03d}E" for i in range(30, 39)]) - url = ( - f"https://api.census.gov/data/{year}/acs/acs1" - f"?get={var_ids}&for=state:*" - ) + url = f"https://api.census.gov/data/{year}/acs/acs1?get={var_ids}&for=state:*" logger.info(f"Fetching ACS B01001 female 15-44 for {year}") resp = requests.get(url, timeout=30) resp.raise_for_status() @@ -222,9 +219,7 @@ def transform_pregnancy_data( df = births_df.merge(pop_df, on="state_abbrev") df["state_fips"] = df["state_abbrev"].map(STATE_ABBREV_TO_FIPS) # Point-in-time pregnancy count. - df["pregnancy_target"] = ( - df["births"] * PREGNANCY_DURATION_FRACTION - ).round() + df["pregnancy_target"] = (df["births"] * PREGNANCY_DURATION_FRACTION).round() # Rate for stochastic assignment in the CPS build. df["pregnancy_rate"] = ( df["births"] / df["female_15_44"] @@ -246,9 +241,7 @@ def load_pregnancy_data( df: From transform_pregnancy_data. year: Target year for the calibration targets. """ - db_url = ( - f"sqlite:///" f"{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" - ) + db_url = f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" engine = create_engine(db_url) with Session(engine) as session: @@ -273,10 +266,7 @@ def load_pregnancy_data( for _, row in df.iterrows(): state_fips = int(row["state_fips"]) if state_fips not in geo_strata["state"]: - logger.warning( - f"No geographic stratum for FIPS " - f"{state_fips}, skipping" - ) + logger.warning(f"No geographic stratum for FIPS {state_fips}, skipping") continue parent_id = geo_strata["state"][state_fips] @@ -368,16 +358,14 @@ def main(): except Exception as e: logger.warning(f"ACS {acs_year} not available: {e}") if pop_df is None: - raise RuntimeError( - f"No ACS population data for " f"{year - 1} or {year - 2}" - ) + raise RuntimeError(f"No ACS population data for {year - 1} or {year - 2}") df = transform_pregnancy_data(births_df, pop_df) total_births = df["births"].sum() total_target = df["pregnancy_target"].sum() print(f"Total births: {total_births:,.0f}") - print(f"Pregnancy target (point-in-time): " f"{total_target:,.0f}") + print(f"Pregnancy target (point-in-time): {total_target:,.0f}") load_pregnancy_data(df, year) print("Pregnancy calibration targets loaded.") diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py index 48cb7e773..dc5975a4f 100644 --- a/policyengine_us_data/db/etl_snap.py +++ b/policyengine_us_data/db/etl_snap.py @@ -154,9 +154,7 @@ def transform_survey_snap_data(raw_df): def load_administrative_snap_data(df_states, year): - DATABASE_URL = ( - f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" - ) + DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" engine = create_engine(DATABASE_URL) with Session(engine) as session: @@ -244,9 +242,7 @@ def load_survey_snap_data(survey_df, year, snap_stratum_lookup): load_administrative_snap_data, so we don't recreate them. """ - DATABASE_URL = ( - f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" - ) + DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" engine = create_engine(DATABASE_URL) with Session(engine) as session: diff --git a/policyengine_us_data/db/etl_state_income_tax.py b/policyengine_us_data/db/etl_state_income_tax.py index a9ffa35c1..95fbc285c 100644 --- a/policyengine_us_data/db/etl_state_income_tax.py +++ b/policyengine_us_data/db/etl_state_income_tax.py @@ -320,11 +320,7 @@ def main(): # Print summary total_collections = transformed_df["income_tax_collections"].sum() states_with_tax = len( - [ - s - for s in transformed_df["state_abbrev"] - if s not in NO_INCOME_TAX_STATES - ] + [s for s in transformed_df["state_abbrev"] if s not in NO_INCOME_TAX_STATES] ) logger.info( @@ -337,9 +333,7 @@ def main(): # Print Ohio specifically (for the issue reference) ohio_row = transformed_df[transformed_df["state_abbrev"] == "OH"].iloc[0] - logger.info( - f" Ohio (OH): ${ohio_row['income_tax_collections'] / 1e9:.2f}B" - ) + logger.info(f" Ohio (OH): ${ohio_row['income_tax_collections'] / 1e9:.2f}B") if __name__ == "__main__": diff --git a/policyengine_us_data/db/validate_database.py b/policyengine_us_data/db/validate_database.py index 2fa819f29..b57a83c32 100644 --- a/policyengine_us_data/db/validate_database.py +++ b/policyengine_us_data/db/validate_database.py @@ -9,9 +9,7 @@ import pandas as pd from policyengine_us.system import system -conn = sqlite3.connect( - "policyengine_us_data/storage/calibration/policy_data.db" -) +conn = sqlite3.connect("policyengine_us_data/storage/calibration/policy_data.db") stratum_constraints_df = pd.read_sql("SELECT * FROM stratum_constraints", conn) targets_df = pd.read_sql("SELECT * FROM targets", conn) diff --git a/policyengine_us_data/db/validate_hierarchy.py b/policyengine_us_data/db/validate_hierarchy.py index 69a176f2e..1c555703f 100644 --- a/policyengine_us_data/db/validate_hierarchy.py +++ b/policyengine_us_data/db/validate_hierarchy.py @@ -31,9 +31,7 @@ def validate_geographic_hierarchy(session): "ERROR: No US-level stratum found (should have parent_stratum_id = None)" ) else: - print( - f"✓ US stratum found: {us_stratum.notes} (ID: {us_stratum.stratum_id})" - ) + print(f"✓ US stratum found: {us_stratum.notes} (ID: {us_stratum.stratum_id})") # Check it has no constraints us_constraints = session.exec( @@ -89,14 +87,10 @@ def validate_geographic_hierarchy(session): c for c in constraints if c.constraint_variable == "state_fips" ] if not state_fips_constraint: - errors.append( - f"ERROR: State '{state.notes}' has no state_fips constraint" - ) + errors.append(f"ERROR: State '{state.notes}' has no state_fips constraint") else: state_ids[state.stratum_id] = state.notes - print( - f" - {state.notes}: state_fips = {state_fips_constraint[0].value}" - ) + print(f" - {state.notes}: state_fips = {state_fips_constraint[0].value}") # Check congressional districts print("\nChecking Congressional Districts...") @@ -112,11 +106,10 @@ def validate_geographic_hierarchy(session): ) ).all() constraint_vars = {c.constraint_variable for c in constraints} - if ( - "congressional_district_geoid" in constraint_vars - and constraint_vars - <= {"state_fips", "congressional_district_geoid"} - ): + if "congressional_district_geoid" in constraint_vars and constraint_vars <= { + "state_fips", + "congressional_district_geoid", + }: all_cds.append(s) print(f"✓ Found {len(all_cds)} congressional/delegate districts") @@ -158,9 +151,7 @@ def validate_geographic_hierarchy(session): wyoming_cds.append(child) if len(wyoming_cds) != 1: - errors.append( - f"ERROR: Wyoming should have 1 CD, found {len(wyoming_cds)}" - ) + errors.append(f"ERROR: Wyoming should have 1 CD, found {len(wyoming_cds)}") else: print(f"✓ Wyoming has correct number of CDs: 1") @@ -184,9 +175,7 @@ def validate_geographic_hierarchy(session): for cd in wrong_parent_cds[:5]: errors.append(f" - {cd.notes}") else: - print( - "✓ No congressional districts incorrectly parented to Wyoming" - ) + print("✓ No congressional districts incorrectly parented to Wyoming") return errors @@ -237,13 +226,10 @@ def validate_demographic_strata(session): if actual == expected_total: print(f"✓ {domain}: {actual} strata") elif actual == 0: - errors.append( - f"ERROR: {domain} has no strata, " f"expected {expected_total}" - ) + errors.append(f"ERROR: {domain} has no strata, expected {expected_total}") else: errors.append( - f"WARNING: {domain} has {actual} strata, " - f"expected {expected_total}" + f"WARNING: {domain} has {actual} strata, expected {expected_total}" ) # Identify geographic strata (those with only geographic @@ -291,11 +277,9 @@ def validate_demographic_strata(session): ) else: no_parents += 1 - errors.append( - f"ERROR: Stratum {stratum.stratum_id} " f"has no parent" - ) + errors.append(f"ERROR: Stratum {stratum.stratum_id} has no parent") - print(f" Sample of {len(sample_strata)} " f"demographic strata:") + print(f" Sample of {len(sample_strata)} demographic strata:") print(f" - With geographic parent: {correct_parents}") print(f" - With wrong parent: {wrong_parents}") print(f" - With no parent: {no_parents}") @@ -322,18 +306,12 @@ def validate_constraint_uniqueness(session): else: hash_counts[stratum.definition_hash] = [stratum] - duplicates = { - h: strata for h, strata in hash_counts.items() if len(strata) > 1 - } + duplicates = {h: strata for h, strata in hash_counts.items() if len(strata) > 1} if duplicates: - errors.append( - f"ERROR: Found {len(duplicates)} duplicate definition_hashes" - ) + errors.append(f"ERROR: Found {len(duplicates)} duplicate definition_hashes") for hash_val, strata in list(duplicates.items())[:3]: # Show first 3 - errors.append( - f" Hash {hash_val[:10]}... appears {len(strata)} times:" - ) + errors.append(f" Hash {hash_val[:10]}... appears {len(strata)} times:") for s in strata[:3]: errors.append(f" - ID {s.stratum_id}: {s.notes[:50]}") else: @@ -345,9 +323,7 @@ def validate_constraint_uniqueness(session): def main(): """Run all validation checks""" - DATABASE_URL = ( - f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" - ) + DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}" engine = create_engine(DATABASE_URL) all_errors = [] diff --git a/policyengine_us_data/geography/__init__.py b/policyengine_us_data/geography/__init__.py index 0bcc73f0b..f20068192 100644 --- a/policyengine_us_data/geography/__init__.py +++ b/policyengine_us_data/geography/__init__.py @@ -2,9 +2,7 @@ import pandas as pd import os -ZIP_CODE_DATASET_PATH = ( - Path(__file__).parent.parent / "geography" / "zip_codes.csv.gz" -) +ZIP_CODE_DATASET_PATH = Path(__file__).parent.parent / "geography" / "zip_codes.csv.gz" # Avoid circular import error when -us-data is initialized if os.path.exists(ZIP_CODE_DATASET_PATH): diff --git a/policyengine_us_data/geography/county_fips.py b/policyengine_us_data/geography/county_fips.py index 3e5ac5183..6bb2b9e92 100644 --- a/policyengine_us_data/geography/county_fips.py +++ b/policyengine_us_data/geography/county_fips.py @@ -21,7 +21,9 @@ def generate_county_fips_2020_dataset(): # COUNTYFP - Three-digit county portion of FIPS (001 for Autauga County, AL, if STATEFP is 01) # COUNTYNAME - County name - COUNTY_FIPS_2020_URL = "https://www2.census.gov/geo/docs/reference/codes2020/national_county2020.txt" + COUNTY_FIPS_2020_URL = ( + "https://www2.census.gov/geo/docs/reference/codes2020/national_county2020.txt" + ) # Download the base tab-delimited data file response = requests.get(COUNTY_FIPS_2020_URL) @@ -68,9 +70,7 @@ def generate_county_fips_2020_dataset(): csv_buffer = BytesIO() # Save CSV into buffer object and reset pointer - county_fips.to_csv( - csv_buffer, index=False, compression="gzip", encoding="utf-8" - ) + county_fips.to_csv(csv_buffer, index=False, compression="gzip", encoding="utf-8") csv_buffer.seek(0) # Upload to Hugging Face diff --git a/policyengine_us_data/geography/create_zip_code_dataset.py b/policyengine_us_data/geography/create_zip_code_dataset.py index eb154cf70..981b5de5f 100644 --- a/policyengine_us_data/geography/create_zip_code_dataset.py +++ b/policyengine_us_data/geography/create_zip_code_dataset.py @@ -51,7 +51,5 @@ zcta.set_index("zcta").population[zip_code.zcta].values / zip_code.groupby("zcta").zip_code.count()[zip_code.zcta].values ) -zip_code["county"] = ( - zcta_to_county.set_index("zcta").county[zip_code.zcta].values -) +zip_code["county"] = zcta_to_county.set_index("zcta").county[zip_code.zcta].values zip_code.to_csv("zip_codes.csv", compression="gzip") diff --git a/policyengine_us_data/parameters/__init__.py b/policyengine_us_data/parameters/__init__.py index 2fcddb5af..dc385f8e0 100644 --- a/policyengine_us_data/parameters/__init__.py +++ b/policyengine_us_data/parameters/__init__.py @@ -65,8 +65,6 @@ def load_take_up_rate(variable_name: str, year: int = 2018): break if applicable_value is None: - raise ValueError( - f"No take-up rate found for {variable_name} in {year}" - ) + raise ValueError(f"No take-up rate found for {variable_name} in {year}") return applicable_value diff --git a/policyengine_us_data/storage/calibration_targets/audit_county_enum.py b/policyengine_us_data/storage/calibration_targets/audit_county_enum.py index 4849a10ef..fcaf443ff 100644 --- a/policyengine_us_data/storage/calibration_targets/audit_county_enum.py +++ b/policyengine_us_data/storage/calibration_targets/audit_county_enum.py @@ -109,9 +109,7 @@ def print_categorized_report(invalid_entries, county_to_states): print("\n" + "=" * 60) print("WRONG STATE ASSIGNMENTS") print("=" * 60) - for name, wrong_state, correct_states in sorted( - invalid_entries["wrong_state"] - ): + for name, wrong_state, correct_states in sorted(invalid_entries["wrong_state"]): print(f" {name}") print(f" Listed as: {wrong_state}") print(f" Actually exists in: {', '.join(sorted(correct_states))}") diff --git a/policyengine_us_data/storage/calibration_targets/make_block_cd_distributions.py b/policyengine_us_data/storage/calibration_targets/make_block_cd_distributions.py index 6f55e3f7c..f2b634e00 100644 --- a/policyengine_us_data/storage/calibration_targets/make_block_cd_distributions.py +++ b/policyengine_us_data/storage/calibration_targets/make_block_cd_distributions.py @@ -78,9 +78,7 @@ def build_block_cd_distributions(): # Create CD geoid in our format: state_fips * 100 + district # Examples: AL-1 = 101, NY-10 = 3610, DC = 1198 - df["cd_geoid"] = df["state_fips"].astype(int) * 100 + df["CD119"].astype( - int - ) + df["cd_geoid"] = df["state_fips"].astype(int) * 100 + df["CD119"].astype(int) # Step 4: Calculate P(block|CD) print("\nCalculating block probabilities...") @@ -97,9 +95,7 @@ def build_block_cd_distributions(): output = df[["cd_geoid", "GEOID", "probability"]].rename( columns={"GEOID": "block_geoid"} ) - output = output.sort_values( - ["cd_geoid", "probability"], ascending=[True, False] - ) + output = output.sort_values(["cd_geoid", "probability"], ascending=[True, False]) # Step 6: Save as gzipped CSV (parquet requires pyarrow) output_path = STORAGE_FOLDER / "block_cd_distributions.csv.gz" diff --git a/policyengine_us_data/storage/calibration_targets/make_block_crosswalk.py b/policyengine_us_data/storage/calibration_targets/make_block_crosswalk.py index 418e725f1..ed0d8cc1a 100644 --- a/policyengine_us_data/storage/calibration_targets/make_block_crosswalk.py +++ b/policyengine_us_data/storage/calibration_targets/make_block_crosswalk.py @@ -60,9 +60,7 @@ def download_state_baf(state_fips: str, state_abbr: str) -> dict: ) # Place (City/CDP) - place_file = ( - f"BlockAssign_ST{state_fips}_{state_abbr}_INCPLACE_CDP.txt" - ) + place_file = f"BlockAssign_ST{state_fips}_{state_abbr}_INCPLACE_CDP.txt" if place_file in z.namelist(): df = pd.read_csv(z.open(place_file), sep="|", dtype=str) results["place"] = df.rename( @@ -168,23 +166,17 @@ def build_block_crosswalk(): # Merge other geographies if "sldl" in bafs: - df = df.merge( - bafs["sldl"], on="block_geoid", how="left" - ) + df = df.merge(bafs["sldl"], on="block_geoid", how="left") else: df["sldl"] = None if "place" in bafs: - df = df.merge( - bafs["place"], on="block_geoid", how="left" - ) + df = df.merge(bafs["place"], on="block_geoid", how="left") else: df["place_fips"] = None if "vtd" in bafs: - df = df.merge( - bafs["vtd"], on="block_geoid", how="left" - ) + df = df.merge(bafs["vtd"], on="block_geoid", how="left") else: df["vtd"] = None diff --git a/policyengine_us_data/storage/calibration_targets/make_county_cd_distributions.py b/policyengine_us_data/storage/calibration_targets/make_county_cd_distributions.py index ba68a5566..2c91f1ca0 100644 --- a/policyengine_us_data/storage/calibration_targets/make_county_cd_distributions.py +++ b/policyengine_us_data/storage/calibration_targets/make_county_cd_distributions.py @@ -126,15 +126,11 @@ def build_county_cd_distributions(): # Create CD geoid in our format: state_fips * 100 + district # Examples: AL-1 = 101, NY-10 = 3610, DC = 1198 - df["cd_geoid"] = df["state_fips"].astype(int) * 100 + df["CD119"].astype( - int - ) + df["cd_geoid"] = df["state_fips"].astype(int) * 100 + df["CD119"].astype(int) # Step 4: Aggregate by (CD, county) print("\nAggregating population by CD and county...") - cd_county_pop = ( - df.groupby(["cd_geoid", "county_fips"])["POP20"].sum().reset_index() - ) + cd_county_pop = df.groupby(["cd_geoid", "county_fips"])["POP20"].sum().reset_index() print(f" Unique CD-county pairs: {len(cd_county_pop):,}") # Step 5: Calculate P(county|CD) @@ -151,9 +147,7 @@ def build_county_cd_distributions(): # Step 6: Map county FIPS to enum names print("\nMapping county FIPS to enum names...") fips_to_enum = build_county_fips_to_enum_mapping() - cd_county_pop["county_name"] = cd_county_pop["county_fips"].map( - fips_to_enum - ) + cd_county_pop["county_name"] = cd_county_pop["county_fips"].map(fips_to_enum) # Check for unmapped counties unmapped = cd_county_pop[cd_county_pop["county_name"].isna()] @@ -177,9 +171,7 @@ def build_county_cd_distributions(): # Step 8: Save CSV output = cd_county_pop[["cd_geoid", "county_name", "probability"]] - output = output.sort_values( - ["cd_geoid", "probability"], ascending=[True, False] - ) + output = output.sort_values(["cd_geoid", "probability"], ascending=[True, False]) output_path = STORAGE_FOLDER / "county_cd_distributions.csv" output.to_csv(output_path, index=False) diff --git a/policyengine_us_data/storage/calibration_targets/make_district_mapping.py b/policyengine_us_data/storage/calibration_targets/make_district_mapping.py index 2b930a2da..bfb4936e8 100644 --- a/policyengine_us_data/storage/calibration_targets/make_district_mapping.py +++ b/policyengine_us_data/storage/calibration_targets/make_district_mapping.py @@ -91,9 +91,7 @@ def fetch_block_to_district_map(congress: int) -> pd.DataFrame: return bef[["GEOID", f"CD{congress}"]] else: - raise ValueError( - f"Congress {congress} is not supported by this function." - ) + raise ValueError(f"Congress {congress} is not supported by this function.") def fetch_block_population(state) -> pd.DataFrame: @@ -145,9 +143,7 @@ def fetch_block_population(state) -> pd.DataFrame: geo_df = pd.DataFrame(geo_records, columns=["LOGRECNO", "GEOID"]) # ---------------- P-file: pull total-population cell ---------------------- - p1_records = [ - (p[4], int(p[5])) for p in map(lambda x: x.split("|"), p1_lines) - ] + p1_records = [(p[4], int(p[5])) for p in map(lambda x: x.split("|"), p1_lines)] p1_df = pd.DataFrame(p1_records, columns=["LOGRECNO", "P0010001"]) # ---------------- Merge & finish ----------------------------------------- diff --git a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py index da8b54121..3199a56a2 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py @@ -42,13 +42,9 @@ def pull_hardcoded_targets(): "VARIABLE": list(HARD_CODED_TOTALS.keys()), "VALUE": list(HARD_CODED_TOTALS.values()), "IS_COUNT": [0.0] - * len( - HARD_CODED_TOTALS - ), # All values are monetary amounts, not counts + * len(HARD_CODED_TOTALS), # All values are monetary amounts, not counts "BREAKDOWN_VARIABLE": [np.nan] - * len( - HARD_CODED_TOTALS - ), # No breakdown variable for hardcoded targets + * len(HARD_CODED_TOTALS), # No breakdown variable for hardcoded targets "LOWER_BOUND": [np.nan] * len(HARD_CODED_TOTALS), "UPPER_BOUND": [np.nan] * len(HARD_CODED_TOTALS), } diff --git a/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py b/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py index 1830bdb3a..202286e70 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py @@ -84,7 +84,9 @@ def extract_usda_snap_data(year=2023): session.headers.update(headers) # Try to visit the main page first to get any necessary cookies - main_page = "https://www.fns.usda.gov/pd/supplemental-nutrition-assistance-program-snap" + main_page = ( + "https://www.fns.usda.gov/pd/supplemental-nutrition-assistance-program-snap" + ) try: session.get(main_page, timeout=30) except: @@ -167,9 +169,7 @@ def extract_usda_snap_data(year=2023): .reset_index(drop=True) ) df_states["GEO_ID"] = "0400000US" + df_states["STATE_FIPS"] - df_states["GEO_NAME"] = "state_" + df_states["State"].map( - STATE_NAME_TO_ABBREV - ) + df_states["GEO_NAME"] = "state_" + df_states["State"].map(STATE_NAME_TO_ABBREV) count_df = df_states[["GEO_ID", "GEO_NAME"]].copy() count_df["VALUE"] = df_states["Households"] diff --git a/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py b/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py index 59050a1b3..ce6d9f887 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py @@ -129,26 +129,17 @@ def pull_national_soi_variable( national_df: Optional[pd.DataFrame] = None, ) -> pd.DataFrame: """Download and save national AGI totals.""" - df = pd.read_excel( - "https://www.irs.gov/pub/irs-soi/22in54us.xlsx", skiprows=7 - ) + df = pd.read_excel("https://www.irs.gov/pub/irs-soi/22in54us.xlsx", skiprows=7) assert ( - np.abs( - df.iloc[soi_variable_ident, 1] - - df.iloc[soi_variable_ident, 2:12].sum() - ) + np.abs(df.iloc[soi_variable_ident, 1] - df.iloc[soi_variable_ident, 2:12].sum()) < 100 ), "Row 0 doesn't add up — check the file." agi_values = df.iloc[soi_variable_ident, 2:12].astype(int).to_numpy() - agi_values = np.concatenate( - [agi_values[:8], [agi_values[8] + agi_values[9]]] - ) + agi_values = np.concatenate([agi_values[:8], [agi_values[8] + agi_values[9]]]) - agi_brackets = [ - AGI_STUB_TO_BAND[i] for i in range(1, len(SOI_COLUMNS) + 1) - ] + agi_brackets = [AGI_STUB_TO_BAND[i] for i in range(1, len(SOI_COLUMNS) + 1)] result = pd.DataFrame( { @@ -161,9 +152,7 @@ def pull_national_soi_variable( ) # final column order - result = result[ - ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"] - ] + result = result[["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"]] result["IS_COUNT"] = int(is_count) result["VARIABLE"] = variable_name @@ -186,9 +175,7 @@ def pull_state_soi_variable( state_df: Optional[pd.DataFrame] = None, ) -> pd.DataFrame: """Download and save state AGI totals.""" - df = pd.read_csv( - "https://www.irs.gov/pub/irs-soi/22in55cmcsv.csv", thousands="," - ) + df = pd.read_csv("https://www.irs.gov/pub/irs-soi/22in55cmcsv.csv", thousands=",") merged = ( df[df["AGI_STUB"].isin([9, 10])] @@ -211,17 +198,11 @@ def pull_state_soi_variable( ["GEO_ID", "GEO_NAME", "agi_bracket", soi_variable_ident], ].rename(columns={soi_variable_ident: "VALUE"}) - result["LOWER_BOUND"] = result["agi_bracket"].map( - lambda b: AGI_BOUNDS[b][0] - ) - result["UPPER_BOUND"] = result["agi_bracket"].map( - lambda b: AGI_BOUNDS[b][1] - ) + result["LOWER_BOUND"] = result["agi_bracket"].map(lambda b: AGI_BOUNDS[b][0]) + result["UPPER_BOUND"] = result["agi_bracket"].map(lambda b: AGI_BOUNDS[b][1]) # final column order - result = result[ - ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"] - ] + result = result[["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"]] result["IS_COUNT"] = int(is_count) result["VARIABLE"] = variable_name @@ -249,9 +230,7 @@ def pull_district_soi_variable( df = df[df["agi_stub"] != 0] df["STATEFIPS"] = df["STATEFIPS"].astype(int).astype(str).str.zfill(2) - df["CONG_DISTRICT"] = ( - df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2) - ) + df["CONG_DISTRICT"] = df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2) if SOI_DISTRICT_TAX_YEAR >= 2024: raise RuntimeError( f"SOI tax year {SOI_DISTRICT_TAX_YEAR} may need " @@ -288,12 +267,8 @@ def pull_district_soi_variable( ] ].rename(columns={soi_variable_ident: "VALUE"}) - result["LOWER_BOUND"] = result["agi_bracket"].map( - lambda b: AGI_BOUNDS[b][0] - ) - result["UPPER_BOUND"] = result["agi_bracket"].map( - lambda b: AGI_BOUNDS[b][1] - ) + result["LOWER_BOUND"] = result["agi_bracket"].map(lambda b: AGI_BOUNDS[b][0]) + result["UPPER_BOUND"] = result["agi_bracket"].map(lambda b: AGI_BOUNDS[b][1]) # if redistrict: # result = apply_redistricting(result, variable_name) @@ -308,25 +283,23 @@ def pull_district_soi_variable( # Check that all GEO_IDs are valid produced_codes = set(result["GEO_ID"]) invalid_codes = produced_codes - valid_district_codes - assert ( - not invalid_codes - ), f"Invalid district codes after redistricting: {invalid_codes}" + assert not invalid_codes, ( + f"Invalid district codes after redistricting: {invalid_codes}" + ) # Check we have exactly 436 districts - assert ( - len(produced_codes) == 436 - ), f"Expected 436 districts after redistricting, got {len(produced_codes)}" + assert len(produced_codes) == 436, ( + f"Expected 436 districts after redistricting, got {len(produced_codes)}" + ) # Check that all GEO_IDs successfully mapped to names missing_names = result[result["GEO_NAME"].isna()]["GEO_ID"].unique() - assert ( - len(missing_names) == 0 - ), f"GEO_IDs without names in ID_TO_NAME mapping: {missing_names}" + assert len(missing_names) == 0, ( + f"GEO_IDs without names in ID_TO_NAME mapping: {missing_names}" + ) # final column order - result = result[ - ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"] - ] + result = result[["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"]] result["IS_COUNT"] = int(is_count) result["VARIABLE"] = variable_name @@ -457,15 +430,11 @@ def combine_geography_levels(districts: Optional[bool] = False) -> None: ) # Get state totals indexed by STATEFIPS - state_totals = state.loc[state_mask].set_index("STATEFIPS")[ - "VALUE" - ] + state_totals = state.loc[state_mask].set_index("STATEFIPS")["VALUE"] # Get district totals grouped by STATEFIPS district_totals = ( - district.loc[district_mask] - .groupby("STATEFIPS")["VALUE"] - .sum() + district.loc[district_mask].groupby("STATEFIPS")["VALUE"].sum() ) # Check and rescale districts for each state @@ -480,12 +449,8 @@ def combine_geography_levels(districts: Optional[bool] = False) -> None: f"Districts' sum does not match {fips} state total for {variable}/{count_type} " f"in bracket [{lower}, {upper}]. Rescaling district targets." ) - rescale_mask = district_mask & ( - district["STATEFIPS"] == fips - ) - district.loc[rescale_mask, "VALUE"] *= ( - s_total / d_total - ) + rescale_mask = district_mask & (district["STATEFIPS"] == fips) + district.loc[rescale_mask, "VALUE"] *= s_total / d_total # Combine all data combined = pd.concat( diff --git a/policyengine_us_data/storage/upload_completed_datasets.py b/policyengine_us_data/storage/upload_completed_datasets.py index 8f00b3753..7fcf59581 100644 --- a/policyengine_us_data/storage/upload_completed_datasets.py +++ b/policyengine_us_data/storage/upload_completed_datasets.py @@ -94,14 +94,11 @@ def _check_group_has_data(f, name): for group_name in REQUIRED_GROUPS: if not _check_group_has_data(f, group_name): errors.append( - f"Required group '{group_name}' missing " - f"or empty in H5 file." + f"Required group '{group_name}' missing or empty in H5 file." ) # At least one income group must have data - has_income = any( - _check_group_has_data(f, g) for g in INCOME_GROUPS - ) + has_income = any(_check_group_has_data(f, g) for g in INCOME_GROUPS) if not has_income: errors.append( f"No income data found. Need at least one of " diff --git a/policyengine_us_data/tests/test_calibration/test_build_matrix_masking.py b/policyengine_us_data/tests/test_calibration/test_build_matrix_masking.py index 8db56ddcb..853c6d04b 100644 --- a/policyengine_us_data/tests/test_calibration/test_build_matrix_masking.py +++ b/policyengine_us_data/tests/test_calibration/test_build_matrix_masking.py @@ -43,9 +43,7 @@ def matrix_result(): sim = Microsimulation(dataset=DATASET_PATH) n_records = sim.calculate("household_id").values.shape[0] - geography = assign_random_geography( - n_records, n_clones=N_CLONES, seed=SEED - ) + geography = assign_random_geography(n_records, n_clones=N_CLONES, seed=SEED) builder = UnifiedMatrixBuilder( db_uri=DB_URI, time_period=2024, @@ -124,8 +122,7 @@ def test_clone_visible_only_to_own_state(self, matrix_result): if state_0 == state_1: pytest.skip( - "Both clones landed in the same state — " - "cannot test cross-state masking" + "Both clones landed in the same state — cannot test cross-state masking" ) state_targets = targets_df[targets_df["geo_level"] == "state"] @@ -164,11 +161,7 @@ def test_clone_visible_only_to_own_cd(self, matrix_result): vals_0 = X_csc[:, col_0].toarray().ravel() same_state_other_cd = district_targets[ - ( - district_targets["geographic_id"].apply( - lambda g: g.startswith(state_0) - ) - ) + (district_targets["geographic_id"].apply(lambda g: g.startswith(state_0))) & (district_targets["geographic_id"] != cd_0) ] @@ -198,10 +191,7 @@ def test_clone_nonzero_for_own_cd(self, matrix_result): X_csc = X.tocsc() vals_0 = X_csc[:, col_0].toarray().ravel() - any_nonzero = any( - vals_0[row.name] != 0 for _, row in own_cd_targets.iterrows() - ) + any_nonzero = any(vals_0[row.name] != 0 for _, row in own_cd_targets.iterrows()) assert any_nonzero, ( - f"Clone 0 should have at least one non-zero entry " - f"for its own CD {cd_0}" + f"Clone 0 should have at least one non-zero entry for its own CD {cd_0}" ) diff --git a/policyengine_us_data/tests/test_calibration/test_clone_and_assign.py b/policyengine_us_data/tests/test_calibration/test_clone_and_assign.py index 0ba330549..c13c6a89b 100644 --- a/policyengine_us_data/tests/test_calibration/test_clone_and_assign.py +++ b/policyengine_us_data/tests/test_calibration/test_clone_and_assign.py @@ -66,13 +66,10 @@ def test_loads_and_normalizes(self, tmp_path): csv_path = tmp_path / "block_cd_distributions.csv.gz" MOCK_BLOCKS.to_csv(csv_path, index=False, compression="gzip") with patch( - "policyengine_us_data.calibration" - ".clone_and_assign.STORAGE_FOLDER", + "policyengine_us_data.calibration.clone_and_assign.STORAGE_FOLDER", tmp_path, ): - blocks, cds, states, probs = ( - load_global_block_distribution.__wrapped__() - ) + blocks, cds, states, probs = load_global_block_distribution.__wrapped__() assert len(blocks) == 9 np.testing.assert_almost_equal(probs.sum(), 1.0) @@ -80,8 +77,7 @@ def test_state_fips_extracted(self, tmp_path): csv_path = tmp_path / "block_cd_distributions.csv.gz" MOCK_BLOCKS.to_csv(csv_path, index=False, compression="gzip") with patch( - "policyengine_us_data.calibration" - ".clone_and_assign.STORAGE_FOLDER", + "policyengine_us_data.calibration.clone_and_assign.STORAGE_FOLDER", tmp_path, ): _, _, states, _ = load_global_block_distribution.__wrapped__() @@ -137,8 +133,7 @@ def test_missing_file_raises(self, tmp_path): fake = tmp_path / "nonexistent" fake.mkdir() with patch( - "policyengine_us_data.calibration" - ".clone_and_assign.STORAGE_FOLDER", + "policyengine_us_data.calibration.clone_and_assign.STORAGE_FOLDER", fake, ): with pytest.raises(FileNotFoundError): diff --git a/policyengine_us_data/tests/test_calibration/test_puf_impute.py b/policyengine_us_data/tests/test_calibration/test_puf_impute.py index 1bce3cf70..d803486ee 100644 --- a/policyengine_us_data/tests/test_calibration/test_puf_impute.py +++ b/policyengine_us_data/tests/test_calibration/test_puf_impute.py @@ -150,9 +150,7 @@ def test_reduces_to_target(self): rng.uniform(500_000, 5_000_000, size=250), ] ) - idx = _stratified_subsample_index( - income, target_n=10_000, top_pct=99.5 - ) + idx = _stratified_subsample_index(income, target_n=10_000, top_pct=99.5) assert len(idx) == 10_000 def test_preserves_top_earners(self): @@ -166,9 +164,7 @@ def test_preserves_top_earners(self): threshold = np.percentile(income, 99.5) n_top = (income >= threshold).sum() - idx = _stratified_subsample_index( - income, target_n=10_000, top_pct=99.5 - ) + idx = _stratified_subsample_index(income, target_n=10_000, top_pct=99.5) selected_income = income[idx] n_top_selected = (selected_income >= threshold).sum() assert n_top_selected == n_top diff --git a/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py b/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py index ce261a02b..cd4b45245 100644 --- a/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py +++ b/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py @@ -54,14 +54,8 @@ def _make_mock_data(n_persons=20, n_households=5, time_period=2024): "person_household_id": {time_period: hh_ids_person}, "person_tax_unit_id": {time_period: hh_ids_person.copy()}, "person_spm_unit_id": {time_period: hh_ids_person.copy()}, - "age": { - time_period: rng.integers(18, 80, size=n_persons).astype( - np.float32 - ) - }, - "is_male": { - time_period: rng.integers(0, 2, size=n_persons).astype(np.float32) - }, + "age": {time_period: rng.integers(18, 80, size=n_persons).astype(np.float32)}, + "is_male": {time_period: rng.integers(0, 2, size=n_persons).astype(np.float32)}, "household_weight": {time_period: np.ones(n_households) * 1000}, "employment_income": { time_period: rng.uniform(0, 100_000, n_persons).astype(np.float32) @@ -71,9 +65,7 @@ def _make_mock_data(n_persons=20, n_households=5, time_period=2024): }, } for var in CPS_RETIREMENT_VARIABLES: - data[var] = { - time_period: rng.uniform(0, 5000, n_persons).astype(np.float32) - } + data[var] = {time_period: rng.uniform(0, 5000, n_persons).astype(np.float32)} return data @@ -139,9 +131,9 @@ class TestConstants: def test_retirement_vars_not_in_imputed(self): """Retirement vars must NOT be in IMPUTED_VARIABLES.""" for var in CPS_RETIREMENT_VARIABLES: - assert ( - var not in IMPUTED_VARIABLES - ), f"{var} should not be in IMPUTED_VARIABLES" + assert var not in IMPUTED_VARIABLES, ( + f"{var} should not be in IMPUTED_VARIABLES" + ) def test_retirement_vars_not_in_overridden(self): for var in CPS_RETIREMENT_VARIABLES: @@ -171,14 +163,12 @@ def test_retirement_predictors_include_demographics(self): def test_income_predictors_in_imputed_variables(self): """All income predictors must be available from PUF QRF.""" for var in RETIREMENT_INCOME_PREDICTORS: - assert ( - var in IMPUTED_VARIABLES - ), f"{var} not in IMPUTED_VARIABLES — won't be in puf_imputations" + assert var in IMPUTED_VARIABLES, ( + f"{var} not in IMPUTED_VARIABLES — won't be in puf_imputations" + ) def test_predictors_are_combined_lists(self): - expected = ( - RETIREMENT_DEMOGRAPHIC_PREDICTORS + RETIREMENT_INCOME_PREDICTORS - ) + expected = RETIREMENT_DEMOGRAPHIC_PREDICTORS + RETIREMENT_INCOME_PREDICTORS assert RETIREMENT_PREDICTORS == expected @@ -270,18 +260,12 @@ def _setup(self): self.puf_imputations = { "employment_income": emp, "self_employment_income": se, - "taxable_interest_income": rng.uniform(0, 5_000, self.n).astype( - np.float32 - ), + "taxable_interest_income": rng.uniform(0, 5_000, self.n).astype(np.float32), "qualified_dividend_income": rng.uniform(0, 3_000, self.n).astype( np.float32 ), - "taxable_pension_income": rng.uniform(0, 20_000, self.n).astype( - np.float32 - ), - "social_security": rng.uniform(0, 15_000, self.n).astype( - np.float32 - ), + "taxable_pension_income": rng.uniform(0, 20_000, self.n).astype(np.float32), + "social_security": rng.uniform(0, 15_000, self.n).astype(np.float32), } self.cps_df = _make_cps_df(self.n, rng) @@ -319,10 +303,7 @@ def _uniform_preds(self, value): def _random_preds(self, low, high, seed=99): rng = np.random.default_rng(seed) return pd.DataFrame( - { - var: rng.uniform(low, high, self.n) - for var in CPS_RETIREMENT_VARIABLES - } + {var: rng.uniform(low, high, self.n) for var in CPS_RETIREMENT_VARIABLES} ) def test_returns_all_retirement_vars(self): @@ -367,27 +348,23 @@ def test_401k_zero_when_no_wages(self): "traditional_401k_contributions", "roth_401k_contributions", ): - assert np.all( - result[var][zero_wage] == 0 - ), f"{var} should be 0 when employment_income is 0" + assert np.all(result[var][zero_wage] == 0), ( + f"{var} should be 0 when employment_income is 0" + ) def test_se_pension_zero_when_no_se_income(self): result = self._call_with_mocks(self._uniform_preds(5_000.0)) zero_se = self.puf_imputations["self_employment_income"] == 0 assert zero_se.sum() == 20 - assert np.all( - result["self_employed_pension_contributions"][zero_se] == 0 - ) + assert np.all(result["self_employed_pension_contributions"][zero_se] == 0) def test_catch_up_age_threshold(self): """Records age >= 50 get higher caps than younger.""" - self.cps_df["age"] = np.concatenate( - [np.full(25, 30.0), np.full(25, 55.0)] - ) + self.cps_df["age"] = np.concatenate([np.full(25, 30.0), np.full(25, 55.0)]) # All have positive income - self.puf_imputations["employment_income"] = np.full( - self.n, 100_000.0 - ).astype(np.float32) + self.puf_imputations["employment_income"] = np.full(self.n, 100_000.0).astype( + np.float32 + ) lim = _get_retirement_limits(self.time_period) val = float(lim["401k"]) + 1000 # 24000 @@ -404,9 +381,7 @@ def test_catch_up_age_threshold(self): def test_ira_catch_up_threshold(self): """IRA catch-up also works for age >= 50.""" - self.cps_df["age"] = np.concatenate( - [np.full(25, 30.0), np.full(25, 55.0)] - ) + self.cps_df["age"] = np.concatenate([np.full(25, 30.0), np.full(25, 55.0)]) lim = _get_retirement_limits(self.time_period) val = float(lim["ira"]) + 500 # 7500 @@ -432,9 +407,7 @@ def test_401k_nonzero_for_positive_wages(self): def test_se_pension_nonzero_for_positive_se(self): result = self._call_with_mocks(self._uniform_preds(5_000.0)) pos_se = self.puf_imputations["self_employment_income"] > 0 - assert np.all( - result["self_employed_pension_contributions"][pos_se] > 0 - ) + assert np.all(result["self_employed_pension_contributions"][pos_se] > 0) def test_se_pension_capped_at_rate_times_income(self): """SE pension should not exceed 25% of SE income.""" @@ -460,9 +433,7 @@ def test_qrf_failure_returns_zeros(self): # Make a QRF that crashes on fit mock_qrf_cls = MagicMock() - mock_qrf_cls.return_value.fit.side_effect = RuntimeError( - "QRF exploded" - ) + mock_qrf_cls.return_value.fit.side_effect = RuntimeError("QRF exploded") qrf_mod = sys.modules["microimpute.models.qrf"] old_qrf = getattr(qrf_mod, "QRF", None) @@ -488,9 +459,7 @@ def test_training_data_failure_returns_zeros(self): import sys mock_sim = MagicMock() - mock_sim.calculate_dataframe.side_effect = ValueError( - "missing variable" - ) + mock_sim.calculate_dataframe.side_effect = ValueError("missing variable") qrf_mod = sys.modules["microimpute.models.qrf"] old_qrf = getattr(qrf_mod, "QRF", None) @@ -540,9 +509,7 @@ def test_retirement_vars_use_imputed_when_available(self): state_fips = np.array([1, 2, 36, 6, 48]) n = 20 - fake_retirement = { - var: np.full(n, 999.0) for var in CPS_RETIREMENT_VARIABLES - } + fake_retirement = {var: np.full(n, 999.0) for var in CPS_RETIREMENT_VARIABLES} with ( patch( @@ -551,16 +518,14 @@ def test_retirement_vars_use_imputed_when_available(self): return_value=fake_retirement, ), patch( - "policyengine_us_data.calibration.puf_impute" - "._run_qrf_imputation", + "policyengine_us_data.calibration.puf_impute._run_qrf_imputation", return_value=( {v: np.zeros(n) for v in IMPUTED_VARIABLES}, {}, ), ), patch( - "policyengine_us_data.calibration.puf_impute" - "._impute_weeks_unemployed", + "policyengine_us_data.calibration.puf_impute._impute_weeks_unemployed", return_value=np.zeros(n), ), patch(_MSIM_PATCH), @@ -585,12 +550,8 @@ def test_cps_half_unchanged_with_imputation(self): state_fips = np.array([1, 2, 36, 6, 48]) n = 20 - originals = { - var: data[var][2024].copy() for var in CPS_RETIREMENT_VARIABLES - } - fake_retirement = { - var: np.zeros(n) for var in CPS_RETIREMENT_VARIABLES - } + originals = {var: data[var][2024].copy() for var in CPS_RETIREMENT_VARIABLES} + fake_retirement = {var: np.zeros(n) for var in CPS_RETIREMENT_VARIABLES} with ( patch( @@ -599,16 +560,14 @@ def test_cps_half_unchanged_with_imputation(self): return_value=fake_retirement, ), patch( - "policyengine_us_data.calibration.puf_impute" - "._run_qrf_imputation", + "policyengine_us_data.calibration.puf_impute._run_qrf_imputation", return_value=( {v: np.zeros(n) for v in IMPUTED_VARIABLES}, {}, ), ), patch( - "policyengine_us_data.calibration.puf_impute" - "._impute_weeks_unemployed", + "policyengine_us_data.calibration.puf_impute._impute_weeks_unemployed", return_value=np.zeros(n), ), patch(_MSIM_PATCH), @@ -623,9 +582,7 @@ def test_cps_half_unchanged_with_imputation(self): ) for var in CPS_RETIREMENT_VARIABLES: - np.testing.assert_array_equal( - result[var][2024][:n], originals[var] - ) + np.testing.assert_array_equal(result[var][2024][:n], originals[var]) def test_puf_half_gets_zero_retirement_for_zero_imputed(self): """When imputation returns zeros, PUF half should be zero.""" @@ -633,9 +590,7 @@ def test_puf_half_gets_zero_retirement_for_zero_imputed(self): state_fips = np.array([1, 2, 36, 6, 48]) n = 20 - fake_retirement = { - var: np.zeros(n) for var in CPS_RETIREMENT_VARIABLES - } + fake_retirement = {var: np.zeros(n) for var in CPS_RETIREMENT_VARIABLES} with ( patch( @@ -644,16 +599,14 @@ def test_puf_half_gets_zero_retirement_for_zero_imputed(self): return_value=fake_retirement, ), patch( - "policyengine_us_data.calibration.puf_impute" - "._run_qrf_imputation", + "policyengine_us_data.calibration.puf_impute._run_qrf_imputation", return_value=( {v: np.zeros(n) for v in IMPUTED_VARIABLES}, {}, ), ), patch( - "policyengine_us_data.calibration.puf_impute" - "._impute_weeks_unemployed", + "policyengine_us_data.calibration.puf_impute._impute_weeks_unemployed", return_value=np.zeros(n), ), patch(_MSIM_PATCH), @@ -707,6 +660,6 @@ def test_401k_ira_from_policyengine_us(self): ours = _get_retirement_limits(year) pe = pe_limits(year) for key in ["401k", "401k_catch_up", "ira", "ira_catch_up"]: - assert ( - ours[key] == pe[key] - ), f"Year {year} key {key}: {ours[key]} != {pe[key]}" + assert ours[key] == pe[key], ( + f"Year {year} key {key}: {ours[key]} != {pe[key]}" + ) diff --git a/policyengine_us_data/tests/test_calibration/test_source_impute.py b/policyengine_us_data/tests/test_calibration/test_source_impute.py index c69ec653a..517a559ef 100644 --- a/policyengine_us_data/tests/test_calibration/test_source_impute.py +++ b/policyengine_us_data/tests/test_calibration/test_source_impute.py @@ -71,9 +71,7 @@ def test_scf_variables_defined(self): def test_all_source_variables_defined(self): expected = ( - ACS_IMPUTED_VARIABLES - + SIPP_IMPUTED_VARIABLES - + SCF_IMPUTED_VARIABLES + ACS_IMPUTED_VARIABLES + SIPP_IMPUTED_VARIABLES + SCF_IMPUTED_VARIABLES ) assert ALL_SOURCE_VARIABLES == expected diff --git a/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py b/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py index ea2d49c5c..938f8a92f 100644 --- a/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py +++ b/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py @@ -130,11 +130,7 @@ def _insert_aca_ptc_data(engine): ] for tid, sid, var, val, period in targets: conn.execute( - text( - "INSERT INTO targets " - "VALUES (:tid, :sid, :var, :val, " - ":period, 1)" - ), + text("INSERT INTO targets VALUES (:tid, :sid, :var, :val, :period, 1)"), { "tid": tid, "sid": sid, @@ -193,9 +189,7 @@ def test_geographic_id_populated(self): df = b._query_targets({"domain_variables": ["aca_ptc"]}) national = df[df["geo_level"] == "national"] self.assertTrue((national["geographic_id"] == "US").all()) - state_ca = df[ - (df["geo_level"] == "state") & (df["geographic_id"] == "6") - ] + state_ca = df[(df["geo_level"] == "state") & (df["geographic_id"] == "6")] self.assertGreater(len(state_ca), 0) @@ -227,9 +221,9 @@ def _get_targets_with_uprating(self, cpi_factor=1.1, pop_factor=1.02): } df["original_value"] = df["value"].copy() df["uprating_factor"] = df.apply( - lambda row: b._get_uprating_info( - row["variable"], row["period"], factors - )[0], + lambda row: b._get_uprating_info(row["variable"], row["period"], factors)[ + 0 + ], axis=1, ) df["value"] = df["original_value"] * df["uprating_factor"] @@ -254,9 +248,7 @@ def test_cd_sums_match_uprated_state(self): & (result["geo_level"] == "district") & ( result["geographic_id"].apply( - lambda g, s=sf: ( - int(g) // 100 == s if g.isdigit() else False - ) + lambda g, s=sf: int(g) // 100 == s if g.isdigit() else False ) ) ] @@ -290,8 +282,7 @@ def test_hif_is_one_when_cds_sum_to_state(self): b, df, factors = self._get_targets_with_uprating(cpi_factor=1.15) result = b._apply_hierarchical_uprating(df, ["aca_ptc"], factors) cd_aca = result[ - (result["variable"] == "aca_ptc") - & (result["geo_level"] == "district") + (result["variable"] == "aca_ptc") & (result["geo_level"] == "district") ] for _, row in cd_aca.iterrows(): self.assertAlmostEqual(row["hif"], 1.0, places=6) diff --git a/policyengine_us_data/tests/test_constraint_validation.py b/policyengine_us_data/tests/test_constraint_validation.py index 29920475f..e494f5c92 100644 --- a/policyengine_us_data/tests/test_constraint_validation.py +++ b/policyengine_us_data/tests/test_constraint_validation.py @@ -138,9 +138,7 @@ def test_conflicting_lower_bounds(self): Constraint(variable="age", operation=">", value="20"), Constraint(variable="age", operation=">=", value="25"), ] - with pytest.raises( - ConstraintValidationError, match="conflicting lower bounds" - ): + with pytest.raises(ConstraintValidationError, match="conflicting lower bounds"): ensure_consistent_constraint_set(constraints) def test_conflicting_upper_bounds(self): @@ -149,9 +147,7 @@ def test_conflicting_upper_bounds(self): Constraint(variable="age", operation="<", value="50"), Constraint(variable="age", operation="<=", value="45"), ] - with pytest.raises( - ConstraintValidationError, match="conflicting upper bounds" - ): + with pytest.raises(ConstraintValidationError, match="conflicting upper bounds"): ensure_consistent_constraint_set(constraints) @@ -193,9 +189,7 @@ class TestNonNumericValues: def test_string_equality_valid(self): """medicaid_enrolled == 'True' should pass.""" constraints = [ - Constraint( - variable="medicaid_enrolled", operation="==", value="True" - ), + Constraint(variable="medicaid_enrolled", operation="==", value="True"), ] ensure_consistent_constraint_set(constraints) # No exception diff --git a/policyengine_us_data/tests/test_database.py b/policyengine_us_data/tests/test_database.py index c9cf14c7c..e0e329e53 100644 --- a/policyengine_us_data/tests/test_database.py +++ b/policyengine_us_data/tests/test_database.py @@ -14,7 +14,7 @@ @pytest.fixture def engine(tmp_path): - db_uri = f"sqlite:///{tmp_path/'test.db'}" + db_uri = f"sqlite:///{tmp_path / 'test.db'}" return create_database(db_uri) diff --git a/policyengine_us_data/tests/test_database_build.py b/policyengine_us_data/tests/test_database_build.py index 3c0e4fb3f..87a6ce082 100644 --- a/policyengine_us_data/tests/test_database_build.py +++ b/policyengine_us_data/tests/test_database_build.py @@ -23,8 +23,7 @@ # HuggingFace URL for the stratified CPS dataset. # ETL scripts use this only to derive the time period (2024). HF_DATASET = ( - "hf://policyengine/policyengine-us-data" - "/calibration/stratified_extended_cps.h5" + "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" ) # Scripts run in the same order as `make database` in the Makefile. @@ -80,9 +79,7 @@ def built_db(): ) if errors: - pytest.fail( - f"{len(errors)} ETL script(s) failed:\n" + "\n\n".join(errors) - ) + pytest.fail(f"{len(errors)} ETL script(s) failed:\n" + "\n\n".join(errors)) assert DB_PATH.exists(), "policy_data.db was not created" return DB_PATH @@ -99,9 +96,7 @@ def test_expected_tables_exist(built_db): conn = sqlite3.connect(str(built_db)) tables = { row[0] - for row in conn.execute( - "SELECT name FROM sqlite_master WHERE type='table'" - ) + for row in conn.execute("SELECT name FROM sqlite_master WHERE type='table'") } conn.close() @@ -126,8 +121,7 @@ def test_national_targets_loaded(built_db): variables = {r[0] for r in rows} for expected in ["snap", "social_security", "ssi"]: assert expected in variables, ( - f"National target '{expected}' missing. " - f"Found: {sorted(variables)}" + f"National target '{expected}' missing. Found: {sorted(variables)}" ) @@ -153,8 +147,7 @@ def test_state_income_tax_targets(built_db): ca_val = state_totals.get("06") or state_totals.get("6") assert ca_val is not None, "California (FIPS 06) target missing" assert ca_val > 100e9, ( - f"California income tax should be > $100B, " - f"got ${ca_val / 1e9:.1f}B" + f"California income tax should be > $100B, got ${ca_val / 1e9:.1f}B" ) @@ -176,9 +169,7 @@ def test_all_target_variables_exist_in_policyengine(built_db): from policyengine_us.system import system conn = sqlite3.connect(str(built_db)) - variables = { - r[0] for r in conn.execute("SELECT DISTINCT variable FROM targets") - } + variables = {r[0] for r in conn.execute("SELECT DISTINCT variable FROM targets")} conn.close() missing = [v for v in variables if v not in system.variables] diff --git a/policyengine_us_data/tests/test_datasets/test_county_fips.py b/policyengine_us_data/tests/test_datasets/test_county_fips.py index d692cf559..ac2eb9faf 100644 --- a/policyengine_us_data/tests/test_datasets/test_county_fips.py +++ b/policyengine_us_data/tests/test_datasets/test_county_fips.py @@ -48,9 +48,7 @@ def mock_upload_to_hf(): def mock_local_folder(): """Mock the LOCAL_FOLDER""" mock_path = MagicMock() - with patch( - "policyengine_us_data.geography.county_fips.LOCAL_FOLDER", mock_path - ): + with patch("policyengine_us_data.geography.county_fips.LOCAL_FOLDER", mock_path): yield mock_path @@ -104,7 +102,6 @@ def test_download_failure(): patch("requests.get", return_value=failed_response), pytest.raises(ValueError) as excinfo, ): - # Run the function, expect ValueError generate_county_fips_2020_dataset() @@ -180,6 +177,4 @@ def test_huggingface_upload(mock_upload_to_hf, mock_to_csv, mock_requests_get): assert call_kwargs["repo_file_path"] == "county_fips_2020.csv.gz" # Verify that the first parameter is a BytesIO instance - assert isinstance( - mock_upload_to_hf.call_args[1]["local_file_path"], BytesIO - ) + assert isinstance(mock_upload_to_hf.call_args[1]["local_file_path"], BytesIO) diff --git a/policyengine_us_data/tests/test_datasets/test_cps.py b/policyengine_us_data/tests/test_datasets/test_cps.py index bbfba73bd..f03469393 100644 --- a/policyengine_us_data/tests/test_datasets/test_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_cps.py @@ -13,18 +13,11 @@ def test_cps_has_auto_loan_interest(): RELATIVE_TOLERANCE = 0.4 assert ( - abs( - sim.calculate("auto_loan_interest").sum() - / AUTO_LOAN_INTEREST_TARGET - - 1 - ) + abs(sim.calculate("auto_loan_interest").sum() / AUTO_LOAN_INTEREST_TARGET - 1) < RELATIVE_TOLERANCE ) assert ( - abs( - sim.calculate("auto_loan_balance").sum() / AUTO_LOAN_BALANCE_TARGET - - 1 - ) + abs(sim.calculate("auto_loan_balance").sum() / AUTO_LOAN_BALANCE_TARGET - 1) < RELATIVE_TOLERANCE ) @@ -38,11 +31,7 @@ def test_cps_has_fsla_overtime_premium(): OVERTIME_PREMIUM_TARGET = 70e9 RELATIVE_TOLERANCE = 0.2 assert ( - abs( - sim.calculate("fsla_overtime_premium").sum() - / OVERTIME_PREMIUM_TARGET - - 1 - ) + abs(sim.calculate("fsla_overtime_premium").sum() / OVERTIME_PREMIUM_TARGET - 1) < RELATIVE_TOLERANCE ) diff --git a/policyengine_us_data/tests/test_datasets/test_dataset_sanity.py b/policyengine_us_data/tests/test_datasets/test_dataset_sanity.py index 4aeb13e6f..4a2d17f58 100644 --- a/policyengine_us_data/tests/test_datasets/test_dataset_sanity.py +++ b/policyengine_us_data/tests/test_datasets/test_dataset_sanity.py @@ -41,27 +41,23 @@ def test_ecps_employment_income_positive(ecps_sim): def test_ecps_self_employment_income_positive(ecps_sim): total = ecps_sim.calculate("self_employment_income").sum() - assert ( - total > 50e9 - ), f"self_employment_income sum is {total:.2e}, expected > 50B." + assert total > 50e9, f"self_employment_income sum is {total:.2e}, expected > 50B." def test_ecps_household_count(ecps_sim): """Household count should be roughly 130-160M.""" total_hh = ecps_sim.calculate("household_weight").values.sum() - assert ( - 100e6 < total_hh < 200e6 - ), f"Total households = {total_hh:.2e}, expected 100M-200M." + assert 100e6 < total_hh < 200e6, ( + f"Total households = {total_hh:.2e}, expected 100M-200M." + ) def test_ecps_person_count(ecps_sim): """Weighted person count should be roughly 330M.""" - total_people = ecps_sim.calculate( - "household_weight", map_to="person" - ).values.sum() - assert ( - 250e6 < total_people < 400e6 - ), f"Total people = {total_people:.2e}, expected 250M-400M." + total_people = ecps_sim.calculate("household_weight", map_to="person").values.sum() + assert 250e6 < total_people < 400e6, ( + f"Total people = {total_people:.2e}, expected 250M-400M." + ) def test_ecps_poverty_rate_reasonable(ecps_sim): @@ -85,7 +81,7 @@ def test_ecps_mean_employment_income_reasonable(ecps_sim): income = ecps_sim.calculate("employment_income", map_to="person") mean = income.mean() assert 15_000 < mean < 80_000, ( - f"Mean employment income = ${mean:,.0f}, " "expected $15k-$80k." + f"Mean employment income = ${mean:,.0f}, expected $15k-$80k." ) @@ -94,9 +90,7 @@ def test_ecps_mean_employment_income_reasonable(ecps_sim): def test_cps_employment_income_positive(cps_sim): total = cps_sim.calculate("employment_income").sum() - assert total > 5e12, ( - f"CPS employment_income sum is {total:.2e}, " "expected > 5T." - ) + assert total > 5e12, f"CPS employment_income sum is {total:.2e}, expected > 5T." def test_cps_household_count(cps_sim): @@ -122,24 +116,20 @@ def sparse_sim(): def test_sparse_employment_income_positive(sparse_sim): """Sparse dataset employment income must be in the trillions.""" total = sparse_sim.calculate("employment_income").sum() - assert ( - total > 5e12 - ), f"Sparse employment_income sum is {total:.2e}, expected > 5T." + assert total > 5e12, f"Sparse employment_income sum is {total:.2e}, expected > 5T." def test_sparse_household_count(sparse_sim): total_hh = sparse_sim.calculate("household_weight").values.sum() - assert ( - 100e6 < total_hh < 200e6 - ), f"Sparse total households = {total_hh:.2e}, expected 100M-200M." + assert 100e6 < total_hh < 200e6, ( + f"Sparse total households = {total_hh:.2e}, expected 100M-200M." + ) def test_sparse_poverty_rate_reasonable(sparse_sim): in_poverty = sparse_sim.calculate("person_in_poverty", map_to="person") rate = in_poverty.mean() - assert ( - 0.05 < rate < 0.25 - ), f"Sparse poverty rate = {rate:.1%}, expected 5-25%." + assert 0.05 < rate < 0.25, f"Sparse poverty rate = {rate:.1%}, expected 5-25%." # ── File size checks ─────────────────────────────────────────── @@ -153,6 +143,6 @@ def test_ecps_file_size(): if not path.exists(): pytest.skip("enhanced_cps_2024.h5 not found") size_mb = path.stat().st_size / (1024 * 1024) - assert ( - size_mb > 100 - ), f"enhanced_cps_2024.h5 is only {size_mb:.1f}MB, expected >100MB" + assert size_mb > 100, ( + f"enhanced_cps_2024.h5 is only {size_mb:.1f}MB, expected >100MB" + ) diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py index b3edbc9e3..298de5a4a 100644 --- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py @@ -50,10 +50,10 @@ def test_ecps_replicates_jct_tax_expenditures(): & (calibration_log["epoch"] == calibration_log["epoch"].max()) ] - assert ( - jct_rows.rel_abs_error.max() < 0.5 - ), "JCT tax expenditure targets not met (see the calibration log for details). Max relative error: {:.2%}".format( - jct_rows.rel_abs_error.max() + assert jct_rows.rel_abs_error.max() < 0.5, ( + "JCT tax expenditure targets not met (see the calibration log for details). Max relative error: {:.2%}".format( + jct_rows.rel_abs_error.max() + ) ) @@ -71,9 +71,7 @@ def deprecated_test_ecps_replicates_jct_tax_expenditures_full(): } baseline = Microsimulation(dataset=EnhancedCPS_2024) - income_tax_b = baseline.calculate( - "income_tax", period=2024, map_to="household" - ) + income_tax_b = baseline.calculate("income_tax", period=2024, map_to="household") for deduction, target in EXPENDITURE_TARGETS.items(): # Create reform that neutralizes the deduction @@ -82,12 +80,8 @@ def apply(self): self.neutralize_variable(deduction) # Run reform simulation - reformed = Microsimulation( - reform=RepealDeduction, dataset=EnhancedCPS_2024 - ) - income_tax_r = reformed.calculate( - "income_tax", period=2024, map_to="household" - ) + reformed = Microsimulation(reform=RepealDeduction, dataset=EnhancedCPS_2024) + income_tax_r = reformed.calculate("income_tax", period=2024, map_to="household") # Calculate tax expenditure tax_expenditure = (income_tax_r - income_tax_b).sum() @@ -95,7 +89,7 @@ def apply(self): TOLERANCE = 0.4 print( - f"{deduction} tax expenditure {tax_expenditure/1e9:.1f}bn differs from target {target/1e9:.1f}bn by {pct_error:.2%}" + f"{deduction} tax expenditure {tax_expenditure / 1e9:.1f}bn differs from target {target / 1e9:.1f}bn by {pct_error:.2%}" ) assert pct_error < TOLERANCE, deduction @@ -137,9 +131,9 @@ def test_undocumented_matches_ssn_none(): # 1. Per-person equivalence mismatches = np.where(ssn_type_none_mask != undocumented_mask)[0] - assert ( - mismatches.size == 0 - ), f"{mismatches.size} mismatches between 'NONE' SSN and 'UNDOCUMENTED' status" + assert mismatches.size == 0, ( + f"{mismatches.size} mismatches between 'NONE' SSN and 'UNDOCUMENTED' status" + ) # 2. Optional aggregate sanity-check count = undocumented_mask.sum() @@ -164,9 +158,7 @@ def test_aca_calibration(): # Monthly to yearly targets["spending"] = targets["spending"] * 12 # Adjust to match national target - targets["spending"] = targets["spending"] * ( - 98e9 / targets["spending"].sum() - ) + targets["spending"] = targets["spending"] * (98e9 / targets["spending"].sum()) sim = Microsimulation(dataset=EnhancedCPS_2024) state_code_hh = sim.calculate("state_code", map_to="household").values @@ -181,17 +173,15 @@ def test_aca_calibration(): pct_error = abs(simulated - target_spending) / target_spending print( - f"{state}: simulated ${simulated/1e9:.2f} bn " - f"target ${target_spending/1e9:.2f} bn " + f"{state}: simulated ${simulated / 1e9:.2f} bn " + f"target ${target_spending / 1e9:.2f} bn " f"error {pct_error:.2%}" ) if pct_error > TOLERANCE: failed = True - assert ( - not failed - ), f"One or more states exceeded tolerance of {TOLERANCE:.0%}." + assert not failed, f"One or more states exceeded tolerance of {TOLERANCE:.0%}." def test_immigration_status_diversity(): @@ -227,20 +217,18 @@ def test_immigration_status_diversity(): ) # Also check that we have a reasonable percentage of citizens (should be 85-90%) - assert ( - 80 < citizen_pct < 95 - ), f"Citizen percentage ({citizen_pct:.1f}%) outside expected range (80-95%)" + assert 80 < citizen_pct < 95, ( + f"Citizen percentage ({citizen_pct:.1f}%) outside expected range (80-95%)" + ) # Check that we have some non-citizens non_citizen_pct = 100 - citizen_pct - assert ( - non_citizen_pct > 5 - ), f"Too few non-citizens ({non_citizen_pct:.1f}%) - expected at least 5%" - - print( - f"Immigration status diversity test passed: {citizen_pct:.1f}% citizens" + assert non_citizen_pct > 5, ( + f"Too few non-citizens ({non_citizen_pct:.1f}%) - expected at least 5%" ) + print(f"Immigration status diversity test passed: {citizen_pct:.1f}% citizens") + def test_medicaid_calibration(): @@ -269,14 +257,12 @@ def test_medicaid_calibration(): pct_error = abs(simulated - target_enrollment) / target_enrollment print( - f"{state}: simulated ${simulated/1e9:.2f} bn " - f"target ${target_enrollment/1e9:.2f} bn " + f"{state}: simulated ${simulated / 1e9:.2f} bn " + f"target ${target_enrollment / 1e9:.2f} bn " f"error {pct_error:.2%}" ) if pct_error > TOLERANCE: failed = True - assert ( - not failed - ), f"One or more states exceeded tolerance of {TOLERANCE:.0%}." + assert not failed, f"One or more states exceeded tolerance of {TOLERANCE:.0%}." diff --git a/policyengine_us_data/tests/test_datasets/test_sipp_assets.py b/policyengine_us_data/tests/test_datasets/test_sipp_assets.py index c8780d847..a79b4bce6 100644 --- a/policyengine_us_data/tests/test_datasets/test_sipp_assets.py +++ b/policyengine_us_data/tests/test_datasets/test_sipp_assets.py @@ -59,12 +59,12 @@ def test_ecps_has_liquid_assets(): MAXIMUM_TOTAL = 30e12 # $30 trillion ceiling assert total > MINIMUM_TOTAL, ( - f"Total liquid assets ${total/1e12:.1f}T below " - f"minimum ${MINIMUM_TOTAL/1e12:.0f}T" + f"Total liquid assets ${total / 1e12:.1f}T below " + f"minimum ${MINIMUM_TOTAL / 1e12:.0f}T" ) assert total < MAXIMUM_TOTAL, ( - f"Total liquid assets ${total/1e12:.1f}T above " - f"maximum ${MAXIMUM_TOTAL/1e12:.0f}T" + f"Total liquid assets ${total / 1e12:.1f}T above " + f"maximum ${MAXIMUM_TOTAL / 1e12:.0f}T" ) @@ -102,12 +102,10 @@ def test_liquid_assets_distribution(): MEDIAN_MAX = 20_000 assert weighted_median > MEDIAN_MIN, ( - f"Median liquid assets ${weighted_median:,.0f} below " - f"minimum ${MEDIAN_MIN:,}" + f"Median liquid assets ${weighted_median:,.0f} below minimum ${MEDIAN_MIN:,}" ) assert weighted_median < MEDIAN_MAX, ( - f"Median liquid assets ${weighted_median:,.0f} above " - f"maximum ${MEDIAN_MAX:,}" + f"Median liquid assets ${weighted_median:,.0f} above maximum ${MEDIAN_MAX:,}" ) @@ -129,9 +127,7 @@ def test_asset_categories_exist(): assert bonds >= 0, "Bond assets should be non-negative" # Bank accounts typically largest category of liquid assets - assert ( - bank > stocks * 0.3 - ), "Bank accounts should be substantial relative to stocks" + assert bank > stocks * 0.3, "Bank accounts should be substantial relative to stocks" def test_low_asset_households(): @@ -158,10 +154,8 @@ def test_low_asset_households(): MAX_PCT = 0.70 assert below_2k > MIN_PCT, ( - f"Only {below_2k:.1%} have <$2k liquid assets, " - f"expected at least {MIN_PCT:.0%}" + f"Only {below_2k:.1%} have <$2k liquid assets, expected at least {MIN_PCT:.0%}" ) assert below_2k < MAX_PCT, ( - f"{below_2k:.1%} have <$2k liquid assets, " - f"expected at most {MAX_PCT:.0%}" + f"{below_2k:.1%} have <$2k liquid assets, expected at most {MAX_PCT:.0%}" ) diff --git a/policyengine_us_data/tests/test_datasets/test_small_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_small_enhanced_cps.py index 23b7b2dcb..9316d3909 100644 --- a/policyengine_us_data/tests/test_datasets/test_small_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_small_enhanced_cps.py @@ -19,12 +19,10 @@ def test_small_ecps_loads(year: int): # Employment income should be positive (not zero from missing vars) emp_income = sim.calculate("employment_income", 2025).sum() - assert ( - emp_income > 0 - ), f"Small ECPS employment_income sum is {emp_income}, expected > 0." + assert emp_income > 0, ( + f"Small ECPS employment_income sum is {emp_income}, expected > 0." + ) # Should have a reasonable number of households hh_count = len(sim.calculate("household_net_income", 2025)) - assert ( - hh_count > 100 - ), f"Small ECPS has only {hh_count} households, expected > 100." + assert hh_count > 100, f"Small ECPS has only {hh_count} households, expected > 100." diff --git a/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py index 6a690f0cc..a7ee941bb 100644 --- a/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py @@ -115,10 +115,10 @@ def test_sparse_ecps_replicates_jct_tax_expenditures(): & (calibration_log["epoch"] == calibration_log["epoch"].max()) ] - assert ( - jct_rows.rel_abs_error.max() < 0.5 - ), "JCT tax expenditure targets not met (see the calibration log for details). Max relative error: {:.2%}".format( - jct_rows.rel_abs_error.max() + assert jct_rows.rel_abs_error.max() < 0.5, ( + "JCT tax expenditure targets not met (see the calibration log for details). Max relative error: {:.2%}".format( + jct_rows.rel_abs_error.max() + ) ) @@ -133,9 +133,7 @@ def deprecated_test_sparse_ecps_replicates_jct_tax_expenditures_full(sim): } baseline = sim - income_tax_b = baseline.calculate( - "income_tax", period=2024, map_to="household" - ) + income_tax_b = baseline.calculate("income_tax", period=2024, map_to="household") for deduction, target in EXPENDITURE_TARGETS.items(): # Create reform that neutralizes the deduction @@ -145,9 +143,7 @@ def apply(self): # Run reform simulation reformed = Microsimulation(reform=RepealDeduction, dataset=sim.dataset) - income_tax_r = reformed.calculate( - "income_tax", period=2024, map_to="household" - ) + income_tax_r = reformed.calculate("income_tax", period=2024, map_to="household") # Calculate tax expenditure tax_expenditure = (income_tax_r - income_tax_b).sum() @@ -155,8 +151,8 @@ def apply(self): TOLERANCE = 0.4 logging.info( - f"{deduction} tax expenditure {tax_expenditure/1e9:.1f}bn " - f"differs from target {target/1e9:.1f}bn by {pct_error:.2%}" + f"{deduction} tax expenditure {tax_expenditure / 1e9:.1f}bn " + f"differs from target {target / 1e9:.1f}bn by {pct_error:.2%}" ) assert pct_error < TOLERANCE, deduction @@ -188,9 +184,7 @@ def test_sparse_aca_calibration(sim): # Monthly to yearly targets["spending"] = targets["spending"] * 12 # Adjust to match national target - targets["spending"] = targets["spending"] * ( - 98e9 / targets["spending"].sum() - ) + targets["spending"] = targets["spending"] * (98e9 / targets["spending"].sum()) state_code_hh = sim.calculate("state_code", map_to="household").values aca_ptc = sim.calculate("aca_ptc", map_to="household", period=2025) @@ -204,17 +198,15 @@ def test_sparse_aca_calibration(sim): pct_error = abs(simulated - target_spending) / target_spending logging.info( - f"{state}: simulated ${simulated/1e9:.2f} bn " - f"target ${target_spending/1e9:.2f} bn " + f"{state}: simulated ${simulated / 1e9:.2f} bn " + f"target ${target_spending / 1e9:.2f} bn " f"error {pct_error:.2%}" ) if pct_error > TOLERANCE: failed = True - assert ( - not failed - ), f"One or more states exceeded tolerance of {TOLERANCE:.0%}." + assert not failed, f"One or more states exceeded tolerance of {TOLERANCE:.0%}." def test_sparse_medicaid_calibration(sim): @@ -238,14 +230,12 @@ def test_sparse_medicaid_calibration(sim): pct_error = abs(simulated - target_enrollment) / target_enrollment logging.info( - f"{state}: simulated ${simulated/1e9:.2f} bn " - f"target ${target_enrollment/1e9:.2f} bn " + f"{state}: simulated ${simulated / 1e9:.2f} bn " + f"target ${target_enrollment / 1e9:.2f} bn " f"error {pct_error:.2%}" ) if pct_error > TOLERANCE: failed = True - assert ( - not failed - ), f"One or more states exceeded tolerance of {TOLERANCE:.0%}." + assert not failed, f"One or more states exceeded tolerance of {TOLERANCE:.0%}." diff --git a/policyengine_us_data/tests/test_local_area_calibration/create_test_fixture.py b/policyengine_us_data/tests/test_local_area_calibration/create_test_fixture.py index 00334734d..2fadeeeb9 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/create_test_fixture.py +++ b/policyengine_us_data/tests/test_local_area_calibration/create_test_fixture.py @@ -30,9 +30,7 @@ def create_test_fixture(): # Household-level arrays household_ids = np.arange(N_HOUSEHOLDS, dtype=np.int32) - household_weights = np.random.uniform(500, 3000, N_HOUSEHOLDS).astype( - np.float32 - ) + household_weights = np.random.uniform(500, 3000, N_HOUSEHOLDS).astype(np.float32) # Assign households to states (use NC=37 and AK=2 for testing) # 40 households in NC, 10 in AK @@ -102,18 +100,14 @@ def create_test_fixture(): f["household_id"].create_dataset(TIME_PERIOD, data=household_ids) f.create_group("household_weight") - f["household_weight"].create_dataset( - TIME_PERIOD, data=household_weights - ) + f["household_weight"].create_dataset(TIME_PERIOD, data=household_weights) # Person variables f.create_group("person_id") f["person_id"].create_dataset(TIME_PERIOD, data=person_ids) f.create_group("person_household_id") - f["person_household_id"].create_dataset( - TIME_PERIOD, data=person_household_ids - ) + f["person_household_id"].create_dataset(TIME_PERIOD, data=person_household_ids) f.create_group("person_weight") f["person_weight"].create_dataset(TIME_PERIOD, data=person_weights) @@ -122,18 +116,14 @@ def create_test_fixture(): f["age"].create_dataset(TIME_PERIOD, data=ages) f.create_group("employment_income") - f["employment_income"].create_dataset( - TIME_PERIOD, data=employment_income - ) + f["employment_income"].create_dataset(TIME_PERIOD, data=employment_income) # Tax unit f.create_group("tax_unit_id") f["tax_unit_id"].create_dataset(TIME_PERIOD, data=tax_unit_ids) f.create_group("person_tax_unit_id") - f["person_tax_unit_id"].create_dataset( - TIME_PERIOD, data=person_tax_unit_ids - ) + f["person_tax_unit_id"].create_dataset(TIME_PERIOD, data=person_tax_unit_ids) f.create_group("tax_unit_weight") f["tax_unit_weight"].create_dataset(TIME_PERIOD, data=tax_unit_weights) @@ -143,9 +133,7 @@ def create_test_fixture(): f["spm_unit_id"].create_dataset(TIME_PERIOD, data=spm_unit_ids) f.create_group("person_spm_unit_id") - f["person_spm_unit_id"].create_dataset( - TIME_PERIOD, data=person_spm_unit_ids - ) + f["person_spm_unit_id"].create_dataset(TIME_PERIOD, data=person_spm_unit_ids) f.create_group("spm_unit_weight") f["spm_unit_weight"].create_dataset(TIME_PERIOD, data=spm_unit_weights) @@ -155,9 +143,7 @@ def create_test_fixture(): f["family_id"].create_dataset(TIME_PERIOD, data=family_ids) f.create_group("person_family_id") - f["person_family_id"].create_dataset( - TIME_PERIOD, data=person_family_ids - ) + f["person_family_id"].create_dataset(TIME_PERIOD, data=person_family_ids) f.create_group("family_weight") f["family_weight"].create_dataset(TIME_PERIOD, data=family_weights) @@ -172,9 +158,7 @@ def create_test_fixture(): ) f.create_group("marital_unit_weight") - f["marital_unit_weight"].create_dataset( - TIME_PERIOD, data=marital_unit_weights - ) + f["marital_unit_weight"].create_dataset(TIME_PERIOD, data=marital_unit_weights) # Geography (household level) f.create_group("state_fips") diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_county_assignment.py b/policyengine_us_data/tests/test_local_area_calibration/test_county_assignment.py index 158e0ca68..e20c1797a 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/test_county_assignment.py +++ b/policyengine_us_data/tests/test_local_area_calibration/test_county_assignment.py @@ -47,9 +47,7 @@ def test_ny_cd_gets_ny_counties(self): for idx in result: county_name = County._member_names_[idx] # Should end with _NY - assert county_name.endswith( - "_NY" - ), f"Got non-NY county: {county_name}" + assert county_name.endswith("_NY"), f"Got non-NY county: {county_name}" def test_ca_cd_gets_ca_counties(self): """Verify CA CDs get CA counties.""" @@ -58,9 +56,7 @@ def test_ca_cd_gets_ca_counties(self): for idx in result: county_name = County._member_names_[idx] - assert county_name.endswith( - "_CA" - ), f"Got non-CA county: {county_name}" + assert county_name.endswith("_CA"), f"Got non-CA county: {county_name}" class TestCountyIndex: diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_stacked_dataset_builder.py b/policyengine_us_data/tests/test_local_area_calibration/test_stacked_dataset_builder.py index 2900eec19..4d2d7c74e 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/test_stacked_dataset_builder.py +++ b/policyengine_us_data/tests/test_local_area_calibration/test_stacked_dataset_builder.py @@ -141,17 +141,15 @@ def test_counties_match_state(self, stacked_result): state_fips = row["state_fips"] if state_fips == 37: - assert county.endswith( - "_NC" - ), f"NC county should end with _NC: {county}" + assert county.endswith("_NC"), ( + f"NC county should end with _NC: {county}" + ) elif state_fips == 2: - assert county.endswith( - "_AK" - ), f"AK county should end with _AK: {county}" + assert county.endswith("_AK"), ( + f"AK county should end with _AK: {county}" + ) - def test_household_count_matches_weights( - self, stacked_result, test_weights - ): + def test_household_count_matches_weights(self, stacked_result, test_weights): """Number of output households should match non-zero weights.""" hh_df = stacked_result["hh_df"] expected_households = (test_weights > 0).sum() @@ -205,40 +203,30 @@ class TestEntityReindexing: def test_family_ids_are_unique(self, stacked_sim): """Family IDs should be globally unique across all CDs.""" family_ids = stacked_sim.calculate("family_id", map_to="family").values - assert len(family_ids) == len( - set(family_ids) - ), "Family IDs should be unique" + assert len(family_ids) == len(set(family_ids)), "Family IDs should be unique" def test_tax_unit_ids_are_unique(self, stacked_sim): """Tax unit IDs should be globally unique.""" - tax_unit_ids = stacked_sim.calculate( - "tax_unit_id", map_to="tax_unit" - ).values - assert len(tax_unit_ids) == len( - set(tax_unit_ids) - ), "Tax unit IDs should be unique" + tax_unit_ids = stacked_sim.calculate("tax_unit_id", map_to="tax_unit").values + assert len(tax_unit_ids) == len(set(tax_unit_ids)), ( + "Tax unit IDs should be unique" + ) def test_spm_unit_ids_are_unique(self, stacked_sim): """SPM unit IDs should be globally unique.""" - spm_unit_ids = stacked_sim.calculate( - "spm_unit_id", map_to="spm_unit" - ).values - assert len(spm_unit_ids) == len( - set(spm_unit_ids) - ), "SPM unit IDs should be unique" + spm_unit_ids = stacked_sim.calculate("spm_unit_id", map_to="spm_unit").values + assert len(spm_unit_ids) == len(set(spm_unit_ids)), ( + "SPM unit IDs should be unique" + ) def test_person_family_id_matches_family_id(self, stacked_sim): """person_family_id should reference valid family_ids.""" person_family_ids = stacked_sim.calculate( "person_family_id", map_to="person" ).values - family_ids = set( - stacked_sim.calculate("family_id", map_to="family").values - ) + family_ids = set(stacked_sim.calculate("family_id", map_to="family").values) for pf_id in person_family_ids: - assert ( - pf_id in family_ids - ), f"person_family_id {pf_id} not in family_ids" + assert pf_id in family_ids, f"person_family_id {pf_id} not in family_ids" def test_family_ids_unique_across_cds(self, stacked_sim_with_overlap): """Same household in different CDs should have different family_ids.""" @@ -247,9 +235,7 @@ def test_family_ids_unique_across_cds(self, stacked_sim_with_overlap): n_cds = len(TEST_CDS) family_ids = sim.calculate("family_id", map_to="family").values - household_ids = sim.calculate( - "household_id", map_to="household" - ).values + household_ids = sim.calculate("household_id", map_to="household").values # Should have n_overlap * n_cds unique families (one per HH-CD pair) expected_families = n_overlap * n_cds diff --git a/policyengine_us_data/tests/test_puf_impute.py b/policyengine_us_data/tests/test_puf_impute.py index fcdcf763f..d968fb16d 100644 --- a/policyengine_us_data/tests/test_puf_impute.py +++ b/policyengine_us_data/tests/test_puf_impute.py @@ -57,9 +57,7 @@ def _make_data( if age is not None: data["age"] = {tp: np.concatenate([age, age]).astype(np.float32)} if is_male is not None: - data["is_male"] = { - tp: np.concatenate([is_male, is_male]).astype(np.float32) - } + data["is_male"] = {tp: np.concatenate([is_male, is_male]).astype(np.float32)} return data, n, tp diff --git a/policyengine_us_data/tests/test_schema_views_and_lookups.py b/policyengine_us_data/tests/test_schema_views_and_lookups.py index 14521a214..8d99615cf 100644 --- a/policyengine_us_data/tests/test_schema_views_and_lookups.py +++ b/policyengine_us_data/tests/test_schema_views_and_lookups.py @@ -227,9 +227,7 @@ def _query_stratum_domain(self): from sqlalchemy import text with self.engine.connect() as conn: - rows = conn.execute( - text("SELECT * FROM stratum_domain") - ).fetchall() + rows = conn.execute(text("SELECT * FROM stratum_domain")).fetchall() return rows def test_geographic_stratum_excluded(self): @@ -246,7 +244,7 @@ def test_geographic_stratum_excluded(self): domain_stratum_ids = {r[0] for r in rows} self.assertTrue( domain_stratum_ids.isdisjoint(geo_ids), - "Geographic strata should not appear in " "stratum_domain", + "Geographic strata should not appear in stratum_domain", ) def test_single_domain_variable(self): @@ -280,7 +278,7 @@ def test_geographic_constraints_filtered(self): } self.assertTrue( all_domain_vars.isdisjoint(excluded), - f"Found excluded vars: " f"{all_domain_vars & excluded}", + f"Found excluded vars: {all_domain_vars & excluded}", ) # ---------------------------------------------------------------- @@ -291,18 +289,14 @@ def _query_target_overview(self): from sqlalchemy import text with self.engine.connect() as conn: - rows = conn.execute( - text("SELECT * FROM target_overview") - ).fetchall() + rows = conn.execute(text("SELECT * FROM target_overview")).fetchall() return rows def _overview_columns(self): from sqlalchemy import text with self.engine.connect() as conn: - cursor = conn.execute( - text("SELECT * FROM target_overview LIMIT 0") - ) + cursor = conn.execute(text("SELECT * FROM target_overview LIMIT 0")) return [desc[0] for desc in cursor.cursor.description] def test_national_geo_level(self): diff --git a/policyengine_us_data/tests/test_stochastic_variables.py b/policyengine_us_data/tests/test_stochastic_variables.py index 172260784..b9ab13466 100644 --- a/policyengine_us_data/tests/test_stochastic_variables.py +++ b/policyengine_us_data/tests/test_stochastic_variables.py @@ -10,7 +10,6 @@ class TestTakeUpRateParameters: - def test_eitc_rate_loads(self): rates = load_take_up_rate("eitc", 2022) assert isinstance(rates, dict) @@ -52,7 +51,6 @@ def test_ssi_takeup_rate_loads(self): class TestStableStringHash: - def test_deterministic(self): h1 = _stable_string_hash("takes_up_snap_if_eligible") h2 = _stable_string_hash("takes_up_snap_if_eligible") @@ -69,7 +67,6 @@ def test_returns_uint64(self): class TestSeededRng: - def test_same_name_same_results(self): rng1 = seeded_rng("takes_up_snap_if_eligible") result1 = rng1.random(1000) @@ -103,7 +100,6 @@ def test_order_independence(self): class TestTakeUpProportions: - def test_take_up_produces_expected_proportion(self): rate = 0.7 n = 10_000 diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py index c61cc166d..422d750c3 100644 --- a/policyengine_us_data/utils/census.py +++ b/policyengine_us_data/utils/census.py @@ -139,9 +139,7 @@ def get_census_docs(year): - docs_url = ( - f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json" - ) + docs_url = f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json" cache_file = f"census_docs_{year}.json" if is_cached(cache_file): logger.info(f"Using cached {cache_file}") diff --git a/policyengine_us_data/utils/constraint_validation.py b/policyengine_us_data/utils/constraint_validation.py index c3e512c79..f533739cb 100644 --- a/policyengine_us_data/utils/constraint_validation.py +++ b/policyengine_us_data/utils/constraint_validation.py @@ -101,21 +101,17 @@ def _check_operation_compatibility(var_name: str, operations: set) -> None: # Cannot have both > and >= (conflicting lower bounds) if ">" in operations and ">=" in operations: raise ConstraintValidationError( - f"{var_name}: cannot have both '>' and '>=' " - "(conflicting lower bounds)" + f"{var_name}: cannot have both '>' and '>=' (conflicting lower bounds)" ) # Cannot have both < and <= (conflicting upper bounds) if "<" in operations and "<=" in operations: raise ConstraintValidationError( - f"{var_name}: cannot have both '<' and '<=' " - "(conflicting upper bounds)" + f"{var_name}: cannot have both '<' and '<=' (conflicting upper bounds)" ) -def _check_range_validity( - var_name: str, constraints: List[Constraint] -) -> None: +def _check_range_validity(var_name: str, constraints: List[Constraint]) -> None: """Check that range constraints don't create an empty range.""" lower_bound = float("-inf") upper_bound = float("inf") @@ -130,9 +126,7 @@ def _check_range_validity( continue if c.operation == ">": - if val > lower_bound or ( - val == lower_bound and not lower_inclusive - ): + if val > lower_bound or (val == lower_bound and not lower_inclusive): lower_bound = val lower_inclusive = False elif c.operation == ">=": @@ -140,9 +134,7 @@ def _check_range_validity( lower_bound = val lower_inclusive = True elif c.operation == "<": - if val < upper_bound or ( - val == upper_bound and not upper_inclusive - ): + if val < upper_bound or (val == upper_bound and not upper_inclusive): upper_bound = val upper_inclusive = False elif c.operation == "<=": @@ -156,9 +148,7 @@ def _check_range_validity( f"{var_name}: empty range - lower bound {lower_bound} > " f"upper bound {upper_bound}" ) - if lower_bound == upper_bound and not ( - lower_inclusive and upper_inclusive - ): + if lower_bound == upper_bound and not (lower_inclusive and upper_inclusive): raise ConstraintValidationError( f"{var_name}: empty range - bounds equal at {lower_bound} " "but not both inclusive" diff --git a/policyengine_us_data/utils/data_upload.py b/policyengine_us_data/utils/data_upload.py index 42cd8feee..7b7481b3e 100644 --- a/policyengine_us_data/utils/data_upload.py +++ b/policyengine_us_data/utils/data_upload.py @@ -116,18 +116,14 @@ def upload_files_to_gcs( Upload files to Google Cloud Storage and set metadata with the version. """ credentials, project_id = google.auth.default() - storage_client = storage.Client( - credentials=credentials, project=project_id - ) + storage_client = storage.Client(credentials=credentials, project=project_id) bucket = storage_client.bucket(gcs_bucket_name) for file_path in files: file_path = Path(file_path) blob = bucket.blob(file_path.name) blob.upload_from_filename(file_path) - logging.info( - f"Uploaded {file_path.name} to GCS bucket {gcs_bucket_name}." - ) + logging.info(f"Uploaded {file_path.name} to GCS bucket {gcs_bucket_name}.") # Set metadata blob.metadata = {"version": version} @@ -164,9 +160,7 @@ def upload_local_area_file( # Upload to GCS with subdirectory credentials, project_id = google.auth.default() - storage_client = storage.Client( - credentials=credentials, project=project_id - ) + storage_client = storage.Client(credentials=credentials, project=project_id) bucket = storage_client.bucket(gcs_bucket_name) blob_name = f"{subdirectory}/{file_path.name}" @@ -336,9 +330,7 @@ def upload_to_staging_hf( f"Uploaded batch {i // batch_size + 1}: {len(operations)} files to staging/" ) - logging.info( - f"Total: uploaded {total_uploaded} files to staging/ in HuggingFace" - ) + logging.info(f"Total: uploaded {total_uploaded} files to staging/ in HuggingFace") return total_uploaded diff --git a/policyengine_us_data/utils/db.py b/policyengine_us_data/utils/db.py index 2d8f134bf..5dc78603c 100644 --- a/policyengine_us_data/utils/db.py +++ b/policyengine_us_data/utils/db.py @@ -44,10 +44,7 @@ def etl_argparser( args = parser.parse_args() - if ( - not args.dataset.startswith("hf://") - and not Path(args.dataset).exists() - ): + if not args.dataset.startswith("hf://") and not Path(args.dataset).exists(): raise FileNotFoundError( f"Dataset not found: {args.dataset}\n" f"Either build it locally (`make data`) or pass a " @@ -69,18 +66,14 @@ def get_stratum_by_id(session: Session, stratum_id: int) -> Optional[Stratum]: return session.get(Stratum, stratum_id) -def get_simple_stratum_by_ucgid( - session: Session, ucgid: str -) -> Optional[Stratum]: +def get_simple_stratum_by_ucgid(session: Session, ucgid: str) -> Optional[Stratum]: """ Finds a stratum defined *only* by a single ucgid_str constraint. """ constraint_count_subquery = ( select( StratumConstraint.stratum_id, - sa.func.count(StratumConstraint.stratum_id).label( - "constraint_count" - ), + sa.func.count(StratumConstraint.stratum_id).label("constraint_count"), ) .group_by(StratumConstraint.stratum_id) .subquery() @@ -137,16 +130,12 @@ def parse_ucgid(ucgid_str: str) -> Dict: elif ucgid_str.startswith("0400000US"): state_fips = int(ucgid_str[9:]) return {"type": "state", "state_fips": state_fips} - elif ucgid_str.startswith("5001800US") or ucgid_str.startswith( - "5001900US" - ): + elif ucgid_str.startswith("5001800US") or ucgid_str.startswith("5001900US"): # 5001800US = 118th Congress, 5001900US = 119th Congress state_and_district = ucgid_str[9:] state_fips = int(state_and_district[:2]) district_number = int(state_and_district[2:]) - if district_number == 0 or ( - state_fips == 11 and district_number == 98 - ): + if district_number == 0 or (state_fips == 11 and district_number == 98): district_number = 1 cd_geoid = state_fips * 100 + district_number return { @@ -201,9 +190,7 @@ def get_geographic_strata(session: Session) -> Dict: if not constraints: strata_map["national"] = stratum.stratum_id else: - constraint_vars = { - c.constraint_variable: c.value for c in constraints - } + constraint_vars = {c.constraint_variable: c.value for c in constraints} if "congressional_district_geoid" in constraint_vars: cd_geoid = int(constraint_vars["congressional_district_geoid"]) diff --git a/policyengine_us_data/utils/huggingface.py b/policyengine_us_data/utils/huggingface.py index a312b5240..c6d54af17 100644 --- a/policyengine_us_data/utils/huggingface.py +++ b/policyengine_us_data/utils/huggingface.py @@ -10,9 +10,7 @@ ) -def download( - repo: str, repo_filename: str, local_folder: str, version: str = None -): +def download(repo: str, repo_filename: str, local_folder: str, version: str = None): hf_hub_download( repo_id=repo, diff --git a/policyengine_us_data/utils/l0.py b/policyengine_us_data/utils/l0.py index 3dd9e0145..a1d1a5a0d 100644 --- a/policyengine_us_data/utils/l0.py +++ b/policyengine_us_data/utils/l0.py @@ -191,11 +191,11 @@ def train_with_l0(model, train_loader, epochs=10, l0_lambda=1e-3): if epoch % 1 == 0: sparsity_stats = model.get_sparsity_stats() logging.info( - f"Epoch {epoch}: Loss={total_loss/len(train_loader):.4f}, L0={total_l0/len(train_loader):.4f}" + f"Epoch {epoch}: Loss={total_loss / len(train_loader):.4f}, L0={total_l0 / len(train_loader):.4f}" ) for layer, stats in sparsity_stats.items(): logging.info( - f" {layer}: {stats['sparsity']*100:.1f}% sparse, {stats['active_params']:.1f} active params" + f" {layer}: {stats['sparsity'] * 100:.1f}% sparse, {stats['active_params']:.1f} active params" ) diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index d7410d2eb..5cbb879f8 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -101,10 +101,10 @@ def fmt(x): if x < 1e3: return f"{x:.0f}" if x < 1e6: - return f"{x/1e3:.0f}k" + return f"{x / 1e3:.0f}k" if x < 1e9: - return f"{x/1e6:.0f}m" - return f"{x/1e9:.1f}bn" + return f"{x / 1e6:.0f}m" + return f"{x / 1e9:.1f}bn" def build_loss_matrix(dataset: type, time_period): @@ -164,9 +164,7 @@ def build_loss_matrix(dataset: type, time_period): continue mask = ( - (agi >= row["AGI lower bound"]) - * (agi < row["AGI upper bound"]) - * filer + (agi >= row["AGI lower bound"]) * (agi < row["AGI upper bound"]) * filer ) > 0 if row["Filing status"] == "Single": @@ -186,12 +184,8 @@ def build_loss_matrix(dataset: type, time_period): if row["Count"]: values = (values > 0).astype(float) - agi_range_label = ( - f"{fmt(row['AGI lower bound'])}-{fmt(row['AGI upper bound'])}" - ) - taxable_label = ( - "taxable" if row["Taxable only"] else "all" + " returns" - ) + agi_range_label = f"{fmt(row['AGI lower bound'])}-{fmt(row['AGI upper bound'])}" + taxable_label = "taxable" if row["Taxable only"] else "all" + " returns" filing_status_label = row["Filing status"] variable_label = row["Variable"].replace("_", " ") @@ -270,9 +264,7 @@ def build_loss_matrix(dataset: type, time_period): for variable_name in CBO_PROGRAMS: label = f"nation/cbo/{variable_name}" - loss_matrix[label] = sim.calculate( - variable_name, map_to="household" - ).values + loss_matrix[label] = sim.calculate(variable_name, map_to="household").values if any(loss_matrix[label].isna()): raise ValueError(f"Missing values for {label}") param_name = CBO_PARAM_NAME_MAP.get(variable_name, variable_name) @@ -312,9 +304,9 @@ def build_loss_matrix(dataset: type, time_period): # National ACA Enrollment (people receiving a PTC) label = "nation/gov/aca_enrollment" - on_ptc = ( - sim.calculate("aca_ptc", map_to="person", period=2025).values > 0 - ).astype(int) + on_ptc = (sim.calculate("aca_ptc", map_to="person", period=2025).values > 0).astype( + int + ) loss_matrix[label] = sim.map_result(on_ptc, "person", "household") ACA_PTC_ENROLLMENT_2024 = 19_743_689 # people enrolled @@ -346,13 +338,9 @@ def build_loss_matrix(dataset: type, time_period): eitc_eligible_children = sim.calculate("eitc_child_count").values eitc = sim.calculate("eitc").values if row["count_children"] < 2: - meets_child_criteria = ( - eitc_eligible_children == row["count_children"] - ) + meets_child_criteria = eitc_eligible_children == row["count_children"] else: - meets_child_criteria = ( - eitc_eligible_children >= row["count_children"] - ) + meets_child_criteria = eitc_eligible_children >= row["count_children"] loss_matrix[returns_label] = sim.map_result( (eitc > 0) * meets_child_criteria, "tax_unit", @@ -406,9 +394,7 @@ def build_loss_matrix(dataset: type, time_period): # Hard-coded totals for variable_name, target in HARD_CODED_TOTALS.items(): label = f"nation/census/{variable_name}" - loss_matrix[label] = sim.calculate( - variable_name, map_to="household" - ).values + loss_matrix[label] = sim.calculate(variable_name, map_to="household").values if any(loss_matrix[label].isna()): raise ValueError(f"Missing values for {label}") targets_array.append(target) @@ -416,8 +402,8 @@ def build_loss_matrix(dataset: type, time_period): # Negative household market income total rough estimate from the IRS SOI PUF market_income = sim.calculate("household_market_income").values - loss_matrix["nation/irs/negative_household_market_income_total"] = ( - market_income * (market_income < 0) + loss_matrix["nation/irs/negative_household_market_income_total"] = market_income * ( + market_income < 0 ) targets_array.append(-138e9) @@ -439,7 +425,7 @@ def build_loss_matrix(dataset: type, time_period): "other_medical_expenses", "medicare_part_b_premiums", ]: - label = f"nation/census/{expense_type}/age_{age_lower_bound}_to_{age_lower_bound+9}" + label = f"nation/census/{expense_type}/age_{age_lower_bound}_to_{age_lower_bound + 9}" value = sim.calculate(expense_type).values loss_matrix[label] = sim.map_result( in_age_range * value, "person", "household" @@ -448,39 +434,27 @@ def build_loss_matrix(dataset: type, time_period): # AGI by SPM threshold totals - spm_threshold_agi = pd.read_csv( - CALIBRATION_FOLDER / "spm_threshold_agi.csv" - ) + spm_threshold_agi = pd.read_csv(CALIBRATION_FOLDER / "spm_threshold_agi.csv") for _, row in spm_threshold_agi.iterrows(): - spm_unit_agi = sim.calculate( - "adjusted_gross_income", map_to="spm_unit" - ).values + spm_unit_agi = sim.calculate("adjusted_gross_income", map_to="spm_unit").values spm_threshold = sim.calculate("spm_unit_spm_threshold").values in_threshold_range = (spm_threshold >= row["lower_spm_threshold"]) * ( spm_threshold < row["upper_spm_threshold"] ) - label = ( - f"nation/census/agi_in_spm_threshold_decile_{int(row['decile'])}" - ) + label = f"nation/census/agi_in_spm_threshold_decile_{int(row['decile'])}" loss_matrix[label] = sim.map_result( in_threshold_range * spm_unit_agi, "spm_unit", "household" ) targets_array.append(row["adjusted_gross_income"]) - label = ( - f"nation/census/count_in_spm_threshold_decile_{int(row['decile'])}" - ) - loss_matrix[label] = sim.map_result( - in_threshold_range, "spm_unit", "household" - ) + label = f"nation/census/count_in_spm_threshold_decile_{int(row['decile'])}" + loss_matrix[label] = sim.map_result(in_threshold_range, "spm_unit", "household") targets_array.append(row["count"]) # Population by state and population under 5 by state - state_population = pd.read_csv( - CALIBRATION_FOLDER / "population_by_state.csv" - ) + state_population = pd.read_csv(CALIBRATION_FOLDER / "population_by_state.csv") for _, row in state_population.iterrows(): in_state = sim.calculate("state_code", map_to="person") == row["state"] @@ -491,9 +465,7 @@ def build_loss_matrix(dataset: type, time_period): under_5 = sim.calculate("age").values < 5 in_state_under_5 = in_state * under_5 label = f"state/census/population_under_5_by_state/{row['state']}" - loss_matrix[label] = sim.map_result( - in_state_under_5, "person", "household" - ) + loss_matrix[label] = sim.map_result(in_state_under_5, "person", "household") targets_array.append(row["population_under_5"]) age = sim.calculate("age").values @@ -517,9 +489,7 @@ def build_loss_matrix(dataset: type, time_period): # SALT tax expenditure targeting - _add_tax_expenditure_targets( - dataset, time_period, sim, loss_matrix, targets_array - ) + _add_tax_expenditure_targets(dataset, time_period, sim, loss_matrix, targets_array) if any(loss_matrix.isna().sum() > 0): raise ValueError("Some targets are missing from the loss matrix") @@ -533,9 +503,7 @@ def build_loss_matrix(dataset: type, time_period): # Overall count by SSN card type label = f"nation/ssa/ssn_card_type_{card_type_str.lower()}_count" - loss_matrix[label] = sim.map_result( - ssn_type_mask, "person", "household" - ) + loss_matrix[label] = sim.map_result(ssn_type_mask, "person", "household") # Target undocumented population by year based on various sources if card_type_str == "NONE": @@ -571,14 +539,11 @@ def build_loss_matrix(dataset: type, time_period): for _, row in spending_by_state.iterrows(): # Households located in this state in_state = ( - sim.calculate("state_code", map_to="household").values - == row["state"] + sim.calculate("state_code", map_to="household").values == row["state"] ) # ACA PTC amounts for every household (2025) - aca_value = sim.calculate( - "aca_ptc", map_to="household", period=2025 - ).values + aca_value = sim.calculate("aca_ptc", map_to="household", period=2025).values # Add a loss-matrix entry and matching target label = f"nation/irs/aca_spending/{row['state'].lower()}" @@ -611,9 +576,7 @@ def build_loss_matrix(dataset: type, time_period): in_state_enrolled = in_state & is_enrolled label = f"state/irs/aca_enrollment/{row['state'].lower()}" - loss_matrix[label] = sim.map_result( - in_state_enrolled, "person", "household" - ) + loss_matrix[label] = sim.map_result(in_state_enrolled, "person", "household") if any(loss_matrix[label].isna()): raise ValueError(f"Missing values for {label}") @@ -630,9 +593,7 @@ def build_loss_matrix(dataset: type, time_period): state_person = sim.calculate("state_code", map_to="person").values # Flag people in households that actually receive medicaid - has_medicaid = sim.calculate( - "medicaid_enrolled", map_to="person", period=2025 - ) + has_medicaid = sim.calculate("medicaid_enrolled", map_to="person", period=2025) is_medicaid_eligible = sim.calculate( "is_medicaid_eligible", map_to="person", period=2025 ).values @@ -644,9 +605,7 @@ def build_loss_matrix(dataset: type, time_period): in_state_enrolled = in_state & is_enrolled label = f"irs/medicaid_enrollment/{row['state'].lower()}" - loss_matrix[label] = sim.map_result( - in_state_enrolled, "person", "household" - ) + loss_matrix[label] = sim.map_result(in_state_enrolled, "person", "household") if any(loss_matrix[label].isna()): raise ValueError(f"Missing values for {label}") @@ -670,9 +629,7 @@ def build_loss_matrix(dataset: type, time_period): age_lower_bound = int(age_range.replace("+", "")) age_upper_bound = np.inf else: - age_lower_bound, age_upper_bound = map( - int, age_range.split("-") - ) + age_lower_bound, age_upper_bound = map(int, age_range.split("-")) age_mask = (age >= age_lower_bound) & (age <= age_upper_bound) label = f"state/census/age/{state}/{age_range}" @@ -740,9 +697,7 @@ def apply(self): simulation.default_calculation_period = time_period # Calculate the baseline and reform income tax values. - income_tax_r = simulation.calculate( - "income_tax", map_to="household" - ).values + income_tax_r = simulation.calculate("income_tax", map_to="household").values # Compute the tax expenditure (TE) values. te_values = income_tax_r - income_tax_b @@ -776,9 +731,7 @@ def _add_agi_state_targets(): + soi_targets["VARIABLE"] + "/" + soi_targets.apply( - lambda r: get_agi_band_label( - r["AGI_LOWER_BOUND"], r["AGI_UPPER_BOUND"] - ), + lambda r: get_agi_band_label(r["AGI_LOWER_BOUND"], r["AGI_UPPER_BOUND"]), axis=1, ) ) @@ -799,9 +752,7 @@ def _add_agi_metric_columns( agi = sim.calculate("adjusted_gross_income").values state = sim.calculate("state_code", map_to="person").values - state = sim.map_result( - state, "person", "tax_unit", how="value_from_first_person" - ) + state = sim.map_result(state, "person", "tax_unit", how="value_from_first_person") for _, r in soi_targets.iterrows(): lower, upper = r.AGI_LOWER_BOUND, r.AGI_UPPER_BOUND @@ -845,13 +796,9 @@ def _add_state_real_estate_taxes(loss_matrix, targets_list, sim): rtol=1e-8, ), "Real estate tax totals do not sum to national target" - targets_list.extend( - real_estate_taxes_targets["real_estate_taxes_bn"].tolist() - ) + targets_list.extend(real_estate_taxes_targets["real_estate_taxes_bn"].tolist()) - real_estate_taxes = sim.calculate( - "real_estate_taxes", map_to="household" - ).values + real_estate_taxes = sim.calculate("real_estate_taxes", map_to="household").values state = sim.calculate("state_code", map_to="household").values for _, r in real_estate_taxes_targets.iterrows(): @@ -874,22 +821,16 @@ def _add_snap_state_targets(sim): ).calibration.gov.cbo._children["snap"] ratio = snap_targets[["Cost"]].sum().values[0] / national_cost_target snap_targets[["CostAdj"]] = snap_targets[["Cost"]] / ratio - assert ( - np.round(snap_targets[["CostAdj"]].sum().values[0]) - == national_cost_target - ) + assert np.round(snap_targets[["CostAdj"]].sum().values[0]) == national_cost_target cost_targets = snap_targets.copy()[["GEO_ID", "CostAdj"]] - cost_targets["target_name"] = ( - cost_targets["GEO_ID"].str[-4:] + "/snap-cost" - ) + cost_targets["target_name"] = cost_targets["GEO_ID"].str[-4:] + "/snap-cost" hh_targets = snap_targets.copy()[["GEO_ID", "Households"]] hh_targets["target_name"] = snap_targets["GEO_ID"].str[-4:] + "/snap-hhs" target_names = ( - cost_targets["target_name"].tolist() - + hh_targets["target_name"].tolist() + cost_targets["target_name"].tolist() + hh_targets["target_name"].tolist() ) target_values = ( cost_targets["CostAdj"].astype(float).tolist() @@ -908,14 +849,12 @@ def _add_snap_metric_columns( snap_targets = pd.read_csv(CALIBRATION_FOLDER / "snap_state.csv") snap_cost = sim.calculate("snap_reported", map_to="household").values - snap_hhs = ( - sim.calculate("snap_reported", map_to="household").values > 0 - ).astype(int) + snap_hhs = (sim.calculate("snap_reported", map_to="household").values > 0).astype( + int + ) state = sim.calculate("state_code", map_to="person").values - state = sim.map_result( - state, "person", "household", how="value_from_first_person" - ) + state = sim.map_result(state, "person", "household", how="value_from_first_person") STATE_ABBR_TO_FIPS["DC"] = 11 state_fips = pd.Series(state).apply(lambda s: STATE_ABBR_TO_FIPS[s]) @@ -934,9 +873,7 @@ def _add_snap_metric_columns( return loss_matrix -def print_reweighting_diagnostics( - optimised_weights, loss_matrix, targets_array, label -): +def print_reweighting_diagnostics(optimised_weights, loss_matrix, targets_array, label): # Convert all inputs to NumPy arrays right at the start optimised_weights_np = ( optimised_weights.numpy() @@ -963,9 +900,7 @@ def print_reweighting_diagnostics( # All subsequent calculations use the guaranteed NumPy versions estimate = optimised_weights_np @ loss_matrix_np - rel_error = ( - ((estimate - targets_array_np) + 1) / (targets_array_np + 1) - ) ** 2 + rel_error = (((estimate - targets_array_np) + 1) / (targets_array_np + 1)) ** 2 within_10_percent_mask = np.abs(estimate - targets_array_np) <= ( 0.10 * np.abs(targets_array_np) ) diff --git a/policyengine_us_data/utils/randomness.py b/policyengine_us_data/utils/randomness.py index eac015227..001dbf2f8 100644 --- a/policyengine_us_data/utils/randomness.py +++ b/policyengine_us_data/utils/randomness.py @@ -11,9 +11,7 @@ def _stable_string_hash(s: str) -> np.uint64: Ported from policyengine_core.commons.formulas._stable_string_hash. """ with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", "overflow encountered", RuntimeWarning - ) + warnings.filterwarnings("ignore", "overflow encountered", RuntimeWarning) h = np.uint64(0) for byte in s.encode("utf-8"): h = h * np.uint64(31) + np.uint64(byte) diff --git a/policyengine_us_data/utils/soi.py b/policyengine_us_data/utils/soi.py index d9538addb..b9755c30f 100644 --- a/policyengine_us_data/utils/soi.py +++ b/policyengine_us_data/utils/soi.py @@ -11,9 +11,7 @@ def pe_to_soi(pe_dataset, year): pe_sim.default_calculation_period = year df = pd.DataFrame() - pe = lambda variable: np.array( - pe_sim.calculate(variable, map_to="tax_unit") - ) + pe = lambda variable: np.array(pe_sim.calculate(variable, map_to="tax_unit")) df["adjusted_gross_income"] = pe("adjusted_gross_income") df["exemption"] = pe("exemptions") @@ -51,12 +49,8 @@ def pe_to_soi(pe_dataset, year): df["total_pension_income"] = pe("pension_income") df["taxable_pension_income"] = pe("taxable_pension_income") df["qualified_dividends"] = pe("qualified_dividend_income") - df["rent_and_royalty_net_income"] = pe("rental_income") * ( - pe("rental_income") > 0 - ) - df["rent_and_royalty_net_losses"] = -pe("rental_income") * ( - pe("rental_income") < 0 - ) + df["rent_and_royalty_net_income"] = pe("rental_income") * (pe("rental_income") > 0) + df["rent_and_royalty_net_losses"] = -pe("rental_income") * (pe("rental_income") < 0) df["total_social_security"] = pe("social_security") df["taxable_social_security"] = pe("taxable_social_security") df["income_tax_before_credits"] = pe("income_tax_before_credits") @@ -176,8 +170,7 @@ def get_soi(year: int) -> pd.DataFrame: pe_name = uprating_map.get(variable) if pe_name in uprating.index: uprating_factors[variable] = ( - uprating.loc[pe_name, year] - / uprating.loc[pe_name, soi.Year.max()] + uprating.loc[pe_name, year] / uprating.loc[pe_name, soi.Year.max()] ) else: uprating_factors[variable] = ( @@ -218,9 +211,7 @@ def compare_soi_replication_to_soi(df, soi): elif fs == "Head of Household": subset = subset[subset.filing_status == "HEAD_OF_HOUSEHOLD"] elif fs == "Married Filing Jointly/Surviving Spouse": - subset = subset[ - subset.filing_status.isin(["JOINT", "SURVIVING_SPOUSE"]) - ] + subset = subset[subset.filing_status.isin(["JOINT", "SURVIVING_SPOUSE"])] elif fs == "Married Filing Separately": subset = subset[subset.filing_status == "SEPARATE"] @@ -258,17 +249,13 @@ def compare_soi_replication_to_soi(df, soi): } ) - soi_replication["Error"] = ( - soi_replication["Value"] - soi_replication["SOI Value"] - ) + soi_replication["Error"] = soi_replication["Value"] - soi_replication["SOI Value"] soi_replication["Absolute error"] = soi_replication["Error"].abs() soi_replication["Relative error"] = ( (soi_replication["Error"] / soi_replication["SOI Value"]) .replace([np.inf, -np.inf], np.nan) .fillna(0) ) - soi_replication["Absolute relative error"] = soi_replication[ - "Relative error" - ].abs() + soi_replication["Absolute relative error"] = soi_replication["Relative error"].abs() return soi_replication diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py index b2e4538b5..ad3c9e9fb 100644 --- a/policyengine_us_data/utils/spm.py +++ b/policyengine_us_data/utils/spm.py @@ -44,9 +44,7 @@ def calculate_spm_thresholds_with_geoadj( for i in range(n): tenure_str = TENURE_CODE_MAP.get(int(tenure_codes[i]), "renter") base = base_thresholds[tenure_str] - equiv_scale = spm_equivalence_scale( - int(num_adults[i]), int(num_children[i]) - ) + equiv_scale = spm_equivalence_scale(int(num_adults[i]), int(num_children[i])) thresholds[i] = base * equiv_scale * geoadj[i] return thresholds diff --git a/policyengine_us_data/utils/uprating.py b/policyengine_us_data/utils/uprating.py index 6dd2f89ca..41d223b0b 100644 --- a/policyengine_us_data/utils/uprating.py +++ b/policyengine_us_data/utils/uprating.py @@ -23,9 +23,7 @@ def create_policyengine_uprating_factors_table(): parameter = system.parameters.get_child(variable.uprating) start_value = parameter(START_YEAR) for year in range(START_YEAR, END_YEAR + 1): - population_growth = population_size(year) / population_size( - START_YEAR - ) + population_growth = population_size(year) / population_size(START_YEAR) variable_names.append(variable.name) years.append(year) growth = parameter(year) / start_value diff --git a/pyproject.toml b/pyproject.toml index 95ada2a35..b9e309eb2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ l0 = [ [dependency-groups] dev = [ - "black", + "ruff>=0.9.0", "pytest", "quantile-forest", "tabulate", @@ -82,24 +82,6 @@ testpaths = [ "policyengine_us_data/tests", ] -[tool.black] -line-length = 79 -target-version = ['py311', 'py312', 'py313'] -include = '\.pyi?$' -extend-exclude = ''' -/( - # directories - \.eggs - | \.git - | \.hg - | \.mypy_cache - | \.tox - | \.venv - | build - | dist -)/ -''' - [tool.towncrier] package = "policyengine_us_data" directory = "changelog.d" diff --git a/scripts/generate_test_data.py b/scripts/generate_test_data.py index 75025bca6..2f0de3080 100644 --- a/scripts/generate_test_data.py +++ b/scripts/generate_test_data.py @@ -46,9 +46,7 @@ def generate_synthetic_cps(n_households=1000, seed=42): "age": age, "sex": np.random.choice([1, 2]), # 1=male, 2=female "person_weight": np.random.uniform(1000, 3000), - "employment_income": ( - np.random.lognormal(10, 1.5) if age >= 18 else 0 - ), + "employment_income": (np.random.lognormal(10, 1.5) if age >= 18 else 0), "is_disabled": np.random.random() < 0.15, "role": role, } @@ -82,52 +80,32 @@ def generate_synthetic_puf(n_returns=10000, seed=43): for i in range(n_returns): # Income components (log-normal distributions) wages = np.random.lognormal(10.5, 1.2) - interest = ( - np.random.exponential(500) if np.random.random() < 0.3 else 0 - ) - dividends = ( - np.random.exponential(1000) if np.random.random() < 0.2 else 0 - ) + interest = np.random.exponential(500) if np.random.random() < 0.3 else 0 + dividends = np.random.exponential(1000) if np.random.random() < 0.2 else 0 business = np.random.lognormal(9, 2) if np.random.random() < 0.1 else 0 - cap_gains = ( - np.random.exponential(5000) if np.random.random() < 0.15 else 0 - ) + cap_gains = np.random.exponential(5000) if np.random.random() < 0.15 else 0 # Deductions - mortgage_int = ( - np.random.exponential(8000) if np.random.random() < 0.25 else 0 - ) - charity = ( - np.random.exponential(3000) if np.random.random() < 0.3 else 0 - ) + mortgage_int = np.random.exponential(8000) if np.random.random() < 0.25 else 0 + charity = np.random.exponential(3000) if np.random.random() < 0.3 else 0 salt = min(10000, wages * 0.05 + np.random.normal(0, 1000)) # Demographics (limited in PUF) - filing_status = np.random.choice( - [1, 2, 3, 4], p=[0.45, 0.40, 0.10, 0.05] - ) - num_deps = np.random.choice( - [0, 1, 2, 3, 4], p=[0.6, 0.15, 0.15, 0.08, 0.02] - ) + filing_status = np.random.choice([1, 2, 3, 4], p=[0.45, 0.40, 0.10, 0.05]) + num_deps = np.random.choice([0, 1, 2, 3, 4], p=[0.6, 0.15, 0.15, 0.08, 0.02]) return_data = { "return_id": i, "filing_status": filing_status, "num_dependents": num_deps, "age_primary": np.random.randint(18, 85), - "age_secondary": ( - np.random.randint(18, 85) if filing_status == 2 else 0 - ), + "age_secondary": (np.random.randint(18, 85) if filing_status == 2 else 0), "wages": wages, "interest": interest, "dividends": dividends, "business_income": business, "capital_gains": cap_gains, - "total_income": wages - + interest - + dividends - + business - + cap_gains, + "total_income": wages + interest + dividends + business + cap_gains, "mortgage_interest": mortgage_int, "charitable_deduction": charity, "salt_deduction": salt, diff --git a/scripts/migrate_versioned_to_production.py b/scripts/migrate_versioned_to_production.py index 5f99f74e3..1f2d7f447 100644 --- a/scripts/migrate_versioned_to_production.py +++ b/scripts/migrate_versioned_to_production.py @@ -93,9 +93,7 @@ def main(): parser.add_argument( "--execute", action="store_true", help="Actually perform the migration" ) - parser.add_argument( - "--gcs-only", action="store_true", help="Only migrate GCS" - ) + parser.add_argument("--gcs-only", action="store_true", help="Only migrate GCS") parser.add_argument( "--hf-only", action="store_true", help="Only migrate HuggingFace" ) diff --git a/tests/test_h6_reform.py b/tests/test_h6_reform.py index e68ed8db3..2acdd8ccf 100644 --- a/tests/test_h6_reform.py +++ b/tests/test_h6_reform.py @@ -27,17 +27,13 @@ def calculate_oasdi_thresholds(year: int) -> tuple[int, int]: return oasdi_single, oasdi_joint -def get_swapped_thresholds( - oasdi_threshold: int, hi_threshold: int -) -> tuple[int, int]: +def get_swapped_thresholds(oasdi_threshold: int, hi_threshold: int) -> tuple[int, int]: """ Apply min/max swap to handle threshold crossover. Returns (base_threshold, adjusted_threshold) where base <= adjusted. """ - return min(oasdi_threshold, hi_threshold), max( - oasdi_threshold, hi_threshold - ) + return min(oasdi_threshold, hi_threshold), max(oasdi_threshold, hi_threshold) def needs_crossover_swap(oasdi_threshold: int, hi_threshold: int) -> bool: @@ -145,9 +141,7 @@ def test_single_crossover_starts_2046(self): # 2046+: crossover for year in range(2046, 2054): oasdi_single, _ = calculate_oasdi_thresholds(year) - assert needs_crossover_swap( - oasdi_single, HI_SINGLE - ), f"Year {year}" + assert needs_crossover_swap(oasdi_single, HI_SINGLE), f"Year {year}" class TestH6ThresholdSwapping: @@ -211,9 +205,9 @@ def test_2045_error_analysis(self): assert single_error_swapped == pytest.approx(225) assert joint_error_default == pytest.approx(3_150) - assert joint_error_default / single_error_swapped == pytest.approx( - 14.0 - ), "Swapped rates should have 14x less error" + assert joint_error_default / single_error_swapped == pytest.approx(14.0), ( + "Swapped rates should have 14x less error" + ) def test_swapped_rates_align_with_tax_cut_intent(self): """Swapped rates undertax (not overtax), aligning with reform intent.""" diff --git a/tests/test_no_formula_variables_stored.py b/tests/test_no_formula_variables_stored.py index 9334a5c78..7c7cb0de5 100644 --- a/tests/test_no_formula_variables_stored.py +++ b/tests/test_no_formula_variables_stored.py @@ -109,11 +109,7 @@ def test_stored_values_match_computed( computed_total = np.sum(computed.astype(float)) if abs(stored_total) > 0: - pct_diff = ( - abs(stored_total - computed_total) - / abs(stored_total) - * 100 - ) + pct_diff = abs(stored_total - computed_total) / abs(stored_total) * 100 else: pct_diff = 0 @@ -141,23 +137,13 @@ def test_ss_subcomponents_sum_to_computed_total(sim, dataset_path): stored in the dataset sum to the simulation's computed total. """ with h5py.File(dataset_path, "r") as f: - ss_retirement = f["social_security_retirement"]["2024"][...].astype( - float - ) - ss_disability = f["social_security_disability"]["2024"][...].astype( - float - ) - ss_survivors = f["social_security_survivors"]["2024"][...].astype( - float - ) - ss_dependents = f["social_security_dependents"]["2024"][...].astype( - float - ) + ss_retirement = f["social_security_retirement"]["2024"][...].astype(float) + ss_disability = f["social_security_disability"]["2024"][...].astype(float) + ss_survivors = f["social_security_survivors"]["2024"][...].astype(float) + ss_dependents = f["social_security_dependents"]["2024"][...].astype(float) sub_sum = ss_retirement + ss_disability + ss_survivors + ss_dependents - computed_total = np.array(sim.calculate("social_security", 2024)).astype( - float - ) + computed_total = np.array(sim.calculate("social_security", 2024)).astype(float) # Only check records that have any SS income has_ss = computed_total > 0 diff --git a/tests/test_reproducibility.py b/tests/test_reproducibility.py index 1ec097a7b..25755f0a6 100644 --- a/tests/test_reproducibility.py +++ b/tests/test_reproducibility.py @@ -144,9 +144,9 @@ def test_output_checksums(self): if file_path.exists() and filename != "checksums.txt": with open(file_path, "rb") as f: actual_checksum = hashlib.sha256(f.read()).hexdigest() - assert ( - actual_checksum == expected_checksum - ), f"Checksum mismatch for {filename}" + assert actual_checksum == expected_checksum, ( + f"Checksum mismatch for {filename}" + ) def test_memory_usage(self): """Test that memory usage stays within bounds.""" diff --git a/tests/test_weeks_unemployed.py b/tests/test_weeks_unemployed.py index 18aa47629..d64d8b64c 100644 --- a/tests/test_weeks_unemployed.py +++ b/tests/test_weeks_unemployed.py @@ -21,9 +21,9 @@ def test_lkweeks_in_person_columns(self): # Check for correct variable assert '"LKWEEKS"' in content, "LKWEEKS should be in PERSON_COLUMNS" - assert ( - '"WKSUNEM"' not in content - ), "WKSUNEM should not be in PERSON_COLUMNS (Census uses LKWEEKS)" + assert '"WKSUNEM"' not in content, ( + "WKSUNEM should not be in PERSON_COLUMNS (Census uses LKWEEKS)" + ) def test_cps_uses_lkweeks(self): """Test that cps.py uses LKWEEKS, not WKSUNEM.""" diff --git a/uv.lock b/uv.lock index 11179f708..044161b89 100644 --- a/uv.lock +++ b/uv.lock @@ -167,33 +167,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" }, ] -[[package]] -name = "black" -version = "25.12.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click" }, - { name = "mypy-extensions" }, - { name = "packaging" }, - { name = "pathspec" }, - { name = "platformdirs" }, - { name = "pytokens" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c4/d9/07b458a3f1c525ac392b5edc6b191ff140b596f9d77092429417a54e249d/black-25.12.0.tar.gz", hash = "sha256:8d3dd9cea14bff7ddc0eb243c811cdb1a011ebb4800a5f0335a01a68654796a7", size = 659264, upload-time = "2025-12-08T01:40:52.501Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/bd/26083f805115db17fda9877b3c7321d08c647df39d0df4c4ca8f8450593e/black-25.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:31f96b7c98c1ddaeb07dc0f56c652e25bdedaac76d5b68a059d998b57c55594a", size = 1924178, upload-time = "2025-12-08T01:49:51.048Z" }, - { url = "https://files.pythonhosted.org/packages/89/6b/ea00d6651561e2bdd9231c4177f4f2ae19cc13a0b0574f47602a7519b6ca/black-25.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:05dd459a19e218078a1f98178c13f861fe6a9a5f88fc969ca4d9b49eb1809783", size = 1742643, upload-time = "2025-12-08T01:49:59.09Z" }, - { url = "https://files.pythonhosted.org/packages/6d/f3/360fa4182e36e9875fabcf3a9717db9d27a8d11870f21cff97725c54f35b/black-25.12.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c1f68c5eff61f226934be6b5b80296cf6939e5d2f0c2f7d543ea08b204bfaf59", size = 1800158, upload-time = "2025-12-08T01:44:27.301Z" }, - { url = "https://files.pythonhosted.org/packages/f8/08/2c64830cb6616278067e040acca21d4f79727b23077633953081c9445d61/black-25.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:274f940c147ddab4442d316b27f9e332ca586d39c85ecf59ebdea82cc9ee8892", size = 1426197, upload-time = "2025-12-08T01:45:51.198Z" }, - { url = "https://files.pythonhosted.org/packages/d4/60/a93f55fd9b9816b7432cf6842f0e3000fdd5b7869492a04b9011a133ee37/black-25.12.0-cp312-cp312-win_arm64.whl", hash = "sha256:169506ba91ef21e2e0591563deda7f00030cb466e747c4b09cb0a9dae5db2f43", size = 1237266, upload-time = "2025-12-08T01:45:10.556Z" }, - { url = "https://files.pythonhosted.org/packages/c8/52/c551e36bc95495d2aa1a37d50566267aa47608c81a53f91daa809e03293f/black-25.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a05ddeb656534c3e27a05a29196c962877c83fa5503db89e68857d1161ad08a5", size = 1923809, upload-time = "2025-12-08T01:46:55.126Z" }, - { url = "https://files.pythonhosted.org/packages/a0/f7/aac9b014140ee56d247e707af8db0aae2e9efc28d4a8aba92d0abd7ae9d1/black-25.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9ec77439ef3e34896995503865a85732c94396edcc739f302c5673a2315e1e7f", size = 1742384, upload-time = "2025-12-08T01:49:37.022Z" }, - { url = "https://files.pythonhosted.org/packages/74/98/38aaa018b2ab06a863974c12b14a6266badc192b20603a81b738c47e902e/black-25.12.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e509c858adf63aa61d908061b52e580c40eae0dfa72415fa47ac01b12e29baf", size = 1798761, upload-time = "2025-12-08T01:46:05.386Z" }, - { url = "https://files.pythonhosted.org/packages/16/3a/a8ac542125f61574a3f015b521ca83b47321ed19bb63fe6d7560f348bfe1/black-25.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:252678f07f5bac4ff0d0e9b261fbb029fa530cfa206d0a636a34ab445ef8ca9d", size = 1429180, upload-time = "2025-12-08T01:45:34.903Z" }, - { url = "https://files.pythonhosted.org/packages/e6/2d/bdc466a3db9145e946762d52cd55b1385509d9f9004fec1c97bdc8debbfb/black-25.12.0-cp313-cp313-win_arm64.whl", hash = "sha256:bc5b1c09fe3c931ddd20ee548511c64ebf964ada7e6f0763d443947fd1c603ce", size = 1239350, upload-time = "2025-12-08T01:46:09.458Z" }, - { url = "https://files.pythonhosted.org/packages/68/11/21331aed19145a952ad28fca2756a1433ee9308079bd03bd898e903a2e53/black-25.12.0-py3-none-any.whl", hash = "sha256:48ceb36c16dbc84062740049eef990bb2ce07598272e673c17d1a7720c71c828", size = 206191, upload-time = "2025-12-08T01:40:50.963Z" }, -] - [[package]] name = "bleach" version = "6.3.0" @@ -637,6 +610,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f8/0a/a3871375c7b9727edaeeea994bfff7c63ff7804c9829c19309ba2e058807/greenlet-3.3.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:b01548f6e0b9e9784a2c99c5651e5dc89ffcbe870bc5fb2e5ef864e9cc6b5dcb", size = 276379, upload-time = "2025-12-04T14:23:30.498Z" }, { url = "https://files.pythonhosted.org/packages/43/ab/7ebfe34dce8b87be0d11dae91acbf76f7b8246bf9d6b319c741f99fa59c6/greenlet-3.3.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:349345b770dc88f81506c6861d22a6ccd422207829d2c854ae2af8025af303e3", size = 597294, upload-time = "2025-12-04T14:50:06.847Z" }, { url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" }, + { url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" }, { url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" }, { url = "https://files.pythonhosted.org/packages/49/0e/49b46ac39f931f59f987b7cd9f34bfec8ef81d2a1e6e00682f55be5de9f4/greenlet-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d9ad37fc657b1102ec880e637cccf20191581f75c64087a549e66c57e1ceb53", size = 1567424, upload-time = "2025-12-04T15:04:23.757Z" }, { url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" }, @@ -644,6 +618,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/2f/28592176381b9ab2cafa12829ba7b472d177f3acc35d8fbcf3673d966fff/greenlet-3.3.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:a1e41a81c7e2825822f4e068c48cb2196002362619e2d70b148f20a831c00739", size = 275140, upload-time = "2025-12-04T14:23:01.282Z" }, { url = "https://files.pythonhosted.org/packages/2c/80/fbe937bf81e9fca98c981fe499e59a3f45df2a04da0baa5c2be0dca0d329/greenlet-3.3.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f515a47d02da4d30caaa85b69474cec77b7929b2e936ff7fb853d42f4bf8808", size = 599219, upload-time = "2025-12-04T14:50:08.309Z" }, { url = "https://files.pythonhosted.org/packages/c2/ff/7c985128f0514271b8268476af89aee6866df5eec04ac17dcfbc676213df/greenlet-3.3.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7d2d9fd66bfadf230b385fdc90426fcd6eb64db54b40c495b72ac0feb5766c54", size = 610211, upload-time = "2025-12-04T14:57:43.968Z" }, + { url = "https://files.pythonhosted.org/packages/79/07/c47a82d881319ec18a4510bb30463ed6891f2ad2c1901ed5ec23d3de351f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30a6e28487a790417d036088b3bcb3f3ac7d8babaa7d0139edbaddebf3af9492", size = 624311, upload-time = "2025-12-04T15:07:14.697Z" }, { url = "https://files.pythonhosted.org/packages/fd/8e/424b8c6e78bd9837d14ff7df01a9829fc883ba2ab4ea787d4f848435f23f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:087ea5e004437321508a8d6f20efc4cfec5e3c30118e1417ea96ed1d93950527", size = 612833, upload-time = "2025-12-04T14:26:03.669Z" }, { url = "https://files.pythonhosted.org/packages/b5/ba/56699ff9b7c76ca12f1cdc27a886d0f81f2189c3455ff9f65246780f713d/greenlet-3.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ab97cf74045343f6c60a39913fa59710e4bd26a536ce7ab2397adf8b27e67c39", size = 1567256, upload-time = "2025-12-04T15:04:25.276Z" }, { url = "https://files.pythonhosted.org/packages/1e/37/f31136132967982d698c71a281a8901daf1a8fbab935dce7c0cf15f942cc/greenlet-3.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5375d2e23184629112ca1ea89a53389dddbffcf417dad40125713d88eb5f96e8", size = 1636483, upload-time = "2025-12-04T14:27:30.804Z" }, @@ -1252,15 +1227,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/db/0314e4e2db56ebcf450f277904ffd84a7988b9e5da8d0d61ab2d057df2b6/msgpack-1.1.2-cp313-cp313-win_arm64.whl", hash = "sha256:e69b39f8c0aa5ec24b57737ebee40be647035158f14ed4b40e6f150077e21a84", size = 64118, upload-time = "2025-10-08T09:15:23.402Z" }, ] -[[package]] -name = "mypy-extensions" -version = "1.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" }, -] - [[package]] name = "mystmd" version = "1.7.1" @@ -1697,15 +1663,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/f9/690a8600b93c332de3ab4a344a4ac34f00c8f104917061f779db6a918ed6/pathlib-1.0.1-py3-none-any.whl", hash = "sha256:f35f95ab8b0f59e6d354090350b44a80a80635d22efdedfa84c7ad1cf0a74147", size = 14363, upload-time = "2022-05-04T13:37:20.585Z" }, ] -[[package]] -name = "pathspec" -version = "1.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/28/2e/83722ece0f6ee24387d6cb830dd562ddbcd6ce0b9d76072c6849670c31b4/pathspec-1.0.1.tar.gz", hash = "sha256:e2769b508d0dd47b09af6ee2c75b2744a2cb1f474ae4b1494fd6a1b7a841613c", size = 129791, upload-time = "2026-01-06T13:02:55.15Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d2/fe/2257c71721aeab6a6e8aa1f00d01f2a20f58547d249a6c8fef5791f559fc/pathspec-1.0.1-py3-none-any.whl", hash = "sha256:8870061f22c58e6d83463cfce9a7dd6eca0512c772c1001fb09ac64091816721", size = 54584, upload-time = "2026-01-06T13:02:53.601Z" }, -] - [[package]] name = "patsy" version = "1.0.2" @@ -1894,7 +1851,6 @@ l0 = [ [package.dev-dependencies] dev = [ - { name = "black" }, { name = "build" }, { name = "furo" }, { name = "itables" }, @@ -1902,6 +1858,7 @@ dev = [ { name = "mystmd" }, { name = "pytest" }, { name = "quantile-forest" }, + { name = "ruff" }, { name = "tabulate" }, { name = "tomli" }, { name = "towncrier" }, @@ -1939,7 +1896,6 @@ provides-extras = ["calibration", "l0"] [package.metadata.requires-dev] dev = [ - { name = "black" }, { name = "build" }, { name = "furo" }, { name = "itables" }, @@ -1947,6 +1903,7 @@ dev = [ { name = "mystmd", specifier = ">=1.7.0" }, { name = "pytest" }, { name = "quantile-forest" }, + { name = "ruff", specifier = ">=0.9.0" }, { name = "tabulate" }, { name = "tomli" }, { name = "towncrier", specifier = ">=24.8.0" }, @@ -2215,15 +2172,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/51/e5/fecf13f06e5e5f67e8837d777d1bc43fac0ed2b77a676804df5c34744727/python_json_logger-4.0.0-py3-none-any.whl", hash = "sha256:af09c9daf6a813aa4cc7180395f50f2a9e5fa056034c9953aec92e381c5ba1e2", size = 15548, upload-time = "2025-10-06T04:15:17.553Z" }, ] -[[package]] -name = "pytokens" -version = "0.3.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/4e/8d/a762be14dae1c3bf280202ba3172020b2b0b4c537f94427435f19c413b72/pytokens-0.3.0.tar.gz", hash = "sha256:2f932b14ed08de5fcf0b391ace2642f858f1394c0857202959000b68ed7a458a", size = 17644, upload-time = "2025-11-05T13:36:35.34Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/84/25/d9db8be44e205a124f6c98bc0324b2bb149b7431c53877fc6d1038dddaf5/pytokens-0.3.0-py3-none-any.whl", hash = "sha256:95b2b5eaf832e469d141a378872480ede3f251a5a5041b8ec6e581d3ac71bbf3", size = 12195, upload-time = "2025-11-05T13:36:33.183Z" }, -] - [[package]] name = "pytz" version = "2025.2" @@ -2477,6 +2425,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" }, ] +[[package]] +name = "ruff" +version = "0.15.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/77/9b/840e0039e65fcf12758adf684d2289024d6140cde9268cc59887dc55189c/ruff-0.15.5.tar.gz", hash = "sha256:7c3601d3b6d76dce18c5c824fc8d06f4eef33d6df0c21ec7799510cde0f159a2", size = 4574214, upload-time = "2026-03-05T20:06:34.946Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/20/5369c3ce21588c708bcbe517a8fbe1a8dfdb5dfd5137e14790b1da71612c/ruff-0.15.5-py3-none-linux_armv6l.whl", hash = "sha256:4ae44c42281f42e3b06b988e442d344a5b9b72450ff3c892e30d11b29a96a57c", size = 10478185, upload-time = "2026-03-05T20:06:29.093Z" }, + { url = "https://files.pythonhosted.org/packages/44/ed/e81dd668547da281e5dce710cf0bc60193f8d3d43833e8241d006720e42b/ruff-0.15.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6edd3792d408ebcf61adabc01822da687579a1a023f297618ac27a5b51ef0080", size = 10859201, upload-time = "2026-03-05T20:06:32.632Z" }, + { url = "https://files.pythonhosted.org/packages/c4/8f/533075f00aaf19b07c5cd6aa6e5d89424b06b3b3f4583bfa9c640a079059/ruff-0.15.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:89f463f7c8205a9f8dea9d658d59eff49db05f88f89cc3047fb1a02d9f344010", size = 10184752, upload-time = "2026-03-05T20:06:40.312Z" }, + { url = "https://files.pythonhosted.org/packages/66/0e/ba49e2c3fa0395b3152bad634c7432f7edfc509c133b8f4529053ff024fb/ruff-0.15.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba786a8295c6574c1116704cf0b9e6563de3432ac888d8f83685654fe528fd65", size = 10534857, upload-time = "2026-03-05T20:06:19.581Z" }, + { url = "https://files.pythonhosted.org/packages/59/71/39234440f27a226475a0659561adb0d784b4d247dfe7f43ffc12dd02e288/ruff-0.15.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fd4b801e57955fe9f02b31d20375ab3a5c4415f2e5105b79fb94cf2642c91440", size = 10309120, upload-time = "2026-03-05T20:06:00.435Z" }, + { url = "https://files.pythonhosted.org/packages/f5/87/4140aa86a93df032156982b726f4952aaec4a883bb98cb6ef73c347da253/ruff-0.15.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:391f7c73388f3d8c11b794dbbc2959a5b5afe66642c142a6effa90b45f6f5204", size = 11047428, upload-time = "2026-03-05T20:05:51.867Z" }, + { url = "https://files.pythonhosted.org/packages/5a/f7/4953e7e3287676f78fbe85e3a0ca414c5ca81237b7575bdadc00229ac240/ruff-0.15.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8dc18f30302e379fe1e998548b0f5e9f4dff907f52f73ad6da419ea9c19d66c8", size = 11914251, upload-time = "2026-03-05T20:06:22.887Z" }, + { url = "https://files.pythonhosted.org/packages/77/46/0f7c865c10cf896ccf5a939c3e84e1cfaeed608ff5249584799a74d33835/ruff-0.15.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1cc6e7f90087e2d27f98dc34ed1b3ab7c8f0d273cc5431415454e22c0bd2a681", size = 11333801, upload-time = "2026-03-05T20:05:57.168Z" }, + { url = "https://files.pythonhosted.org/packages/d3/01/a10fe54b653061585e655f5286c2662ebddb68831ed3eaebfb0eb08c0a16/ruff-0.15.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1cb7169f53c1ddb06e71a9aebd7e98fc0fea936b39afb36d8e86d36ecc2636a", size = 11206821, upload-time = "2026-03-05T20:06:03.441Z" }, + { url = "https://files.pythonhosted.org/packages/7a/0d/2132ceaf20c5e8699aa83da2706ecb5c5dcdf78b453f77edca7fb70f8a93/ruff-0.15.5-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:9b037924500a31ee17389b5c8c4d88874cc6ea8e42f12e9c61a3d754ff72f1ca", size = 11133326, upload-time = "2026-03-05T20:06:25.655Z" }, + { url = "https://files.pythonhosted.org/packages/72/cb/2e5259a7eb2a0f87c08c0fe5bf5825a1e4b90883a52685524596bfc93072/ruff-0.15.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:65bb414e5b4eadd95a8c1e4804f6772bbe8995889f203a01f77ddf2d790929dd", size = 10510820, upload-time = "2026-03-05T20:06:37.79Z" }, + { url = "https://files.pythonhosted.org/packages/ff/20/b67ce78f9e6c59ffbdb5b4503d0090e749b5f2d31b599b554698a80d861c/ruff-0.15.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d20aa469ae3b57033519c559e9bc9cd9e782842e39be05b50e852c7c981fa01d", size = 10302395, upload-time = "2026-03-05T20:05:54.504Z" }, + { url = "https://files.pythonhosted.org/packages/5f/e5/719f1acccd31b720d477751558ed74e9c88134adcc377e5e886af89d3072/ruff-0.15.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:15388dd28c9161cdb8eda68993533acc870aa4e646a0a277aa166de9ad5a8752", size = 10754069, upload-time = "2026-03-05T20:06:06.422Z" }, + { url = "https://files.pythonhosted.org/packages/c3/9c/d1db14469e32d98f3ca27079dbd30b7b44dbb5317d06ab36718dee3baf03/ruff-0.15.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b30da330cbd03bed0c21420b6b953158f60c74c54c5f4c1dabbdf3a57bf355d2", size = 11304315, upload-time = "2026-03-05T20:06:10.867Z" }, + { url = "https://files.pythonhosted.org/packages/28/3a/950367aee7c69027f4f422059227b290ed780366b6aecee5de5039d50fa8/ruff-0.15.5-py3-none-win32.whl", hash = "sha256:732e5ee1f98ba5b3679029989a06ca39a950cced52143a0ea82a2102cb592b74", size = 10551676, upload-time = "2026-03-05T20:06:13.705Z" }, + { url = "https://files.pythonhosted.org/packages/b8/00/bf077a505b4e649bdd3c47ff8ec967735ce2544c8e4a43aba42ee9bf935d/ruff-0.15.5-py3-none-win_amd64.whl", hash = "sha256:821d41c5fa9e19117616c35eaa3f4b75046ec76c65e7ae20a333e9a8696bc7fe", size = 11678972, upload-time = "2026-03-05T20:06:45.379Z" }, + { url = "https://files.pythonhosted.org/packages/fe/4e/cd76eca6db6115604b7626668e891c9dd03330384082e33662fb0f113614/ruff-0.15.5-py3-none-win_arm64.whl", hash = "sha256:b498d1c60d2fe5c10c45ec3f698901065772730b411f164ae270bb6bfcc4740b", size = 10965572, upload-time = "2026-03-05T20:06:16.984Z" }, +] + [[package]] name = "samplics" version = "0.4.55" diff --git a/validation/benefit_validation.py b/validation/benefit_validation.py index d614ae032..cf4689720 100644 --- a/validation/benefit_validation.py +++ b/validation/benefit_validation.py @@ -50,9 +50,7 @@ def analyze_benefit_underreporting(): # Participation participants = (benefit > 0).sum() - weighted_participants = ( - (benefit > 0) * weight - ).sum() / 1e6 # millions + weighted_participants = ((benefit > 0) * weight).sum() / 1e6 # millions # Underreporting factor underreporting = info["admin_total"] / total if total > 0 else np.inf @@ -168,9 +166,7 @@ def earnings_reform(parameters): earnings_change = earnings * pct_increase / 100 net_change = reformed_net - original_net - emtr = np.where( - earnings_change > 0, 1 - (net_change / earnings_change), 0 - ) + emtr = np.where(earnings_change > 0, 1 - (net_change / earnings_change), 0) # Focus on sample sample_emtr = emtr[sample] @@ -254,9 +250,7 @@ def analyze_aca_subsidies(): total_ptc = (ptc[mask] * weight[mask]).sum() / 1e9 recipients = ((ptc > 0) & mask).sum() weighted_recipients = (((ptc > 0) & mask) * weight).sum() / 1e6 - mean_ptc = ( - ptc[(ptc > 0) & mask].mean() if ((ptc > 0) & mask).any() else 0 - ) + mean_ptc = ptc[(ptc > 0) & mask].mean() if ((ptc > 0) & mask).any() else 0 results.append( { @@ -307,9 +301,7 @@ def generate_benefit_validation_report(): print("\n\n4. Top 10 States by SNAP Benefits") print("-" * 40) state_df = validate_state_benefits() - top_states = state_df.nlargest(10, "snap_billions")[ - ["state_code", "snap_billions"] - ] + top_states = state_df.nlargest(10, "snap_billions")[["state_code", "snap_billions"]] print(top_states.to_string(index=False)) # ACA analysis @@ -319,9 +311,7 @@ def generate_benefit_validation_report(): print(aca_df.to_string(index=False)) # Save results - underreporting_df.to_csv( - "validation/benefit_underreporting.csv", index=False - ) + underreporting_df.to_csv("validation/benefit_underreporting.csv", index=False) interactions_df.to_csv("validation/program_interactions.csv", index=False) emtr_df.to_csv("validation/effective_marginal_tax_rates.csv", index=False) state_df.to_csv("validation/state_benefit_totals.csv", index=False) diff --git a/validation/generate_qrf_statistics.py b/validation/generate_qrf_statistics.py index 87d43a54a..4015fe1ed 100644 --- a/validation/generate_qrf_statistics.py +++ b/validation/generate_qrf_statistics.py @@ -222,18 +222,14 @@ print(support_df.round(3).to_string()) print("\nSummary:") -print( - f"- Average overlap coefficient: {support_df['overlap_coefficient'].mean():.3f}" -) +print(f"- Average overlap coefficient: {support_df['overlap_coefficient'].mean():.3f}") print( f"- All overlap coefficients > 0.85: {(support_df['overlap_coefficient'] > 0.85).all()}" ) print( f"- Variables with SMD > 0.25: {(support_df['standardized_mean_diff'] > 0.25).sum()}" ) -print( - f"- All SMDs < 0.25: {(support_df['standardized_mean_diff'] < 0.25).all()}" -) +print(f"- All SMDs < 0.25: {(support_df['standardized_mean_diff'] < 0.25).all()}") print( f"- Variables with significant KS test (p<0.05): {(support_df['ks_pvalue'] < 0.05).sum()}" ) @@ -243,7 +239,7 @@ print("\n\n2. VARIANCE EXPLAINED BY PREDICTORS") print("-" * 40) for var, r2 in variance_explained.items(): - print(f"- {var.replace('_', ' ').title()}: {r2*100:.0f}%") + print(f"- {var.replace('_', ' ').title()}: {r2 * 100:.0f}%") # 3. Out-of-Sample Accuracy print("\n\n3. OUT-OF-SAMPLE PREDICTION ACCURACY") @@ -279,9 +275,7 @@ print( f"- All correlation differences < 0.05: {(joint_df['correlation_diff'] < 0.05).all()}" ) -print( - f"- Average correlation difference: {joint_df['correlation_diff'].mean():.3f}" -) +print(f"- Average correlation difference: {joint_df['correlation_diff'].mean():.3f}") # Save all results print("\n\nSAVING RESULTS...") @@ -294,9 +288,7 @@ ) accuracy_df.to_csv("validation/outputs/qrf_accuracy_metrics.csv") -print( - "✓ Saved accuracy metrics to validation/outputs/qrf_accuracy_metrics.csv" -) +print("✓ Saved accuracy metrics to validation/outputs/qrf_accuracy_metrics.csv") joint_df.to_csv("validation/outputs/joint_distribution_tests.csv", index=False) print( @@ -308,10 +300,8 @@ f.write("Variance Explained by Predictors (R-squared)\n") f.write("=" * 40 + "\n\n") for var, r2 in variance_explained.items(): - f.write(f"{var.replace('_', ' ').title()}: {r2*100:.0f}%\n") -print( - "✓ Saved variance explained to validation/outputs/variance_explained.txt" -) + f.write(f"{var.replace('_', ' ').title()}: {r2 * 100:.0f}%\n") +print("✓ Saved variance explained to validation/outputs/variance_explained.txt") # Create summary report with open("validation/outputs/qrf_diagnostics_summary.txt", "w") as f: @@ -327,17 +317,13 @@ f.write( f"All overlap coefficients > 0.85: {(support_df['overlap_coefficient'] > 0.85).all()}\n" ) - f.write( - f"All SMDs < 0.25: {(support_df['standardized_mean_diff'] < 0.25).all()}\n" - ) - f.write( - f"All KS tests p > 0.05: {(support_df['ks_pvalue'] > 0.05).all()}\n\n" - ) + f.write(f"All SMDs < 0.25: {(support_df['standardized_mean_diff'] < 0.25).all()}\n") + f.write(f"All KS tests p > 0.05: {(support_df['ks_pvalue'] > 0.05).all()}\n\n") f.write("2. VARIANCE EXPLAINED\n") f.write("-" * 40 + "\n") for var, r2 in variance_explained.items(): - f.write(f"{var.replace('_', ' ').title()}: {r2*100:.0f}%\n") + f.write(f"{var.replace('_', ' ').title()}: {r2 * 100:.0f}%\n") f.write("\n3. OUT-OF-SAMPLE ACCURACY\n") f.write("-" * 40 + "\n") @@ -361,9 +347,7 @@ ) f.write("\n" + "=" * 60 + "\n") - f.write( - "These statistics demonstrate that the QRF methodology successfully:\n" - ) + f.write("These statistics demonstrate that the QRF methodology successfully:\n") f.write("- Maintains strong common support between datasets\n") f.write("- Achieves high predictive accuracy for imputation\n") f.write("- Preserves joint distributions of variables\n") diff --git a/validation/qrf_diagnostics.py b/validation/qrf_diagnostics.py index dcd23b5ac..d22f883c1 100644 --- a/validation/qrf_diagnostics.py +++ b/validation/qrf_diagnostics.py @@ -28,9 +28,7 @@ def analyze_common_support(cps_data, puf_data, predictors): # Overlap coefficient (Weitzman 1970) # OVL = sum(min(f(x), g(x))) where f,g are densities - bins = np.histogram_bin_edges( - np.concatenate([cps_dist, puf_dist]), bins=50 - ) + bins = np.histogram_bin_edges(np.concatenate([cps_dist, puf_dist]), bins=50) cps_hist, _ = np.histogram(cps_dist, bins=bins, density=True) puf_hist, _ = np.histogram(puf_dist, bins=bins, density=True) @@ -81,9 +79,7 @@ def validate_qrf_accuracy(puf_data, predictors, target_vars, n_estimators=100): ) # Fit QRF - qrf = RandomForestQuantileRegressor( - n_estimators=n_estimators, random_state=42 - ) + qrf = RandomForestQuantileRegressor(n_estimators=n_estimators, random_state=42) qrf.fit(X_train, y_train) # Predictions at multiple quantiles @@ -92,7 +88,7 @@ def validate_qrf_accuracy(puf_data, predictors, target_vars, n_estimators=100): for q in quantiles: pred = qrf.predict(X_test, quantiles=[q]) - predictions[f"q{int(q*100)}"] = pred.flatten() + predictions[f"q{int(q * 100)}"] = pred.flatten() # Calculate metrics median_pred = predictions["q50"] @@ -124,9 +120,7 @@ def validate_qrf_accuracy(puf_data, predictors, target_vars, n_estimators=100): "qrf_rmse": rmse, "hotdeck_mae": hotdeck_mae, "linear_mae": lr_mae, - "qrf_improvement_vs_hotdeck": (hotdeck_mae - mae) - / hotdeck_mae - * 100, + "qrf_improvement_vs_hotdeck": (hotdeck_mae - mae) / hotdeck_mae * 100, "qrf_improvement_vs_linear": (lr_mae - mae) / lr_mae * 100, "coverage_90pct": coverage_90, "coverage_50pct": coverage_50, @@ -135,9 +129,7 @@ def validate_qrf_accuracy(puf_data, predictors, target_vars, n_estimators=100): return pd.DataFrame(results).T -def test_joint_distribution_preservation( - original_data, imputed_data, var_pairs -): +def test_joint_distribution_preservation(original_data, imputed_data, var_pairs): """Test whether joint distributions are preserved in imputation.""" results = [] @@ -159,12 +151,12 @@ def test_joint_distribution_preservation( # Joint distribution test (2D KS test approximation) # Using average of marginal KS statistics - ks1 = stats.ks_2samp( - original_data[var1].dropna(), imputed_data[var1].dropna() - )[0] - ks2 = stats.ks_2samp( - original_data[var2].dropna(), imputed_data[var2].dropna() - )[0] + ks1 = stats.ks_2samp(original_data[var1].dropna(), imputed_data[var1].dropna())[ + 0 + ] + ks2 = stats.ks_2samp(original_data[var2].dropna(), imputed_data[var2].dropna())[ + 0 + ] joint_ks = (ks1 + ks2) / 2 results.append( @@ -281,9 +273,7 @@ def generate_qrf_diagnostic_report(cps_data, puf_data, imputed_data): print( f"- Average QRF improvement vs linear: {accuracy_df['qrf_improvement_vs_linear'].mean():.1f}%" ) - print( - f"- Average 90% coverage: {accuracy_df['coverage_90pct'].mean():.3f}" - ) + print(f"- Average 90% coverage: {accuracy_df['coverage_90pct'].mean():.3f}") # Joint distribution preservation print("\n\n3. Joint Distribution Preservation") @@ -295,16 +285,12 @@ def generate_qrf_diagnostic_report(cps_data, puf_data, imputed_data): ("pension_income", "social_security"), ] - joint_df = test_joint_distribution_preservation( - puf_data, imputed_data, var_pairs - ) + joint_df = test_joint_distribution_preservation(puf_data, imputed_data, var_pairs) print(joint_df.to_string(index=False)) # Create diagnostic plots create_diagnostic_plots(cps_data, puf_data, predictors) - print( - "\n\nDiagnostic plots saved to validation/common_support_diagnostics.png" - ) + print("\n\nDiagnostic plots saved to validation/common_support_diagnostics.png") # Save results support_df.to_csv("validation/common_support_analysis.csv") diff --git a/validation/run_qrf_diagnostics.py b/validation/run_qrf_diagnostics.py index dae400597..b39b16f5b 100644 --- a/validation/run_qrf_diagnostics.py +++ b/validation/run_qrf_diagnostics.py @@ -225,7 +225,7 @@ def main(): for display_name, actual_name in target_map.items(): if actual_name in variance_results: print( - f"- {display_name.capitalize()}: {variance_results[actual_name]*100:.0f}%" + f"- {display_name.capitalize()}: {variance_results[actual_name] * 100:.0f}%" ) # 3. Joint distribution preservation @@ -281,7 +281,7 @@ def main(): for display_name, actual_name in target_map.items(): if actual_name in variance_results: f.write( - f"{display_name.capitalize()}: {variance_results[actual_name]*100:.0f}%\n" + f"{display_name.capitalize()}: {variance_results[actual_name] * 100:.0f}%\n" ) print( "✓ Saved variance explained results to validation/outputs/variance_explained.txt" @@ -321,7 +321,7 @@ def main(): for display_name, actual_name in target_map.items(): if actual_name in variance_results: f.write( - f"{display_name.capitalize()}: {variance_results[actual_name]*100:.0f}%\n" + f"{display_name.capitalize()}: {variance_results[actual_name] * 100:.0f}%\n" ) if valid_pairs: diff --git a/validation/tax_policy_validation.py b/validation/tax_policy_validation.py index c7c4f6007..9e04982f1 100644 --- a/validation/tax_policy_validation.py +++ b/validation/tax_policy_validation.py @@ -101,9 +101,7 @@ def analyze_high_income_taxpayers(): for threshold in thresholds: count = (weights[agi >= threshold]).sum() pct_returns = count / weights.sum() * 100 - total_agi = ( - agi[agi >= threshold] * weights[agi >= threshold] - ).sum() / 1e9 + total_agi = (agi[agi >= threshold] * weights[agi >= threshold]).sum() / 1e9 results.append( { @@ -135,9 +133,7 @@ def validate_state_revenues(): results.append({"state_code": state, "revenue_billions": total}) - return pd.DataFrame(results).sort_values( - "revenue_billions", ascending=False - ) + return pd.DataFrame(results).sort_values("revenue_billions", ascending=False) def generate_validation_report(): diff --git a/validation/validate_retirement_imputation.py b/validation/validate_retirement_imputation.py index f57441751..065a82944 100644 --- a/validation/validate_retirement_imputation.py +++ b/validation/validate_retirement_imputation.py @@ -54,12 +54,8 @@ def validate_constraints(sim) -> list: issues = [] year = 2024 - emp_income = sim.calculate( - "employment_income", year, map_to="person" - ).values - se_income = sim.calculate( - "self_employment_income", year, map_to="person" - ).values + emp_income = sim.calculate("employment_income", year, map_to="person").values + se_income = sim.calculate("self_employment_income", year, map_to="person").values age = sim.calculate("age", year, map_to="person").values catch_up = age >= 50 @@ -79,9 +75,7 @@ def validate_constraints(sim) -> list: n_over_cap = (vals > max_401k + 1).sum() if n_over_cap > 0: - issues.append( - f"FAIL: {var} has {n_over_cap} values exceeding " f"401k cap" - ) + issues.append(f"FAIL: {var} has {n_over_cap} values exceeding 401k cap") zero_wage = emp_income == 0 n_nonzero_no_wage = (vals[zero_wage] > 0).sum() @@ -110,9 +104,7 @@ def validate_constraints(sim) -> list: n_over_cap = (vals > max_ira + 1).sum() if n_over_cap > 0: - issues.append( - f"FAIL: {var} has {n_over_cap} values exceeding " f"IRA cap" - ) + issues.append(f"FAIL: {var} has {n_over_cap} values exceeding IRA cap") # SE pension constraint var = "self_employed_pension_contributions" @@ -141,9 +133,7 @@ def validate_aggregates(sim) -> list: weight = sim.calculate("person_weight", year).values - logger.info( - "\n%-45s %15s %15s %10s", "Variable", "Weighted Sum", "Target", "Ratio" - ) + logger.info("\n%-45s %15s %15s %10s", "Variable", "Weighted Sum", "Target", "Ratio") logger.info("-" * 90) for var, target in TARGETS.items(): @@ -168,8 +158,8 @@ def validate_aggregates(sim) -> list: if ratio < 0.1 or ratio > 5.0: issues.append( f"WARNING: {var} weighted sum " - f"${weighted_sum/1e9:.1f}B is far from " - f"target ${target/1e9:.1f}B " + f"${weighted_sum / 1e9:.1f}B is far from " + f"target ${target / 1e9:.1f}B " f"(ratio={ratio:.2f})" )