Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/reusable_lint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v5
- name: Check formatting
uses: "lgeiger/black-action@master"
with:
args: ". -l 79 --check"
run: uvx ruff format --check .
6 changes: 3 additions & 3 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@
- `make test` - Also runs all tests

## Formatting
- `make format` - Format all code using Black with 79 char line length
- `black . -l 79 --check` - Check formatting without changing files
- `make format` - Format all code using ruff with 79 char line length
- `ruff format --check .` - Check formatting without changing files

## Code Style Guidelines
- **Imports**: Standard libraries first, then third-party, then internal
- **Type Hints**: Use for all function parameters and return values
- **Naming**: Classes: PascalCase, Functions/Variables: snake_case, Constants: UPPER_SNAKE_CASE
- **Documentation**: Google-style docstrings with Args and Returns sections
- **Error Handling**: Use validation checks with specific error messages
- **Line Length**: 79 characters max (Black configured in pyproject.toml)
- **Line Length**: 79 characters max (ruff configured in pyproject.toml)
- **Python Version**: Targeting Python 3.11

## Git and PR Guidelines
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ HF_CLONE_DIR ?= $(HOME)/huggingface/policyengine-us-data
all: data test

format:
black . -l 79
ruff format .

test:
pytest
Expand Down
1 change: 1 addition & 0 deletions changelog.d/changed/switch-to-ruff.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Switched code formatter from Black to Ruff.
113 changes: 92 additions & 21 deletions docs/calibration_matrix.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,28 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import numpy as np\nimport pandas as pd\nfrom policyengine_us import Microsimulation\nfrom policyengine_us_data.storage import STORAGE_FOLDER\nfrom policyengine_us_data.calibration.unified_matrix_builder import (\n UnifiedMatrixBuilder,\n)\nfrom policyengine_us_data.calibration.clone_and_assign import (\n assign_random_geography,\n)\nfrom policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n create_target_groups,\n drop_target_groups,\n get_geo_level,\n STATE_CODES,\n)\n\ndb_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\ndb_uri = f\"sqlite:///{db_path}\"\ndataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\""
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from policyengine_us import Microsimulation\n",
"from policyengine_us_data.storage import STORAGE_FOLDER\n",
"from policyengine_us_data.calibration.unified_matrix_builder import (\n",
" UnifiedMatrixBuilder,\n",
")\n",
"from policyengine_us_data.calibration.clone_and_assign import (\n",
" assign_random_geography,\n",
")\n",
"from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n",
" create_target_groups,\n",
" drop_target_groups,\n",
" get_geo_level,\n",
" STATE_CODES,\n",
")\n",
"\n",
"db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n",
"db_uri = f\"sqlite:///{db_path}\"\n",
"dataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\""
]
},
{
"cell_type": "code",
Expand Down Expand Up @@ -65,7 +86,9 @@
")\n",
"\n",
"n_total = n_records * N_CLONES\n",
"print(f\"Records: {n_records:,}, Clones: {N_CLONES}, Total columns: {n_total:,}\")\n",
"print(\n",
" f\"Records: {n_records:,}, Clones: {N_CLONES}, Total columns: {n_total:,}\"\n",
")\n",
"print(f\"Matrix shape: {X_sparse.shape}\")\n",
"print(f\"Non-zero entries: {X_sparse.nnz:,}\")"
]
Expand All @@ -82,7 +105,21 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "print(f\"Targets: {X_sparse.shape[0]}\")\nprint(f\"Columns: {X_sparse.shape[1]:,} ({N_CLONES} clones x {n_records:,} records)\")\nprint(f\"Non-zeros: {X_sparse.nnz:,}\")\nprint(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")\n\ngeo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\nlevel_names = {0: \"National\", 1: \"State\", 2: \"District\"}\nfor level in [0, 1, 2]:\n n = (geo_levels == level).sum()\n if n > 0:\n print(f\" {level_names[level]}: {n} targets\")"
"source": [
"print(f\"Targets: {X_sparse.shape[0]}\")\n",
"print(\n",
" f\"Columns: {X_sparse.shape[1]:,} ({N_CLONES} clones x {n_records:,} records)\"\n",
")\n",
"print(f\"Non-zeros: {X_sparse.nnz:,}\")\n",
"print(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")\n",
"\n",
"geo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\n",
"level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n",
"for level in [0, 1, 2]:\n",
" n = (geo_levels == level).sum()\n",
" if n > 0:\n",
" print(f\" {level_names[level]}: {n} targets\")"
]
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -294,14 +331,16 @@
"for gid, info in enumerate(group_info):\n",
" mask = target_groups == gid\n",
" vals = targets_df.loc[mask, \"value\"]\n",
" records.append({\n",
" \"group_id\": gid,\n",
" \"description\": info,\n",
" \"n_targets\": mask.sum(),\n",
" \"min_value\": vals.min(),\n",
" \"median_value\": vals.median(),\n",
" \"max_value\": vals.max(),\n",
" })\n",
" records.append(\n",
" {\n",
" \"group_id\": gid,\n",
" \"description\": info,\n",
" \"n_targets\": mask.sum(),\n",
" \"min_value\": vals.min(),\n",
" \"median_value\": vals.median(),\n",
" \"max_value\": vals.max(),\n",
" }\n",
" )\n",
"\n",
"group_df = pd.DataFrame(records)\n",
"print(group_df.to_string(index=False))"
Expand Down Expand Up @@ -400,7 +439,9 @@
" col_vec = X_sparse[:, col]\n",
" nnz = col_vec.nnz\n",
" abbr = STATE_CODES.get(state, \"??\")\n",
" print(f\" col {col}: {abbr} (state={state}, CD={cd}) — {nnz} non-zero rows\")"
" print(\n",
" f\" col {col}: {abbr} (state={state}, CD={cd}) — {nnz} non-zero rows\"\n",
" )"
]
},
{
Expand Down Expand Up @@ -475,7 +516,28 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "nnz_per_row = np.diff(X_sparse.indptr)\nprint(f\"Non-zeros per row:\")\nprint(f\" min: {nnz_per_row.min():,}\")\nprint(f\" median: {int(np.median(nnz_per_row)):,}\")\nprint(f\" mean: {nnz_per_row.mean():,.0f}\")\nprint(f\" max: {nnz_per_row.max():,}\")\n\ngeo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\nlevel_names = {0: \"National\", 1: \"State\", 2: \"District\"}\nprint(\"\\nBy geographic level:\")\nfor level in [0, 1, 2]:\n mask = (geo_levels == level).values\n if mask.any():\n vals = nnz_per_row[mask]\n print(\n f\" {level_names[level]:10s}: \"\n f\"n={mask.sum():>4d}, \"\n f\"median nnz={int(np.median(vals)):>7,}, \"\n f\"range=[{vals.min():,}, {vals.max():,}]\"\n )"
"source": [
"nnz_per_row = np.diff(X_sparse.indptr)\n",
"print(f\"Non-zeros per row:\")\n",
"print(f\" min: {nnz_per_row.min():,}\")\n",
"print(f\" median: {int(np.median(nnz_per_row)):,}\")\n",
"print(f\" mean: {nnz_per_row.mean():,.0f}\")\n",
"print(f\" max: {nnz_per_row.max():,}\")\n",
"\n",
"geo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\n",
"level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n",
"print(\"\\nBy geographic level:\")\n",
"for level in [0, 1, 2]:\n",
" mask = (geo_levels == level).values\n",
" if mask.any():\n",
" vals = nnz_per_row[mask]\n",
" print(\n",
" f\" {level_names[level]:10s}: \"\n",
" f\"n={mask.sum():>4d}, \"\n",
" f\"median nnz={int(np.median(vals)):>7,}, \"\n",
" f\"range=[{vals.min():,}, {vals.max():,}]\"\n",
" )"
]
},
{
"cell_type": "code",
Expand All @@ -498,12 +560,16 @@
"clone_nnz = []\n",
"for ci in range(N_CLONES):\n",
" block = X_sparse[:, ci * n_records : (ci + 1) * n_records]\n",
" n_states = len(np.unique(geography.state_fips[ci * n_records : (ci + 1) * n_records]))\n",
" clone_nnz.append({\n",
" \"clone\": ci,\n",
" \"nnz\": block.nnz,\n",
" \"unique_states\": n_states,\n",
" })\n",
" n_states = len(\n",
" np.unique(geography.state_fips[ci * n_records : (ci + 1) * n_records])\n",
" )\n",
" clone_nnz.append(\n",
" {\n",
" \"clone\": ci,\n",
" \"nnz\": block.nnz,\n",
" \"unique_states\": n_states,\n",
" }\n",
" )\n",
"\n",
"clone_df = pd.DataFrame(clone_nnz)\n",
"print(\"Non-zeros per clone block:\")\n",
Expand Down Expand Up @@ -666,7 +732,10 @@
}
],
"source": [
"ratios = row_sums[achievable_mask] / targets_filtered.loc[achievable_mask, \"value\"].values\n",
"ratios = (\n",
" row_sums[achievable_mask]\n",
" / targets_filtered.loc[achievable_mask, \"value\"].values\n",
")\n",
"ratio_df = targets_filtered[achievable_mask].copy()\n",
"ratio_df[\"row_sum\"] = row_sums[achievable_mask]\n",
"ratio_df[\"ratio\"] = ratios\n",
Expand Down Expand Up @@ -704,7 +773,9 @@
"X_final = X_filtered[achievable_mask, :]\n",
"print(f\"Final matrix shape: {X_final.shape}\")\n",
"print(f\"Final non-zero entries: {X_final.nnz:,}\")\n",
"print(f\"Final density: {X_final.nnz / (X_final.shape[0] * X_final.shape[1]):.6f}\")\n",
"print(\n",
" f\"Final density: {X_final.nnz / (X_final.shape[0] * X_final.shape[1]):.6f}\"\n",
")\n",
"print(\"\\nThis is what the optimizer receives.\")"
]
},
Expand Down
37 changes: 12 additions & 25 deletions docs/hierarchical_uprating.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,7 @@
],
"source": [
"snap_hh = raw[\n",
" (raw[\"domain_variable\"] == \"snap\")\n",
" & (raw[\"variable\"] == \"household_count\")\n",
" (raw[\"domain_variable\"] == \"snap\") & (raw[\"variable\"] == \"household_count\")\n",
"]\n",
"for level in [\"state\", \"district\"]:\n",
" total = snap_hh[snap_hh[\"geo_level\"] == level][\"value\"].sum()\n",
Expand Down Expand Up @@ -377,8 +376,7 @@
"\n",
"for fips, abbr in sample_states.items():\n",
" rows = raw[\n",
" (raw[\"geo_level\"] == \"state\")\n",
" & (raw[\"geographic_id\"] == str(fips))\n",
" (raw[\"geo_level\"] == \"state\") & (raw[\"geographic_id\"] == str(fips))\n",
" ]\n",
" for _, r in rows.iterrows():\n",
" print(\n",
Expand Down Expand Up @@ -412,9 +410,7 @@
"metadata": {},
"outputs": [],
"source": [
"result = builder._apply_hierarchical_uprating(\n",
" raw, DOMAINS, uprating_factors\n",
")"
"result = builder._apply_hierarchical_uprating(raw, DOMAINS, uprating_factors)"
]
},
{
Expand Down Expand Up @@ -455,9 +451,7 @@
" cd_state = cd_domain[\n",
" cd_domain[\"geographic_id\"].apply(\n",
" lambda g, s=fips: (\n",
" int(g) // 100 == s\n",
" if g not in (\"US\",)\n",
" else False\n",
" int(g) // 100 == s if g not in (\"US\",) else False\n",
" )\n",
" )\n",
" ]\n",
Expand All @@ -474,11 +468,7 @@
" & (raw[\"variable\"] == var)\n",
" & (raw[\"domain_variable\"] == domain)\n",
" ]\n",
" uprated_state = (\n",
" st_row[\"value\"].iloc[0]\n",
" if len(st_row)\n",
" else np.nan\n",
" )\n",
" uprated_state = st_row[\"value\"].iloc[0] if len(st_row) else np.nan\n",
" print(\n",
" f\" {abbr} {var:20s} \"\n",
" f\"hif={hif:.6f} \"\n",
Expand All @@ -487,6 +477,7 @@
" f\"uprated_state={uprated_state:>14,.0f}\"\n",
" )\n",
"\n",
"\n",
"show_reconciliation(result, raw, \"aca_ptc\", sample_states)"
]
},
Expand Down Expand Up @@ -527,17 +518,17 @@
"]\n",
"\n",
"state_ufs = (\n",
" aca_cds.assign(state_fips=aca_cds[\"geographic_id\"].apply(\n",
" lambda g: int(g) // 100\n",
" ))\n",
" aca_cds.assign(\n",
" state_fips=aca_cds[\"geographic_id\"].apply(lambda g: int(g) // 100)\n",
" )\n",
" .groupby(\"state_fips\")[\"state_uprating_factor\"]\n",
" .first()\n",
" .sort_values()\n",
")\n",
"\n",
"print(\"ACA PTC uprating factors (aca_ptc = vol_mult * val_mult):\")\n",
"print(f\" {'State FIPS':>12s} {'Factor':>8s}\")\n",
"print(f\" {'─'*12} {'─'*8}\")\n",
"print(f\" {'─' * 12} {'─' * 8}\")\n",
"for fips in list(state_ufs.index[:5]) + [\"...\"] + list(state_ufs.index[-5:]):\n",
" if fips == \"...\":\n",
" print(f\" {'...':>12s}\")\n",
Expand Down Expand Up @@ -749,19 +740,15 @@
"checks = 0\n",
"for domain in DOMAINS:\n",
" domain_result = result[result[\"domain_variable\"] == domain]\n",
" cd_result = domain_result[\n",
" domain_result[\"geo_level\"] == \"district\"\n",
" ]\n",
" cd_result = domain_result[domain_result[\"geo_level\"] == \"district\"]\n",
" if cd_result.empty:\n",
" continue\n",
"\n",
" for fips, abbr in sorted(STATE_CODES.items()):\n",
" cd_rows = cd_result[\n",
" cd_result[\"geographic_id\"].apply(\n",
" lambda g, s=fips: (\n",
" int(g) // 100 == s\n",
" if g not in (\"US\",)\n",
" else False\n",
" int(g) // 100 == s if g not in (\"US\",) else False\n",
" )\n",
" )\n",
" ]\n",
Expand Down
12 changes: 4 additions & 8 deletions docs/local_area_calibration_setup.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -576,9 +576,7 @@
" f\"{col in cd_to_cols.get(cd, [])}\"\n",
" )\n",
" # Check an unrelated state\n",
" print(\n",
" f\" Visible to NC (37) targets: \" f\"{col in state_to_cols.get(37, [])}\"\n",
" )\n",
" print(f\" Visible to NC (37) targets: {col in state_to_cols.get(37, [])}\")\n",
" print()"
]
},
Expand Down Expand Up @@ -639,8 +637,7 @@
" else f\"dict ({len(rate)} entries)\"\n",
" )\n",
" print(\n",
" f\" {spec['variable']:40s} \"\n",
" f\"entity={spec['entity']:10s} rate={rate_str}\"\n",
" f\" {spec['variable']:40s} entity={spec['entity']:10s} rate={rate_str}\"\n",
" )"
]
},
Expand Down Expand Up @@ -966,8 +963,7 @@
"output_path = os.path.join(output_dir, \"results.h5\")\n",
"\n",
"print(\n",
" f\"Weight vector: {len(w):,} entries \"\n",
" f\"({n_demo_cds} CDs x {n_records:,} HH)\"\n",
" f\"Weight vector: {len(w):,} entries ({n_demo_cds} CDs x {n_records:,} HH)\"\n",
")\n",
"print(f\"Non-zero weights: {(w > 0).sum()}\")\n",
"print(\n",
Expand Down Expand Up @@ -1124,7 +1120,7 @@
"example_mapping = mapping_df.loc[\n",
" mapping_df.original_household_id == example_hh_id\n",
"]\n",
"print(f\"Example household (original_id={example_hh_id}) \" f\"in mapping:\\n\")\n",
"print(f\"Example household (original_id={example_hh_id}) in mapping:\\n\")\n",
"print(example_mapping.to_string(index=False))\n",
"\n",
"new_ids = example_mapping.new_household_id\n",
Expand Down
3 changes: 1 addition & 2 deletions modal_app/data_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,8 +391,7 @@ def build_datasets(
# GROUP 3: After extended_cps - run in parallel
# enhanced_cps and stratified_cps both depend on extended_cps
print(
"=== Phase 4: Building enhanced and stratified CPS (parallel)"
" ==="
"=== Phase 4: Building enhanced and stratified CPS (parallel) ==="
)
with ThreadPoolExecutor(max_workers=2) as executor:
futures = [
Expand Down
2 changes: 1 addition & 1 deletion paper/scripts/calculate_distributional_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def calculate_top_shares(values, weights, percentiles=[90, 99]):
threshold = weighted_percentile(values, weights, p)
mask = values >= threshold
top_income = np.sum(values[mask] * weights[mask])
shares[f"top_{100-p}%"] = top_income / total_income
shares[f"top_{100 - p}%"] = top_income / total_income

return shares

Expand Down
5 changes: 3 additions & 2 deletions paper/scripts/calculate_target_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,9 @@ def compare_dataset_performance(

# Calculate average improvement by target category
categories = {
"IRS Income": lambda x: "employment_income" in x
or "capital_gains" in x,
"IRS Income": lambda x: (
"employment_income" in x or "capital_gains" in x
),
"Demographics": lambda x: "age_" in x or "population" in x,
"Programs": lambda x: "snap" in x or "social_security" in x,
"Tax Expenditures": lambda x: "salt" in x or "charitable" in x,
Expand Down
Loading