Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .github/bump_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ def get_current_version(pyproject_path: Path) -> str:

def infer_bump(changelog_dir: Path) -> str:
fragments = [
f
for f in changelog_dir.iterdir()
if f.is_file() and f.name != ".gitkeep"
f for f in changelog_dir.iterdir() if f.is_file() and f.name != ".gitkeep"
]
if not fragments:
print("No changelog fragments found", file=sys.stderr)
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/reusable_lint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install ruff
run: pip install ruff>=0.9.0
- name: Check formatting
uses: "lgeiger/black-action@master"
with:
args: ". -l 79 --check"
run: ruff format --check .
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ HF_CLONE_DIR ?= $(HOME)/huggingface/policyengine-us-data
all: data test

format:
black . -l 79
ruff format .

test:
pytest
Expand Down
1 change: 1 addition & 0 deletions changelog.d/switch-to-ruff.changed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Switch from black to ruff format.
101 changes: 81 additions & 20 deletions docs/calibration_matrix.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,28 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import numpy as np\nimport pandas as pd\nfrom policyengine_us import Microsimulation\nfrom policyengine_us_data.storage import STORAGE_FOLDER\nfrom policyengine_us_data.calibration.unified_matrix_builder import (\n UnifiedMatrixBuilder,\n)\nfrom policyengine_us_data.calibration.clone_and_assign import (\n assign_random_geography,\n)\nfrom policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n create_target_groups,\n drop_target_groups,\n get_geo_level,\n STATE_CODES,\n)\n\ndb_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\ndb_uri = f\"sqlite:///{db_path}\"\ndataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\""
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from policyengine_us import Microsimulation\n",
"from policyengine_us_data.storage import STORAGE_FOLDER\n",
"from policyengine_us_data.calibration.unified_matrix_builder import (\n",
" UnifiedMatrixBuilder,\n",
")\n",
"from policyengine_us_data.calibration.clone_and_assign import (\n",
" assign_random_geography,\n",
")\n",
"from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n",
" create_target_groups,\n",
" drop_target_groups,\n",
" get_geo_level,\n",
" STATE_CODES,\n",
")\n",
"\n",
"db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n",
"db_uri = f\"sqlite:///{db_path}\"\n",
"dataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\""
]
},
{
"cell_type": "code",
Expand Down Expand Up @@ -82,7 +103,19 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "print(f\"Targets: {X_sparse.shape[0]}\")\nprint(f\"Columns: {X_sparse.shape[1]:,} ({N_CLONES} clones x {n_records:,} records)\")\nprint(f\"Non-zeros: {X_sparse.nnz:,}\")\nprint(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")\n\ngeo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\nlevel_names = {0: \"National\", 1: \"State\", 2: \"District\"}\nfor level in [0, 1, 2]:\n n = (geo_levels == level).sum()\n if n > 0:\n print(f\" {level_names[level]}: {n} targets\")"
"source": [
"print(f\"Targets: {X_sparse.shape[0]}\")\n",
"print(f\"Columns: {X_sparse.shape[1]:,} ({N_CLONES} clones x {n_records:,} records)\")\n",
"print(f\"Non-zeros: {X_sparse.nnz:,}\")\n",
"print(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")\n",
"\n",
"geo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\n",
"level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n",
"for level in [0, 1, 2]:\n",
" n = (geo_levels == level).sum()\n",
" if n > 0:\n",
" print(f\" {level_names[level]}: {n} targets\")"
]
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -294,14 +327,16 @@
"for gid, info in enumerate(group_info):\n",
" mask = target_groups == gid\n",
" vals = targets_df.loc[mask, \"value\"]\n",
" records.append({\n",
" \"group_id\": gid,\n",
" \"description\": info,\n",
" \"n_targets\": mask.sum(),\n",
" \"min_value\": vals.min(),\n",
" \"median_value\": vals.median(),\n",
" \"max_value\": vals.max(),\n",
" })\n",
" records.append(\n",
" {\n",
" \"group_id\": gid,\n",
" \"description\": info,\n",
" \"n_targets\": mask.sum(),\n",
" \"min_value\": vals.min(),\n",
" \"median_value\": vals.median(),\n",
" \"max_value\": vals.max(),\n",
" }\n",
" )\n",
"\n",
"group_df = pd.DataFrame(records)\n",
"print(group_df.to_string(index=False))"
Expand Down Expand Up @@ -431,8 +466,7 @@
" for r in nz_rows[:5]:\n",
" row = targets_df.iloc[r]\n",
" print(\n",
" f\" {row['variable']} (geo={row['geographic_id']}): \"\n",
" f\"{X_sparse[r, col]:.2f}\"\n",
" f\" {row['variable']} (geo={row['geographic_id']}): {X_sparse[r, col]:.2f}\"\n",
" )\n",
" if len(nz_rows) > 5:\n",
" print(f\" ... and {len(nz_rows) - 5} more\")"
Expand Down Expand Up @@ -475,7 +509,28 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "nnz_per_row = np.diff(X_sparse.indptr)\nprint(f\"Non-zeros per row:\")\nprint(f\" min: {nnz_per_row.min():,}\")\nprint(f\" median: {int(np.median(nnz_per_row)):,}\")\nprint(f\" mean: {nnz_per_row.mean():,.0f}\")\nprint(f\" max: {nnz_per_row.max():,}\")\n\ngeo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\nlevel_names = {0: \"National\", 1: \"State\", 2: \"District\"}\nprint(\"\\nBy geographic level:\")\nfor level in [0, 1, 2]:\n mask = (geo_levels == level).values\n if mask.any():\n vals = nnz_per_row[mask]\n print(\n f\" {level_names[level]:10s}: \"\n f\"n={mask.sum():>4d}, \"\n f\"median nnz={int(np.median(vals)):>7,}, \"\n f\"range=[{vals.min():,}, {vals.max():,}]\"\n )"
"source": [
"nnz_per_row = np.diff(X_sparse.indptr)\n",
"print(f\"Non-zeros per row:\")\n",
"print(f\" min: {nnz_per_row.min():,}\")\n",
"print(f\" median: {int(np.median(nnz_per_row)):,}\")\n",
"print(f\" mean: {nnz_per_row.mean():,.0f}\")\n",
"print(f\" max: {nnz_per_row.max():,}\")\n",
"\n",
"geo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\n",
"level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n",
"print(\"\\nBy geographic level:\")\n",
"for level in [0, 1, 2]:\n",
" mask = (geo_levels == level).values\n",
" if mask.any():\n",
" vals = nnz_per_row[mask]\n",
" print(\n",
" f\" {level_names[level]:10s}: \"\n",
" f\"n={mask.sum():>4d}, \"\n",
" f\"median nnz={int(np.median(vals)):>7,}, \"\n",
" f\"range=[{vals.min():,}, {vals.max():,}]\"\n",
" )"
]
},
{
"cell_type": "code",
Expand All @@ -498,12 +553,16 @@
"clone_nnz = []\n",
"for ci in range(N_CLONES):\n",
" block = X_sparse[:, ci * n_records : (ci + 1) * n_records]\n",
" n_states = len(np.unique(geography.state_fips[ci * n_records : (ci + 1) * n_records]))\n",
" clone_nnz.append({\n",
" \"clone\": ci,\n",
" \"nnz\": block.nnz,\n",
" \"unique_states\": n_states,\n",
" })\n",
" n_states = len(\n",
" np.unique(geography.state_fips[ci * n_records : (ci + 1) * n_records])\n",
" )\n",
" clone_nnz.append(\n",
" {\n",
" \"clone\": ci,\n",
" \"nnz\": block.nnz,\n",
" \"unique_states\": n_states,\n",
" }\n",
" )\n",
"\n",
"clone_df = pd.DataFrame(clone_nnz)\n",
"print(\"Non-zeros per clone block:\")\n",
Expand Down Expand Up @@ -666,7 +725,9 @@
}
],
"source": [
"ratios = row_sums[achievable_mask] / targets_filtered.loc[achievable_mask, \"value\"].values\n",
"ratios = (\n",
" row_sums[achievable_mask] / targets_filtered.loc[achievable_mask, \"value\"].values\n",
")\n",
"ratio_df = targets_filtered[achievable_mask].copy()\n",
"ratio_df[\"row_sum\"] = row_sums[achievable_mask]\n",
"ratio_df[\"ratio\"] = ratios\n",
Expand Down
51 changes: 14 additions & 37 deletions docs/hierarchical_uprating.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,7 @@
],
"source": [
"snap_hh = raw[\n",
" (raw[\"domain_variable\"] == \"snap\")\n",
" & (raw[\"variable\"] == \"household_count\")\n",
" (raw[\"domain_variable\"] == \"snap\") & (raw[\"variable\"] == \"household_count\")\n",
"]\n",
"for level in [\"state\", \"district\"]:\n",
" total = snap_hh[snap_hh[\"geo_level\"] == level][\"value\"].sum()\n",
Expand Down Expand Up @@ -333,9 +332,9 @@
"source": [
"raw[\"original_value\"] = raw[\"value\"].copy()\n",
"raw[\"uprating_factor\"] = raw.apply(\n",
" lambda r: builder._get_uprating_info(\n",
" r[\"variable\"], r[\"period\"], uprating_factors\n",
" )[0],\n",
" lambda r: builder._get_uprating_info(r[\"variable\"], r[\"period\"], uprating_factors)[\n",
" 0\n",
" ],\n",
" axis=1,\n",
")\n",
"raw[\"value\"] = raw[\"original_value\"] * raw[\"uprating_factor\"]"
Expand Down Expand Up @@ -376,10 +375,7 @@
"sample_states = {6: \"CA\", 48: \"TX\", 36: \"NY\"}\n",
"\n",
"for fips, abbr in sample_states.items():\n",
" rows = raw[\n",
" (raw[\"geo_level\"] == \"state\")\n",
" & (raw[\"geographic_id\"] == str(fips))\n",
" ]\n",
" rows = raw[(raw[\"geo_level\"] == \"state\") & (raw[\"geographic_id\"] == str(fips))]\n",
" for _, r in rows.iterrows():\n",
" print(\n",
" f\" {abbr} [{r['domain_variable']:8s}] \"\n",
Expand Down Expand Up @@ -412,9 +408,7 @@
"metadata": {},
"outputs": [],
"source": [
"result = builder._apply_hierarchical_uprating(\n",
" raw, DOMAINS, uprating_factors\n",
")"
"result = builder._apply_hierarchical_uprating(raw, DOMAINS, uprating_factors)"
]
},
{
Expand Down Expand Up @@ -454,11 +448,7 @@
" for fips, abbr in sample_states.items():\n",
" cd_state = cd_domain[\n",
" cd_domain[\"geographic_id\"].apply(\n",
" lambda g, s=fips: (\n",
" int(g) // 100 == s\n",
" if g not in (\"US\",)\n",
" else False\n",
" )\n",
" lambda g, s=fips: int(g) // 100 == s if g not in (\"US\",) else False\n",
" )\n",
" ]\n",
" if cd_state.empty:\n",
Expand All @@ -474,11 +464,7 @@
" & (raw[\"variable\"] == var)\n",
" & (raw[\"domain_variable\"] == domain)\n",
" ]\n",
" uprated_state = (\n",
" st_row[\"value\"].iloc[0]\n",
" if len(st_row)\n",
" else np.nan\n",
" )\n",
" uprated_state = st_row[\"value\"].iloc[0] if len(st_row) else np.nan\n",
" print(\n",
" f\" {abbr} {var:20s} \"\n",
" f\"hif={hif:.6f} \"\n",
Expand All @@ -487,6 +473,7 @@
" f\"uprated_state={uprated_state:>14,.0f}\"\n",
" )\n",
"\n",
"\n",
"show_reconciliation(result, raw, \"aca_ptc\", sample_states)"
]
},
Expand Down Expand Up @@ -527,17 +514,15 @@
"]\n",
"\n",
"state_ufs = (\n",
" aca_cds.assign(state_fips=aca_cds[\"geographic_id\"].apply(\n",
" lambda g: int(g) // 100\n",
" ))\n",
" aca_cds.assign(state_fips=aca_cds[\"geographic_id\"].apply(lambda g: int(g) // 100))\n",
" .groupby(\"state_fips\")[\"state_uprating_factor\"]\n",
" .first()\n",
" .sort_values()\n",
")\n",
"\n",
"print(\"ACA PTC uprating factors (aca_ptc = vol_mult * val_mult):\")\n",
"print(f\" {'State FIPS':>12s} {'Factor':>8s}\")\n",
"print(f\" {'─'*12} {'─'*8}\")\n",
"print(f\" {'─' * 12} {'─' * 8}\")\n",
"for fips in list(state_ufs.index[:5]) + [\"...\"] + list(state_ufs.index[-5:]):\n",
" if fips == \"...\":\n",
" print(f\" {'...':>12s}\")\n",
Expand Down Expand Up @@ -676,9 +661,7 @@
],
"source": [
"level_counts = (\n",
" result.groupby([\"domain_variable\", \"geo_level\"])\n",
" .size()\n",
" .reset_index(name=\"count\")\n",
" result.groupby([\"domain_variable\", \"geo_level\"]).size().reset_index(name=\"count\")\n",
")\n",
"level_counts"
]
Expand Down Expand Up @@ -749,20 +732,14 @@
"checks = 0\n",
"for domain in DOMAINS:\n",
" domain_result = result[result[\"domain_variable\"] == domain]\n",
" cd_result = domain_result[\n",
" domain_result[\"geo_level\"] == \"district\"\n",
" ]\n",
" cd_result = domain_result[domain_result[\"geo_level\"] == \"district\"]\n",
" if cd_result.empty:\n",
" continue\n",
"\n",
" for fips, abbr in sorted(STATE_CODES.items()):\n",
" cd_rows = cd_result[\n",
" cd_result[\"geographic_id\"].apply(\n",
" lambda g, s=fips: (\n",
" int(g) // 100 == s\n",
" if g not in (\"US\",)\n",
" else False\n",
" )\n",
" lambda g, s=fips: int(g) // 100 == s if g not in (\"US\",) else False\n",
" )\n",
" ]\n",
" if cd_rows.empty:\n",
Expand Down
Loading