PolicyEngine · MaxGhenis · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/.github/workflows/reusable_lint.yaml b/.github/workflows/reusable_lint.yaml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
       - name: Check formatting
-        uses: "lgeiger/black-action@master"
-        with:
-          args: ". -l 79 --check"
+        run: uvx ruff format --check .
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -11,16 +11,16 @@
 - `make test` - Also runs all tests
 
 ## Formatting
-- `make format` - Format all code using Black with 79 char line length
-- `black . -l 79 --check` - Check formatting without changing files
+- `make format` - Format all code using ruff with 79 char line length
+- `ruff format --check .` - Check formatting without changing files
 
 ## Code Style Guidelines
 - **Imports**: Standard libraries first, then third-party, then internal
 - **Type Hints**: Use for all function parameters and return values
 - **Naming**: Classes: PascalCase, Functions/Variables: snake_case, Constants: UPPER_SNAKE_CASE
 - **Documentation**: Google-style docstrings with Args and Returns sections
 - **Error Handling**: Use validation checks with specific error messages
-- **Line Length**: 79 characters max (Black configured in pyproject.toml)
+- **Line Length**: 79 characters max (ruff configured in pyproject.toml)
 - **Python Version**: Targeting Python 3.11
 
 ## Git and PR Guidelines

diff --git a/Makefile b/Makefile
@@ -5,7 +5,7 @@ HF_CLONE_DIR ?= $(HOME)/huggingface/policyengine-us-data
 all: data test
 
 format:
-	black . -l 79
+	ruff format .
 
 test:
 	pytest

diff --git a/changelog.d/changed/switch-to-ruff.md b/changelog.d/changed/switch-to-ruff.md
@@ -0,0 +1 @@
+Switched code formatter from Black to Ruff.
diff --git a/docs/calibration_matrix.ipynb b/docs/calibration_matrix.ipynb
@@ -27,7 +27,28 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "import numpy as np\nimport pandas as pd\nfrom policyengine_us import Microsimulation\nfrom policyengine_us_data.storage import STORAGE_FOLDER\nfrom policyengine_us_data.calibration.unified_matrix_builder import (\n    UnifiedMatrixBuilder,\n)\nfrom policyengine_us_data.calibration.clone_and_assign import (\n    assign_random_geography,\n)\nfrom policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n    create_target_groups,\n    drop_target_groups,\n    get_geo_level,\n    STATE_CODES,\n)\n\ndb_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\ndb_uri = f\"sqlite:///{db_path}\"\ndataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\""
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from policyengine_us import Microsimulation\n",
+    "from policyengine_us_data.storage import STORAGE_FOLDER\n",
+    "from policyengine_us_data.calibration.unified_matrix_builder import (\n",
+    "    UnifiedMatrixBuilder,\n",
+    ")\n",
+    "from policyengine_us_data.calibration.clone_and_assign import (\n",
+    "    assign_random_geography,\n",
+    ")\n",
+    "from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n",
+    "    create_target_groups,\n",
+    "    drop_target_groups,\n",
+    "    get_geo_level,\n",
+    "    STATE_CODES,\n",
+    ")\n",
+    "\n",
+    "db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n",
+    "db_uri = f\"sqlite:///{db_path}\"\n",
+    "dataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\""
+   ]
   },
   {
    "cell_type": "code",
@@ -65,7 +86,9 @@
     ")\n",
     "\n",
     "n_total = n_records * N_CLONES\n",
-    "print(f\"Records: {n_records:,}, Clones: {N_CLONES}, Total columns: {n_total:,}\")\n",
+    "print(\n",
+    "    f\"Records: {n_records:,}, Clones: {N_CLONES}, Total columns: {n_total:,}\"\n",
+    ")\n",
     "print(f\"Matrix shape: {X_sparse.shape}\")\n",
     "print(f\"Non-zero entries: {X_sparse.nnz:,}\")"
    ]
@@ -82,7 +105,21 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "print(f\"Targets: {X_sparse.shape[0]}\")\nprint(f\"Columns: {X_sparse.shape[1]:,} ({N_CLONES} clones x {n_records:,} records)\")\nprint(f\"Non-zeros: {X_sparse.nnz:,}\")\nprint(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")\n\ngeo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\nlevel_names = {0: \"National\", 1: \"State\", 2: \"District\"}\nfor level in [0, 1, 2]:\n    n = (geo_levels == level).sum()\n    if n > 0:\n        print(f\"  {level_names[level]}: {n} targets\")"
+   "source": [
+    "print(f\"Targets: {X_sparse.shape[0]}\")\n",
+    "print(\n",
+    "    f\"Columns: {X_sparse.shape[1]:,} ({N_CLONES} clones x {n_records:,} records)\"\n",
+    ")\n",
+    "print(f\"Non-zeros: {X_sparse.nnz:,}\")\n",
+    "print(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")\n",
+    "\n",
+    "geo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\n",
+    "level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n",
+    "for level in [0, 1, 2]:\n",
+    "    n = (geo_levels == level).sum()\n",
+    "    if n > 0:\n",
+    "        print(f\"  {level_names[level]}: {n} targets\")"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -294,14 +331,16 @@
     "for gid, info in enumerate(group_info):\n",
     "    mask = target_groups == gid\n",
     "    vals = targets_df.loc[mask, \"value\"]\n",
-    "    records.append({\n",
-    "        \"group_id\": gid,\n",
-    "        \"description\": info,\n",
-    "        \"n_targets\": mask.sum(),\n",
-    "        \"min_value\": vals.min(),\n",
-    "        \"median_value\": vals.median(),\n",
-    "        \"max_value\": vals.max(),\n",
-    "    })\n",
+    "    records.append(\n",
+    "        {\n",
+    "            \"group_id\": gid,\n",
+    "            \"description\": info,\n",
+    "            \"n_targets\": mask.sum(),\n",
+    "            \"min_value\": vals.min(),\n",
+    "            \"median_value\": vals.median(),\n",
+    "            \"max_value\": vals.max(),\n",
+    "        }\n",
+    "    )\n",
     "\n",
     "group_df = pd.DataFrame(records)\n",
     "print(group_df.to_string(index=False))"
@@ -400,7 +439,9 @@
     "    col_vec = X_sparse[:, col]\n",
     "    nnz = col_vec.nnz\n",
     "    abbr = STATE_CODES.get(state, \"??\")\n",
-    "    print(f\"  col {col}: {abbr} (state={state}, CD={cd}) — {nnz} non-zero rows\")"
+    "    print(\n",
+    "        f\"  col {col}: {abbr} (state={state}, CD={cd}) — {nnz} non-zero rows\"\n",
+    "    )"
    ]
   },
   {
@@ -475,7 +516,28 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "nnz_per_row = np.diff(X_sparse.indptr)\nprint(f\"Non-zeros per row:\")\nprint(f\"  min:    {nnz_per_row.min():,}\")\nprint(f\"  median: {int(np.median(nnz_per_row)):,}\")\nprint(f\"  mean:   {nnz_per_row.mean():,.0f}\")\nprint(f\"  max:    {nnz_per_row.max():,}\")\n\ngeo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\nlevel_names = {0: \"National\", 1: \"State\", 2: \"District\"}\nprint(\"\\nBy geographic level:\")\nfor level in [0, 1, 2]:\n    mask = (geo_levels == level).values\n    if mask.any():\n        vals = nnz_per_row[mask]\n        print(\n            f\"  {level_names[level]:10s}: \"\n            f\"n={mask.sum():>4d}, \"\n            f\"median nnz={int(np.median(vals)):>7,}, \"\n            f\"range=[{vals.min():,}, {vals.max():,}]\"\n        )"
+   "source": [
+    "nnz_per_row = np.diff(X_sparse.indptr)\n",
+    "print(f\"Non-zeros per row:\")\n",
+    "print(f\"  min:    {nnz_per_row.min():,}\")\n",
+    "print(f\"  median: {int(np.median(nnz_per_row)):,}\")\n",
+    "print(f\"  mean:   {nnz_per_row.mean():,.0f}\")\n",
+    "print(f\"  max:    {nnz_per_row.max():,}\")\n",
+    "\n",
+    "geo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\n",
+    "level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n",
+    "print(\"\\nBy geographic level:\")\n",
+    "for level in [0, 1, 2]:\n",
+    "    mask = (geo_levels == level).values\n",
+    "    if mask.any():\n",
+    "        vals = nnz_per_row[mask]\n",
+    "        print(\n",
+    "            f\"  {level_names[level]:10s}: \"\n",
+    "            f\"n={mask.sum():>4d}, \"\n",
+    "            f\"median nnz={int(np.median(vals)):>7,}, \"\n",
+    "            f\"range=[{vals.min():,}, {vals.max():,}]\"\n",
+    "        )"
+   ]
   },
   {
    "cell_type": "code",
@@ -498,12 +560,16 @@
     "clone_nnz = []\n",
     "for ci in range(N_CLONES):\n",
     "    block = X_sparse[:, ci * n_records : (ci + 1) * n_records]\n",
-    "    n_states = len(np.unique(geography.state_fips[ci * n_records : (ci + 1) * n_records]))\n",
-    "    clone_nnz.append({\n",
-    "        \"clone\": ci,\n",
-    "        \"nnz\": block.nnz,\n",
-    "        \"unique_states\": n_states,\n",
-    "    })\n",
+    "    n_states = len(\n",
+    "        np.unique(geography.state_fips[ci * n_records : (ci + 1) * n_records])\n",
+    "    )\n",
+    "    clone_nnz.append(\n",
+    "        {\n",
+    "            \"clone\": ci,\n",
+    "            \"nnz\": block.nnz,\n",
+    "            \"unique_states\": n_states,\n",
+    "        }\n",
+    "    )\n",
     "\n",
     "clone_df = pd.DataFrame(clone_nnz)\n",
     "print(\"Non-zeros per clone block:\")\n",
@@ -666,7 +732,10 @@
     }
    ],
    "source": [
-    "ratios = row_sums[achievable_mask] / targets_filtered.loc[achievable_mask, \"value\"].values\n",
+    "ratios = (\n",
+    "    row_sums[achievable_mask]\n",
+    "    / targets_filtered.loc[achievable_mask, \"value\"].values\n",
+    ")\n",
     "ratio_df = targets_filtered[achievable_mask].copy()\n",
     "ratio_df[\"row_sum\"] = row_sums[achievable_mask]\n",
     "ratio_df[\"ratio\"] = ratios\n",
@@ -704,7 +773,9 @@
     "X_final = X_filtered[achievable_mask, :]\n",
     "print(f\"Final matrix shape: {X_final.shape}\")\n",
     "print(f\"Final non-zero entries: {X_final.nnz:,}\")\n",
-    "print(f\"Final density: {X_final.nnz / (X_final.shape[0] * X_final.shape[1]):.6f}\")\n",
+    "print(\n",
+    "    f\"Final density: {X_final.nnz / (X_final.shape[0] * X_final.shape[1]):.6f}\"\n",
+    ")\n",
     "print(\"\\nThis is what the optimizer receives.\")"
    ]
   },

diff --git a/docs/hierarchical_uprating.ipynb b/docs/hierarchical_uprating.ipynb
@@ -264,8 +264,7 @@
    ],
    "source": [
     "snap_hh = raw[\n",
-    "    (raw[\"domain_variable\"] == \"snap\")\n",
-    "    & (raw[\"variable\"] == \"household_count\")\n",
+    "    (raw[\"domain_variable\"] == \"snap\") & (raw[\"variable\"] == \"household_count\")\n",
     "]\n",
     "for level in [\"state\", \"district\"]:\n",
     "    total = snap_hh[snap_hh[\"geo_level\"] == level][\"value\"].sum()\n",
@@ -377,8 +376,7 @@
     "\n",
     "for fips, abbr in sample_states.items():\n",
     "    rows = raw[\n",
-    "        (raw[\"geo_level\"] == \"state\")\n",
-    "        & (raw[\"geographic_id\"] == str(fips))\n",
+    "        (raw[\"geo_level\"] == \"state\") & (raw[\"geographic_id\"] == str(fips))\n",
     "    ]\n",
     "    for _, r in rows.iterrows():\n",
     "        print(\n",
@@ -412,9 +410,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "result = builder._apply_hierarchical_uprating(\n",
-    "    raw, DOMAINS, uprating_factors\n",
-    ")"
+    "result = builder._apply_hierarchical_uprating(raw, DOMAINS, uprating_factors)"
    ]
   },
   {
@@ -455,9 +451,7 @@
     "        cd_state = cd_domain[\n",
     "            cd_domain[\"geographic_id\"].apply(\n",
     "                lambda g, s=fips: (\n",
-    "                    int(g) // 100 == s\n",
-    "                    if g not in (\"US\",)\n",
-    "                    else False\n",
+    "                    int(g) // 100 == s if g not in (\"US\",) else False\n",
     "                )\n",
     "            )\n",
     "        ]\n",
@@ -474,11 +468,7 @@
     "                & (raw[\"variable\"] == var)\n",
     "                & (raw[\"domain_variable\"] == domain)\n",
     "            ]\n",
-    "            uprated_state = (\n",
-    "                st_row[\"value\"].iloc[0]\n",
-    "                if len(st_row)\n",
-    "                else np.nan\n",
-    "            )\n",
+    "            uprated_state = st_row[\"value\"].iloc[0] if len(st_row) else np.nan\n",
     "            print(\n",
     "                f\"  {abbr} {var:20s}  \"\n",
     "                f\"hif={hif:.6f}  \"\n",
@@ -487,6 +477,7 @@
     "                f\"uprated_state={uprated_state:>14,.0f}\"\n",
     "            )\n",
     "\n",
+    "\n",
     "show_reconciliation(result, raw, \"aca_ptc\", sample_states)"
    ]
   },
@@ -527,17 +518,17 @@
     "]\n",
     "\n",
     "state_ufs = (\n",
-    "    aca_cds.assign(state_fips=aca_cds[\"geographic_id\"].apply(\n",
-    "        lambda g: int(g) // 100\n",
-    "    ))\n",
+    "    aca_cds.assign(\n",
+    "        state_fips=aca_cds[\"geographic_id\"].apply(lambda g: int(g) // 100)\n",
+    "    )\n",
     "    .groupby(\"state_fips\")[\"state_uprating_factor\"]\n",
     "    .first()\n",
     "    .sort_values()\n",
     ")\n",
     "\n",
     "print(\"ACA PTC uprating factors (aca_ptc = vol_mult * val_mult):\")\n",
     "print(f\"  {'State FIPS':>12s}  {'Factor':>8s}\")\n",
-    "print(f\"  {'─'*12}  {'─'*8}\")\n",
+    "print(f\"  {'─' * 12}  {'─' * 8}\")\n",
     "for fips in list(state_ufs.index[:5]) + [\"...\"] + list(state_ufs.index[-5:]):\n",
     "    if fips == \"...\":\n",
     "        print(f\"  {'...':>12s}\")\n",
@@ -749,19 +740,15 @@
     "checks = 0\n",
     "for domain in DOMAINS:\n",
     "    domain_result = result[result[\"domain_variable\"] == domain]\n",
-    "    cd_result = domain_result[\n",
-    "        domain_result[\"geo_level\"] == \"district\"\n",
-    "    ]\n",
+    "    cd_result = domain_result[domain_result[\"geo_level\"] == \"district\"]\n",
     "    if cd_result.empty:\n",
     "        continue\n",
     "\n",
     "    for fips, abbr in sorted(STATE_CODES.items()):\n",
     "        cd_rows = cd_result[\n",
     "            cd_result[\"geographic_id\"].apply(\n",
     "                lambda g, s=fips: (\n",
-    "                    int(g) // 100 == s\n",
-    "                    if g not in (\"US\",)\n",
-    "                    else False\n",
+    "                    int(g) // 100 == s if g not in (\"US\",) else False\n",
     "                )\n",
     "            )\n",
     "        ]\n",

diff --git a/docs/local_area_calibration_setup.ipynb b/docs/local_area_calibration_setup.ipynb
@@ -576,9 +576,7 @@
     "        f\"{col in cd_to_cols.get(cd, [])}\"\n",
     "    )\n",
     "    # Check an unrelated state\n",
-    "    print(\n",
-    "        f\"  Visible to NC (37) targets: \" f\"{col in state_to_cols.get(37, [])}\"\n",
-    "    )\n",
+    "    print(f\"  Visible to NC (37) targets: {col in state_to_cols.get(37, [])}\")\n",
     "    print()"
    ]
   },
@@ -639,8 +637,7 @@
     "        else f\"dict ({len(rate)} entries)\"\n",
     "    )\n",
     "    print(\n",
-    "        f\"  {spec['variable']:40s} \"\n",
-    "        f\"entity={spec['entity']:10s} rate={rate_str}\"\n",
+    "        f\"  {spec['variable']:40s} entity={spec['entity']:10s} rate={rate_str}\"\n",
     "    )"
    ]
   },
@@ -966,8 +963,7 @@
     "output_path = os.path.join(output_dir, \"results.h5\")\n",
     "\n",
     "print(\n",
-    "    f\"Weight vector: {len(w):,} entries \"\n",
-    "    f\"({n_demo_cds} CDs x {n_records:,} HH)\"\n",
+    "    f\"Weight vector: {len(w):,} entries ({n_demo_cds} CDs x {n_records:,} HH)\"\n",
     ")\n",
     "print(f\"Non-zero weights: {(w > 0).sum()}\")\n",
     "print(\n",
@@ -1124,7 +1120,7 @@
     "example_mapping = mapping_df.loc[\n",
     "    mapping_df.original_household_id == example_hh_id\n",
     "]\n",
-    "print(f\"Example household (original_id={example_hh_id}) \" f\"in mapping:\\n\")\n",
+    "print(f\"Example household (original_id={example_hh_id}) in mapping:\\n\")\n",
     "print(example_mapping.to_string(index=False))\n",
     "\n",
     "new_ids = example_mapping.new_household_id\n",

diff --git a/modal_app/data_build.py b/modal_app/data_build.py
@@ -391,8 +391,7 @@ def build_datasets(
         # GROUP 3: After extended_cps - run in parallel
         # enhanced_cps and stratified_cps both depend on extended_cps
         print(
-            "=== Phase 4: Building enhanced and stratified CPS (parallel)"
-            " ==="
+            "=== Phase 4: Building enhanced and stratified CPS (parallel) ==="
         )
         with ThreadPoolExecutor(max_workers=2) as executor:
             futures = [

diff --git a/paper/scripts/calculate_distributional_metrics.py b/paper/scripts/calculate_distributional_metrics.py
@@ -82,7 +82,7 @@ def calculate_top_shares(values, weights, percentiles=[90, 99]):
         threshold = weighted_percentile(values, weights, p)
         mask = values >= threshold
         top_income = np.sum(values[mask] * weights[mask])
-        shares[f"top_{100-p}%"] = top_income / total_income
+        shares[f"top_{100 - p}%"] = top_income / total_income
 
     return shares
 

diff --git a/paper/scripts/calculate_target_performance.py b/paper/scripts/calculate_target_performance.py
@@ -79,8 +79,9 @@ def compare_dataset_performance(
 
     # Calculate average improvement by target category
     categories = {
-        "IRS Income": lambda x: "employment_income" in x
-        or "capital_gains" in x,
+        "IRS Income": lambda x: (
+            "employment_income" in x or "capital_gains" in x
+        ),
         "Demographics": lambda x: "age_" in x or "population" in x,
         "Programs": lambda x: "snap" in x or "social_security" in x,
         "Tax Expenditures": lambda x: "salt" in x or "charitable" in x,
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,7 +5,7 @@ HF_CLONE_DIR ?= $(HOME)/huggingface/policyengine-us-data @@
     all: data test
     format:
-    	black . -l 79
+    	ruff format .
     test:
     	pytest
@@ Expand Down @@