PolicyEngine · MaxGhenis · Jun 16, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/README.md b/README.md
@@ -8,10 +8,9 @@ and benefit outputs without tools.
 For benchmark scope, snapshot policy, and terminology, see the
 [benchmark card](https://github.com/PolicyEngine/policybench/blob/main/docs/benchmark_card.md).
 
-US benchmark scenarios are sampled from the certified PolicyEngine populace
-dataset and evaluated under tax year 2026 rules with PolicyEngine-US. The
-public UK path uses a UK-calibrated transfer dataset and PolicyEngine-UK
-reference outputs for fiscal year 2026-27.
+Current public benchmark scenarios are sampled from the certified PolicyEngine
+populace US dataset and evaluated under tax year 2026 rules with
+PolicyEngine-US.
 
 ## Condition
 
@@ -39,13 +38,13 @@ eligibility, not as a dollar amount.
 ## Programs evaluated
 
 The current public release covers selected federal taxes, credits, benefits,
-coverage labels, and state-tax outputs in the US, plus selected tax and transfer
-outputs in the UK. US federal income tax is scored as a compact decomposition:
-tax after nonrefundable credits and before refundable credits, plus refundable
-federal credits excluding the ACA Premium Tax Credit. The May 2026 source run
-requested ACA Premium Tax Credit responses, but they are excluded from the
-canonical scored leaderboard because explanation audits showed prompt ambiguity
-without plan-specific Marketplace information.
+coverage labels, and state-tax outputs in the US. US federal income tax is scored
+as a compact decomposition: tax after nonrefundable credits and before
+refundable credits, plus refundable federal credits excluding the ACA Premium Tax
+Credit. The June 2026 populace source run requested ACA Premium Tax Credit
+responses, but they are excluded from the canonical scored leaderboard because
+explanation audits showed prompt ambiguity without plan-specific Marketplace
+information.
 
 ## Quick start
 

diff --git a/...ions/us_full_run_20260612_policyengine_4_16_1_populace/us_case_reference_explanations.csv b/...ions/us_full_run_20260612_policyengine_4_16_1_populace/us_case_reference_explanations.csv
diff --git a/app/public/paper/policybench.pdf b/app/public/paper/policybench.pdf
diff --git a/app/public/paper/web/index.html b/app/public/paper/web/index.html
diff --git a/app/src/app/model/[id]/page.tsx b/app/src/app/model/[id]/page.tsx
@@ -39,12 +39,12 @@ export async function generateMetadata({
   const scoreText = summaries
     .map(
       (summary) =>
-        `${summary.country.toUpperCase()} ${(summary.stat.within1pct ?? summary.stat.score).toFixed(1)}%`,
+        `${summary.country.toUpperCase()} ${(summary.stat.exact ?? summary.stat.score).toFixed(1)}%`,
     )
     .join(", ");
   const description =
     `How accurately ${label} estimates household tax and benefit amounts ` +
-    `without tools, scored against PolicyEngine (within-1% hit rate: ` +
+    `without tools, scored against PolicyEngine (exact-match rate: ` +
     `${scoreText}).`;
   return {
     title: label,
@@ -119,8 +119,8 @@ export default async function ModelPage({
         {summaries.map((summary) => (
           <ScorePill
             key={`score-${summary.country}`}
-            label={`${summary.country.toUpperCase()} within 1%`}
-            value={formatPct(summary.stat.within1pct ?? summary.stat.score)}
+            label={`${summary.country.toUpperCase()} exact`}
+            value={formatPct(summary.stat.exact ?? summary.stat.score)}
           />
         ))}
       </div>
@@ -202,10 +202,10 @@ export default async function ModelPage({
                             Program
                           </th>
                           <th className="px-4 py-2.5 text-right text-[10px] uppercase tracking-[0.14em] text-text-muted font-medium">
-                            Within 1%
+                            Exact
                           </th>
                           <th className="px-4 py-2.5 text-right text-[10px] uppercase tracking-[0.14em] text-text-muted font-medium">
-                            Exact
+                            Within 1%
                           </th>
                           <th className="px-4 py-2.5 text-right text-[10px] uppercase tracking-[0.14em] text-text-muted font-medium">
                             n
@@ -222,10 +222,10 @@ export default async function ModelPage({
                               {getVariableLabel(row.variable, summary.country)}
                             </td>
                             <td className="px-4 py-2 text-right font-[family-name:var(--font-mono)] text-text">
-                              {formatPct(row.within1pct)}
+                              {formatPct(row.exact)}
                             </td>
                             <td className="px-4 py-2 text-right font-[family-name:var(--font-mono)] text-text-secondary">
-                              {formatPct(row.exact)}
+                              {formatPct(row.within1pct)}
                             </td>
                             <td className="px-4 py-2 text-right font-[family-name:var(--font-mono)] text-text-muted">
                               {row.n}

diff --git a/app/src/app/paper/page.tsx b/app/src/app/paper/page.tsx
@@ -4,10 +4,10 @@ import Link from "next/link";
 
 import SiteHeader from "../../components/SiteHeader";
 
-const SNAPSHOT_DATE_LABEL = "Snapshot 2026-05-20";
+const SNAPSHOT_DATE_LABEL = "Snapshot 2026-06-14";
 
 const PAPER_DESCRIPTION =
-  "PolicyBench paper — 2026-05-20 household-equal scored manuscript snapshot with May 13-May 20 model responses and refreshed PolicyEngine reference outputs.";
+  "PolicyBench paper — the 2026-06-14 household-equal scored manuscript snapshot, with June 12-13 model responses and PolicyEngine reference outputs on the populace US microdata.";
 
 export const metadata: Metadata = {
   title: "Paper",
@@ -40,7 +40,7 @@ export const metadata: Metadata = {
 
 const manuscriptPaths = {
   pdf: "/paper/policybench.pdf",
-  web: "/paper/web/index.html?v=20260609-vals-related-work",
+  web: "/paper/web/index.html?v=20260616-populace-launch",
 };
 const ssrnUrl = process.env.NEXT_PUBLIC_POLICYBENCH_SSRN_URL;
 
@@ -49,13 +49,16 @@ export default function PaperPage() {
     <>
       <p className="max-w-2xl text-sm leading-relaxed text-text-secondary sm:text-base">
         Benchmarking no-tool tax-and-benefit estimation in frontier language
-        models. This page embeds the 2026-05-20 scored manuscript snapshot:
-        a 100-household-per-country public preview using household-equal impact
-        scores against PolicyEngine reference outputs.
+        models. This page embeds the 2026-06-14 scored manuscript snapshot: a
+        100-household public preview using household-equal impact scores against
+        PolicyEngine reference outputs.
       </p>
       <div className="mt-4 flex flex-wrap items-center gap-3">
         <span className="inline-flex items-center gap-1.5 rounded-full border border-border bg-card px-2.5 py-1 text-[10px] font-medium uppercase tracking-[0.12em] text-text-secondary">
-          <span aria-hidden className="h-1.5 w-1.5 rounded-full bg-primary/70" />
+          <span
+            aria-hidden
+            className="h-1.5 w-1.5 rounded-full bg-primary/70"
+          />
           {SNAPSHOT_DATE_LABEL}
         </span>
       </div>

diff --git a/app/src/components/Hero.tsx b/app/src/components/Hero.tsx
@@ -3,22 +3,6 @@ import SiteHeader, { type HeaderNavItem } from "./SiteHeader";
 
 const SNAPSHOT_DATE_LABEL = "Snapshot 2026-06-14";
 
-function PreReleaseBanner() {
-  return (
-    <div className="border-y border-warning/30 bg-warning-soft/90">
-      <div className="mx-auto flex max-w-7xl flex-col gap-1 px-4 py-3 text-sm text-text sm:flex-row sm:items-center sm:px-6">
-        <span className="font-[family-name:var(--font-mono)] text-[11px] font-semibold uppercase tracking-[0.12em] text-warning-text">
-          Pre-release
-        </span>
-        <span className="text-text-secondary">
-          These results are provisional. We plan to rerun PolicyBench with
-          updated data and improved prompts before the final release.
-        </span>
-      </div>
-    </div>
-  );
-}
-
 export default function Hero({
   selectedView,
   onSelectView,
@@ -60,7 +44,6 @@ export default function Hero({
         availableViews={availableViews}
         actionLink={{ label: "Paper", href: "/paper", type: "internal" }}
       />
-      <PreReleaseBanner />
 
       <section
         aria-labelledby="hero-title"
@@ -102,7 +85,10 @@ export default function Hero({
             </div>
 
             <span className="inline-flex items-center gap-1.5 rounded-full border border-border bg-card px-2.5 py-1 text-[10px] font-medium uppercase tracking-[0.12em] text-text-secondary">
-              <span aria-hidden className="h-1.5 w-1.5 rounded-full bg-primary/70" />
+              <span
+                aria-hidden
+                className="h-1.5 w-1.5 rounded-full bg-primary/70"
+              />
               {SNAPSHOT_DATE_LABEL}
             </span>
           </div>

diff --git a/app/src/components/Methodology.tsx b/app/src/components/Methodology.tsx
@@ -62,13 +62,13 @@ export default function Methodology({
   const benchData = data;
   const country = benchData.country;
   const noToolsModels = benchData.modelStats.filter(
-    (m) => m.condition === "no_tools"
+    (m) => m.condition === "no_tools",
   );
   const modelNames = noToolsModels.map((m) => MODEL_LABELS[m.model] || m.model);
   const variables = [...benchData.programStats]
     .map((program) => program.variable)
     .sort((a, b) =>
-      getVariableLabel(a, country).localeCompare(getVariableLabel(b, country))
+      getVariableLabel(a, country).localeCompare(getVariableLabel(b, country)),
     );
   const scenarioCount = Object.keys(benchData.scenarios).length;
   const scoredPoints =
@@ -138,15 +138,15 @@ export default function Methodology({
 
         <SectionCard title="Open-set status">
           The public scenario explorer exposes prompts and PolicyEngine
-          reference outputs, so future model releases or fine-tunes could
-          learn from the released cases. Treat this leaderboard as a public
-          preview; protected held-out claims would require a separate
-          rotating evaluation set.
+          reference outputs, so future model releases or fine-tunes could learn
+          from the released cases. Treat this leaderboard as a public preview;
+          protected held-out claims would require a separate rotating evaluation
+          set.
         </SectionCard>
 
         <SectionCard title="Households">
           {country === "uk"
-            ? "The UK benchmark samples one-benefit-unit households from the public UK calibrated transfer dataset with a fixed seed. That dataset maps benchmark-compatible US Enhanced CPS records into UK-facing inputs and recalibrates weights to selected UK targets. The prompt states the shared UK benefit-unit structure; nonzero promptable inputs are carried through into both the prompt and the PolicyEngine-UK input."
+            ? "The UK benchmark samples one-benefit-unit households from the public UK calibrated transfer dataset with a fixed seed. That dataset maps benchmark-compatible US household records into UK-facing inputs and recalibrates weights to selected UK targets. The prompt states the shared UK benefit-unit structure; nonzero promptable inputs are carried through into both the prompt and the PolicyEngine-UK input."
             : `The US benchmark samples households from PolicyEngine's populace US microdataset with a fixed seed. The current set is restricted to households with a single federal tax unit, a single family, and a single benefit-calculation unit. Adult dependents remain in scope when they satisfy those restrictions. Ages, roles, income sources, and other nonzero promptable inputs are carried through into both the prompt and the ${referenceOutputSource} input; filing status is inferred from household structure.`}
         </SectionCard>
 
@@ -162,48 +162,44 @@ export default function Methodology({
           facts. It excludes intermediate tax bases, payroll subcomponents, and
           outputs that mainly require unavailable history, restricted local
           market data, or program take-up assignment. WIC is scored as
-          person-level eligibility, not as a dollar amount. Local income tax
-          is retained as a displayed requested output, but currently receives
-          zero default population-impact weight because the full populace
-          dataset has no positive modeled local-income-tax records.
+          person-level eligibility, not as a dollar amount. Local income tax is
+          retained as a displayed requested output, but currently receives zero
+          default population-impact weight because the full populace dataset has
+          no positive modeled local-income-tax records.
           {country === "us"
             ? " The source run also requested the ACA Premium Tax Credit, but explanation audits showed the prompt could be misleading when households lacked plan-specific Marketplace information, so it is preserved in raw responses and excluded from the scored leaderboard."
             : ""}
         </SectionCard>
 
         <SectionCard title="Scoring and weighting">
-          The public leaderboard ranks models by the within-1% hit rate using
+          The public leaderboard ranks models by the exact-match rate using
           population household-impact weights. For each household-output row,
           the within-1% indicator is 1 when a currency answer is within 1% of
           the PolicyEngine reference value, with a one-currency-unit tolerance
           when the reference is zero. Binary eligibility flags are requested as
           integer 0/1 outputs and require exact 0/1 matching. The secondary
-          bounded score uses{" "}
-          <code>max(0, 1 − |pred − ref| / |ref|)</code> when the reference is
-          nonzero and exact zero matches when the reference is zero for amount
-          outputs, and the same exact 0/1 rule for binary outputs.
-          Each full source household&apos;s per-output share is{" "}
+          bounded score uses <code>max(0, 1 − |pred − ref| / |ref|)</code> when
+          the reference is nonzero and exact zero matches when the reference is
+          zero for amount outputs, and the same exact 0/1 rule for binary
+          outputs. Each full source household&apos;s per-output share is{" "}
           <code>|ref| / max(|household_net_income|, Σ |ref|)</code>, a value in
           [0, 1] that&apos;s strictly less than one when net income dominates
           the gross tax-benefit flow and equals one only when programs cancel
           each other out. Those shares are averaged using calibrated household
           weights in the full weighting population, then renormalized so the
-          output weights sum to one. US weights use the full populace dataset; UK
-          weights use the full enhanced FRS. This weighting source is separate
-          from the UK benchmark scenarios, which use the public calibrated
-          transfer dataset. The weights are then applied to the fixed benchmark
-          households and renormalized within each household over requested
-          outputs.
+          output weights sum to one. US weights use the full populace dataset.
+          The weights are then applied to the fixed benchmark households and
+          renormalized within each household over requested outputs.
           {country === "us"
             ? " Person-level eligibility flags like Medicaid carry weight through PolicyEngine's paired per-capita value (e.g. medicaid_value), so the LLM is graded only on the boolean call itself."
-            : " Person-level eligibility flags carry weight through PolicyEngine's paired per-capita value, so the LLM is graded only on the boolean call itself."}
-          {" "}Missing or unparseable answers count as misses through the
-          coverage multiplier. The leaderboard reports within-1% as the
-          headline, exact match as the deployability bar, and bounded score,
-          amount accuracy, and participation accuracy as diagnostic companions.
-          Equal-weight and budget-weighted variants are reported alongside for
-          transparency. The leaderboard is a point estimate on this fixed test
-          set.
+            : " Person-level eligibility flags carry weight through PolicyEngine's paired per-capita value, so the LLM is graded only on the boolean call itself."}{" "}
+          Missing or unparseable answers count as misses through the coverage
+          multiplier. The leaderboard reports the exact-match rate as the
+          headline deployability bar, with within-1% as a near-miss-tolerant
+          companion and bounded score, amount accuracy, and participation
+          accuracy as further diagnostics. Equal-weight and
+          budget-weighted variants are reported alongside for transparency. The
+          leaderboard is a point estimate on this fixed test set.
         </SectionCard>
 
         <SectionCard title="Sensitivity checks">
@@ -212,8 +208,7 @@ export default function Methodology({
           cases, zero-reference cases, and country-only results. In the
           equal-output-group view, person-level outputs are grouped by program
           before the country average. These checks are used to interpret rank
-          stability; they do not replace the public within-1%
-          leaderboard.
+          stability; they do not replace the public exact-match leaderboard.
         </SectionCard>
 
         <SectionCard title="Impact weighting">
@@ -242,7 +237,7 @@ export default function Methodology({
             </div>
           </div>
           <div className="text-text-muted text-xs">
-            Fixed test set, no tools, US tax year 2026 / UK fiscal year 2026-27
+            Fixed test set, no tools, US tax year 2026
           </div>
         </div>
 

diff --git a/app/src/components/ModelLeaderboard.tsx b/app/src/components/ModelLeaderboard.tsx
@@ -84,14 +84,19 @@ export default function ModelLeaderboard({
 }) {
   const [sensitivityView, setSensitivityView] =
     useState<SensitivityViewId>("household");
-  // Headline scoring: defaults to "within1pct". On UK, ~71% of references
-  // are £0 and Exact mode mostly measures "did you say £0?" — the
-  // within-1% bar restores meaningful separation. Exact remains a click
-  // away as the production-deployability bar; Continuous tracks
-  // conceptual progress year over year.
+  // Headline scoring: defaults to "exact" — the deployability bar. A
+  // prediction counts only if it matches the PolicyEngine reference to the
+  // dollar (for amounts) or to the eligibility flag (for booleans), which is
+  // the bar a tax filer, benefit estimator, or caseworker actually has to
+  // clear. Because this leaderboard is household-impact-weighted, the
+  // weighting down-weights the zero-reference outputs that a hedge-to-zero
+  // model gets for free, so weighted exact is NOT compressed near the
+  // unweighted zero share and discriminates between models about as well as
+  // within-1%. Within-1% (near-miss tolerance) and Continuous (partial
+  // credit) are a click away.
   const [scoringMode, setScoringMode] = useState<
     "exact" | "within1pct" | "continuous"
-  >("within1pct");
+  >("exact");
   // Reference cases: All by default, but Positives is the right view when
   // (e.g.) UK references are 71% £0 and Exact mode mostly measures
   // "did you say £0?". Zeros surfaces the inverse — eligibility hedging.