From a1d28a597ad87559dad0e26a2f266cf516553d21 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 09:24:24 +0200
Subject: [PATCH 1/3] remove examples/flyradar_eval_example.py

---
 examples/flyradar_eval_example.py | 406 ------------------------------
 1 file changed, 406 deletions(-)
 delete mode 100644 examples/flyradar_eval_example.py

diff --git a/examples/flyradar_eval_example.py b/examples/flyradar_eval_example.py
deleted file mode 100644
index 706528f4..00000000
--- a/examples/flyradar_eval_example.py
+++ /dev/null
@@ -1,406 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""FlyRadar evaluation example — gate-based process-mining quality gate.
-
-Demonstrates how to use ``fireflyframework_agentic.evaluation`` to replicate
-the flyradar experiment quality-gate workflow:
-
-1. Load a must-find registry (the gold standard items the model must discover).
-2. Load a DiscoveryResult produced by a flyradar pipeline run.
-3. Run gates G1-G5 to produce a structured verdict:
-     G1 -- Structural & Safe (schema validity, PII, empty-registry guard).
-     G2 -- Recall & Precision (must-find recall floor, NC precision).
-     G3 -- Grounded (finding-to-evidence anchoring).
-     G4 -- LLM-as-a-Judge (advisory only; never blocks promotion).
-     G5 -- No-regression / promotion (champion/challenger comparison).
-4. Render a human-readable scorecard and print the final verdict.
-5. Promote the challenger to champion when the verdict is PROMOTE.
-
-Usage::
-
-    # Minimal: deterministic gates only (no G4 judge, no baseline)
-    python examples/flyradar_eval_example.py \\
-        --result output.json \\
-        --registry registry.json
-
-    # With corpus verification and a champion baseline
-    python examples/flyradar_eval_example.py \\
-        --result output.json \\
-        --registry registry.json \\
-        --baseline baseline.json \\
-        --corpus input.json
-
-    # With the advisory G4 LLM judge (requires API key in environment)
-    FLYEVAL_JUDGE_MODEL=anthropic:claude-sonnet-4-6 \\
-    python examples/flyradar_eval_example.py \\
-        --result output.json \\
-        --registry registry.json \\
-        --judge-model anthropic:claude-sonnet-4-6
-
-Exit codes: 0 = PROMOTE, 1 = HOLD.
-
-Input file formats
-------------------
-``--result`` (output.json)
-    A DiscoveryResult JSON produced by a flyradar pipeline run.  Must contain
-    at minimum ``findings`` (list) and ``evidence_index`` (list).
-
-``--registry`` (registry.json)
-    A lean-1 registry JSON.  Each item has ``id``, ``tier`` (L0-L3), ``title``,
-    ``description``, and ``nc`` (bool, True for negative controls).
-
-``--baseline`` (baseline.json)
-    A ChampionRecord JSON written by a previous PROMOTE run.  When omitted the
-    gate runs in day-zero mode (G5 always passes and a new champion is minted).
-
-``--corpus`` (input.json)
-    The corpus bundle used during the run.  When supplied, G3 verifies that cited
-    evidence excerpts actually appear in the corpus documents.
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import sys
-from pathlib import Path
-
-from fireflyframework_agentic.evaluation import (
-    ChampionRecord,
-    GateResult,
-    build_embedder,
-    load_champion,
-    load_corpus,
-    load_registry,
-    render_scorecard,
-    run_gates,
-    run_judge,
-    save_champion,
-    verdict,
-    VERDICT_PROMOTE,
-)
-from fireflyframework_agentic.evaluation.models import EvalConfig
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def _load_json(path: str) -> dict:
-    """Read a JSON file and return its contents as a dict."""
-    return json.loads(Path(path).read_text(encoding="utf-8"))
-
-
-def _lexical_missed_ids(result: dict, registry) -> list[str]:
-    """Return the IDs of registry items not matched by any finding (lexically).
-
-    The G4 judge uses these to focus its coverage checks on items that
-    lexical recall missed — the places where semantic recovery matters most.
-    """
-    from fireflyframework_agentic.evaluation.matcher import matches
-
-    evidence_index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")}
-    findings = result.get("findings", [])
-    # L3 items are informational-only and are never scored.
-    scored_items = [item for item in registry.real_items if item.tier != "L3"]
-    return [
-        item.id
-        for item in scored_items
-        if not any(matches(f, item, evidence_index) for f in findings)
-    ]
-
-
-# ---------------------------------------------------------------------------
-# Main evaluation flow
-# ---------------------------------------------------------------------------
-
-
-def run_evaluation(args: argparse.Namespace) -> int:
-    """Run the full flyradar gate evaluation and return an exit code."""
-
-    # ------------------------------------------------------------------
-    # Step 1 — Load inputs.
-    # ------------------------------------------------------------------
-    print(f"Loading result   : {args.result}")
-    result = _load_json(args.result)
-
-    print(f"Loading registry : {args.registry}")
-    registry = load_registry(args.registry)
-    print(f"  {len(registry.real_items)} real items, {len(registry.nc_items)} NC items")
-
-    # The EvalConfig captures provenance for the run record.
-    config = EvalConfig(
-        model_id=args.model_id,
-        corpus=registry.corpus,
-        run_id=args.run_id,
-        registry_path=args.registry,
-        corpus_path=args.corpus or "",
-        baseline_path=args.baseline or "",
-        judge_model=args.judge_model or "",
-    )
-
-    # Optional: corpus bundle for deterministic evidence verification (G3).
-    corpus = None
-    if args.corpus:
-        print(f"Loading corpus   : {args.corpus}")
-        corpus = load_corpus(args.corpus)
-
-    # Optional: champion record for regression detection (G5).
-    champion = None
-    champion_scores = None
-    aa_noise = None
-    if args.baseline:
-        print(f"Loading baseline : {args.baseline}")
-        champion = load_champion(args.baseline)
-        if champion:
-            champion_scores = champion.scores
-            aa_noise = champion.aa_noise
-            print(f"  Champion run   : {champion.run_id} ({champion.model_id})")
-        else:
-            print("  No champion found — running in day-zero mode.")
-
-    # Optional: embedder for semantic/hybrid recall (G2).
-    embed_fn = None
-    if args.embedder:
-        print(f"Building embedder: {args.embedder}")
-        embed_fn = build_embedder(args.embedder)
-
-    print()
-
-    # ------------------------------------------------------------------
-    # Step 2 — Run deterministic gates G1-G3 + G5.
-    #
-    # run_gates() returns a list of GateResult objects, one per gate.
-    # Each GateResult carries:
-    #   .gate   -- "G1" | "G2" | "G3" | "G5"
-    #   .passed -- bool
-    #   .details -- dict with per-metric values
-    #   .errors  -- list[str] of blocking error codes
-    # ------------------------------------------------------------------
-    print("Running gates G1-G3 + G5 ...")
-    gate_results: list[GateResult] = run_gates(
-        result,
-        registry,
-        args.registry,
-        pii_list=args.pii_list or [],
-        recall_floor=args.recall_floor,
-        grounding_floor=args.grounding_floor,
-        champion_scores=champion_scores,
-        aa_noise=aa_noise,
-        is_day_zero=(champion is None),
-        human_signed_off=args.human_signed_off,
-        signoff_count=args.signoffs,
-        embed_fn=embed_fn,
-        tau=args.tau,
-        recall_metric=args.recall_metric,
-        tau_nc=args.tau_nc,
-        corpus=corpus,
-    )
-
-    # Quick gate summary before the full scorecard.
-    for gr in gate_results:
-        status = "PASS" if gr.passed else "FAIL"
-        print(f"  {gr.gate}: {status}")
-
-    # ------------------------------------------------------------------
-    # Step 3 — Run the advisory G4 LLM-as-a-Judge (optional).
-    #
-    # G4 is non-blocking: it never changes the verdict or exit code.
-    # It produces an AdvisoryReport with per-finding quality signals
-    # (faithfulness, citation relevance, fabricated entities, etc.).
-    # ------------------------------------------------------------------
-    advisory = None
-    if args.judge_model:
-        print(f"\nRunning G4 judge ({args.judge_model}) ...")
-        missed_ids = _lexical_missed_ids(result, registry)
-        advisory = run_judge(
-            result,
-            registry,
-            judge_model=args.judge_model,
-            runs=args.judge_runs,
-            concurrency=args.judge_concurrency,
-            pipeline_model=args.model_id,
-            embed_fn=embed_fn,
-            tau=args.tau,
-            lexical_missed_ids=missed_ids,
-        )
-        print(f"  Judge completed ({args.judge_runs} run(s)).")
-    else:
-        print("\nG4 judge skipped (pass --judge-model to enable).")
-
-    # ------------------------------------------------------------------
-    # Step 4 — Render the scorecard.
-    #
-    # render_scorecard() produces a markdown-formatted human-readable
-    # report that mirrors the output of `flyeval gate` in the playground.
-    # ------------------------------------------------------------------
-    print()
-    scorecard = render_scorecard(
-        gate_results,
-        corpus=registry.corpus,
-        model_id=config.model_id,
-        run_id=config.run_id,
-        is_self_graded=True,
-        kappa_advisory=registry.is_kappa_advisory(),
-        evidence_unverified=(corpus is None),
-        advisory=advisory,
-    )
-    print(scorecard)
-
-    # ------------------------------------------------------------------
-    # Step 5 — Inspect the verdict and handle promotion.
-    #
-    # verdict() returns "PROMOTE" or "HOLD" based on the gate results.
-    # On PROMOTE, save the challenger as the new champion so future runs
-    # can detect regressions against this baseline.
-    # ------------------------------------------------------------------
-    v = verdict(gate_results)
-    print(f"\nFinal verdict: {v}")
-
-    if v == VERDICT_PROMOTE and args.baseline:
-        # Extract the key scores from G2 and G3 to store in the champion record.
-        g2 = next((g for g in gate_results if g.gate == "G2"), None)
-        g3 = next((g for g in gate_results if g.gate == "G3"), None)
-        scores: dict[str, float] = {}
-        if g2:
-            scores["recall"] = g2.details.get("recall", 0.0)
-        if g3:
-            scores["grounding_pct"] = g3.details.get("grounding_pct", 0.0)
-
-        new_champion = ChampionRecord(
-            corpus=registry.corpus,
-            run_id=config.run_id,
-            model_id=config.model_id,
-            registry_sha256=registry.sha256(),
-            scores=scores,
-            is_day_zero=(champion is None),
-        )
-        save_champion(
-            args.baseline,
-            new_champion,
-            summary=f"Promoted by flyradar_eval_example.py — {config.run_id}",
-        )
-        print(f"Champion saved to {args.baseline}")
-
-    # Exit 0 = PROMOTE, 1 = HOLD (mirrors `flyeval gate` convention).
-    return 0 if v == VERDICT_PROMOTE else 1
-
-
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-
-
-def build_parser() -> argparse.ArgumentParser:
-    p = argparse.ArgumentParser(
-        prog="flyradar_eval_example",
-        description="FlyRadar gate evaluation — replicates the flyeval gate workflow.",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-
-    # Required inputs.
-    p.add_argument("--result", required=True, help="Path to DiscoveryResult JSON.")
-    p.add_argument("--registry", required=True, help="Path to lean-1 registry JSON.")
-
-    # Optional inputs.
-    p.add_argument(
-        "--baseline",
-        help="Path to baseline.json (champion store).  When absent, runs in day-zero mode.",
-    )
-    p.add_argument(
-        "--corpus",
-        help="Path to input.json corpus bundle for deterministic evidence verification (G3).",
-    )
-
-    # Run metadata.
-    p.add_argument("--model-id", default="unknown", help="Model identifier for the scorecard.")
-    p.add_argument("--run-id", default="example-run", help="Run identifier for the scorecard.")
-
-    # Gate thresholds.
-    p.add_argument(
-        "--recall-floor",
-        type=float,
-        default=0.70,
-        help="Minimum recall required for G2 to pass.",
-    )
-    p.add_argument(
-        "--grounding-floor",
-        type=float,
-        default=0.90,
-        help="Minimum grounding percentage required for G3 to pass.",
-    )
-    p.add_argument(
-        "--recall-metric",
-        choices=["lexical", "semantic", "hybrid"],
-        default="lexical",
-        help="Recall metric used by G2.  'semantic' and 'hybrid' require --embedder.",
-    )
-    p.add_argument(
-        "--tau",
-        type=float,
-        default=0.70,
-        help="Cosine similarity threshold for semantic recall (real items).",
-    )
-    p.add_argument(
-        "--tau-nc",
-        type=float,
-        default=0.85,
-        help="Cosine similarity threshold for NC item detection.",
-    )
-    p.add_argument("--pii-list", nargs="*", default=[], help="PII tokens to check for in findings.")
-    p.add_argument("--human-signed-off", action="store_true", help="Mark this run as human-reviewed.")
-    p.add_argument("--signoffs", type=int, default=0, help="Number of human sign-offs collected.")
-
-    # G4 judge options.
-    p.add_argument(
-        "--judge-model",
-        default=None,
-        help=(
-            "Provider:model string for the advisory G4 LLM judge "
-            "(e.g. 'anthropic:claude-sonnet-4-6').  Omit to skip G4."
-        ),
-    )
-    p.add_argument(
-        "--judge-runs",
-        type=int,
-        default=1,
-        help="Number of judge calls to aggregate (odd number recommended for median).",
-    )
-    p.add_argument(
-        "--judge-concurrency",
-        type=int,
-        default=1,
-        help="Thread fan-out for per-item G4 metrics (1 = sequential).",
-    )
-
-    # Embedder for semantic recall.
-    p.add_argument(
-        "--embedder",
-        default=None,
-        help="Embedder spec for semantic/hybrid recall (e.g. 'ollama:bge-m3').",
-    )
-
-    return p
-
-
-def main() -> None:
-    parser = build_parser()
-    args = parser.parse_args()
-    sys.exit(run_evaluation(args))
-
-
-if __name__ == "__main__":
-    main()

From 61617186f1ed103c783197784497dd841a260b43 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 09:24:27 +0200
Subject: [PATCH 2/3] ci: add --extra evaluation to typecheck and test sync
 steps

---
 .github/workflows/pr-gate.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr-gate.yml b/.github/workflows/pr-gate.yml
index c0ef76d4..86e35717 100644
--- a/.github/workflows/pr-gate.yml
+++ b/.github/workflows/pr-gate.yml
@@ -57,7 +57,7 @@ jobs:
       - uses: actions/setup-python@v6
         with:
           python-version: '3.13'
-      - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra openai-embeddings
+      - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra openai-embeddings --extra evaluation
       - run: uv run pyright
 
   test:
@@ -72,7 +72,7 @@ jobs:
       - uses: actions/setup-python@v6
         with:
           python-version: '3.13'
-      - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra vectorstores-pgvector --extra openai-embeddings
+      - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra vectorstores-pgvector --extra openai-embeddings --extra evaluation
       - run: uv run pytest -m "not nightly" --cov --cov-report=term-missing
 
   build:

From 203134ca971377816c462b7d4c5125d9ebc9d4e0 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 09:24:32 +0200
Subject: [PATCH 3/3] fix(evaluation): resolve all ruff lint errors (import
 sort, SIM108, B905, N806, UP035)

---
 examples/flycanon_eval_example.py             | 13 +--
 .../evaluation/__init__.py                    | 21 ++++-
 fireflyframework_agentic/evaluation/cli.py    | 42 +++++-----
 fireflyframework_agentic/evaluation/corpus.py | 20 ++---
 fireflyframework_agentic/evaluation/gates.py  | 42 +++-------
 fireflyframework_agentic/evaluation/judge.py  | 79 ++++++++-----------
 .../evaluation/judge_client.py                | 25 ++----
 .../evaluation/matcher.py                     | 60 +++++++-------
 .../evaluation/registry.py                    | 40 +++++-----
 .../evaluation/run_config_snapshot.py         |  9 +--
 .../evaluation/scorecard.py                   | 44 +++--------
 fireflyframework_agentic/evaluation/stats.py  |  9 +--
 uv.lock                                       | 59 +++++++++++++-
 13 files changed, 220 insertions(+), 243 deletions(-)

diff --git a/examples/flycanon_eval_example.py b/examples/flycanon_eval_example.py
index 9d8d071b..856b520b 100644
--- a/examples/flycanon_eval_example.py
+++ b/examples/flycanon_eval_example.py
@@ -94,8 +94,7 @@
 import sys
 from pathlib import Path
 
-from fireflyframework_agentic.evaluation import RetrieverMetrics, compute_retrieval_metrics
-
+from fireflyframework_agentic.evaluation import RetrieverMetrics
 
 # ---------------------------------------------------------------------------
 # Helpers
@@ -171,10 +170,7 @@ def _print_metrics_table(metrics: RetrieverMetrics, baseline: dict | None) -> No
         if value is None:
             continue
         # Format floats as 4 decimal places; ints as plain integers.
-        if isinstance(value, float):
-            cur_str = f"{value:.4f}"
-        else:
-            cur_str = str(value)
+        cur_str = f"{value:.4f}" if isinstance(value, float) else str(value)
 
         row = f"{key:<{col_w}} {cur_str:>{num_w}}"
         if baseline and key in baseline and isinstance(value, float):
@@ -353,10 +349,7 @@ def build_parser() -> argparse.ArgumentParser:
     p.add_argument(
         "--baseline",
         default=None,
-        help=(
-            "Path to baseline.json (champion store).  When absent, scores are printed "
-            "without comparison."
-        ),
+        help=("Path to baseline.json (champion store).  When absent, scores are printed without comparison."),
     )
     p.add_argument(
         "--promote-if-better",
diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py
index ad01980c..d986d09f 100644
--- a/fireflyframework_agentic/evaluation/__init__.py
+++ b/fireflyframework_agentic/evaluation/__init__.py
@@ -31,16 +31,29 @@
 
 from importlib.metadata import PackageNotFoundError, version
 
-from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index
+from fireflyframework_agentic.evaluation.champion import (
+    ChampionRecord,
+    invalidate_champion,
+    load_champion,
+    save_champion,
+)
+from fireflyframework_agentic.evaluation.corpus import (
+    EMPTY,
+    FABRICATED,
+    SOURCE_UNKNOWN,
+    VERIFIED,
+    corpus_sha256,
+    load_corpus,
+    verify_evidence_index,
+)
 from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, run_gates
-from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict, VERDICT_PROMOTE, VERDICT_HOLD
-from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion
 from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge
 from fireflyframework_agentic.evaluation.judge_client import JudgeClient, OllamaEmbedder, build_embedder, cosine
 from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens
 from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256
-from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics
+from fireflyframework_agentic.evaluation.scorecard import VERDICT_HOLD, VERDICT_PROMOTE, render_scorecard, verdict
 from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag
+from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics
 
 try:
     __version__ = version("fireflyframework-agentic")
diff --git a/fireflyframework_agentic/evaluation/cli.py b/fireflyframework_agentic/evaluation/cli.py
index 7ac868d9..80dc418a 100644
--- a/fireflyframework_agentic/evaluation/cli.py
+++ b/fireflyframework_agentic/evaluation/cli.py
@@ -48,7 +48,8 @@
 from fireflyframework_agentic.evaluation.judge_client import build_embedder
 from fireflyframework_agentic.evaluation.matcher import matches
 from fireflyframework_agentic.evaluation.registry import load_registry
-from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict as get_verdict
+from fireflyframework_agentic.evaluation.scorecard import render_scorecard
+from fireflyframework_agentic.evaluation.scorecard import verdict as get_verdict
 from fireflyframework_agentic.evaluation.stats import aa_band, left_skew_flag
 
 
@@ -114,10 +115,8 @@ def _eval_config(args, registry, corpus=None) -> dict:
                     "champion (EMPTY_MUST_FIND)",
                     "registry_sha256_pin": "loaded registry matches its file hash (GOLD_DRIFT)",
                     "corpus_sha256_pin": "corpus matches its hash when supplied (CORPUS_DRIFT)",
-                    "schema_valid": "required top-level keys present in the result "
-                    "(SCHEMA_INVALID)",
-                    "pii_non_disclosure": "no corpus PII name appears in any finding/report text "
-                    "(PII_LEAK)",
+                    "schema_valid": "required top-level keys present in the result (SCHEMA_INVALID)",
+                    "pii_non_disclosure": "no corpus PII name appears in any finding/report text (PII_LEAK)",
                 },
             },
             "G2": {
@@ -142,14 +141,10 @@ def _eval_config(args, registry, corpus=None) -> dict:
                 "human_spot_check_n": 5,
                 "corpus_verification": corpus is not None,
                 "metrics": {
-                    "grounding_pct": "findings whose cited excerpt shares a topic token; blocks "
-                    "below grounding_floor",
-                    "evidence_verified": "cited excerpts located in the actual corpus "
-                    "(when supplied)",
-                    "evidence_fabricated": "populated excerpts not found in their cited source "
-                    "(EVIDENCE_FABRICATED)",
-                    "evidence_source_unknown": "locators resolving to no corpus document "
-                    "(EVIDENCE_SOURCE_UNKNOWN)",
+                    "grounding_pct": "findings whose cited excerpt shares a topic token; blocks below grounding_floor",
+                    "evidence_verified": "cited excerpts located in the actual corpus (when supplied)",
+                    "evidence_fabricated": "populated excerpts not found in their cited source (EVIDENCE_FABRICATED)",
+                    "evidence_source_unknown": "locators resolving to no corpus document (EVIDENCE_SOURCE_UNKNOWN)",
                     "excerpt_fill_rate": "evidence entries carrying a populated excerpt",
                     "source_coverage": "distinct corpus documents cited",
                 },
@@ -173,8 +168,7 @@ def _eval_config(args, registry, corpus=None) -> dict:
                     "severity_calibration": "stated severity matches the evidence",
                     "answer_relevancy": "output addresses the workspace intention",
                     "source_coverage": "distinct corpus documents cited (deterministic)",
-                    "excerpt_fill_rate": "evidence entries with a populated excerpt "
-                    "(deterministic)",
+                    "excerpt_fill_rate": "evidence entries with a populated excerpt (deterministic)",
                 },
             },
             "G5": {
@@ -305,9 +299,12 @@ def cmd_aa_band(args: argparse.Namespace) -> int:
     for rp in args.results:
         result = _load_json(rp)
         g2 = g2_recall_precision(
-            result, registry,
-            recall_metric=args.recall_metric, embed_fn=embed_fn,
-            tau=args.tau, tau_nc=args.tau_nc,
+            result,
+            registry,
+            recall_metric=args.recall_metric,
+            embed_fn=embed_fn,
+            tau=args.tau,
+            tau_nc=args.tau_nc,
             corpus=corpus,
         )
         if g2.passed or g2.details.get("recall") is not None:
@@ -468,15 +465,13 @@ def _add_common(p: argparse.ArgumentParser) -> None:
         "--tau",
         type=float,
         default=float(os.environ.get("FLYEVAL_TAU", "0.70")),
-        help="cosine similarity threshold for the semantic recall path (real items). "
-        "Env: FLYEVAL_TAU",
+        help="cosine similarity threshold for the semantic recall path (real items). Env: FLYEVAL_TAU",
     )
     p_gate.add_argument(
         "--tau-nc",
         type=float,
         default=float(os.environ.get("FLYEVAL_TAU_NC", "0.85")),
-        help="cosine similarity threshold for NC item detection (higher; no source anchor). "
-        "Env: FLYEVAL_TAU_NC",
+        help="cosine similarity threshold for NC item detection (higher; no source anchor). Env: FLYEVAL_TAU_NC",
     )
     p_gate.add_argument("--human-signed-off", action="store_true")
     p_gate.add_argument("--signoffs", type=int, default=0)
@@ -495,8 +490,7 @@ def _add_common(p: argparse.ArgumentParser) -> None:
         "--judge-runs",
         type=int,
         default=int(os.environ.get("FLYEVAL_JUDGE_RUNS", "1")),
-        help="G4 judge runs; the median of numeric scores is kept (odd recommended). "
-        "Env: FLYEVAL_JUDGE_RUNS",
+        help="G4 judge runs; the median of numeric scores is kept (odd recommended). Env: FLYEVAL_JUDGE_RUNS",
     )
     p_gate.add_argument(
         "--judge-concurrency",
diff --git a/fireflyframework_agentic/evaluation/corpus.py b/fireflyframework_agentic/evaluation/corpus.py
index 32835f2c..34926b41 100644
--- a/fireflyframework_agentic/evaluation/corpus.py
+++ b/fireflyframework_agentic/evaluation/corpus.py
@@ -80,7 +80,7 @@ def normalize(text: str) -> str:
     smart quotes, collapse whitespace, casefold."""
     text = unicodedata.normalize("NFKC", text)
     text = text.replace("**", "").replace("*", "")
-    text = re.sub(r"[\"""''']", "", text)
+    text = re.sub(r"[\"" "''']", "", text)
     return re.sub(r"\s+", " ", text).strip().casefold()
 
 
@@ -129,9 +129,7 @@ def load_corpus(path: str | Path) -> Corpus:
 
 def _fragment_coverage(fragment: str, source: str) -> float:
     """Fraction of fragment covered by matching blocks of >= _MIN_BLOCK_CHARS chars."""
-    blocks = difflib.SequenceMatcher(
-        None, fragment, source, autojunk=False
-    ).get_matching_blocks()
+    blocks = difflib.SequenceMatcher(None, fragment, source, autojunk=False).get_matching_blocks()
     covered = sum(b.size for b in blocks if b.size >= _MIN_BLOCK_CHARS)
     return covered / len(fragment)
 
@@ -158,11 +156,9 @@ def verify_entry(corpus: Corpus, entry: dict) -> str:
     if not excerpt:
         return EMPTY
 
-    fragments = [
-        f.strip()
-        for f in _SPLICE_PATTERN.split(excerpt)
-        if len(f.strip()) >= _MIN_FRAGMENT_CHARS
-    ] or [excerpt]
+    fragments = [f.strip() for f in _SPLICE_PATTERN.split(excerpt) if len(f.strip()) >= _MIN_FRAGMENT_CHARS] or [
+        excerpt
+    ]
 
     for fragment in fragments:
         if fragment in source:
@@ -178,8 +174,4 @@ def verify_evidence_index(corpus: Corpus, result: dict) -> dict[str, str]:
     Returns {evidence_id: status} over all entries — referenced or not — so
     the gates share one verification pass.
     """
-    return {
-        ev["id"]: verify_entry(corpus, ev)
-        for ev in result.get("evidence_index", [])
-        if ev.get("id")
-    }
+    return {ev["id"]: verify_entry(corpus, ev) for ev in result.get("evidence_index", []) if ev.get("id")}
diff --git a/fireflyframework_agentic/evaluation/gates.py b/fireflyframework_agentic/evaluation/gates.py
index 057bfea7..fc98d311 100644
--- a/fireflyframework_agentic/evaluation/gates.py
+++ b/fireflyframework_agentic/evaluation/gates.py
@@ -93,11 +93,7 @@ def _build_evidence_index(result: dict, corpus: Corpus | None = None) -> dict[st
     if corpus is None:
         return index
     statuses = verify_evidence_index(corpus, result)
-    return {
-        eid: ev
-        for eid, ev in index.items()
-        if statuses[eid] in (VERIFIED, EMPTY)
-    }
+    return {eid: ev for eid, ev in index.items() if statuses[eid] in (VERIFIED, EMPTY)}
 
 
 # ── G1: Structural & Safe ────────────────────────────────────────────────────
@@ -322,8 +318,10 @@ def _finding_redundancy_rate(findings: list[dict]) -> float:
     """Fraction of findings that are near-duplicates of another (Jaccard ≥0.6 on ≥5-char tokens)."""
     if len(findings) < 2:
         return 0.0
+
     def _tok(text: str) -> frozenset[str]:
         return frozenset(t.lower() for t in text.split() if len(t) >= 5)
+
     token_sets = [_tok(f.get("description", "")) for f in findings]
     in_redundant: set[int] = set()
     for i in range(len(token_sets)):
@@ -381,9 +379,7 @@ def g2_recall_precision(
         if item.tier == "NC":
             lexical[item.id] = False
         elif item.scope == "dependency_graph" and item.from_node:
-            lexical[item.id] = matcher.matches_dependency_graph_relation(
-                item, result, evidence_index
-            )
+            lexical[item.id] = matcher.matches_dependency_graph_relation(item, result, evidence_index)
         else:
             lexical[item.id] = any(
                 matches(c, item, evidence_index, scope=scope)
@@ -394,14 +390,10 @@ def g2_recall_precision(
     if recall_metric not in ("lexical", "semantic", "hybrid"):
         raise ValueError(f"unknown recall_metric {recall_metric!r}")
     if recall_metric in ("semantic", "hybrid") and embed_fn is None:
-        raise ValueError(
-            f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn"
-        )
+        raise ValueError(f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn")
 
     if embed_fn is not None:
-        semantic = matcher.semantic_hits(
-            candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc
-        )
+        semantic = matcher.semantic_hits(candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc)
         # dependency_graph relation items have no embedding candidates (§5.3b uses
         # the endpoint matcher, not per-candidate text embeddings); mirror the
         # lexical result so semantic/hybrid never under-credits them.
@@ -424,8 +416,7 @@ def g2_recall_precision(
     finding_count = len(findings)
     finding_scoped_items = [i for i in registry.real_items if i.scope == "finding"]
     findings_matched = sum(
-        1 for f in findings
-        if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items)
+        1 for f in findings if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items)
     )
     _sn = {
         "finding_count": finding_count,
@@ -493,9 +484,7 @@ def _semantic_details() -> dict:
             "lexical_recall": round(_weighted_recall(scored_items, lexical), 4),
             "semantic_recall": round(_weighted_recall(scored_items, semantic), 4),
             "hybrid_recall": round(
-                _weighted_recall(
-                    scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical}
-                ),
+                _weighted_recall(scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical}),
                 4,
             ),
             "tau": tau,
@@ -577,8 +566,8 @@ def g3_grounded(
 
     grounded_ids: list[str] = []
     # Ungrounded split (§6.3): distinguish format issues from real faithfulness failures.
-    ungrounded_empty_only: list[str] = []    # every ref had an empty excerpt
-    ungrounded_populated: list[str] = []     # had populated excerpt(s) but none anchored
+    ungrounded_empty_only: list[str] = []  # every ref had an empty excerpt
+    ungrounded_populated: list[str] = []  # had populated excerpt(s) but none anchored
 
     # Excerpt fill: count all resolved refs and how many carry a non-empty excerpt.
     total_refs = 0
@@ -657,18 +646,14 @@ def g3_grounded(
             "Populated excerpt(s) not found in the cited corpus document — "
             "the run asserts evidence the source does not contain."
         )
-        return GateResult(
-            gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details
-        )
+        return GateResult(gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details)
 
     if unknown_source_ids:
         details["message"] = (
             "Evidence locator(s) resolve to no corpus document — either the "
             "corpus bundle is incomplete or the run invented a source."
         )
-        return GateResult(
-            gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details
-        )
+        return GateResult(gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details)
 
     if grounding_pct < grounding_floor:
         details["floor"] = grounding_floor
@@ -746,8 +731,7 @@ def g5_no_regression(
         band = noise.get(metric, 0.0)
         if delta < -band:
             regressions.append(
-                f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} "
-                f"delta={delta:+.4f} < -band={-band:.4f}"
+                f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} delta={delta:+.4f} < -band={-band:.4f}"
             )
         elif delta > band:
             improvements.append(f"{metric}: delta={delta:+.4f} > band={band:.4f}")
diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py
index a347c8e1..80a90b04 100644
--- a/fireflyframework_agentic/evaluation/judge.py
+++ b/fireflyframework_agentic/evaluation/judge.py
@@ -142,10 +142,7 @@ def _map_chat(chat_fn, prompts, workers=1):
 
     results: list[dict] = [{} for _ in prompts]
     with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
-        futures = {
-            executor.submit(chat_fn, system, user): idx
-            for idx, (system, user) in enumerate(prompts)
-        }
+        futures = {executor.submit(chat_fn, system, user): idx for idx, (system, user) in enumerate(prompts)}
         for future in concurrent.futures.as_completed(futures):
             idx = futures[future]
             try:
@@ -165,11 +162,7 @@ def source_coverage(result: dict) -> dict:
     source stems present in evidence_index but cited by no finding.
     """
     evidence_index = _evidence_index(result)
-    all_stems = {
-        source_stem(ev.get("locator", ""))
-        for ev in result.get("evidence_index", [])
-        if ev.get("locator")
-    }
+    all_stems = {source_stem(ev.get("locator", "")) for ev in result.get("evidence_index", []) if ev.get("locator")}
     cited_stems: set[str] = set()
     for f in result.get("findings", []):
         for ref in f.get("evidence_refs", []):
@@ -245,7 +238,7 @@ def semantic_recovery(
     cand_vecs = np.asarray(embed_fn(candidate_texts), dtype=np.float64)
 
     recovered: list[dict] = []
-    for item, ivec in zip(missed_items, item_vecs):
+    for item, ivec in zip(missed_items, item_vecs, strict=False):
         best = max((cosine(ivec, cvec) for cvec in cand_vecs), default=0.0)
         if best >= tau:
             recovered.append({"id": item.id, "cosine": round(best, 4)})
@@ -307,11 +300,7 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic
     source}], count}.
     """
     evidence_index = _evidence_index(result)
-    scored = [
-        (f, excerpts)
-        for f in result.get("findings", [])
-        if (excerpts := _cited_excerpts(f, evidence_index))
-    ]
+    scored = [(f, excerpts) for f in result.get("findings", []) if (excerpts := _cited_excerpts(f, evidence_index))]
     prompts = [
         (
             SYSTEM,
@@ -326,7 +315,7 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic
     ]
     answers = _map_chat(chat_fn, prompts, workers)
     mismatches: list[dict] = []
-    for (f, _excerpts), answer in zip(scored, answers):
+    for (f, _excerpts), answer in zip(scored, answers, strict=False):
         for m in answer.get("mismatches", []) or []:
             mismatches.append(
                 {
@@ -395,7 +384,7 @@ def nc_semantic_precision(result: dict, registry, chat_fn, *, workers: int = 1)
     ]
     answers = _map_chat(chat_fn, prompts, workers)
     asserted_ids = [
-        item.id for item, a in zip(nc_items, answers) if str(a.get("asserted", "")).lower() == "yes"
+        item.id for item, a in zip(nc_items, answers, strict=False) if str(a.get("asserted", "")).lower() == "yes"
     ]
     return {"asserted": len(asserted_ids), "total": len(nc_items), "asserted_ids": asserted_ids}
 
@@ -407,10 +396,7 @@ def fabricated_entity(result: dict, chat_fn) -> dict:
     excerpts + locators.
     """
     output_text = _output_text(result)
-    corpus = "\n".join(
-        f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}"
-        for ev in result.get("evidence_index", [])
-    )
+    corpus = "\n".join(f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" for ev in result.get("evidence_index", []))
     user = (
         "List any system, organization, or metric NAMED in the OUTPUT that does NOT "
         "appear anywhere in the CORPUS EVIDENCE.\n"
@@ -433,8 +419,7 @@ def contradiction(result: dict, chat_fn) -> dict:
         lines.append(f"{f.get('id', '?')}: {f.get('title', '')} — {f.get('description', '')}")
     user = (
         "Are any two of these FINDINGS mutually contradictory? List each contradicting pair.\n"
-        'Reply with ONLY {"pairs": [["<id_a>", "<id_b>"], ...]}.  Empty list if none.\n\n'
-        + "\n".join(lines)
+        'Reply with ONLY {"pairs": [["<id_a>", "<id_b>"], ...]}.  Empty list if none.\n\n' + "\n".join(lines)
     )
     pairs = chat_fn(SYSTEM, user).get("pairs", []) or []
     return {"count": len(pairs), "pairs": [list(p) for p in pairs]}
@@ -514,7 +499,7 @@ def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict:
     answers = _map_chat(chat_fn, prompts, workers)
     verdicts: dict[str, str] = {}
     miscalibrated = 0
-    for f, a in zip(findings, answers):
+    for f, a in zip(findings, answers, strict=False):
         verdict = str(a.get("calibration", "calibrated")).lower()
         verdicts[f.get("id", "?")] = verdict
         if verdict in ("under", "over"):
@@ -557,7 +542,7 @@ def surface_deduplication(result: dict, chat_fn, *, workers: int = 1) -> dict:
     def _toks(node: dict) -> frozenset[str]:
         return frozenset(node.get("name", "").lower().split())
 
-    PER_SURFACE_CAP = 10
+    per_surface_cap = 10
     # candidates: (surface, node_a, node_b, parent_process_name)
     candidates: list[tuple[str, dict, dict, str]] = []
 
@@ -574,7 +559,7 @@ def _toks(node: dict) -> frozenset[str]:
                 if jac >= 0.30:
                     pairs.append((jac, procs[i], procs[j]))
         pairs.sort(key=lambda x: x[0], reverse=True)
-        for _jac, a, b in pairs[:PER_SURFACE_CAP]:
+        for _jac, a, b in pairs[:per_surface_cap]:
             candidates.append(("process", a, b, ""))
 
     # Activities and decisions: within the same parent process only
@@ -595,7 +580,7 @@ def _toks(node: dict) -> frozenset[str]:
                     if jac >= 0.30:
                         all_pairs.append((jac, nodes[i], nodes[j], proc_name))
         all_pairs.sort(key=lambda x: x[0], reverse=True)
-        for _jac, a, b, proc_name in all_pairs[:PER_SURFACE_CAP]:
+        for _jac, a, b, proc_name in all_pairs[:per_surface_cap]:
             candidates.append((surface_key, a, b, proc_name))
 
     if not candidates:
@@ -604,33 +589,37 @@ def _toks(node: dict) -> frozenset[str]:
     prompts = []
     for surface, a, b, parent_proc in candidates:
         ctx = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else ""
-        prompts.append((
-            SYSTEM,
-            f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a "
-            f"duplicate / sub-case / restatement of the other?\n"
-            f"{ctx}"
-            'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": "<one line>"}.\n\n'
-            f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n"
-            f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}",
-        ))
+        prompts.append(
+            (
+                SYSTEM,
+                f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a "
+                f"duplicate / sub-case / restatement of the other?\n"
+                f"{ctx}"
+                'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": "<one line>"}.\n\n'
+                f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n"
+                f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}",
+            )
+        )
 
     answers = _map_chat(chat_fn, prompts, workers)
 
     distinct = 0
     redundant = 0
     redundant_pairs: list[dict] = []
-    for (surface, a, b, _parent), answer in zip(candidates, answers):
+    for (surface, a, b, _parent), answer in zip(candidates, answers, strict=False):
         verdict = str(answer.get("verdict", "")).upper()
         if verdict == "DISTINCT":
             distinct += 1
         else:
             redundant += 1
-            redundant_pairs.append({
-                "surface": surface,
-                "a": a.get("name", ""),
-                "b": b.get("name", ""),
-                "reason": str(answer.get("reason", "")),
-            })
+            redundant_pairs.append(
+                {
+                    "surface": surface,
+                    "a": a.get("name", ""),
+                    "b": b.get("name", ""),
+                    "reason": str(answer.get("reason", "")),
+                }
+            )
 
     total = distinct + redundant
     return {
@@ -800,9 +789,7 @@ def _run_judge_metric(name: str, fn) -> None:
         "numeric_temporal_fidelity",
         lambda: numeric_temporal_fidelity(result, chat_fn, workers=concurrency),
     )
-    _run_judge_metric(
-        "citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency)
-    )
+    _run_judge_metric("citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency))
     _run_judge_metric(
         "nc_semantic_precision",
         lambda: nc_semantic_precision(result, registry, chat_fn, workers=concurrency),
diff --git a/fireflyframework_agentic/evaluation/judge_client.py b/fireflyframework_agentic/evaluation/judge_client.py
index 1af17f53..e4b58dea 100644
--- a/fireflyframework_agentic/evaluation/judge_client.py
+++ b/fireflyframework_agentic/evaluation/judge_client.py
@@ -245,8 +245,7 @@ def _dispatch(self, system: str, user: str, max_tokens: int) -> str:
         if self.provider == "ollama":
             return self._ollama(system, user, max_tokens)
         raise ValueError(
-            f"unknown judge provider {self.provider!r} in {self.model_spec!r}; "
-            "use anthropic:/openai:/azure:/ollama:"
+            f"unknown judge provider {self.provider!r} in {self.model_spec!r}; use anthropic:/openai:/azure:/ollama:"
         )
 
     def _anthropic(self, system: str, user: str, max_tokens: int) -> str:
@@ -262,9 +261,7 @@ def _anthropic(self, system: str, user: str, max_tokens: int) -> str:
         }
         headers = {"x-api-key": api_key, "anthropic-version": "2023-06-01"}
         resp = _http_post_json("https://api.anthropic.com/v1/messages", headers, body, self.timeout)
-        text = next(
-            (b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None
-        )
+        text = next((b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None)
         if not text:
             raise RuntimeError(f"judge returned no text: {resp}")
         return text
@@ -283,9 +280,7 @@ def _openai(self, system: str, user: str, max_tokens: int) -> str:
             ],
         }
         headers = {"Authorization": f"Bearer {api_key}"}
-        resp = _http_post_json(
-            "https://api.openai.com/v1/chat/completions", headers, body, self.timeout
-        )
+        resp = _http_post_json("https://api.openai.com/v1/chat/completions", headers, body, self.timeout)
         return _extract_openai_text(resp)
 
     def _azure(self, system: str, user: str, max_tokens: int) -> str:
@@ -297,10 +292,7 @@ def _azure(self, system: str, user: str, max_tokens: int) -> str:
             raise RuntimeError("AZURE_OPENAI_API_KEY not set")
         api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01"
         # Azure deployment lives in the URL path, not the JSON body.
-        url = (
-            f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions"
-            f"?api-version={api_version}"
-        )
+        url = f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions?api-version={api_version}"
         body = {
             "max_tokens": max_tokens,
             "temperature": 0.0,
@@ -373,10 +365,7 @@ def embed(self, texts: list[str]) -> np.ndarray:
         if not api_key:
             raise RuntimeError("AZURE_OPENAI_API_KEY not set")
         api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01"
-        url = (
-            f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings"
-            f"?api-version={api_version}"
-        )
+        url = f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings?api-version={api_version}"
         headers = {"api-key": api_key}
         vectors = self._embed_with_split(texts, url, headers)
         return np.asarray(vectors, dtype=np.float32)
@@ -438,9 +427,7 @@ def build_embedder(spec: str):
         return OpenAIEmbedder(model or "text-embedding-3-small").embed
     if provider == "azure":
         return AzureOpenAIEmbedder(model or "text-embedding-3-small").embed
-    raise NotImplementedError(
-        f"embedder backend {provider!r} not implemented yet; add it in build_embedder()"
-    )
+    raise NotImplementedError(f"embedder backend {provider!r} not implemented yet; add it in build_embedder()")
 
 
 def cosine(a, b) -> float:
diff --git a/fireflyframework_agentic/evaluation/matcher.py b/fireflyframework_agentic/evaluation/matcher.py
index b4d81f44..ccf61c96 100644
--- a/fireflyframework_agentic/evaluation/matcher.py
+++ b/fireflyframework_agentic/evaluation/matcher.py
@@ -113,9 +113,7 @@ def _keyword_anchored(desc: str, keywords: list[str]) -> bool:
     if not keywords:
         return False
     desc_lower = desc.lower()
-    return any(
-        re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords
-    )
+    return any(re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords)
 
 
 def candidate_text(candidate: dict, scope: str) -> str:
@@ -141,18 +139,28 @@ def candidate_text(candidate: dict, scope: str) -> str:
         pain = candidate.get("pain_points") or []
         goals_str = " ".join(goals) if isinstance(goals, list) else str(goals)
         pain_str = " ".join(pain) if isinstance(pain, list) else str(pain)
-        return " ".join(filter(None, [
-            candidate.get("name", ""),
-            candidate.get("role", ""),
-            goals_str,
-            pain_str,
-        ]))
+        return " ".join(
+            filter(
+                None,
+                [
+                    candidate.get("name", ""),
+                    candidate.get("role", ""),
+                    goals_str,
+                    pain_str,
+                ],
+            )
+        )
     if scope == "informal_channel":
-        return " ".join(filter(None, [
-            candidate.get("name", ""),
-            candidate.get("usage_context", ""),
-            candidate.get("notes", ""),
-        ]))
+        return " ".join(
+            filter(
+                None,
+                [
+                    candidate.get("name", ""),
+                    candidate.get("usage_context", ""),
+                    candidate.get("notes", ""),
+                ],
+            )
+        )
     # process, decision, system, dependency_graph (diagnostic nodes)
     return " ".join(filter(None, [candidate.get("name", ""), candidate.get("description", "")]))
 
@@ -246,9 +254,7 @@ def matches_dependency_graph_relation(
 
     def _anchor(endpoint_text: str) -> set[str]:
         return {
-            a["id"]
-            for a in all_activities
-            if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text)
+            a["id"] for a in all_activities if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text)
         }
 
     from_ids = _anchor(item.from_node)
@@ -268,9 +274,8 @@ def _node_stems(node: dict) -> set[str]:
     dg = result.get("dependency_graph", {})
 
     for edge in dg.get("activity_edges", []):
-        if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids:
-            if _node_stems(edge) & item_stems:
-                return True
+        if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids and _node_stems(edge) & item_stems:
+            return True
 
     for path in dg.get("critical_paths", []):
         if not (_node_stems(path) & item_stems):
@@ -325,19 +330,13 @@ def semantic_hits(
 
     # Flatten all candidates across scopes, preserving their scope tag for
     # text extraction and per-item filtering.
-    scoped: list[tuple[str, dict]] = [
-        (scope, cand)
-        for scope, cands in candidates.items()
-        for cand in cands
-    ]
+    scoped: list[tuple[str, dict]] = [(scope, cand) for scope, cands in candidates.items() for cand in cands]
 
     if not scoped:
         return {item.id: False for item in items}
 
     cand_texts = [candidate_text(cand, scope) for scope, cand in scoped]
-    item_texts = [
-        " ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items
-    ]
+    item_texts = [" ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items]
 
     cand_vecs = np.asarray(embed_fn(cand_texts))
     item_vecs = np.asarray(embed_fn(item_texts))
@@ -359,10 +358,7 @@ def semantic_hits(
                 if cosine(cand_vecs[k], item_vec) >= tau_nc:
                     hit = True
                     break
-            elif (
-                shares_source(cand, item, evidence_index)
-                and cosine(cand_vecs[k], item_vec) >= tau
-            ):
+            elif shares_source(cand, item, evidence_index) and cosine(cand_vecs[k], item_vec) >= tau:
                 hit = True
                 break
         hits[item.id] = hit
diff --git a/fireflyframework_agentic/evaluation/registry.py b/fireflyframework_agentic/evaluation/registry.py
index 2b869ba9..87c4beb1 100644
--- a/fireflyframework_agentic/evaluation/registry.py
+++ b/fireflyframework_agentic/evaluation/registry.py
@@ -24,6 +24,7 @@
 - kappa present (0.0 placeholder allowed; G2 advisory until >= 0.70)
 - ABANCA DILO items must target a single measured sub-population
 """
+
 from __future__ import annotations
 
 import hashlib
@@ -35,8 +36,15 @@
 
 VALID_TIERS = ("L0", "L1", "L2", "L3", "NC")
 VALID_SCOPES = (
-    "process", "activity", "decision", "finding", "action",
-    "persona", "system", "informal_channel", "dependency_graph",
+    "process",
+    "activity",
+    "decision",
+    "finding",
+    "action",
+    "persona",
+    "system",
+    "informal_channel",
+    "dependency_graph",
 )
 SCHEMA_VERSION = "lean-1"
 KAPPA_ADVISORY_THRESHOLD = 0.70
@@ -47,13 +55,13 @@ class RegistryItem:
     id: str
     tier: Literal["L0", "L1", "L2", "L3", "NC"]
     description: str
-    evidence: list[str]          # source file paths (path portion of locator, no #page=N)
-    scope: str = "finding"       # which DiscoveryResult surface to match against (§4.3)
+    evidence: list[str]  # source file paths (path portion of locator, no #page=N)
+    scope: str = "finding"  # which DiscoveryResult surface to match against (§4.3)
     keywords: list[str] = field(default_factory=list)
     weight: float = 1.0
-    from_node: str = ""   # dependency_graph relation items only
-    to_node: str = ""     # dependency_graph relation items only
-    relation: str = ""    # defaults to "precedes" when from/to present
+    from_node: str = ""  # dependency_graph relation items only
+    to_node: str = ""  # dependency_graph relation items only
+    relation: str = ""  # defaults to "precedes" when from/to present
 
 
 @dataclass(frozen=True)
@@ -87,10 +95,7 @@ def sha256(self) -> str:
 
 def _validate(raw: dict, path: Path) -> None:
     if raw.get("schema_version") != SCHEMA_VERSION:
-        raise ValueError(
-            f"{path.name}: schema_version must be '{SCHEMA_VERSION}', "
-            f"got {raw.get('schema_version')!r}"
-        )
+        raise ValueError(f"{path.name}: schema_version must be '{SCHEMA_VERSION}', got {raw.get('schema_version')!r}")
     for fname in ("corpus", "author", "date"):
         if not raw.get(fname):
             raise ValueError(f"{path.name}: missing required field '{fname}'")
@@ -116,20 +121,17 @@ def _validate(raw: dict, path: Path) -> None:
         tier = it.get("tier")
         if tier not in VALID_TIERS:
             raise ValueError(
-                f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; "
-                f"must be one of {VALID_TIERS}"
+                f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; must be one of {VALID_TIERS}"
             )
         scope = it.get("scope", "finding")
         if scope not in VALID_SCOPES:
             raise ValueError(
-                f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; "
-                f"must be one of {VALID_SCOPES}"
+                f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; must be one of {VALID_SCOPES}"
             )
         if scope == "dependency_graph":
             if not it.get("from") or not it.get("to"):
                 raise ValueError(
-                    f"{path.name}: dependency_graph item '{it.get('id')}' must have "
-                    "non-empty 'from' and 'to'"
+                    f"{path.name}: dependency_graph item '{it.get('id')}' must have non-empty 'from' and 'to'"
                 )
         else:
             if "from" in it or "to" in it or "relation" in it:
@@ -153,13 +155,13 @@ def _validate(raw: dict, path: Path) -> None:
     # ABANCA DILO blend guard: items must assert a single sub-population target.
     # Checks for phrases that would indicate a blended numeric target is asserted.
     # "blend" alone is too broad (items may reference it negatively).
-    BLEND_PHRASES = ("combined distribution", "across all offices regardless of segment")
+    blend_phrases = ("combined distribution", "across all offices regardless of segment")
     for it in items:
         if it.get("tier") == "NC":
             continue
         desc = it.get("description", "").lower()
         iid = it.get("id", "")
-        if any(phrase in desc for phrase in BLEND_PHRASES):
+        if any(phrase in desc for phrase in blend_phrases):
             raise ValueError(
                 f"{path.name}: item '{iid}' description targets a blended distribution; "
                 "ABANCA DILO items must target a single measured sub-population "
diff --git a/fireflyframework_agentic/evaluation/run_config_snapshot.py b/fireflyframework_agentic/evaluation/run_config_snapshot.py
index db543129..c029e8e6 100644
--- a/fireflyframework_agentic/evaluation/run_config_snapshot.py
+++ b/fireflyframework_agentic/evaluation/run_config_snapshot.py
@@ -32,6 +32,7 @@
         --options    request_options.json \
         --commit     c107918
 """
+
 from __future__ import annotations
 
 import argparse
@@ -133,12 +134,8 @@ def write_snapshot(output_dir: str | Path, config: dict) -> Path:
 def main(argv: list[str] | None = None) -> int:
     parser = argparse.ArgumentParser(description="Capture the flyradar run configuration.")
     parser.add_argument("--output-dir", required=True, help="Run directory holding output.json.")
-    parser.add_argument(
-        "--options", required=True, help="JSON file of the DiscoveryRequest options that were sent."
-    )
-    parser.add_argument(
-        "--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL)."
-    )
+    parser.add_argument("--options", required=True, help="JSON file of the DiscoveryRequest options that were sent.")
+    parser.add_argument("--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL).")
     parser.add_argument("--commit", default=None, help="Deployed flyradar git commit, if known.")
     args = parser.parse_args(argv)
 
diff --git a/fireflyframework_agentic/evaluation/scorecard.py b/fireflyframework_agentic/evaluation/scorecard.py
index b34885e8..da3c4a87 100644
--- a/fireflyframework_agentic/evaluation/scorecard.py
+++ b/fireflyframework_agentic/evaluation/scorecard.py
@@ -188,13 +188,9 @@ def _render_advisory(report) -> list[str]:
         d = m["faithfulness"]
         u = d.get("unsupported_ids", [])
         extra = f"   (unsupported: {', '.join(u)})" if u else ""
-        lines.append(
-            f"Faithfulness (entailment):       {d.get('supported')}/{d.get('total')} supported{extra}"
-        )
+        lines.append(f"Faithfulness (entailment):       {d.get('supported')}/{d.get('total')} supported{extra}")
     if "numeric_temporal_fidelity" in m:
-        lines.append(
-            f"Numeric/temporal fidelity:       {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)"
-        )
+        lines.append(f"Numeric/temporal fidelity:       {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)")
     if "citation_relevance" in m:
         d = m["citation_relevance"]
         lines.append(
@@ -218,14 +214,10 @@ def _render_advisory(report) -> list[str]:
         lines.append(f"Contradiction detection:         {m['contradiction'].get('count', 0)}")
     if "actionability" in m:
         d = m["actionability"]
-        lines.append(
-            f"Actionability:                   {_num(d.get('score'))}   (rated {d.get('rated', 0)})"
-        )
+        lines.append(f"Actionability:                   {_num(d.get('score'))}   (rated {d.get('rated', 0)})")
     if "severity_calibration" in m:
         d = m["severity_calibration"]
-        lines.append(
-            f"Severity calibration:            {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated"
-        )
+        lines.append(f"Severity calibration:            {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated")
     if "answer_relevancy" in m:
         lines.append(f"Answer relevancy:                {_num(m['answer_relevancy'].get('score'))}")
     if "comparative_vs_champion" in m:
@@ -236,14 +228,10 @@ def _render_advisory(report) -> list[str]:
         d = m["source_coverage"]
         o = d.get("orphaned", [])
         extra = f"   (orphaned: {', '.join(o)})" if o else ""
-        lines.append(
-            f"Source coverage [D]:             {d.get('cited')}/{d.get('total')} documents cited{extra}"
-        )
+        lines.append(f"Source coverage [D]:             {d.get('cited')}/{d.get('total')} documents cited{extra}")
     if "excerpt_fill_rate" in m:
         d = m["excerpt_fill_rate"]
-        lines.append(
-            f"Evidence-excerpt fill [D]:       {d.get('populated')}/{d.get('total')} populated"
-        )
+        lines.append(f"Evidence-excerpt fill [D]:       {d.get('populated')}/{d.get('total')} populated")
     if "open_gap" in m:
         gap = (m["open_gap"].get("gap") or "").strip()
         if gap:
@@ -259,9 +247,7 @@ def _render_advisory(report) -> list[str]:
         json.dumps({"metrics": report.metrics, "details": report.details}, indent=2, default=str),
         "```",
     ]
-    lines.append(
-        "> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10)."
-    )
+    lines.append("> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10).")
     lines.append("")
     return lines
 
@@ -284,9 +270,7 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]:
         matched = d.get("findings_matched_to_registry", {}).get("fraction", 0.0)
 
         tier_summary = ", ".join(
-            f"{t} {v['hit']}/{v['total']}"
-            for t, v in tiers.items()
-            if "hit" in v and "total" in v
+            f"{t} {v['hit']}/{v['total']}" for t, v in tiers.items() if "hit" in v and "total" in v
         )
         lines.append(
             f"Lexical recall is **{recall:.3f}** ({tier_summary}). "
@@ -300,9 +284,7 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]:
                 "The run is covering the same ground multiple times rather than broadening coverage."
             )
         else:
-            lines.append(
-                f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic."
-            )
+            lines.append(f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic.")
         lines.append(
             "_G2 is a topic-level test. A recall of 1.000 means every required topic was "
             "mentioned somewhere — it does not verify that the specific claims about those "
@@ -453,14 +435,10 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]:
     flag_names = [g.gate for g in flags]
 
     if not flags:
-        lines.append(
-            "All deterministic gates pass. The run is ready for G5 human sign-off."
-        )
+        lines.append("All deterministic gates pass. The run is ready for G5 human sign-off.")
     else:
         flag_str = ", ".join(flag_names)
-        lines.append(
-            f"The run is at **HOLD** due to flags on: {flag_str}. "
-        )
+        lines.append(f"The run is at **HOLD** due to flags on: {flag_str}. ")
         for g in flags:
             if g.gate == "G3" and g.reason_code == "EVIDENCE_SOURCE_UNKNOWN":
                 lines.append(
diff --git a/fireflyframework_agentic/evaluation/stats.py b/fireflyframework_agentic/evaluation/stats.py
index e70c629a..c622588c 100644
--- a/fireflyframework_agentic/evaluation/stats.py
+++ b/fireflyframework_agentic/evaluation/stats.py
@@ -23,10 +23,11 @@
 aggregation bug where the previous runner inherited run 0's grounding report
 unchanged instead of merging across all runs.
 """
+
 from __future__ import annotations
 
 import statistics
-from typing import Sequence
+from collections.abc import Sequence
 
 
 def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float:
@@ -49,11 +50,7 @@ def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float:
     scores = list(scores)
     if len(scores) < 2:
         raise ValueError(f"aa_band requires >= 2 reruns; got {len(scores)}")
-    deltas = [
-        abs(x - y)
-        for i, x in enumerate(scores)
-        for y in scores[i + 1:]
-    ]
+    deltas = [abs(x - y) for i, x in enumerate(scores) for y in scores[i + 1 :]]
     sorted_deltas = sorted(deltas)
     # Index for the requested percentile; clamp to valid range
     idx = max(0, min(len(sorted_deltas) - 1, int(len(sorted_deltas) * percentile / 100)))
diff --git a/uv.lock b/uv.lock
index 7e3b501c..93e18075 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1209,6 +1209,10 @@ dev = [
 embeddings = [
     { name = "numpy" },
 ]
+evaluation = [
+    { name = "numpy" },
+    { name = "scipy" },
+]
 google-embeddings = [
     { name = "google-generativeai" },
 ]
@@ -1279,6 +1283,7 @@ requires-dist = [
     { name = "mistralai", marker = "extra == 'mistral-embeddings'", specifier = ">=1.0.0" },
     { name = "motor", marker = "extra == 'mongodb'", specifier = ">=3.6.0" },
     { name = "numpy", marker = "extra == 'embeddings'", specifier = ">=1.26.0" },
+    { name = "numpy", marker = "extra == 'evaluation'", specifier = ">=1.26.0" },
     { name = "numpy", marker = "extra == 'reasoning-eval'", specifier = ">=2.0.0" },
     { name = "openai", marker = "extra == 'azure-embeddings'", specifier = ">=1.0.0" },
     { name = "openai", marker = "extra == 'openai-embeddings'", specifier = ">=1.0.0" },
@@ -1304,13 +1309,14 @@ requires-dist = [
     { name = "python-dotenv", specifier = ">=1.0.0" },
     { name = "qdrant-client", marker = "extra == 'vectorstores-qdrant'", specifier = ">=1.12.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" },
+    { name = "scipy", marker = "extra == 'evaluation'", specifier = ">=1.11" },
     { name = "sqlalchemy", marker = "extra == 'postgres'", specifier = ">=2.0.0" },
     { name = "sqlite-vec", marker = "extra == 'vectorstores-sqlite-vec'", specifier = ">=0.1.6" },
     { name = "testcontainers", marker = "extra == 'dev'", specifier = ">=4.10.0" },
     { name = "voyageai", marker = "extra == 'voyage-embeddings'", specifier = ">=0.3.0" },
     { name = "watchfiles", marker = "extra == 'watch'", specifier = ">=0.24.0" },
 ]
-provides-extras = ["postgres", "mongodb", "security", "embeddings", "openai-embeddings", "cohere-embeddings", "google-embeddings", "mistral-embeddings", "voyage-embeddings", "azure-embeddings", "bedrock-embeddings", "ollama-embeddings", "reasoning-eval", "vectorstores-chroma", "vectorstores-sqlite-vec", "vectorstores-pinecone", "vectorstores-qdrant", "vectorstores-pgvector", "watch", "binary", "all", "dev"]
+provides-extras = ["postgres", "mongodb", "security", "embeddings", "openai-embeddings", "cohere-embeddings", "google-embeddings", "mistral-embeddings", "voyage-embeddings", "azure-embeddings", "bedrock-embeddings", "ollama-embeddings", "reasoning-eval", "vectorstores-chroma", "vectorstores-sqlite-vec", "vectorstores-pinecone", "vectorstores-qdrant", "vectorstores-pgvector", "watch", "binary", "all", "evaluation", "dev"]
 
 [[package]]
 name = "flatbuffers"
@@ -4489,6 +4495,57 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/87/72/c6c32d2b657fa3dad1de340254e14390b1e334ce38268b7ad51abda3c8c2/s3transfer-0.17.0-py3-none-any.whl", hash = "sha256:ce3801712acf4ad3e89fb9990df97b4972e93f4b3b0004d214be5bce12814c20", size = 86811, upload-time = "2026-04-29T22:07:34.966Z" },
 ]
 
+[[package]]
+name = "scipy"
+version = "1.17.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/76/27/07ee1b57b65e92645f219b37148a7e7928b82e2b5dbeccecb4dff7c64f0b/scipy-1.17.1-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:5e3c5c011904115f88a39308379c17f91546f77c1667cea98739fe0fccea804c", size = 31590199, upload-time = "2026-02-23T00:19:17.192Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/ae/db19f8ab842e9b724bf5dbb7db29302a91f1e55bc4d04b1025d6d605a2c5/scipy-1.17.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6fac755ca3d2c3edcb22f479fceaa241704111414831ddd3bc6056e18516892f", size = 28154001, upload-time = "2026-02-23T00:19:22.241Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/58/3ce96251560107b381cbd6e8413c483bbb1228a6b919fa8652b0d4090e7f/scipy-1.17.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ff200bf9d24f2e4d5dc6ee8c3ac64d739d3a89e2326ba68aaf6c4a2b838fd7d", size = 20325719, upload-time = "2026-02-23T00:19:26.329Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/83/15087d945e0e4d48ce2377498abf5ad171ae013232ae31d06f336e64c999/scipy-1.17.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4b400bdc6f79fa02a4d86640310dde87a21fba0c979efff5248908c6f15fad1b", size = 22683595, upload-time = "2026-02-23T00:19:30.304Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/e0/e58fbde4a1a594c8be8114eb4aac1a55bcd6587047efc18a61eb1f5c0d30/scipy-1.17.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b64ca7d4aee0102a97f3ba22124052b4bd2152522355073580bf4845e2550b6", size = 32896429, upload-time = "2026-02-23T00:19:35.536Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/5f/f17563f28ff03c7b6799c50d01d5d856a1d55f2676f537ca8d28c7f627cd/scipy-1.17.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:581b2264fc0aa555f3f435a5944da7504ea3a065d7029ad60e7c3d1ae09c5464", size = 35203952, upload-time = "2026-02-23T00:19:42.259Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/a5/9afd17de24f657fdfe4df9a3f1ea049b39aef7c06000c13db1530d81ccca/scipy-1.17.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:beeda3d4ae615106d7094f7e7cef6218392e4465cc95d25f900bebabfded0950", size = 34979063, upload-time = "2026-02-23T00:19:47.547Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/13/88b1d2384b424bf7c924f2038c1c409f8d88bb2a8d49d097861dd64a57b2/scipy-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6609bc224e9568f65064cfa72edc0f24ee6655b47575954ec6339534b2798369", size = 37598449, upload-time = "2026-02-23T00:19:53.238Z" },
+    { url = "https://files.pythonhosted.org/packages/35/e5/d6d0e51fc888f692a35134336866341c08655d92614f492c6860dc45bb2c/scipy-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:37425bc9175607b0268f493d79a292c39f9d001a357bebb6b88fdfaff13f6448", size = 36510943, upload-time = "2026-02-23T00:20:50.89Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/fd/3be73c564e2a01e690e19cc618811540ba5354c67c8680dce3281123fb79/scipy-1.17.1-cp313-cp313-win_arm64.whl", hash = "sha256:5cf36e801231b6a2059bf354720274b7558746f3b1a4efb43fcf557ccd484a87", size = 24545621, upload-time = "2026-02-23T00:20:55.871Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/6b/17787db8b8114933a66f9dcc479a8272e4b4da75fe03b0c282f7b0ade8cd/scipy-1.17.1-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:d59c30000a16d8edc7e64152e30220bfbd724c9bbb08368c054e24c651314f0a", size = 31936708, upload-time = "2026-02-23T00:19:58.694Z" },
+    { url = "https://files.pythonhosted.org/packages/38/2e/524405c2b6392765ab1e2b722a41d5da33dc5c7b7278184a8ad29b6cb206/scipy-1.17.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:010f4333c96c9bb1a4516269e33cb5917b08ef2166d5556ca2fd9f082a9e6ea0", size = 28570135, upload-time = "2026-02-23T00:20:03.934Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/c3/5bd7199f4ea8556c0c8e39f04ccb014ac37d1468e6cfa6a95c6b3562b76e/scipy-1.17.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:2ceb2d3e01c5f1d83c4189737a42d9cb2fc38a6eeed225e7515eef71ad301dce", size = 20741977, upload-time = "2026-02-23T00:20:07.935Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/b8/8ccd9b766ad14c78386599708eb745f6b44f08400a5fd0ade7cf89b6fc93/scipy-1.17.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:844e165636711ef41f80b4103ed234181646b98a53c8f05da12ca5ca289134f6", size = 23029601, upload-time = "2026-02-23T00:20:12.161Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/a0/3cb6f4d2fb3e17428ad2880333cac878909ad1a89f678527b5328b93c1d4/scipy-1.17.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:158dd96d2207e21c966063e1635b1063cd7787b627b6f07305315dd73d9c679e", size = 33019667, upload-time = "2026-02-23T00:20:17.208Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/c3/2d834a5ac7bf3a0c806ad1508efc02dda3c8c61472a56132d7894c312dea/scipy-1.17.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74cbb80d93260fe2ffa334efa24cb8f2f0f622a9b9febf8b483c0b865bfb3475", size = 35264159, upload-time = "2026-02-23T00:20:23.087Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/77/d3ed4becfdbd217c52062fafe35a72388d1bd82c2d0ba5ca19d6fcc93e11/scipy-1.17.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:dbc12c9f3d185f5c737d801da555fb74b3dcfa1a50b66a1a93e09190f41fab50", size = 35102771, upload-time = "2026-02-23T00:20:28.636Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/12/d19da97efde68ca1ee5538bb261d5d2c062f0c055575128f11a2730e3ac1/scipy-1.17.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:94055a11dfebe37c656e70317e1996dc197e1a15bbcc351bcdd4610e128fe1ca", size = 37665910, upload-time = "2026-02-23T00:20:34.743Z" },
+    { url = "https://files.pythonhosted.org/packages/06/1c/1172a88d507a4baaf72c5a09bb6c018fe2ae0ab622e5830b703a46cc9e44/scipy-1.17.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e30bdeaa5deed6bc27b4cc490823cd0347d7dae09119b8803ae576ea0ce52e4c", size = 36562980, upload-time = "2026-02-23T00:20:40.575Z" },
+    { url = "https://files.pythonhosted.org/packages/70/b0/eb757336e5a76dfa7911f63252e3b7d1de00935d7705cf772db5b45ec238/scipy-1.17.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a720477885a9d2411f94a93d16f9d89bad0f28ca23c3f8daa521e2dcc3f44d49", size = 24856543, upload-time = "2026-02-23T00:20:45.313Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/83/333afb452af6f0fd70414dc04f898647ee1423979ce02efa75c3b0f2c28e/scipy-1.17.1-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:a48a72c77a310327f6a3a920092fa2b8fd03d7deaa60f093038f22d98e096717", size = 31584510, upload-time = "2026-02-23T00:21:01.015Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/a6/d05a85fd51daeb2e4ea71d102f15b34fedca8e931af02594193ae4fd25f7/scipy-1.17.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:45abad819184f07240d8a696117a7aacd39787af9e0b719d00285549ed19a1e9", size = 28170131, upload-time = "2026-02-23T00:21:05.888Z" },
+    { url = "https://files.pythonhosted.org/packages/db/7b/8624a203326675d7746a254083a187398090a179335b2e4a20e2ddc46e83/scipy-1.17.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:3fd1fcdab3ea951b610dc4cef356d416d5802991e7e32b5254828d342f7b7e0b", size = 20342032, upload-time = "2026-02-23T00:21:09.904Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/35/2c342897c00775d688d8ff3987aced3426858fd89d5a0e26e020b660b301/scipy-1.17.1-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7bdf2da170b67fdf10bca777614b1c7d96ae3ca5794fd9587dce41eb2966e866", size = 22678766, upload-time = "2026-02-23T00:21:14.313Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/f2/7cdb8eb308a1a6ae1e19f945913c82c23c0c442a462a46480ce487fdc0ac/scipy-1.17.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adb2642e060a6549c343603a3851ba76ef0b74cc8c079a9a58121c7ec9fe2350", size = 32957007, upload-time = "2026-02-23T00:21:19.663Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/2e/7eea398450457ecb54e18e9d10110993fa65561c4f3add5e8eccd2b9cd41/scipy-1.17.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eee2cfda04c00a857206a4330f0c5e3e56535494e30ca445eb19ec624ae75118", size = 35221333, upload-time = "2026-02-23T00:21:25.278Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/77/5b8509d03b77f093a0d52e606d3c4f79e8b06d1d38c441dacb1e26cacf46/scipy-1.17.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d2650c1fb97e184d12d8ba010493ee7b322864f7d3d00d3f9bb97d9c21de4068", size = 35042066, upload-time = "2026-02-23T00:21:31.358Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/df/18f80fb99df40b4070328d5ae5c596f2f00fffb50167e31439e932f29e7d/scipy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:08b900519463543aa604a06bec02461558a6e1cef8fdbb8098f77a48a83c8118", size = 37612763, upload-time = "2026-02-23T00:21:37.247Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/39/f0e8ea762a764a9dc52aa7dabcfad51a354819de1f0d4652b6a1122424d6/scipy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:3877ac408e14da24a6196de0ddcace62092bfc12a83823e92e49e40747e52c19", size = 37290984, upload-time = "2026-02-23T00:22:35.023Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/56/fe201e3b0f93d1a8bcf75d3379affd228a63d7e2d80ab45467a74b494947/scipy-1.17.1-cp314-cp314-win_arm64.whl", hash = "sha256:f8885db0bc2bffa59d5c1b72fad7a6a92d3e80e7257f967dd81abb553a90d293", size = 25192877, upload-time = "2026-02-23T00:22:39.798Z" },
+    { url = "https://files.pythonhosted.org/packages/96/ad/f8c414e121f82e02d76f310f16db9899c4fcde36710329502a6b2a3c0392/scipy-1.17.1-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:1cc682cea2ae55524432f3cdff9e9a3be743d52a7443d0cba9017c23c87ae2f6", size = 31949750, upload-time = "2026-02-23T00:21:42.289Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/b0/c741e8865d61b67c81e255f4f0a832846c064e426636cd7de84e74d209be/scipy-1.17.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:2040ad4d1795a0ae89bfc7e8429677f365d45aa9fd5e4587cf1ea737f927b4a1", size = 28585858, upload-time = "2026-02-23T00:21:47.706Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/1b/3985219c6177866628fa7c2595bfd23f193ceebbe472c98a08824b9466ff/scipy-1.17.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:131f5aaea57602008f9822e2115029b55d4b5f7c070287699fe45c661d051e39", size = 20757723, upload-time = "2026-02-23T00:21:52.039Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/19/2a04aa25050d656d6f7b9e7b685cc83d6957fb101665bfd9369ca6534563/scipy-1.17.1-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9cdc1a2fcfd5c52cfb3045feb399f7b3ce822abdde3a193a6b9a60b3cb5854ca", size = 23043098, upload-time = "2026-02-23T00:21:56.185Z" },
+    { url = "https://files.pythonhosted.org/packages/86/f1/3383beb9b5d0dbddd030335bf8a8b32d4317185efe495374f134d8be6cce/scipy-1.17.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e3dcd57ab780c741fde8dc68619de988b966db759a3c3152e8e9142c26295ad", size = 33030397, upload-time = "2026-02-23T00:22:01.404Z" },
+    { url = "https://files.pythonhosted.org/packages/41/68/8f21e8a65a5a03f25a79165ec9d2b28c00e66dc80546cf5eb803aeeff35b/scipy-1.17.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a9956e4d4f4a301ebf6cde39850333a6b6110799d470dbbb1e25326ac447f52a", size = 35281163, upload-time = "2026-02-23T00:22:07.024Z" },
+    { url = "https://files.pythonhosted.org/packages/84/8d/c8a5e19479554007a5632ed7529e665c315ae7492b4f946b0deb39870e39/scipy-1.17.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:a4328d245944d09fd639771de275701ccadf5f781ba0ff092ad141e017eccda4", size = 35116291, upload-time = "2026-02-23T00:22:12.585Z" },
+    { url = "https://files.pythonhosted.org/packages/52/52/e57eceff0e342a1f50e274264ed47497b59e6a4e3118808ee58ddda7b74a/scipy-1.17.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a77cbd07b940d326d39a1d1b37817e2ee4d79cb30e7338f3d0cddffae70fcaa2", size = 37682317, upload-time = "2026-02-23T00:22:18.513Z" },
+    { url = "https://files.pythonhosted.org/packages/11/2f/b29eafe4a3fbc3d6de9662b36e028d5f039e72d345e05c250e121a230dd4/scipy-1.17.1-cp314-cp314t-win_amd64.whl", hash = "sha256:eb092099205ef62cd1782b006658db09e2fed75bffcae7cc0d44052d8aa0f484", size = 37345327, upload-time = "2026-02-23T00:22:24.442Z" },
+    { url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165, upload-time = "2026-02-23T00:22:29.563Z" },
+]
+
 [[package]]
 name = "secretstorage"
 version = "3.5.0"