From a2d6770a2339abad096fe6670045a8e81ecdba86 Mon Sep 17 00:00:00 2001
From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com>
Date: Thu, 18 Jun 2026 23:33:03 +0200
Subject: [PATCH 01/48] feat(evaluation): add evaluation subpackage skeleton
 and pyproject entry point (#268)

* feat(evaluation): add evaluation subpackage __init__ with gate/champion/judge/retrieval exports

* feat(evaluation): add EvalConfig and GateVerdict models

* feat(evaluation): add evaluation optional-deps and flyeval CLI entry point to pyproject.toml

* feat(evaluation): note evaluation as optional subpackage in top-level __init__ docstring

---------

Co-authored-by: miguelgfierro <miguelgfierro@users.noreply.github.com>
---
 fireflyframework_agentic/__init__.py          |  7 ++
 .../evaluation/__init__.py                    | 57 +++++++++++++++
 fireflyframework_agentic/evaluation/models.py | 70 +++++++++++++++++++
 pyproject.toml                                |  7 ++
 4 files changed, 141 insertions(+)
 create mode 100644 fireflyframework_agentic/evaluation/__init__.py
 create mode 100644 fireflyframework_agentic/evaluation/models.py

diff --git a/fireflyframework_agentic/__init__.py b/fireflyframework_agentic/__init__.py
index 993b0248..1736f1f4 100644
--- a/fireflyframework_agentic/__init__.py
+++ b/fireflyframework_agentic/__init__.py
@@ -24,6 +24,13 @@
 
     config = get_config()
     print(config.default_model)
+
+Optional subpackages (not imported eagerly at the top level):
+    fireflyframework_agentic.lab          -- sessions, benchmarks, datasets, evaluation orchestration
+    fireflyframework_agentic.experiments  -- experiment tracking and comparison
+    fireflyframework_agentic.evaluation   -- gate-based quality gates, LLM-as-judge advisory,
+                                            champion/challenger tracking, retrieval metrics
+                                            (requires the ``evaluation`` optional extra)
 """
 
 from importlib.metadata import PackageNotFoundError, version
diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py
new file mode 100644
index 00000000..1c264f07
--- /dev/null
+++ b/fireflyframework_agentic/evaluation/__init__.py
@@ -0,0 +1,57 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Evaluation subpackage -- gate-based quality gates, LLM-as-judge advisory, champion/challenger tracking, and retrieval metrics.
+
+Gate pipeline (flags, not vetoes):
+    G1 -- Structural & Safe (schema + PII + empty-registry guard)
+    G2 -- Must-finds & negative controls (recall + NC precision)
+    G3 -- Evidence (grounding / token-anchoring)
+    G4 -- LLM-as-a-Judge (advisory, opt-in, never decides promotion)
+    G5 -- No-regression / promotion (champion/challenger comparison)
+
+Retrieval metrics:
+    Precision@k, Recall@k, MRR, NDCG -- computed over ranked retrieval results.
+
+Champion tracking:
+    Persists the best-known run record so that promotion decisions can be made
+    against a stable baseline rather than the most recent run.
+"""
+
+from importlib.metadata import PackageNotFoundError, version
+
+from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, render_scorecard, run_gates
+from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion
+from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge
+from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics
+
+try:
+    __version__ = version("fireflyframework-agentic")
+except PackageNotFoundError:
+    __version__ = "0.0.0+dev"
+
+__all__ = [
+    "GateResult",
+    "Verdict",
+    "run_gates",
+    "render_scorecard",
+    "ChampionRecord",
+    "load_champion",
+    "save_champion",
+    "invalidate_champion",
+    "AdvisoryReport",
+    "run_judge",
+    "RetrieverMetrics",
+    "compute_retrieval_metrics",
+]
diff --git a/fireflyframework_agentic/evaluation/models.py b/fireflyframework_agentic/evaluation/models.py
new file mode 100644
index 00000000..a98cdf20
--- /dev/null
+++ b/fireflyframework_agentic/evaluation/models.py
@@ -0,0 +1,70 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared config and model classes for the evaluation framework.
+
+EvalConfig captures the parameters of a single evaluation run: which model
+is being tested, which corpus it runs against, and where the supporting
+artefacts (registry, baseline, judge config) live.
+
+GateVerdict constants define the two possible outcomes of the promotion gate:
+PROMOTE (the challenger beats or ties the champion and is safe to deploy)
+or HOLD (the challenger does not meet the bar and must be iterated on).
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from pydantic import BaseModel
+
+
+class EvalConfig(BaseModel):
+    """Configuration for a single evaluation run.
+
+    Parameters:
+        model_id: Identifier of the model under evaluation.
+        corpus: Name of the evaluation corpus (e.g. "ms_marco_mini", "finance_bench").
+        run_id: Unique identifier for this run (e.g. a timestamp or git SHA).
+        registry_path: Path to the must-find / golden registry JSON file.
+        corpus_path: Path to the corpus directory or bundle.
+        baseline_path: Path to a baseline results file for regression comparison.
+        judge_model: Model identifier used for the LLM-as-judge advisory pass.
+        judge_runs: Number of independent judge calls to aggregate (majority vote).
+        embed_model: Model identifier used for embedding-based retrieval metrics.
+        metadata: Arbitrary key/value pairs for run bookkeeping.
+    """
+
+    model_id: str
+    corpus: str
+    run_id: str
+    registry_path: str = ""
+    corpus_path: str = ""
+    baseline_path: str = ""
+    judge_model: str = ""
+    judge_runs: int = 3
+    embed_model: str = ""
+    metadata: dict[str, Any] = {}
+
+
+class GateVerdict:
+    """Promotion gate verdict constants.
+
+    Use ``GateVerdict.PROMOTE`` when the challenger meets the quality bar and
+    is safe to become the new champion.  Use ``GateVerdict.HOLD`` when the
+    challenger does not meet the bar and must be iterated on.
+    """
+
+    PROMOTE: str = "PROMOTE"
+    HOLD: str = "HOLD"
diff --git a/pyproject.toml b/pyproject.toml
index e575323e..bb74201f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -119,6 +119,10 @@ binary = [
 all = [
     "fireflyframework-agentic[postgres,mongodb,security,embeddings,openai-embeddings,cohere-embeddings,google-embeddings,mistral-embeddings,voyage-embeddings,azure-embeddings,bedrock-embeddings,ollama-embeddings,vectorstores-chroma,vectorstores-pinecone,vectorstores-qdrant,vectorstores-pgvector,vectorstores-sqlite-vec,watch,binary]",
 ]
+evaluation = [
+    "scipy>=1.11",
+    "numpy>=1.26.0",
+]
 dev = [
     "pytest>=8.3.0",
     "pytest-asyncio>=0.24.0",
@@ -132,6 +136,9 @@ dev = [
     "pre-commit>=3.8.0",
 ]
 
+[project.scripts]
+flyeval = "fireflyframework_agentic.evaluation.cli:main"
+
 [project.urls]
 Homepage = "https://fireflyframework.org/"
 Documentation = "https://github.com/fireflyframework/fireflyframework-agentic/tree/main/docs"

From 8676b6adbc3319845dc1f7b2faede2e8d4b9cd56 Mon Sep 17 00:00:00 2001
From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com>
Date: Thu, 18 Jun 2026 23:36:17 +0200
Subject: [PATCH 02/48] feat(evaluation): add matcher primitives and statistics
 helpers (#269)

* feat(evaluation): add matcher primitives (anchored, matches, source_stem, tokens)

* feat(evaluation): add statistics helpers (aa_band, aggregate_grounding, left_skew_flag)

* feat(evaluation): export matcher and stats primitives from evaluation package

---------

Co-authored-by: miguelgfierro <miguelgfierro@users.noreply.github.com>
---
 .../evaluation/__init__.py                    |   9 +
 .../evaluation/matcher.py                     | 374 ++++++++++++++++++
 fireflyframework_agentic/evaluation/stats.py  | 110 ++++++
 3 files changed, 493 insertions(+)
 create mode 100644 fireflyframework_agentic/evaluation/matcher.py
 create mode 100644 fireflyframework_agentic/evaluation/stats.py

diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py
index 1c264f07..7d740b00 100644
--- a/fireflyframework_agentic/evaluation/__init__.py
+++ b/fireflyframework_agentic/evaluation/__init__.py
@@ -34,7 +34,9 @@
 from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, render_scorecard, run_gates
 from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion
 from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge
+from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens
 from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics
+from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag
 
 try:
     __version__ = version("fireflyframework-agentic")
@@ -54,4 +56,11 @@
     "run_judge",
     "RetrieverMetrics",
     "compute_retrieval_metrics",
+    "anchored",
+    "matches",
+    "source_stem",
+    "tokens",
+    "aa_band",
+    "aggregate_grounding",
+    "left_skew_flag",
 ]
diff --git a/fireflyframework_agentic/evaluation/matcher.py b/fireflyframework_agentic/evaluation/matcher.py
new file mode 100644
index 00000000..2f5065df
--- /dev/null
+++ b/fireflyframework_agentic/evaluation/matcher.py
@@ -0,0 +1,374 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Single matching primitive reused across G2 (recall/precision) and G3 (grounding).
+
+anchored() is topic-level lexical overlap.  matches() is the gate predicate.
+One function, three uses — do not write three matching functions.
+
+Known limitation (EVALUATION_FRAMEWORK.md): anchored() is topic-anchored, not claim-verified.
+A '45 days' claim cited to a '3 days' source passes if they share the process name.
+Real claim entailment (NLI/AIS) is Phase 2.  The G3 human spot-check is the
+binding faithfulness signal until then.
+"""
+
+from __future__ import annotations
+
+import re
+
+import numpy as np
+
+
+def cosine(a, b) -> float:
+    """Cosine similarity between two vectors."""
+    a = np.asarray(a, dtype=float)
+    b = np.asarray(b, dtype=float)
+    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9))
+
+
+def tokens(text: str) -> list[str]:
+    return re.findall(r"\b\w+\b", text.lower())
+
+
+def anchored(claim: str, evidence: str, *, min_token: int = 5) -> bool:
+    """True if claim and evidence share at least one non-trivial token (>= min_token chars).
+
+    Rejects a citation to an unrelated document.  Does NOT verify the claim value —
+    that gap is closed by the deferred NLI/AIS check in Phase 2.
+    """
+    a = {t for t in tokens(claim) if len(t) >= min_token}
+    b = {t for t in tokens(evidence) if len(t) >= min_token}
+    return bool(a & b)
+
+
+def source_stem(locator: str) -> str:
+    """Normalize a locator/source path to a stable document stem for matching.
+
+    Robust to the two locator conventions observed across runs:
+    - directory-prefixed ('sops/SOP-002-kyc-edd.md') and bare ('SOP-002-kyc-edd.md')
+      both reduce to 'sop-002-kyc-edd';
+    - event-log row ids ('src-credit-underwriting:CU-2026-1003') reduce to the
+      process stem 'credit-underwriting', so they join the CSV the registry cites.
+
+    Preserves the same-document anti-gaming property of matches(): it still keys
+    on which source document a finding cites — just independent of directory
+    prefix, file extension, and case, so one registry scores every run.
+    """
+    s = locator.split("#")[0]  # drop the locator fragment (#page=N, #anchor)
+    s = s.rsplit("/", 1)[-1]  # basename — strip any directory prefix
+    if s.startswith("src-") and ":" in s:  # event-log row id: src-<process>:<case>
+        return s.split(":", 1)[0][len("src-") :].lower()
+    if "." in s:  # strip a trailing file extension
+        s = s.rsplit(".", 1)[0]
+    return s.lower()
+
+
+def _finding_sources(finding: dict, evidence_index: dict[str, dict]) -> set[str]:
+    """Return the set of normalized source-document stems cited by a finding."""
+    sources: set[str] = set()
+    for ref in finding.get("evidence_refs", []):
+        ev = evidence_index.get(ref.get("evidence_id", ""))
+        if ev:
+            stem = source_stem(ev.get("locator", ""))
+            if stem:
+                sources.add(stem)
+    return sources
+
+
+def shares_source(finding: dict, item, evidence_index: dict[str, dict]) -> bool:
+    """True iff the finding cites at least one source document the item lists as evidence.
+
+    Source documents are compared by normalized stem (source_stem) so one registry
+    scores every run regardless of locator convention.  This is the anti-gaming
+    anchor reused by both the lexical predicate (matches) and the semantic path
+    (semantic_hits): a finding on a different document cannot satisfy this item.
+
+    Spec-style NC items list their mirror source (§4.1); legacy NC items carry
+    evidence=[], which makes this always False for them.
+
+    Args:
+        finding: dict from DiscoveryResult.findings[i] (model_dump output).
+        item: RegistryItem dataclass from registry.py.
+        evidence_index: {evidence_id: Evidence dict} built from result['evidence_index'].
+    """
+    finding_sources = _finding_sources(finding, evidence_index)
+    item_sources = {source_stem(e) for e in item.evidence}
+    return bool(finding_sources & item_sources)
+
+
+def _keyword_anchored(desc: str, keywords: list[str]) -> bool:
+    """True iff any keyword appears as a whole word in desc (case-insensitive).
+
+    Keyword rail: exempt from the 5-char token floor so short banking terms
+    (KYC, PEP, AML) can anchor a match even though they are too short for the
+    token rail.  Whole-word matching prevents false substring hits (e.g. "risk"
+    inside "enterprise-risk-management").
+    """
+    if not keywords:
+        return False
+    desc_lower = desc.lower()
+    return any(
+        re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords
+    )
+
+
+def candidate_text(candidate: dict, scope: str) -> str:
+    """Extract the searchable text from a candidate on the given scope surface (§4.3).
+
+    Each scope surface uses different fields as the match text:
+    - finding / action      : title + description
+    - process / decision    : name + description
+    - activity              : name + notes + regulatory_links
+    - persona               : name + role + goals + pain_points
+    - system                : name + description
+    - informal_channel      : name + usage_context + notes
+    - dependency_graph      : name + description (diagnostic nodes; relation items bypass this)
+    """
+    if scope in ("finding", "action"):
+        return " ".join(filter(None, [candidate.get("title", ""), candidate.get("description", "")]))
+    if scope == "activity":
+        rl = candidate.get("regulatory_links") or []
+        rl_str = " ".join(rl) if isinstance(rl, list) else str(rl or "")
+        return " ".join(filter(None, [candidate.get("name", ""), candidate.get("notes", ""), rl_str]))
+    if scope == "persona":
+        goals = candidate.get("goals") or []
+        pain = candidate.get("pain_points") or []
+        goals_str = " ".join(goals) if isinstance(goals, list) else str(goals)
+        pain_str = " ".join(pain) if isinstance(pain, list) else str(pain)
+        return " ".join(filter(None, [
+            candidate.get("name", ""),
+            candidate.get("role", ""),
+            goals_str,
+            pain_str,
+        ]))
+    if scope == "informal_channel":
+        return " ".join(filter(None, [
+            candidate.get("name", ""),
+            candidate.get("usage_context", ""),
+            candidate.get("notes", ""),
+        ]))
+    # process, decision, system, dependency_graph (diagnostic nodes)
+    return " ".join(filter(None, [candidate.get("name", ""), candidate.get("description", "")]))
+
+
+INSIGHT_ITEM_SCOPES = ("finding", "action")
+INSIGHT_MATCH_SURFACES = ("finding", "action", "activity", "decision")
+
+
+def allowed_scopes(item) -> tuple[str, ...]:
+    """Candidate surfaces that may satisfy a registry item.
+
+    Insight items (finding / action) may be satisfied by any insight or process-graph
+    *leaf* surface (activity / decision): a run often grounds the same operational fact
+    on a different surface than the registry's scope tag anticipates (the BBVA case —
+    pain points the registry tags 'finding' that the run emitted as decision/activity
+    nodes).  shares_source is still REQUIRED on every candidate (see matches /
+    semantic_hits), so a candidate on the wrong document never counts — cross-scope
+    widens WHERE we look, never the source anchor.
+
+    Structural items (process / activity / decision) stay on their own surface: a
+    structural must-find requires the run to have actually built that node, not merely
+    mentioned the fact in a finding (test_process_scope_miss_when_no_matching_process).
+    NC items are likewise scope-strict — widening a negative control's pool could only
+    make it easier to trip (a specificity regression), never recover a legitimate hit.
+
+    `process` is never a match surface for an insight item: _candidates_by_scope folds
+    every child's evidence_refs into the process node, so its citation set is a union of
+    many documents and shares_source goes vacuous (hence its exclusion from
+    INSIGHT_MATCH_SURFACES).
+    """
+    if item.tier == "NC":
+        return (item.scope,)
+    if item.scope in INSIGHT_ITEM_SCOPES:
+        return INSIGHT_MATCH_SURFACES
+    return (item.scope,)
+
+
+def matches(
+    candidate: dict,
+    item,
+    evidence_index: dict[str, dict],
+    scope: str = "finding",
+) -> bool:
+    """True iff candidate cites a shared source document AND is topic-anchored to item.
+
+    Two-rail anchor (either rail suffices):
+    - Token rail: ≥1 shared token of ≥5 chars between candidate text and item description.
+    - Keyword rail: ≥1 item keyword appears as a whole word in the candidate text.
+      Exempt from the 5-char floor so short banking terms (KYC, PEP, AML) can anchor.
+
+    The ``scope`` controls which fields are read as the candidate's match text (§4.3):
+    findings and actions use ``title + description``; processes and decisions use
+    ``name + description``; activities use ``name + notes + regulatory_links``.
+
+    Anti-gaming guard: a candidate on a different document cannot satisfy this item
+    even if its text happens to match.  Source documents are compared by
+    normalized stem (source_stem) so one registry scores every run regardless of
+    locator convention.
+
+    Args:
+        candidate: dict from the DiscoveryResult surface matching ``scope``.
+        item: RegistryItem dataclass from registry.py.
+        evidence_index: {evidence_id: Evidence dict} built from result['evidence_index'].
+        scope: surface the candidate was drawn from (default "finding").
+    """
+    if not shares_source(candidate, item, evidence_index):
+        return False
+    desc = candidate_text(candidate, scope)
+    return _keyword_anchored(desc, list(item.keywords or [])) or anchored(desc, item.description)
+
+
+def matches_dependency_graph_relation(
+    item,
+    result: dict,
+    evidence_index: dict[str, dict],
+) -> bool:
+    """Endpoint matcher for dependency_graph relation items (§5.3b).
+
+    Stage 1: Anchor both endpoints to activity nodes via token rail.
+    Stage 2: Verify a directed edge or path connects them in the asserted direction,
+             behind the shared-source guard on the edge's/path's evidence_refs.
+
+    Returns False when either endpoint anchors to no activity, or when no connecting
+    edge/path shares a source document with the item.
+    """
+    if not item.from_node or not item.to_node:
+        return False
+
+    processes = result.get("process_graph", {}).get("processes", [])
+    all_activities = [a for p in processes for a in p.get("activities", [])]
+
+    def _anchor(endpoint_text: str) -> set[str]:
+        return {
+            a["id"]
+            for a in all_activities
+            if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text)
+        }
+
+    from_ids = _anchor(item.from_node)
+    to_ids = _anchor(item.to_node)
+    if not from_ids or not to_ids:
+        return False
+
+    item_stems = {source_stem(e) for e in item.evidence}
+
+    def _node_stems(node: dict) -> set[str]:
+        return {
+            source_stem(evidence_index[r["evidence_id"]].get("locator", ""))
+            for r in node.get("evidence_refs", [])
+            if r.get("evidence_id") in evidence_index
+        }
+
+    dg = result.get("dependency_graph", {})
+
+    for edge in dg.get("activity_edges", []):
+        if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids:
+            if _node_stems(edge) & item_stems:
+                return True
+
+    for path in dg.get("critical_paths", []):
+        if not (_node_stems(path) & item_stems):
+            continue
+        node_ids = path.get("node_ids", [])
+        from_pos = [i for i, nid in enumerate(node_ids) if nid in from_ids]
+        to_pos = [i for i, nid in enumerate(node_ids) if nid in to_ids]
+        if any(fp < tp for fp in from_pos for tp in to_pos):
+            return True
+
+    return False
+
+
+def semantic_hits(
+    candidates: dict[str, list[dict]],
+    items,
+    evidence_index: dict[str, dict],
+    embed_fn,
+    tau: float = 0.70,
+    tau_nc: float = 0.85,
+) -> dict[str, bool]:
+    """Opt-in embedding-semantic recall: {item.id: found-by-some-shared-source candidate}.
+
+    Scope-aware: each registry item is evaluated against candidates from its own
+    scope surface (finding, process, activity, decision, action) using the same
+    per-scope field extraction as the lexical path (candidate_text).  Passing only
+    the findings list (the previous behaviour) would leave process/activity/decision/
+    action items with an empty candidate pool and a guaranteed False result.
+
+    Real items (L0–L3): hit iff some scope-matching candidate shares a source
+    document with the item (shares_source) AND is embedding-similar (cosine >= tau).
+    Source anchor is preserved — a candidate on a different document cannot recover
+    a real item.
+
+    NC items (tier=="NC"): hit iff some scope-matching candidate is embedding-similar
+    (cosine >= tau_nc).  When the NC lists its mirror source (§4.1) the shared-source
+    guard applies; legacy NC items with evidence=[] skip the anchor, with the higher
+    threshold (default 0.85) compensating.
+
+    Cost is two embed_fn calls — all scope-appropriate candidate texts once and all
+    item texts once — not O(n*m) per-pair embeddings.
+
+    Args:
+        candidates: {scope: [candidate dicts]} from _candidates_by_scope().
+        items: iterable of RegistryItem dataclasses.
+        evidence_index: {evidence_id: Evidence dict}.
+        embed_fn: callable(list[str]) -> array-like of row vectors.
+        tau: cosine threshold for real items (inclusive).
+        tau_nc: cosine threshold for NC items (inclusive; higher to compensate for no source anchor).
+    """
+    items = list(items)
+
+    # Flatten all candidates across scopes, preserving their scope tag for
+    # text extraction and per-item filtering.
+    scoped: list[tuple[str, dict]] = [
+        (scope, cand)
+        for scope, cands in candidates.items()
+        for cand in cands
+    ]
+
+    if not scoped:
+        return {item.id: False for item in items}
+
+    cand_texts = [candidate_text(cand, scope) for scope, cand in scoped]
+    item_texts = [
+        " ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items
+    ]
+
+    cand_vecs = np.asarray(embed_fn(cand_texts))
+    item_vecs = np.asarray(embed_fn(item_texts))
+
+    hits: dict[str, bool] = {}
+    for i, item in enumerate(items):
+        item_vec = item_vecs[i]
+        allowed = allowed_scopes(item)
+        hit = False
+        for k, (scope, cand) in enumerate(scoped):
+            if scope not in allowed:
+                continue
+            if item.tier == "NC":
+                # Shared-source guard applies when the NC lists its mirror source
+                # (§4.2/§6.2); legacy evidence=[] NCs stay unanchored, with the
+                # higher tau_nc compensating.
+                if item.evidence and not shares_source(cand, item, evidence_index):
+                    continue
+                if cosine(cand_vecs[k], item_vec) >= tau_nc:
+                    hit = True
+                    break
+            elif (
+                shares_source(cand, item, evidence_index)
+                and cosine(cand_vecs[k], item_vec) >= tau
+            ):
+                hit = True
+                break
+        hits[item.id] = hit
+    return hits
diff --git a/fireflyframework_agentic/evaluation/stats.py b/fireflyframework_agentic/evaluation/stats.py
new file mode 100644
index 00000000..e70c629a
--- /dev/null
+++ b/fireflyframework_agentic/evaluation/stats.py
@@ -0,0 +1,110 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Statistics helpers: A/A noise band + fixed aggregate_grounding.
+
+The A/A band replaces McNemar, Wilcoxon, BCa bootstrap, Cliff's delta, Holm
+correction, and MCID power analysis.  Four self-authored corpora with ~30-70
+non-independent items each cannot power those tests; gating on unpowered tests
+is false precision.  See EVALUATION_FRAMEWORK.md (regression statistics).
+
+This module also provides the fixed aggregate_grounding() that closes a prior
+aggregation bug where the previous runner inherited run 0's grounding report
+unchanged instead of merging across all runs.
+"""
+from __future__ import annotations
+
+import statistics
+from typing import Sequence
+
+
+def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float:
+    """95th-percentile pairwise delta from champion reruns — the noise floor.
+
+    Rerun the champion ~10 times; the 95th-percentile of all pairwise absolute
+    differences is the A/A noise floor.  A candidate must beat the champion by
+    more than this number on EVERY seed to count as a real improvement.
+
+    This single number replaces MCID, power analysis, McNemar, Wilcoxon,
+    bootstrap CIs, and Holm correction.  See EVALUATION_FRAMEWORK.md (the A/A noise band).
+
+    Args:
+        scores: Per-run primary metric scores from champion reruns (>= 2 required).
+        percentile: Which percentile (default 95).
+
+    Returns:
+        Noise floor as a float in the same units as the input scores.
+    """
+    scores = list(scores)
+    if len(scores) < 2:
+        raise ValueError(f"aa_band requires >= 2 reruns; got {len(scores)}")
+    deltas = [
+        abs(x - y)
+        for i, x in enumerate(scores)
+        for y in scores[i + 1:]
+    ]
+    sorted_deltas = sorted(deltas)
+    # Index for the requested percentile; clamp to valid range
+    idx = max(0, min(len(sorted_deltas) - 1, int(len(sorted_deltas) * percentile / 100)))
+    return sorted_deltas[idx]
+
+
+def aggregate_grounding(grounding_dicts: list[dict]) -> dict:
+    """Merge per-run grounding reports into a conservative aggregate.
+
+    Fixes a prior aggregation bug where the previous runner inherited run 0's grounding
+    report unchanged.  Correct behaviour:
+    - support_pct: mean across runs
+    - unsupported_ids: UNION across all runs (anything flagged in any run stays flagged)
+
+    Args:
+        grounding_dicts: List of grounding report dicts, one per evaluation run.
+            Each must have 'support_pct' (float 0-100) and optionally
+            'unsupported_ids' (list[str]).
+
+    Returns:
+        Merged grounding dict.
+    """
+    if not grounding_dicts:
+        return {"support_pct": 0.0, "unsupported_ids": []}
+
+    support_pcts = [float(g.get("support_pct", 0.0)) for g in grounding_dicts]
+    mean_pct = statistics.mean(support_pcts)
+
+    unsupported: set[str] = set()
+    for g in grounding_dicts:
+        unsupported.update(g.get("unsupported_ids", []))
+
+    first = grounding_dicts[0]
+    return {
+        **first,
+        "support_pct": round(mean_pct, 2),
+        "unsupported_ids": sorted(unsupported),
+        "_aggregate_runs": len(grounding_dicts),
+        "_support_pct_per_run": [round(p, 2) for p in support_pcts],
+    }
+
+
+def left_skew_flag(scores: Sequence[float]) -> bool:
+    """True if min < median - 0.10 (HIGH_VARIANCE sentinel).
+
+    A single catastrophic run cannot hide inside a decent mean.
+    True => HIGH_VARIANCE; block the run until investigated.
+    See EVALUATION_FRAMEWORK.md (anti-flakiness).
+    """
+    scores = list(scores)
+    if len(scores) < 2:
+        return False
+    med = statistics.median(scores)
+    return min(scores) < med - 0.10

From 8eb2110ef25ad1579d0f093e011664dfe40935e6 Mon Sep 17 00:00:00 2001
From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com>
Date: Thu, 18 Jun 2026 23:39:04 +0200
Subject: [PATCH 03/48] feat(evaluation): add corpus loader and registry
 modules (#270)

* feat(evaluation): add corpus loader and evidence verification module

* feat(evaluation): add lean-1 registry loader and RegistryItem/Registry models

* feat(evaluation): re-export corpus and registry symbols from evaluation package

---------

Co-authored-by: miguelgfierro <miguelgfierro@users.noreply.github.com>
---
 .../evaluation/__init__.py                    |  13 ++
 fireflyframework_agentic/evaluation/corpus.py | 185 +++++++++++++++
 .../evaluation/registry.py                    | 214 ++++++++++++++++++
 3 files changed, 412 insertions(+)
 create mode 100644 fireflyframework_agentic/evaluation/corpus.py
 create mode 100644 fireflyframework_agentic/evaluation/registry.py

diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py
index 7d740b00..b6283d8b 100644
--- a/fireflyframework_agentic/evaluation/__init__.py
+++ b/fireflyframework_agentic/evaluation/__init__.py
@@ -31,10 +31,12 @@
 
 from importlib.metadata import PackageNotFoundError, version
 
+from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index
 from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, render_scorecard, run_gates
 from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion
 from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge
 from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens
+from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256
 from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics
 from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag
 
@@ -44,6 +46,13 @@
     __version__ = "0.0.0+dev"
 
 __all__ = [
+    "EMPTY",
+    "FABRICATED",
+    "SOURCE_UNKNOWN",
+    "VERIFIED",
+    "corpus_sha256",
+    "load_corpus",
+    "verify_evidence_index",
     "GateResult",
     "Verdict",
     "run_gates",
@@ -54,6 +63,10 @@
     "invalidate_champion",
     "AdvisoryReport",
     "run_judge",
+    "Registry",
+    "RegistryItem",
+    "load_registry",
+    "registry_sha256",
     "RetrieverMetrics",
     "compute_retrieval_metrics",
     "anchored",
diff --git a/fireflyframework_agentic/evaluation/corpus.py b/fireflyframework_agentic/evaluation/corpus.py
new file mode 100644
index 00000000..32835f2c
--- /dev/null
+++ b/fireflyframework_agentic/evaluation/corpus.py
@@ -0,0 +1,185 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Corpus loading and evidence verification (EVALUATION_FRAMEWORK.md §6.3).
+
+The corpus is the third pinned evaluation input, next to the DiscoveryResult
+and the registry: the raw document bundle (input.json) the discovery pipeline
+read.  It is the trusted side of every evidence anchor — the registry tells
+the evaluator what *should* be found; only the corpus can tell it whether what
+a run cited is *real*.
+
+verify_entry() closes the fabricated-evidence channel: a run controls every
+byte of its own evidence_index, so any check computable from (result, registry)
+alone can be satisfied by self-reported evidence.  Checking each excerpt
+against the actual corpus text is the only deterministic counter.
+
+Excerpt contract: excerpts are verbatim quotes from the source document.
+Spliced quotes (fragments joined with '...' or '…') are supported — each
+fragment is verified independently.  Paraphrase belongs in the finding
+description, never in an excerpt.
+"""
+
+from __future__ import annotations
+
+import base64
+import difflib
+import hashlib
+import json
+import re
+import unicodedata
+from dataclasses import dataclass
+from pathlib import Path
+
+from fireflyframework_agentic.evaluation.matcher import source_stem
+
+# Verification statuses for one evidence_index entry.
+VERIFIED = "verified"  # excerpt found (verbatim or spliced) in the cited source
+EMPTY = "empty"  # entry carries no excerpt text — nothing to verify
+SOURCE_UNKNOWN = "source_unknown"  # locator resolves to no corpus document
+FABRICATED = "fabricated"  # populated excerpt not found in the cited source
+
+# A spliced excerpt is split on these joiners; fragments shorter than
+# _MIN_FRAGMENT_CHARS are too generic to verify and are skipped.
+_SPLICE_PATTERN = re.compile(r"\.\.\.|…| -- ")
+_MIN_FRAGMENT_CHARS = 15
+
+# A fragment passes fuzzily when matching blocks (>= _MIN_BLOCK_CHARS chars)
+# cover at least _COVERAGE_THRESHOLD of it — tolerates punctuation/whitespace
+# drift while rejecting invented text (measured ~0.10-0.32 coverage).
+_COVERAGE_THRESHOLD = 0.85
+_MIN_BLOCK_CHARS = 4
+
+
+@dataclass
+class Corpus:
+    """The decoded, normalized corpus: {source stem: normalized text}.
+
+    sha256 pins the corpus file exactly like the registry pin (§4.6): the
+    champion record stores it, and G1 re-hashes the file at scoring time to
+    flag CORPUS_DRIFT.
+    """
+
+    texts: dict[str, str]
+    sha256: str
+    path: str
+
+
+def normalize(text: str) -> str:
+    """Normalize text for excerpt matching: NFKC, strip markdown emphasis and
+    smart quotes, collapse whitespace, casefold."""
+    text = unicodedata.normalize("NFKC", text)
+    text = text.replace("**", "").replace("*", "")
+    text = re.sub(r"[\"""''']", "", text)
+    return re.sub(r"\s+", " ", text).strip().casefold()
+
+
+def corpus_sha256(path: str | Path) -> str:
+    """SHA-256 of the corpus file on disk (the CORPUS_DRIFT re-hash)."""
+    return hashlib.sha256(Path(path).read_bytes()).hexdigest()
+
+
+def load_corpus(path: str | Path) -> Corpus:
+    """Load a FlyRadar input.json bundle into a stem-indexed normalized Corpus.
+
+    Decodes every artifacts[] file and signals[] event log (base64), normalizes
+    the text, and keys each by the same source_stem the matcher uses — so a
+    locator in any convention resolves to its document.
+
+    Raises:
+        ValueError: when the bundle contains no documents, or two documents
+            reduce to the same stem (a collision would let a fabricated
+            citation resolve against the wrong real file).
+    """
+    path = Path(path)
+    raw = json.loads(path.read_text(encoding="utf-8"))
+
+    named_contents: list[tuple[str, str]] = []
+    for artifact in raw.get("artifacts", []):
+        named_contents.append((artifact["filename"], artifact["content_base64"]))
+    for signal in raw.get("signals", []):
+        named_contents.append((signal["name"], signal["content_base64"]))
+
+    if not named_contents:
+        raise ValueError(f"corpus bundle {path} contains no artifacts or signals")
+
+    texts: dict[str, str] = {}
+    for name, content_b64 in named_contents:
+        stem = source_stem(name)
+        if stem in texts:
+            raise ValueError(
+                f"corpus stem collision: two documents reduce to {stem!r} — "
+                "rename one; a collision would verify citations against the wrong file"
+            )
+        decoded = base64.b64decode(content_b64).decode("utf-8", errors="replace")
+        texts[stem] = normalize(decoded)
+
+    return Corpus(texts=texts, sha256=corpus_sha256(path), path=str(path))
+
+
+def _fragment_coverage(fragment: str, source: str) -> float:
+    """Fraction of fragment covered by matching blocks of >= _MIN_BLOCK_CHARS chars."""
+    blocks = difflib.SequenceMatcher(
+        None, fragment, source, autojunk=False
+    ).get_matching_blocks()
+    covered = sum(b.size for b in blocks if b.size >= _MIN_BLOCK_CHARS)
+    return covered / len(fragment)
+
+
+def verify_entry(corpus: Corpus, entry: dict) -> str:
+    """Verify one evidence_index entry against the corpus.
+
+    Returns one of VERIFIED / EMPTY / SOURCE_UNKNOWN / FABRICATED:
+    - the locator must resolve (by source stem) to a corpus document, and
+    - every fragment of the excerpt must appear in that document's text,
+      verbatim after normalization or with matching-block coverage >=
+      _COVERAGE_THRESHOLD.
+
+    The score is the minimum over fragments, so one invented fragment sinks a
+    spliced excerpt.
+
+    """
+    stem = source_stem(entry.get("locator", ""))
+    source = corpus.texts.get(stem)
+    if source is None:
+        return SOURCE_UNKNOWN
+
+    excerpt = normalize(entry.get("excerpt") or "")
+    if not excerpt:
+        return EMPTY
+
+    fragments = [
+        f.strip()
+        for f in _SPLICE_PATTERN.split(excerpt)
+        if len(f.strip()) >= _MIN_FRAGMENT_CHARS
+    ] or [excerpt]
+
+    for fragment in fragments:
+        if fragment in source:
+            continue
+        if _fragment_coverage(fragment, source) < _COVERAGE_THRESHOLD:
+            return FABRICATED
+    return VERIFIED
+
+
+def verify_evidence_index(corpus: Corpus, result: dict) -> dict[str, str]:
+    """Verify every evidence_index entry of a DiscoveryResult.
+
+    Returns {evidence_id: status} over all entries — referenced or not — so
+    the gates share one verification pass.
+    """
+    return {
+        ev["id"]: verify_entry(corpus, ev)
+        for ev in result.get("evidence_index", [])
+        if ev.get("id")
+    }
diff --git a/fireflyframework_agentic/evaluation/registry.py b/fireflyframework_agentic/evaluation/registry.py
new file mode 100644
index 00000000..2b869ba9
--- /dev/null
+++ b/fireflyframework_agentic/evaluation/registry.py
@@ -0,0 +1,214 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""lean-1 registry loader — one schema for all four corpora.
+
+Replaces the four mutually incompatible schemes in use today (L1-L5,
+documented/observed/pain-point, critical/important, and no tiers).
+Loader enforces all invariants; they are not documentation.
+
+Invariants (EVALUATION_FRAMEWORK.md, the must-find registry):
+- schema_version == "lean-1"
+- every tier is one of L0 L1 L2 L3 NC
+- negative_control_count >= ceil(real_items / 10)
+- kappa present (0.0 placeholder allowed; G2 advisory until >= 0.70)
+- ABANCA DILO items must target a single measured sub-population
+"""
+from __future__ import annotations
+
+import hashlib
+import json
+import math
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal
+
+VALID_TIERS = ("L0", "L1", "L2", "L3", "NC")
+VALID_SCOPES = (
+    "process", "activity", "decision", "finding", "action",
+    "persona", "system", "informal_channel", "dependency_graph",
+)
+SCHEMA_VERSION = "lean-1"
+KAPPA_ADVISORY_THRESHOLD = 0.70
+
+
+@dataclass(frozen=True)
+class RegistryItem:
+    id: str
+    tier: Literal["L0", "L1", "L2", "L3", "NC"]
+    description: str
+    evidence: list[str]          # source file paths (path portion of locator, no #page=N)
+    scope: str = "finding"       # which DiscoveryResult surface to match against (§4.3)
+    keywords: list[str] = field(default_factory=list)
+    weight: float = 1.0
+    from_node: str = ""   # dependency_graph relation items only
+    to_node: str = ""     # dependency_graph relation items only
+    relation: str = ""    # defaults to "precedes" when from/to present
+
+
+@dataclass(frozen=True)
+class Registry:
+    schema_version: str
+    corpus: str
+    author: str
+    date: str
+    kappa: float
+    items: list[RegistryItem]
+    _sha256: str = field(default="", compare=False)
+
+    @property
+    def real_items(self) -> list[RegistryItem]:
+        return [i for i in self.items if i.tier != "NC"]
+
+    @property
+    def nc_items(self) -> list[RegistryItem]:
+        return [i for i in self.items if i.tier == "NC"]
+
+    @property
+    def l0_items(self) -> list[RegistryItem]:
+        return [i for i in self.items if i.tier == "L0"]
+
+    def is_kappa_advisory(self) -> bool:
+        return self.kappa < KAPPA_ADVISORY_THRESHOLD
+
+    def sha256(self) -> str:
+        return self._sha256
+
+
+def _validate(raw: dict, path: Path) -> None:
+    if raw.get("schema_version") != SCHEMA_VERSION:
+        raise ValueError(
+            f"{path.name}: schema_version must be '{SCHEMA_VERSION}', "
+            f"got {raw.get('schema_version')!r}"
+        )
+    for fname in ("corpus", "author", "date"):
+        if not raw.get(fname):
+            raise ValueError(f"{path.name}: missing required field '{fname}'")
+    if "kappa" not in raw:
+        raise ValueError(f"{path.name}: missing 'kappa' field (use 0.0 as placeholder)")
+
+    items = raw.get("items", [])
+
+    # EMPTY_MUST_FIND guard — must be first; kills fake-champion bug
+    if not items:
+        raise ValueError(
+            f"{path.name}: EMPTY_MUST_FIND — items list is empty; "
+            "cannot evaluate recall.  This guard exists to prevent the "
+            "fake-100%-champion failure."
+        )
+
+    ids = [it.get("id") for it in items]
+    if len(ids) != len(set(ids)):
+        dupes = sorted({i for i in ids if ids.count(i) > 1})
+        raise ValueError(f"{path.name}: duplicate item ids: {dupes}")
+
+    for it in items:
+        tier = it.get("tier")
+        if tier not in VALID_TIERS:
+            raise ValueError(
+                f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; "
+                f"must be one of {VALID_TIERS}"
+            )
+        scope = it.get("scope", "finding")
+        if scope not in VALID_SCOPES:
+            raise ValueError(
+                f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; "
+                f"must be one of {VALID_SCOPES}"
+            )
+        if scope == "dependency_graph":
+            if not it.get("from") or not it.get("to"):
+                raise ValueError(
+                    f"{path.name}: dependency_graph item '{it.get('id')}' must have "
+                    "non-empty 'from' and 'to'"
+                )
+        else:
+            if "from" in it or "to" in it or "relation" in it:
+                raise ValueError(
+                    f"{path.name}: item '{it.get('id')}' has 'from'/'to'/'relation' "
+                    f"but scope is '{scope}'; these fields are only valid on "
+                    "dependency_graph-scoped items"
+                )
+
+    real_count = sum(1 for it in items if it.get("tier") != "NC")
+    nc_count = sum(1 for it in items if it.get("tier") == "NC")
+    required_nc = max(1, math.ceil(real_count / 10))
+    if nc_count < required_nc:
+        raise ValueError(
+            f"{path.name}: NC density too low — {nc_count} NC item(s) for "
+            f"{real_count} real items; need >= {required_nc} (ceil(real/10)).  "
+            "Without NC items the eval measures recall only; a verbose hallucinator "
+            "scores perfectly."
+        )
+
+    # ABANCA DILO blend guard: items must assert a single sub-population target.
+    # Checks for phrases that would indicate a blended numeric target is asserted.
+    # "blend" alone is too broad (items may reference it negatively).
+    BLEND_PHRASES = ("combined distribution", "across all offices regardless of segment")
+    for it in items:
+        if it.get("tier") == "NC":
+            continue
+        desc = it.get("description", "").lower()
+        iid = it.get("id", "")
+        if any(phrase in desc for phrase in BLEND_PHRASES):
+            raise ValueError(
+                f"{path.name}: item '{iid}' description targets a blended distribution; "
+                "ABANCA DILO items must target a single measured sub-population "
+                "(Empresas or PyMEs).  Use segment-keyed items: "
+                "dilo-empresas-operativa-42pct AND dilo-pymes-operativa-29pct separately."
+            )
+
+
+def _compute_sha256(path: Path) -> str:
+    return hashlib.sha256(path.read_bytes()).hexdigest()
+
+
+def load_registry(path: str | Path) -> Registry:
+    """Load and validate a lean-1 registry file.
+
+    Raises ValueError with a descriptive message on any invariant violation.
+    The EMPTY_MUST_FIND check runs first — it is the fake-champion guard.
+    """
+    path = Path(path)
+    raw = json.loads(path.read_text(encoding="utf-8"))
+    _validate(raw, path)
+    sha = _compute_sha256(path)
+
+    items = [
+        RegistryItem(
+            id=it["id"],
+            tier=it["tier"],
+            scope=it.get("scope", "finding"),
+            description=it.get("description", ""),
+            evidence=it.get("evidence", []),
+            keywords=it.get("keywords", []),
+            weight=float(it.get("weight", 1.0)),
+            from_node=it.get("from", "") if it.get("scope") == "dependency_graph" else "",
+            to_node=it.get("to", "") if it.get("scope") == "dependency_graph" else "",
+            relation=it.get("relation", "precedes") if it.get("scope") == "dependency_graph" else "",
+        )
+        for it in raw["items"]
+    ]
+
+    return Registry(
+        schema_version=raw["schema_version"],
+        corpus=raw["corpus"],
+        author=raw["author"],
+        date=raw["date"],
+        kappa=float(raw["kappa"] or 0.0),
+        items=items,
+        _sha256=sha,
+    )
+
+
+def registry_sha256(path: str | Path) -> str:
+    return _compute_sha256(Path(path))

From ee64cfad1881e32be569a28acd5981373c8cd04f Mon Sep 17 00:00:00 2001
From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com>
Date: Thu, 18 Jun 2026 23:43:24 +0200
Subject: [PATCH 04/48] feat(evaluation): add G1-G5 gate framework (#271)

* feat(evaluation): add G1-G5 gate framework (GateResult, run_gates, g2_recall_precision)

* feat(evaluation): export g2_recall_precision from evaluation package

---------

Co-authored-by: miguelgfierro <miguelgfierro@users.noreply.github.com>
---
 .../evaluation/__init__.py                    |   3 +-
 fireflyframework_agentic/evaluation/gates.py  | 840 ++++++++++++++++++
 2 files changed, 842 insertions(+), 1 deletion(-)
 create mode 100644 fireflyframework_agentic/evaluation/gates.py

diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py
index b6283d8b..401244c9 100644
--- a/fireflyframework_agentic/evaluation/__init__.py
+++ b/fireflyframework_agentic/evaluation/__init__.py
@@ -32,7 +32,7 @@
 from importlib.metadata import PackageNotFoundError, version
 
 from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index
-from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, render_scorecard, run_gates
+from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, render_scorecard, run_gates
 from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion
 from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge
 from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens
@@ -55,6 +55,7 @@
     "verify_evidence_index",
     "GateResult",
     "Verdict",
+    "g2_recall_precision",
     "run_gates",
     "render_scorecard",
     "ChampionRecord",
diff --git a/fireflyframework_agentic/evaluation/gates.py b/fireflyframework_agentic/evaluation/gates.py
new file mode 100644
index 00000000..057bfea7
--- /dev/null
+++ b/fireflyframework_agentic/evaluation/gates.py
@@ -0,0 +1,840 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Four gates — every gate always runs; a failure raises a flag, not a veto.
+
+Gate pipeline (EVALUATION_FRAMEWORK.md §6):
+    G1 — Structural & Safe
+    G2 — Must-finds & negative controls
+    G3 — Evidence (grounding)
+    G5 — No-regression / promotion (human decision)
+
+Each gate is a pure function of the result dict + supporting inputs.
+run_gates() always executes all four gates and returns all four results so
+the scorecard carries the complete picture regardless of which flags fire.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+from fireflyframework_agentic.evaluation import matcher
+from fireflyframework_agentic.evaluation.corpus import (
+    EMPTY,
+    FABRICATED,
+    SOURCE_UNKNOWN,
+    VERIFIED,
+    Corpus,
+    corpus_sha256,
+    verify_evidence_index,
+)
+from fireflyframework_agentic.evaluation.matcher import anchored, matches
+from fireflyframework_agentic.evaluation.registry import Registry, registry_sha256
+
+
+@dataclass
+class GateResult:
+    gate: str
+    passed: bool
+    reason_code: str = ""
+    details: dict = field(default_factory=dict)
+
+    def __str__(self) -> str:
+        status = "PASS" if self.passed else f"FLAG:{self.reason_code}"
+        return f"[{self.gate}] {status}"
+
+
+class Verdict:
+    """Promotion gate verdict constants.
+
+    Use ``Verdict.PROMOTE`` when the challenger meets the quality bar and
+    is safe to become the new champion.  Use ``Verdict.HOLD`` when the
+    challenger does not meet the bar and must be iterated on.
+    """
+
+    PROMOTE: str = "PROMOTE"
+    HOLD: str = "HOLD"
+
+
+def render_scorecard(gate_results: list[GateResult]) -> str:
+    """Render a human-readable scorecard from a list of GateResult objects.
+
+    Emits one line per gate: ``[G1] PASS`` or ``[G2] FLAG:RECALL_BELOW_FLOOR``.
+    The overall verdict (PROMOTE / HOLD) appears on the final line.  A run
+    promotes only when every gate passes; any flag signals HOLD.
+    """
+    lines = [str(r) for r in gate_results]
+    all_passed = all(r.passed for r in gate_results)
+    verdict = Verdict.PROMOTE if all_passed else Verdict.HOLD
+    lines.append(f"VERDICT: {verdict}")
+    return "\n".join(lines)
+
+
+def _build_evidence_index(result: dict, corpus: Corpus | None = None) -> dict[str, dict]:
+    """Index evidence by id; with a corpus, drop entries that fail verification.
+
+    Dropped entries (FABRICATED excerpt or SOURCE_UNKNOWN locator) cannot
+    contribute source stems to G2's shared-source guard or excerpts to G3's
+    grounding — a run cannot anchor anything on evidence it invented.  EMPTY
+    entries are kept: an empty excerpt is a format problem, not fabrication,
+    and its (verified) locator stem is still a legitimate citation.
+    """
+    index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")}
+    if corpus is None:
+        return index
+    statuses = verify_evidence_index(corpus, result)
+    return {
+        eid: ev
+        for eid, ev in index.items()
+        if statuses[eid] in (VERIFIED, EMPTY)
+    }
+
+
+# ── G1: Structural & Safe ────────────────────────────────────────────────────
+
+
+def _name_duplication_rate(nodes: list[dict]) -> float:
+    """Tier-1 + Tier-2 name clustering; returns 1 - clusters/count.
+
+    Tier 1: same normalized id (lower-case) merges nodes into one cluster.
+    Tier 2: name token-Jaccard >= 0.6 merges nodes into one cluster.
+
+    Report-only: no gate flag fires on any threshold.
+    """
+    n = len(nodes)
+    if n < 2:
+        return 0.0
+
+    group = list(range(n))
+
+    def _root(i: int) -> int:
+        while group[i] != i:
+            group[i] = group[group[i]]
+            i = group[i]
+        return i
+
+    seen: dict[str, int] = {}
+    for i, node in enumerate(nodes):
+        nid = node.get("id", "").lower()
+        if nid in seen:
+            group[_root(i)] = _root(seen[nid])
+        else:
+            seen[nid] = i
+
+    toks = [frozenset(node.get("name", "").lower().split()) for node in nodes]
+    for i in range(n):
+        for j in range(i + 1, n):
+            a, b = toks[i], toks[j]
+            union_ab = a | b
+            if union_ab and len(a & b) / len(union_ab) >= 0.6:
+                group[_root(i)] = _root(j)
+
+    clusters = len({_root(i) for i in range(n)})
+    return round(1 - clusters / n, 4)
+
+
+def g1_structural(
+    result: dict,
+    registry: Registry,
+    registry_path: str,
+    *,
+    pii_list: list[str] | None = None,
+    corpus: Corpus | None = None,
+) -> GateResult:
+    """G1 — Structural & Safe (hard veto).
+
+    Checks (in order):
+    1. EMPTY_MUST_FIND — must run first; kills the fake-100%-champion bug.
+    2. Registry SHA-256 pin: loaded Registry matches the file on disk.
+    3. Corpus SHA-256 pin (when a corpus is supplied): same drift guard for
+       the evidence universe (CORPUS_DRIFT).
+    4. Required top-level keys present in result.
+    5. PII non-disclosure: no corpus PII name in any finding/report text.
+    """
+    # Guard 1: empty registry (fake-champion guard — always first)
+    if not registry.real_items:
+        return GateResult(
+            gate="G1",
+            passed=False,
+            reason_code="EMPTY_MUST_FIND",
+            details={"message": "Registry has zero real items — cannot evaluate recall."},
+        )
+
+    # Guard 2: registry SHA-256 pin
+    computed_sha = registry_sha256(registry_path)
+    if computed_sha != registry.sha256():
+        return GateResult(
+            gate="G1",
+            passed=False,
+            reason_code="GOLD_DRIFT",
+            details={
+                "message": "Registry file has changed since it was loaded.",
+                "expected": registry.sha256(),
+                "actual": computed_sha,
+            },
+        )
+
+    # Guard 3: corpus SHA-256 pin (CORPUS_DRIFT — the GOLD_DRIFT twin for evidence)
+    if corpus is not None:
+        current_corpus_sha = corpus_sha256(corpus.path)
+        if current_corpus_sha != corpus.sha256:
+            return GateResult(
+                gate="G1",
+                passed=False,
+                reason_code="CORPUS_DRIFT",
+                details={
+                    "message": "Corpus file has changed since it was loaded.",
+                    "expected": corpus.sha256,
+                    "actual": current_corpus_sha,
+                },
+            )
+
+    # Guard 4: required result keys
+    required = ("process_graph", "findings", "evidence_index")
+    missing = [k for k in required if k not in result]
+    if missing:
+        return GateResult(
+            gate="G1",
+            passed=False,
+            reason_code="SCHEMA_INVALID",
+            details={"missing_keys": missing},
+        )
+
+    # Guard 5: PII check
+    if pii_list:
+        free_text: list[str] = []
+        for finding in result.get("findings", []):
+            free_text.extend([finding.get("title", ""), finding.get("description", "")])
+        for report in result.get("reports", []):
+            free_text.append(str(report))
+        combined = " ".join(free_text).lower()
+        hits = [name for name in pii_list if name.lower() in combined]
+        if hits:
+            return GateResult(
+                gate="G1",
+                passed=False,
+                reason_code="PII_LEAK",
+                details={
+                    "message": "Corpus PII names found in findings/reports.",
+                    "matches": hits[:5],
+                },
+            )
+
+    pg = result.get("process_graph", {})
+    processes = pg.get("processes", [])
+    activities = [a for p in processes for a in p.get("activities", [])]
+    decisions = [d for p in processes for d in p.get("decisions", [])]
+    dg = result.get("dependency_graph", {})
+
+    details = {
+        "registry_sha256": registry.sha256(),
+        "real_items": len(registry.real_items),
+        "nc_items": len(registry.nc_items),
+        "map": {
+            "processes": {
+                "count": len(processes),
+                "duplication_rate": _name_duplication_rate(processes),
+            },
+            "activities": {
+                "count": len(activities),
+                "duplication_rate": _name_duplication_rate(activities),
+            },
+            "decisions": {
+                "count": len(decisions),
+                "duplication_rate": _name_duplication_rate(decisions),
+            },
+            "personas": {
+                "count": len(result.get("personas", [])),
+                "duplication_rate": _name_duplication_rate(result.get("personas", [])),
+            },
+            "systems": {
+                "count": len(result.get("systems", [])),
+                "duplication_rate": _name_duplication_rate(result.get("systems", [])),
+            },
+            "informal_channels": {
+                "count": len(result.get("informal_channels", [])),
+                "duplication_rate": _name_duplication_rate(result.get("informal_channels", [])),
+            },
+            "dependency_graph_edges": len(dg.get("activity_edges", [])),
+        },
+    }
+    if corpus is not None:
+        details["corpus_sha256"] = corpus.sha256
+    return GateResult(gate="G1", passed=True, details=details)
+
+
+# ── G2: Recall & Precision ───────────────────────────────────────────────────
+
+
+def _candidates_by_scope(result: dict) -> dict[str, list[dict]]:
+    """Build per-scope candidate lists from a DiscoveryResult (§4.3).
+
+    Process candidates are augmented with their children's evidence_refs because
+    process nodes typically carry no own refs — the source-document guard uses the
+    union of the process's own refs and all its activities' and decisions' refs.
+
+    dependency_graph-scoped items are relation items (all carry from/to) and are
+    matched via matcher.matches_dependency_graph_relation() — not through per-candidate
+    iteration — so no "dependency_graph" key is included here.
+    """
+    pg = result.get("process_graph", {})
+    processes = pg.get("processes", [])
+
+    def _merge_refs(proc: dict) -> dict:
+        children_refs = [
+            ref
+            for child_list in (proc.get("activities", []), proc.get("decisions", []))
+            for child in child_list
+            for ref in child.get("evidence_refs", [])
+        ]
+        return {**proc, "evidence_refs": list(proc.get("evidence_refs", [])) + children_refs}
+
+    return {
+        "process": [_merge_refs(p) for p in processes],
+        "activity": [a for p in processes for a in p.get("activities", [])],
+        "decision": [d for p in processes for d in p.get("decisions", [])],
+        "finding": result.get("findings", []),
+        "action": result.get("proposed_actions", []),
+        "persona": result.get("personas", []),
+        "system": result.get("systems", []),
+        "informal_channel": result.get("informal_channels", []),
+    }
+
+
+def _weighted_recall(scored_items: list, hits: dict[str, bool]) -> float:
+    """Weighted recall of a hit map over the scored (non-L3) items."""
+    total_weight = sum(item.weight for item in scored_items) or 1.0
+    weighted_hit = sum(item.weight for item in scored_items if hits[item.id])
+    return weighted_hit / total_weight
+
+
+def _finding_redundancy_rate(findings: list[dict]) -> float:
+    """Fraction of findings that are near-duplicates of another (Jaccard ≥0.6 on ≥5-char tokens)."""
+    if len(findings) < 2:
+        return 0.0
+    def _tok(text: str) -> frozenset[str]:
+        return frozenset(t.lower() for t in text.split() if len(t) >= 5)
+    token_sets = [_tok(f.get("description", "")) for f in findings]
+    in_redundant: set[int] = set()
+    for i in range(len(token_sets)):
+        for j in range(i + 1, len(token_sets)):
+            a, b = token_sets[i], token_sets[j]
+            union = a | b
+            sim = len(a & b) / len(union) if union else 1.0
+            if sim >= 0.6:
+                in_redundant.add(i)
+                in_redundant.add(j)
+    return round(len(in_redundant) / len(findings), 4)
+
+
+def g2_recall_precision(
+    result: dict,
+    registry: Registry,
+    *,
+    recall_floor: float = 0.70,
+    embed_fn=None,
+    tau: float = 0.70,
+    tau_nc: float = 0.85,
+    recall_metric: str = "lexical",
+    corpus: Corpus | None = None,
+) -> GateResult:
+    """G2 — Recall & Precision (hard veto).
+
+    - L0 miss  -> BLOCK (zeros the evaluation; regulatory-mandatory item absent)
+    - NC hit   -> BLOCK (precision failure; plausible-but-false item was emitted)
+    - recall < floor -> BLOCK
+
+    With a ``corpus``, evidence entries that fail verification (fabricated
+    excerpt or unknown source) are excluded from the evidence index before
+    matching, so the shared-source guard only accepts citations to real
+    corpus documents — a fabricated locator cannot satisfy any item.
+
+    ``recall_metric`` ("lexical"/"semantic"/"hybrid") selects which hit map GATES.
+    "lexical" is matcher.matches (shared-source + topic-anchored token overlap) and
+    needs no embedder.  "semantic"/"hybrid" add the embedding path (matcher.semantic_hits,
+    threshold ``tau`` for real items, ``tau_nc`` for NC items) and REQUIRE ``embed_fn``
+    — passing them without one raises ValueError (use "lexical" for the offline path).
+    When an embedder is supplied, all three recalls (lexical/semantic/hybrid) are
+    reported in details regardless of which one gates.
+    """
+    evidence_index = _build_evidence_index(result, corpus)
+    candidates = _candidates_by_scope(result)
+    findings = candidates["finding"]
+
+    # NC items anchor via the embedding path only (§6.2): a correct finding about
+    # the true mirror fact shares vocabulary with the false description, so a
+    # token or keyword match would falsely convict it.  Lexical NC is always False.
+    # dependency_graph relation items (those with from_node) use the endpoint
+    # matcher (§5.3b) instead of the per-candidate text predicate.
+    lexical: dict[str, bool] = {}
+    for item in registry.items:
+        if item.tier == "NC":
+            lexical[item.id] = False
+        elif item.scope == "dependency_graph" and item.from_node:
+            lexical[item.id] = matcher.matches_dependency_graph_relation(
+                item, result, evidence_index
+            )
+        else:
+            lexical[item.id] = any(
+                matches(c, item, evidence_index, scope=scope)
+                for scope in matcher.allowed_scopes(item)
+                for c in candidates.get(scope, [])
+            )
+
+    if recall_metric not in ("lexical", "semantic", "hybrid"):
+        raise ValueError(f"unknown recall_metric {recall_metric!r}")
+    if recall_metric in ("semantic", "hybrid") and embed_fn is None:
+        raise ValueError(
+            f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn"
+        )
+
+    if embed_fn is not None:
+        semantic = matcher.semantic_hits(
+            candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc
+        )
+        # dependency_graph relation items have no embedding candidates (§5.3b uses
+        # the endpoint matcher, not per-candidate text embeddings); mirror the
+        # lexical result so semantic/hybrid never under-credits them.
+        for item in registry.items:
+            if item.scope == "dependency_graph" and item.from_node:
+                semantic[item.id] = lexical[item.id]
+    else:
+        semantic = None
+
+    metric = recall_metric
+
+    if semantic is None or metric == "lexical":
+        hits = lexical
+    elif metric == "semantic":
+        hits = semantic
+    else:  # hybrid
+        hits = {iid: lexical[iid] or semantic[iid] for iid in lexical}
+
+    # Signal-to-noise panel — report-only, §6.2 item 3
+    finding_count = len(findings)
+    finding_scoped_items = [i for i in registry.real_items if i.scope == "finding"]
+    findings_matched = sum(
+        1 for f in findings
+        if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items)
+    )
+    _sn = {
+        "finding_count": finding_count,
+        "findings_matched_to_registry": {
+            "count": findings_matched,
+            "fraction": round(findings_matched / finding_count, 4) if finding_count else 0.0,
+        },
+        "finding_redundancy_rate": _finding_redundancy_rate(findings),
+    }
+    if corpus is not None:
+        excluded = len(_build_evidence_index(result)) - len(evidence_index)
+        _sn["evidence_entries_excluded_unverified"] = excluded
+
+    # L0 misses
+    l0_misses = [item.id for item in registry.l0_items if not hits[item.id]]
+    if l0_misses:
+        return GateResult(
+            gate="G2",
+            passed=False,
+            reason_code="L0_MISSING",
+            details={
+                "l0_misses": l0_misses,
+                "message": "Regulatory-mandatory items not found — evaluation zeroed.",
+                **_sn,
+            },
+        )
+
+    # NC precision
+    nc_hits = [item.id for item in registry.nc_items if hits[item.id]]
+    if nc_hits:
+        return GateResult(
+            gate="G2",
+            passed=False,
+            reason_code="NC_HIT",
+            details={
+                "nc_hits": nc_hits,
+                "message": "Plausible-but-false negative control items were matched — precision failure.",
+                **_sn,
+            },
+        )
+
+    # Weighted recall — over scored items only (L0/L1/L2).  L3 is a bonus tier
+    # ("extra credit"): an L3 miss must not lower recall, so L3 is excluded from
+    # the denominator and only reported in per_tier below.  Recall is computed over
+    # the GATING hit map so the gate is internally consistent with the chosen metric.
+    real_items = registry.real_items
+    scored_items = [item for item in real_items if item.tier != "L3"]
+    recall = _weighted_recall(scored_items, hits)
+
+    per_tier: dict[str, dict] = {}
+    for tier in ("L0", "L1", "L2", "L3"):
+        tier_items = [i for i in real_items if i.tier == tier]
+        if not tier_items:
+            continue
+        per_tier[tier] = {
+            "hit": sum(1 for i in tier_items if hits[i.id]),
+            "total": len(tier_items),
+        }
+
+    def _semantic_details() -> dict:
+        """The extra recall-breakdown keys, only emitted when an embedder is given."""
+        if semantic is None:
+            return {}
+        return {
+            "lexical_recall": round(_weighted_recall(scored_items, lexical), 4),
+            "semantic_recall": round(_weighted_recall(scored_items, semantic), 4),
+            "hybrid_recall": round(
+                _weighted_recall(
+                    scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical}
+                ),
+                4,
+            ),
+            "tau": tau,
+        }
+
+    if recall < recall_floor:
+        return GateResult(
+            gate="G2",
+            passed=False,
+            reason_code="RECALL_BELOW_FLOOR",
+            details={
+                "recall": round(recall, 4),
+                "recall_metric": metric,
+                "floor": recall_floor,
+                "per_tier": per_tier,
+                "misses": [item.id for item in scored_items if not hits[item.id]],
+                **_semantic_details(),
+                **_sn,
+            },
+        )
+
+    return GateResult(
+        gate="G2",
+        passed=True,
+        details={
+            "recall": round(recall, 4),
+            "recall_metric": metric,
+            "floor": recall_floor,
+            "per_tier": per_tier,
+            "nc_items_checked": len(registry.nc_items),
+            **_semantic_details(),
+            **_sn,
+        },
+    )
+
+
+# ── G3: Grounded ─────────────────────────────────────────────────────────────
+
+
+def g3_grounded(
+    result: dict,
+    *,
+    grounding_floor: float = 0.90,
+    human_spot_check_n: int = 5,
+    corpus: Corpus | None = None,
+) -> GateResult:
+    """G3 — Grounded (automated portion; human spot-check triggered on pass).
+
+    For each finding, verifies that at least one cited evidence excerpt shares a
+    non-trivial token with the finding description (topic-anchoring).
+
+    With a ``corpus``, the gate also looks in a third direction — cited ->
+    exists: every evidence entry is verified against the actual corpus text
+    (corpus.verify_entry).  A populated excerpt not found in its cited source
+    raises EVIDENCE_FABRICATED; a locator resolving to no corpus document
+    raises EVIDENCE_SOURCE_UNKNOWN; and only verified excerpts can ground a
+    finding, so a run cannot ground itself on evidence it invented.
+
+    Also reports excerpt fill rate and source coverage so the reviewer can tell
+    whether ungrounded findings are a format problem (empty excerpts) or a real
+    faithfulness signal (populated excerpts that do not anchor).
+
+    Known limitation: topic-anchoring, not claim entailment.  A '45 days' claim
+    cited to a '3 days' source passes if they share the process name (excerpt
+    verification confirms the quote is real, not that the claim matches it).
+    The human spot-check is the binding faithfulness signal until NLI/AIS lands.
+    """
+    evidence_index = _build_evidence_index(result)
+    findings = result.get("findings", [])
+    statuses = verify_evidence_index(corpus, result) if corpus is not None else None
+
+    if not findings:
+        return GateResult(
+            gate="G3",
+            passed=False,
+            reason_code="NO_FINDINGS",
+            details={"message": "Result has zero findings — cannot compute grounding."},
+        )
+
+    grounded_ids: list[str] = []
+    # Ungrounded split (§6.3): distinguish format issues from real faithfulness failures.
+    ungrounded_empty_only: list[str] = []    # every ref had an empty excerpt
+    ungrounded_populated: list[str] = []     # had populated excerpt(s) but none anchored
+
+    # Excerpt fill: count all resolved refs and how many carry a non-empty excerpt.
+    total_refs = 0
+    populated_refs = 0
+
+    # Source coverage: which source stems are cited by at least one finding.
+    cited_stems: set[str] = set()
+
+    for finding in findings:
+        fid = finding.get("id", "?")
+        desc = finding.get("description", "")
+        is_grounded = False
+        had_populated = False
+        for ref in finding.get("evidence_refs", []):
+            ev = evidence_index.get(ref.get("evidence_id", ""))
+            if ev:
+                total_refs += 1
+                excerpt = ev.get("excerpt") or ""
+                if excerpt:
+                    populated_refs += 1
+                    had_populated = True
+                    # Track source coverage (even for ungrounded findings).
+                    stem = matcher.source_stem(ev.get("locator", ""))
+                    if stem:
+                        cited_stems.add(stem)
+                    # Only a corpus-verified excerpt can ground a finding.
+                    if statuses is not None and statuses.get(ev.get("id")) != VERIFIED:
+                        continue
+                    if anchored(desc, excerpt):
+                        is_grounded = True
+                        break
+        if is_grounded:
+            grounded_ids.append(fid)
+        elif had_populated:
+            ungrounded_populated.append(fid)
+        else:
+            ungrounded_empty_only.append(fid)
+
+    grounding_pct = len(grounded_ids) / len(findings)
+
+    # All source stems present in the evidence index (not just those cited).
+    all_stems: set[str] = set()
+    for ev in result.get("evidence_index", []):
+        stem = matcher.source_stem(ev.get("locator", ""))
+        if stem:
+            all_stems.add(stem)
+    orphaned = sorted(all_stems - cited_stems)
+
+    excerpt_fill = f"{populated_refs}/{total_refs}" if total_refs else "0/0"
+    source_coverage = f"{len(cited_stems)}/{len(all_stems)}" if all_stems else "0/0"
+
+    details = {
+        "grounding_pct": round(grounding_pct, 4),
+        "grounded": len(grounded_ids),
+        "total": len(findings),
+        "excerpt_fill": excerpt_fill,
+        "source_coverage": source_coverage,
+        "orphaned_sources": orphaned,
+    }
+
+    fabricated_ids: list[str] = []
+    unknown_source_ids: list[str] = []
+    if statuses is not None:
+        fabricated_ids = sorted(e for e, s in statuses.items() if s == FABRICATED)
+        unknown_source_ids = sorted(e for e, s in statuses.items() if s == SOURCE_UNKNOWN)
+        details["evidence_verification"] = {
+            "entries": len(statuses),
+            "verified": sum(1 for s in statuses.values() if s == VERIFIED),
+            "empty_excerpt": sum(1 for s in statuses.values() if s == EMPTY),
+            "fabricated": fabricated_ids,
+            "source_unknown": unknown_source_ids,
+        }
+
+    if fabricated_ids:
+        details["message"] = (
+            "Populated excerpt(s) not found in the cited corpus document — "
+            "the run asserts evidence the source does not contain."
+        )
+        return GateResult(
+            gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details
+        )
+
+    if unknown_source_ids:
+        details["message"] = (
+            "Evidence locator(s) resolve to no corpus document — either the "
+            "corpus bundle is incomplete or the run invented a source."
+        )
+        return GateResult(
+            gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details
+        )
+
+    if grounding_pct < grounding_floor:
+        details["floor"] = grounding_floor
+        details["ungrounded_with_populated_excerpts"] = ungrounded_populated
+        details["ungrounded_with_empty_excerpts_only"] = ungrounded_empty_only
+        return GateResult(gate="G3", passed=False, reason_code="UNGROUNDED", details=details)
+
+    spot_n = min(human_spot_check_n, len(findings))
+    details["human_spot_check"] = (
+        f"ACTION REQUIRED: manually review {spot_n} sampled findings for "
+        "field-consistency, citation-accuracy, and client-readiness.  "
+        "This is the binding faithfulness signal until NLI/AIS lands."
+    )
+    return GateResult(gate="G3", passed=True, details=details)
+
+
+# ── G5: No-regression / promotion (human decision) ───────────────────────────
+
+
+def g5_no_regression(
+    candidate_scores: dict[str, float],
+    champion_scores: dict[str, float] | None,
+    aa_noise: dict[str, float] | None,
+    *,
+    is_day_zero: bool = False,
+    human_signed_off: bool = False,
+    signoff_count: int = 0,
+) -> GateResult:
+    """G5 — No-regression / promotion gate (human decision).
+
+    Day-Zero: no champion exists.  Requires G1-G3 pass + 2 independent sign-offs.
+    Normal promotion: candidate must beat champion by > aa_noise on every metric,
+    no guardrail regresses, + 1 human sign-off.
+
+    Champions are per-corpus.  Do not compare across corpora.
+    """
+    if is_day_zero or champion_scores is None:
+        required = 2
+        if signoff_count < required:
+            return GateResult(
+                gate="G5",
+                passed=False,
+                reason_code="HOLD",
+                details={
+                    "reason": (
+                        f"Day-Zero requires {required} independent human sign-offs "
+                        f"(kappa >= 0.70); got {signoff_count}."
+                    ),
+                    "action": "Collect sign-offs, then re-run with --day-zero --signoffs 2",
+                },
+            )
+        return GateResult(
+            gate="G5",
+            passed=True,
+            details={"day_zero": True, "signoffs": signoff_count},
+        )
+
+    if not human_signed_off:
+        return GateResult(
+            gate="G5",
+            passed=False,
+            reason_code="HOLD",
+            details={"reason": "Human sign-off required for promotion."},
+        )
+
+    noise = aa_noise or {}
+    regressions: list[str] = []
+    improvements: list[str] = []
+
+    for metric, cand_val in candidate_scores.items():
+        champ_val = champion_scores.get(metric)
+        if champ_val is None:
+            continue
+        delta = cand_val - champ_val
+        band = noise.get(metric, 0.0)
+        if delta < -band:
+            regressions.append(
+                f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} "
+                f"delta={delta:+.4f} < -band={-band:.4f}"
+            )
+        elif delta > band:
+            improvements.append(f"{metric}: delta={delta:+.4f} > band={band:.4f}")
+
+    if regressions:
+        return GateResult(
+            gate="G5",
+            passed=False,
+            reason_code="HOLD",
+            details={
+                "regressions": regressions,
+                "improvements": improvements,
+                "message": "Guardrail metric(s) regressed beyond A/A noise band.",
+            },
+        )
+
+    return GateResult(
+        gate="G5",
+        passed=True,
+        details={"improvements": improvements, "noise_band": noise},
+    )
+
+
+# ── Full gate pipeline ────────────────────────────────────────────────────────
+
+
+def run_gates(
+    result: dict,
+    registry: Registry,
+    registry_path: str,
+    *,
+    pii_list: list[str] | None = None,
+    recall_floor: float = 0.70,
+    grounding_floor: float = 0.90,
+    champion_scores: dict[str, float] | None = None,
+    aa_noise: dict[str, float] | None = None,
+    is_day_zero: bool = False,
+    human_signed_off: bool = False,
+    signoff_count: int = 0,
+    embed_fn=None,
+    tau: float = 0.70,
+    tau_nc: float = 0.85,
+    recall_metric: str = "lexical",
+    corpus: Corpus | None = None,
+) -> list[GateResult]:
+    """Run all gates G1 -> G2 -> G3 -> G5; every gate always executes.
+
+    A failed gate raises a flag in its GateResult but never prevents the
+    remaining gates from running.  The scorecard therefore always carries the
+    complete picture: a run that misses a regulatory item *and* grounds poorly
+    shows both flags.  See EVALUATION_FRAMEWORK.md §2 ('No gate vetoes').
+
+    ``corpus`` (optional) enables deterministic evidence verification: G1 pins
+    the corpus hash, G2 ignores unverified evidence entries, and G3 flags
+    fabricated excerpts and unknown sources.  Without it, evidence is taken at
+    face value from the run's own evidence_index (disclosed on the scorecard).
+
+    Returns all four GateResult objects.
+    """
+    g1 = g1_structural(result, registry, registry_path, pii_list=pii_list, corpus=corpus)
+
+    g2 = g2_recall_precision(
+        result,
+        registry,
+        recall_floor=recall_floor,
+        embed_fn=embed_fn,
+        tau=tau,
+        tau_nc=tau_nc,
+        recall_metric=recall_metric,
+        corpus=corpus,
+    )
+
+    g3 = g3_grounded(result, grounding_floor=grounding_floor, corpus=corpus)
+
+    # G5 uses whatever scores G2/G3 produced; 0.0 when a gate flagged and did
+    # not emit the metric (e.g. L0_MISSING returns before computing recall).
+    candidate_scores = {
+        "recall": g2.details.get("recall", 0.0),
+        "grounding_pct": g3.details.get("grounding_pct", 0.0),
+    }
+    g5 = g5_no_regression(
+        candidate_scores,
+        champion_scores,
+        aa_noise,
+        is_day_zero=is_day_zero,
+        human_signed_off=human_signed_off,
+        signoff_count=signoff_count,
+    )
+
+    return [g1, g2, g3, g5]

From d964ba10735b918ed8e62ae7a5b1533238696495 Mon Sep 17 00:00:00 2001
From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com>
Date: Thu, 18 Jun 2026 23:46:21 +0200
Subject: [PATCH 05/48] feat(evaluation): add scorecard renderer (#272)

* feat(evaluation): add scorecard renderer

* feat(evaluation): export render_scorecard, verdict, VERDICT_PROMOTE/HOLD from scorecard module

---------

Co-authored-by: miguelgfierro <miguelgfierro@users.noreply.github.com>
---
 .../evaluation/__init__.py                    |   6 +-
 .../evaluation/scorecard.py                   | 489 ++++++++++++++++++
 2 files changed, 494 insertions(+), 1 deletion(-)
 create mode 100644 fireflyframework_agentic/evaluation/scorecard.py

diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py
index 401244c9..61562db3 100644
--- a/fireflyframework_agentic/evaluation/__init__.py
+++ b/fireflyframework_agentic/evaluation/__init__.py
@@ -32,7 +32,8 @@
 from importlib.metadata import PackageNotFoundError, version
 
 from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index
-from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, render_scorecard, run_gates
+from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, run_gates
+from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict, VERDICT_PROMOTE, VERDICT_HOLD
 from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion
 from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge
 from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens
@@ -58,6 +59,9 @@
     "g2_recall_precision",
     "run_gates",
     "render_scorecard",
+    "verdict",
+    "VERDICT_PROMOTE",
+    "VERDICT_HOLD",
     "ChampionRecord",
     "load_champion",
     "save_champion",
diff --git a/fireflyframework_agentic/evaluation/scorecard.py b/fireflyframework_agentic/evaluation/scorecard.py
new file mode 100644
index 00000000..b34885e8
--- /dev/null
+++ b/fireflyframework_agentic/evaluation/scorecard.py
@@ -0,0 +1,489 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Scorecard renderer: gate results -> Markdown report.
+
+Every scorecard states whether it is self-graded.  Until Phase 3 independent
+re-annotation lands, all Lean-Core PROMOTE verdicts are self-graded against
+team-authored ground truth.  See EVALUATION_FRAMEWORK.md.
+"""
+
+from __future__ import annotations
+
+import json
+
+VERDICT_PROMOTE = "PROMOTE"
+VERDICT_HOLD = "HOLD"
+
+
+def verdict(gate_results: list) -> str:
+    """PROMOTE iff all gates passed and G5 is in the list; HOLD otherwise."""
+    if not gate_results:
+        return VERDICT_HOLD
+    if not all(g.passed for g in gate_results):
+        return VERDICT_HOLD
+    gate_names = {g.gate for g in gate_results}
+    if "G5" not in gate_names:
+        return VERDICT_HOLD
+    return VERDICT_PROMOTE
+
+
+def render_scorecard(
+    gate_results: list,
+    *,
+    corpus: str = "unknown",
+    model_id: str = "unknown",
+    run_id: str = "run",
+    is_self_graded: bool = True,
+    kappa_advisory: bool = False,
+    evidence_unverified: bool = False,
+    bpi2017_f1: float | None = None,
+    advisory=None,
+    config: dict | None = None,
+    experiment_config: dict | None = None,
+) -> str:
+    """Render a Markdown evaluation scorecard.
+
+    The scorecard always discloses self-graded status and advisory flags.
+    """
+    v = verdict(gate_results)
+    lines = [
+        "# FlyRadar Evaluation Scorecard",
+        "",
+        f"**Corpus**: {corpus}",
+        f"**Model**: {model_id}",
+        f"**Run**: {run_id}",
+        f"**Verdict**: **{v}**",
+        "",
+    ]
+
+    if is_self_graded:
+        lines += [
+            "> **SELF-GRADED**: All ground truth (must-find, gold, DILO, human sign-off) is",
+            "> authored by the FlyRadar team.  This PROMOTE has no contamination-free signal",
+            "> until Phase 3.  See EVALUATION_FRAMEWORK.md.",
+            "",
+        ]
+
+    if kappa_advisory:
+        lines += [
+            "> **ADVISORY**: Registry kappa < 0.70 — a second independent annotator has not",
+            "> verified the must-find items.  Promotion is advisory for this corpus until",
+            "> kappa >= 0.70 from an independent re-annotation.",
+            "",
+        ]
+
+    if evidence_unverified:
+        lines += [
+            "> **EVIDENCE UNVERIFIED**: no corpus supplied (--corpus) — evidence locators",
+            "> and excerpts are taken at face value from the run's own evidence_index.",
+            "> Grounding certifies self-consistency, not corpus reality.  Supply the run's",
+            "> input.json to enable deterministic excerpt verification (G3, §6.3).",
+            "",
+        ]
+
+    if experiment_config is not None:
+        lines += [
+            "## Experiment configuration",
+            "How this run was generated. Recorded fields (cost, tokens, latency, agents) are "
+            "read from the run's output.json; `model` is the value passed to the harness via "
+            "--model-id. Generation params (temperature, prompt/pipeline version, seed) are not "
+            "captured in output.json.",
+            "",
+            "```json",
+            json.dumps(experiment_config, indent=2, default=str),
+            "```",
+            "",
+        ]
+
+    if config is not None:
+        lines += [
+            "## Evaluation configuration",
+            "These are the parameters used to compute the evaluation.",
+            "",
+            "```json",
+            json.dumps(config, indent=2, default=str),
+            "```",
+            "",
+        ]
+
+    lines += ["## Gate Results", ""]
+    g5_result = None
+    for g in gate_results:
+        if g.gate == "G5":
+            g5_result = g
+            continue
+        status = "PASS" if g.passed else f"FLAG ({g.reason_code})"
+        lines.append(f"### {g.gate}: {status}")
+        if g.details:
+            lines.append("```json")
+            lines.append(json.dumps(g.details, indent=2, default=str))
+            lines.append("```")
+        lines.append("")
+
+    if bpi2017_f1 is not None:
+        ok = bpi2017_f1 >= 0.60
+        anchor_status = "PASS (>= 0.60)" if ok else "BELOW THRESHOLD (< 0.60)"
+        lines += [
+            "## External Sanity Anchor (non-blocking)",
+            f"BPI-2017 variant-recovery F1: **{bpi2017_f1:.3f}** — {anchor_status}",
+            "_One non-self-graded signal.  Non-blocking; informational only._",
+            "",
+        ]
+
+    if advisory is not None:
+        lines += _render_advisory(advisory)
+
+    if g5_result is not None:
+        status = "PASS" if g5_result.passed else f"FLAG ({g5_result.reason_code})"
+        lines.append(f"### G5: {status}")
+        if g5_result.details:
+            lines.append("```json")
+            lines.append(json.dumps(g5_result.details, indent=2, default=str))
+            lines.append("```")
+        lines.append("")
+
+    lines += _render_analysis(gate_results, advisory)
+
+    return "\n".join(lines)
+
+
+def _num(x) -> str:
+    """Format a metric leaf: None -> 'n/a', float -> 3dp, else str."""
+    if x is None:
+        return "n/a"
+    if isinstance(x, float):
+        return f"{x:.3f}"
+    return str(x)
+
+
+def _render_advisory(report) -> list[str]:
+    """Render the non-blocking G4 LLM-as-a-Judge section from an AdvisoryReport.
+
+    Best-effort: only metrics present in report.metrics are shown.  G4 never
+    affects the PROMOTE/HOLD verdict; this section is decision-support for the
+    G5 human sign-off, and is advisory until LLM-as-a-Judge calibration (§10).
+    """
+    m = report.metrics
+    cal = "calibrated" if report.calibrated else "uncalibrated"
+    lines = [
+        "## G4 — LLM-as-a-Judge (non-blocking — does NOT affect the PROMOTE/HOLD verdict)",
+        f"Judge: {report.judge_model} · {cal} · {report.runs}-run median",
+    ]
+    if report.same_provider_caveat:
+        lines.append("> same-provider as the pipeline — results may share blind spots.")
+    lines.append("```text")
+
+    if "faithfulness" in m:
+        d = m["faithfulness"]
+        u = d.get("unsupported_ids", [])
+        extra = f"   (unsupported: {', '.join(u)})" if u else ""
+        lines.append(
+            f"Faithfulness (entailment):       {d.get('supported')}/{d.get('total')} supported{extra}"
+        )
+    if "numeric_temporal_fidelity" in m:
+        lines.append(
+            f"Numeric/temporal fidelity:       {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)"
+        )
+    if "citation_relevance" in m:
+        d = m["citation_relevance"]
+        lines.append(
+            f"Citation relevance (ctx-prec):   {_num(d.get('precision'))}   ({d.get('relevant')}/{d.get('total')})"
+        )
+    if "semantic_recovery" in m:
+        d = m["semantic_recovery"]
+        rec = d.get("recovered", [])
+        rids = ", ".join(r.get("id", "") for r in rec) if rec else "none"
+        lines.append(
+            f"Semantic recovery (ctx-recall):  lexical {_num(d.get('lexical_recall'))} -> {_num(d.get('recovered_recall'))}   (recovered: {rids})"
+        )
+    if "nc_semantic_precision" in m:
+        d = m["nc_semantic_precision"]
+        a = d.get("asserted_ids", [])
+        extra = f"   ({', '.join(a)})" if a else ""
+        lines.append(f"NC semantic precision:           {d.get('asserted', 0)} asserted{extra}")
+    if "fabricated_entity" in m:
+        lines.append(f"Fabricated-entity check:         {m['fabricated_entity'].get('count', 0)}")
+    if "contradiction" in m:
+        lines.append(f"Contradiction detection:         {m['contradiction'].get('count', 0)}")
+    if "actionability" in m:
+        d = m["actionability"]
+        lines.append(
+            f"Actionability:                   {_num(d.get('score'))}   (rated {d.get('rated', 0)})"
+        )
+    if "severity_calibration" in m:
+        d = m["severity_calibration"]
+        lines.append(
+            f"Severity calibration:            {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated"
+        )
+    if "answer_relevancy" in m:
+        lines.append(f"Answer relevancy:                {_num(m['answer_relevancy'].get('score'))}")
+    if "comparative_vs_champion" in m:
+        lines.append(
+            f"Comparative vs champion:         more consistent -> {m['comparative_vs_champion'].get('more_consistent', 'n/a')}"
+        )
+    if "source_coverage" in m:
+        d = m["source_coverage"]
+        o = d.get("orphaned", [])
+        extra = f"   (orphaned: {', '.join(o)})" if o else ""
+        lines.append(
+            f"Source coverage [D]:             {d.get('cited')}/{d.get('total')} documents cited{extra}"
+        )
+    if "excerpt_fill_rate" in m:
+        d = m["excerpt_fill_rate"]
+        lines.append(
+            f"Evidence-excerpt fill [D]:       {d.get('populated')}/{d.get('total')} populated"
+        )
+    if "open_gap" in m:
+        gap = (m["open_gap"].get("gap") or "").strip()
+        if gap:
+            lines.append(f"Open gap probe:                  {gap}")
+    if report.errors:
+        lines.append(f"(errors: {len(report.errors)} metric(s) failed: {'; '.join(report.errors)})")
+    lines.append("```")
+    # Full detail — nothing truncated: every id, pair, verdict, and complete text.
+    lines += [
+        "",
+        "**G4 — full metric detail:**",
+        "```json",
+        json.dumps({"metrics": report.metrics, "details": report.details}, indent=2, default=str),
+        "```",
+    ]
+    lines.append(
+        "> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10)."
+    )
+    lines.append("")
+    return lines
+
+
+def _render_analysis(gate_results: list, advisory=None) -> list[str]:
+    """Render a plain-language interpretation of all evaluation signals."""
+    g2 = next((g for g in gate_results if g.gate == "G2"), None)
+    g3 = next((g for g in gate_results if g.gate == "G3"), None)
+
+    lines = ["## Analysis", ""]
+
+    # ── Topic coverage (G2) ──────────────────────────────────────────────────
+    lines.append("### Topic coverage (G2)")
+    if g2 and g2.details:
+        d = g2.details
+        recall = d.get("recall", 0.0)
+        tiers = d.get("per_tier", {})
+        finding_count = d.get("finding_count", 0)
+        redundancy = d.get("finding_redundancy_rate", 0.0)
+        matched = d.get("findings_matched_to_registry", {}).get("fraction", 0.0)
+
+        tier_summary = ", ".join(
+            f"{t} {v['hit']}/{v['total']}"
+            for t, v in tiers.items()
+            if "hit" in v and "total" in v
+        )
+        lines.append(
+            f"Lexical recall is **{recall:.3f}** ({tier_summary}). "
+            f"The run produced {finding_count} findings, "
+            f"all of which map to a registry item (match rate {matched:.0%}). "
+        )
+        if redundancy > 0.15:
+            lines.append(
+                f"Finding redundancy is **{redundancy:.0%}** — a meaningful share of "
+                "findings are near-duplicates of each other (Jaccard ≥ 0.6). "
+                "The run is covering the same ground multiple times rather than broadening coverage."
+            )
+        else:
+            lines.append(
+                f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic."
+            )
+        lines.append(
+            "_G2 is a topic-level test. A recall of 1.000 means every required topic was "
+            "mentioned somewhere — it does not verify that the specific claims about those "
+            "topics are accurate. Claim accuracy is G4 Faithfulness._"
+        )
+    else:
+        lines.append("G2 result unavailable.")
+    lines.append("")
+
+    # ── Evidence quality (G3) ────────────────────────────────────────────────
+    lines.append("### Evidence quality (G3)")
+    if g3 and g3.details:
+        d = g3.details
+        grounding = d.get("grounding_pct", 0.0)
+        ev = d.get("evidence_verification", {})
+        verified = ev.get("verified", 0)
+        entries = ev.get("entries", 0)
+        fabricated = ev.get("fabricated", [])
+        unknown = ev.get("source_unknown", [])
+        orphaned = d.get("orphaned_sources", [])
+        source_cov = d.get("source_coverage", "")
+
+        lines.append(
+            f"Grounding is **{grounding:.0%}**: every finding cites at least one "
+            "corpus document, and all excerpts are populated. "
+            f"Evidence verification checked {entries} entries against the raw corpus: "
+            f"{verified} verified"
+            + (f", **{len(fabricated)} fabricated** (locators that do not exist in the corpus)" if fabricated else "")
+            + (f", **{len(unknown)} source-unknown** (locators that resolve to no corpus file)" if unknown else "")
+            + "."
+        )
+        if unknown:
+            lines.append(
+                f"The source-unknown locator(s) are: `{'`, `'.join(unknown)}`. "
+                "This is most likely a corpus bundle gap rather than a hallucinated source — "
+                "verify that all expected files are included in `input.json`."
+            )
+        if orphaned:
+            lines.append(
+                f"**{len(orphaned)} corpus documents were never cited** by this run "
+                f"({', '.join(orphaned)}). These are blind spots: the run extracted nothing "
+                "from these sources, so any findings they contain are silently missed."
+            )
+        if source_cov:
+            cited, total = (int(x) for x in source_cov.split("/"))
+            if cited < total:
+                lines.append(
+                    f"Overall source coverage is {cited}/{total} — "
+                    f"{total - cited} corpus file(s) left entirely uncited."
+                )
+    else:
+        lines.append("G3 result unavailable.")
+    lines.append("")
+
+    # ── Claim accuracy (G4) ──────────────────────────────────────────────────
+    if advisory is not None:
+        m = advisory.metrics
+        lines.append("### Claim accuracy (G4 — advisory)")
+
+        faith = m.get("faithfulness", {})
+        supported = faith.get("supported", 0)
+        total_f = faith.get("total", 0)
+        if total_f:
+            faith_pct = supported / total_f
+            lines.append(
+                f"**Faithfulness: {supported}/{total_f} findings ({faith_pct:.0%}) are entailed by their cited evidence.** "
+            )
+            if faith_pct < 0.5:
+                lines.append(
+                    "This is a critical signal: the majority of findings contain claims "
+                    "that the judge cannot verify from the cited sources. "
+                    "The run is presenting inferences, extrapolations, or hallucinated details "
+                    "as if they were directly evidenced. "
+                    "Each unsupported finding should be reviewed against its cited document before use."
+                )
+            elif faith_pct < 0.8:
+                lines.append(
+                    "A significant minority of findings contain claims not traceable to cited sources. "
+                    "These may be reasonable inferences, but they should be flagged for human verification."
+                )
+            else:
+                lines.append("Most findings are directly supported by their cited evidence.")
+
+        ntf = m.get("numeric_temporal_fidelity", {})
+        mismatch_count = ntf.get("count", 0)
+        if mismatch_count:
+            lines.append(
+                f"**Numeric/temporal fidelity: {mismatch_count} mismatches detected.** "
+                "Specific figures — FTE costs, durations, timestamps, percentages, case IDs — "
+                "appear in findings but cannot be traced to the cited evidence. "
+                "These numbers should be treated as estimates or fabrications until verified "
+                "against the source documents."
+            )
+
+        fab = m.get("fabricated_entity", {})
+        fab_count = fab.get("count", 0)
+        fab_entities = fab.get("entities", [])
+        if fab_count:
+            lines.append(
+                f"**Fabricated entities: {fab_count}** — the following names/identifiers appear "
+                f"in the output but are absent from the corpus: "
+                f"{', '.join(f'`{e}`' for e in fab_entities)}. "
+                "These should be removed or verified before sharing the output."
+            )
+
+        sev = m.get("severity_calibration", {})
+        misc = sev.get("miscalibrated", 0)
+        total_s = sev.get("total", 0)
+        verdicts = sev.get("verdicts", {})
+        over_count = sum(1 for v in verdicts.values() if v == "over")
+        under_count = sum(1 for v in verdicts.values() if v == "under")
+        if misc and total_s:
+            direction = ""
+            if over_count > under_count:
+                direction = f" (predominantly over-rated: {over_count} findings rated too high)"
+            elif under_count > over_count:
+                direction = f" (predominantly under-rated: {under_count} findings rated too low)"
+            lines.append(
+                f"**Severity calibration: {misc}/{total_s} findings miscalibrated{direction}.** "
+                "Over-rated findings inflate perceived urgency and can cause the client to "
+                "prioritise the wrong items."
+            )
+
+        act = m.get("actionability", {})
+        act_score = act.get("score")
+        if act_score is not None:
+            if act_score < 0.6:
+                lines.append(
+                    f"**Actionability score: {act_score:.3f}** — proposed actions are below the "
+                    "0.6 threshold for concrete, quantified recommendations. "
+                    "Actions tend to be generic rather than specific enough to assign and execute."
+                )
+            else:
+                lines.append(f"Actionability score: {act_score:.3f} — actions are sufficiently concrete.")
+
+        og = m.get("open_gap", {})
+        gap_text = (og.get("gap") or "").strip()
+        if gap_text:
+            lines.append(f"**Most important missed finding:** {gap_text}")
+
+        lines.append("")
+
+    # ── Bottom line ──────────────────────────────────────────────────────────
+    lines.append("### Bottom line")
+    g5 = next((g for g in gate_results if g.gate == "G5"), None)
+    g5_reason = (g5.details or {}).get("reason", "") if g5 else ""
+    flags = [g for g in gate_results if not g.passed]
+    flag_names = [g.gate for g in flags]
+
+    if not flags:
+        lines.append(
+            "All deterministic gates pass. The run is ready for G5 human sign-off."
+        )
+    else:
+        flag_str = ", ".join(flag_names)
+        lines.append(
+            f"The run is at **HOLD** due to flags on: {flag_str}. "
+        )
+        for g in flags:
+            if g.gate == "G3" and g.reason_code == "EVIDENCE_SOURCE_UNKNOWN":
+                lines.append(
+                    "- **G3**: One evidence locator points to a file not in the corpus bundle. "
+                    "Regenerate `input.json` to include all corpus sources, then re-run."
+                )
+            elif g.gate == "G5":
+                lines.append(f"- **G5**: {g5_reason}")
+
+    if advisory is not None:
+        m = advisory.metrics
+        faith = m.get("faithfulness", {})
+        supported = faith.get("supported", 0)
+        total_f = faith.get("total", 1)
+        ntf_count = m.get("numeric_temporal_fidelity", {}).get("count", 0)
+        fab_count = m.get("fabricated_entity", {}).get("count", 0)
+        lines.append(
+            f"\nG4 advisory signals (non-blocking but important for the G5 reviewer): "
+            f"faithfulness {supported}/{total_f}, "
+            f"{ntf_count} numeric mismatches, "
+            f"{fab_count} fabricated entities. "
+            "The G5 reviewer should focus on the unsupported findings and verify figures "
+            "against the source documents before certifying the output."
+        )
+    lines.append("")
+    return lines

From 09cfc34bd75869498a0d5a10216625a784d9e638 Mon Sep 17 00:00:00 2001
From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com>
Date: Thu, 18 Jun 2026 23:52:05 +0200
Subject: [PATCH 06/48] feat(evaluation): add LLM-as-judge and judge client
 (#273)

* feat(evaluation): add JudgeClient and OllamaEmbedder (judge_client.py)

* feat(evaluation): add AdvisoryReport and run_judge with [D]/[E]/[J] metric families (judge.py)

* feat(evaluation): import cosine from judge_client in matcher.py

* feat(evaluation): export JudgeClient, OllamaEmbedder, build_embedder, cosine from evaluation package

---------

Co-authored-by: miguelgfierro <miguelgfierro@users.noreply.github.com>
---
 .../evaluation/__init__.py                    |   5 +
 fireflyframework_agentic/evaluation/judge.py  | 829 ++++++++++++++++++
 .../evaluation/judge_client.py                | 454 ++++++++++
 .../evaluation/matcher.py                     |   7 +-
 4 files changed, 1289 insertions(+), 6 deletions(-)
 create mode 100644 fireflyframework_agentic/evaluation/judge.py
 create mode 100644 fireflyframework_agentic/evaluation/judge_client.py

diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py
index 61562db3..37093075 100644
--- a/fireflyframework_agentic/evaluation/__init__.py
+++ b/fireflyframework_agentic/evaluation/__init__.py
@@ -36,6 +36,7 @@
 from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict, VERDICT_PROMOTE, VERDICT_HOLD
 from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion
 from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge
+from fireflyframework_agentic.evaluation.judge_client import JudgeClient, OllamaEmbedder, build_embedder, cosine
 from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens
 from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256
 from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics
@@ -68,6 +69,10 @@
     "invalidate_champion",
     "AdvisoryReport",
     "run_judge",
+    "JudgeClient",
+    "OllamaEmbedder",
+    "build_embedder",
+    "cosine",
     "Registry",
     "RegistryItem",
     "load_registry",
diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py
new file mode 100644
index 00000000..a347c8e1
--- /dev/null
+++ b/fireflyframework_agentic/evaluation/judge.py
@@ -0,0 +1,829 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""G4 — LLM-as-a-Judge: an opt-in, NON-BLOCKING, NON-DETERMINISTIC advisory gate.
+
+G4 NEVER affects the PROMOTE/HOLD verdict and NEVER raises into the caller.
+run_judge() wraps every metric in try/except; a failing metric appends to
+report.errors and the run continues (best-effort).  The result is an
+AdvisoryReport, NOT a GateResult — it is carried separately so it can never
+enter verdict() or the Skipped tuple (see scorecard / verdict_unaffected_note).
+
+Three families of metric (matching the flyradar contracts):
+- [D] DETERMINISTIC — pure python, no LLM, printed even when the judge is off:
+      source_coverage, excerpt_fill_rate.
+- [E] EMBEDDING — needs an embed_fn (local Ollama BGE by default):
+      semantic_recovery (context recall).
+- [J] JUDGE — needs a chat_fn(system, user) -> dict; each [J] metric instructs
+      the model to reply with ONLY JSON: faithfulness, numeric_temporal_fidelity,
+      citation_relevance, nc_semantic_precision, fabricated_entity, contradiction,
+      open_gap, actionability, severity_calibration, answer_relevancy,
+      comparative_vs_champion.
+
+Aggregation follows the flycanon custom-judge design: run each [J] metric `runs`
+times and take the MEDIAN of its numeric scores (robust to an outlier vote).
+
+Zero new dependencies: stdlib (json, statistics) + numpy.  All imports at top.
+calibrated is ALWAYS False for now (LLM-as-a-Judge calibration is §14, future work).
+"""
+
+from __future__ import annotations
+
+import concurrent.futures
+import statistics
+from dataclasses import dataclass, field
+
+import numpy as np
+
+from fireflyframework_agentic.evaluation.judge_client import (
+    JudgeClient,
+    OllamaEmbedder,
+    cosine,
+    same_provider,
+)
+from fireflyframework_agentic.evaluation.matcher import source_stem
+
+SYSTEM = "You are a meticulous evaluator of a process-mining discovery report. Return ONLY a JSON object."
+
+
+@dataclass
+class AdvisoryReport:
+    """The G4 output: a plain metrics bag, never a GateResult.
+
+    metrics maps metric-name -> small dict (the per-metric summary).  details
+    carries supporting context (counts, ids).  errors lists per-metric failures
+    captured by run_judge's best-effort try/except so nothing propagates.
+    """
+
+    judge_model: str
+    same_provider_caveat: bool
+    calibrated: bool  # ALWAYS False for now (§14)
+    runs: int
+    metrics: dict = field(default_factory=dict)
+    details: dict = field(default_factory=dict)
+    errors: list[str] = field(default_factory=list)
+
+
+# ── shared accessors ───────────────────────────────────────────────────────────
+
+
+def _evidence_index(result: dict) -> dict[str, dict]:
+    return {ev.get("id"): ev for ev in result.get("evidence_index", []) if ev.get("id")}
+
+
+def _cited_excerpts(finding: dict, evidence_index: dict[str, dict]) -> list[str]:
+    """Excerpts of the evidence a finding cites (via evidence_refs.evidence_id)."""
+    out: list[str] = []
+    for ref in finding.get("evidence_refs", []):
+        ev = evidence_index.get(ref.get("evidence_id", ""))
+        if ev:
+            excerpt = ev.get("excerpt") or ""
+            if excerpt:
+                out.append(excerpt)
+    return out
+
+
+def _output_text(result: dict) -> str:
+    """All free text the model emitted: finding titles+descriptions + reports."""
+    parts: list[str] = []
+    for f in result.get("findings", []):
+        parts.append(f.get("title", ""))
+        parts.append(f.get("description", ""))
+    for r in result.get("reports", []):
+        parts.append(str(r))
+    return "\n".join(p for p in parts if p)
+
+
+def _workspace_intention(result: dict) -> str:
+    ws = result.get("workspace") or {}
+    return f"{ws.get('name', '')}\n{ws.get('description', '')}".strip()
+
+
+def _coerce_float(value, default=None):
+    """Coerce a model-returned number/numeric-string to float; total (never raises).
+
+    Returns ``default`` (None) on junk so one malformed vote drops that single
+    vote instead of discarding the whole metric.
+    """
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return default
+
+
+def _map_chat(chat_fn, prompts, workers=1):
+    """Run a list of (system, user) chat prompts, returning ordered result dicts.
+
+    ``workers <= 1`` calls ``chat_fn`` SEQUENTIALLY — byte-for-byte identical to
+    the in-line loops it replaces, INCLUDING letting a raise propagate (so
+    run_judge's per-metric try/except still drops that whole metric, the
+    behaviour the suite locks in).
+
+    ``workers >= 2`` fans the calls out across a ThreadPoolExecutor while
+    PRESERVING input order in the returned list.  Concurrency cannot let one
+    raising future poison the batch, so in that path a raising call's slot
+    becomes ``{}`` — the metric's aggregation degrades for that one vote but
+    never raises (the same best-effort contract as run_judge).
+    """
+    prompts = list(prompts)
+    if workers <= 1:
+        return [chat_fn(system, user) for system, user in prompts]
+
+    results: list[dict] = [{} for _ in prompts]
+    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
+        futures = {
+            executor.submit(chat_fn, system, user): idx
+            for idx, (system, user) in enumerate(prompts)
+        }
+        for future in concurrent.futures.as_completed(futures):
+            idx = futures[future]
+            try:
+                results[idx] = future.result()
+            except Exception:  # best-effort: a dropped vote, never a raise
+                results[idx] = {}
+    return results
+
+
+# ── [D] DETERMINISTIC — no LLM, always available ────────────────────────────────
+
+
+def source_coverage(result: dict) -> dict:
+    """Distinct source documents cited by >=1 finding vs all source documents.
+
+    Returns {cited, total, orphaned} where orphaned is the sorted list of
+    source stems present in evidence_index but cited by no finding.
+    """
+    evidence_index = _evidence_index(result)
+    all_stems = {
+        source_stem(ev.get("locator", ""))
+        for ev in result.get("evidence_index", [])
+        if ev.get("locator")
+    }
+    cited_stems: set[str] = set()
+    for f in result.get("findings", []):
+        for ref in f.get("evidence_refs", []):
+            ev = evidence_index.get(ref.get("evidence_id", ""))
+            if ev and ev.get("locator"):
+                cited_stems.add(source_stem(ev["locator"]))
+    cited_stems &= all_stems
+    orphaned = sorted(all_stems - cited_stems)
+    return {"cited": len(cited_stems), "total": len(all_stems), "orphaned": orphaned}
+
+
+def excerpt_fill_rate(result: dict) -> dict:
+    """Fraction of evidence_index entries with a non-empty excerpt.
+
+    Returns {populated, total}.  This is the signal behind older runs' low G3
+    grounding: empty excerpts cannot ground anything.
+    """
+    entries = result.get("evidence_index", [])
+    populated = sum(1 for ev in entries if (ev.get("excerpt") or "").strip())
+    return {"populated": populated, "total": len(entries)}
+
+
+# ── [E] EMBEDDING — needs embed_fn ───────────────────────────────────────────────
+
+
+def semantic_recovery(
+    result: dict,
+    registry,
+    lexical_missed_ids: list[str],
+    embed_fn,
+    tau: float = 0.70,
+) -> dict:
+    """Context-recall: recover G2 lexical misses by embedding similarity.
+
+    For each registry item flagged a LEXICAL MISS by G2, embed its
+    description+keywords and take the max cosine against the embeddings of every
+    finding description (and their cited excerpts).  If max cosine >= tau the
+    item is counted semantically present (recovered).
+
+    recovered_recall = (lexical_hits + recovered) / scored_denominator, where
+    the scored denominator is the count of non-NC items scored by G2 (real
+    items, matching G2's recall denominator family).  Returns the lexical recall,
+    the recovered recall, the recovered item list (with cosine), and tau.
+    """
+    missed = set(lexical_missed_ids or [])
+    real_items = registry.real_items
+    scored_items = [i for i in real_items if i.tier != "L3"]
+    denom = len(scored_items) or 1
+    lexical_hits = sum(1 for i in scored_items if i.id not in missed)
+
+    # Candidate texts the findings actually surfaced.
+    evidence_index = _evidence_index(result)
+    candidate_texts: list[str] = []
+    for f in result.get("findings", []):
+        desc = f.get("description", "")
+        if desc:
+            candidate_texts.append(desc)
+        candidate_texts.extend(_cited_excerpts(f, evidence_index))
+
+    missed_items = [i for i in scored_items if i.id in missed]
+    if not missed_items or not candidate_texts:
+        recovered_recall = lexical_hits / denom
+        return {
+            "lexical_recall": round(lexical_hits / denom, 4),
+            "recovered_recall": round(recovered_recall, 4),
+            "recovered": [],
+            "tau": tau,
+            "scored_denominator": denom,
+        }
+
+    item_texts = [f"{i.description} {' '.join(i.keywords)}".strip() for i in missed_items]
+    item_vecs = np.asarray(embed_fn(item_texts), dtype=np.float64)
+    cand_vecs = np.asarray(embed_fn(candidate_texts), dtype=np.float64)
+
+    recovered: list[dict] = []
+    for item, ivec in zip(missed_items, item_vecs):
+        best = max((cosine(ivec, cvec) for cvec in cand_vecs), default=0.0)
+        if best >= tau:
+            recovered.append({"id": item.id, "cosine": round(best, 4)})
+
+    recovered_recall = (lexical_hits + len(recovered)) / denom
+    return {
+        "lexical_recall": round(lexical_hits / denom, 4),
+        "recovered_recall": round(recovered_recall, 4),
+        "recovered": recovered,
+        "tau": tau,
+        "scored_denominator": denom,
+    }
+
+
+# ── [J] JUDGE — needs chat_fn(system, user) -> dict ──────────────────────────────
+
+
+def faithfulness(result: dict, chat_fn, *, workers: int = 1) -> dict:
+    """Entailment: does each finding's cited evidence SUPPORT its claim?
+
+    Per (finding, cited-excerpts) pair, ask SUPPORTED / NOT_SUPPORTED.  Returns
+    {supported, total, unsupported_ids}.  Findings with no cited evidence are
+    counted as not-supported (nothing to entail against).
+    """
+    evidence_index = _evidence_index(result)
+    findings = result.get("findings", [])
+    cited = [(f, _cited_excerpts(f, evidence_index)) for f in findings]
+    prompts = [
+        (
+            SYSTEM,
+            "Does the cited evidence span ENTAIL the claim made in this finding?\n"
+            'Reply with ONLY {"verdict": "SUPPORTED" or "NOT_SUPPORTED", "reason": "<one line>"}.\n\n'
+            f"FINDING: {f.get('description', '')}\n"
+            f"CITED EVIDENCE: {' || '.join(excerpts)}",
+        )
+        for f, excerpts in cited
+        if excerpts
+    ]
+    answers = iter(_map_chat(chat_fn, prompts, workers))
+    supported = 0
+    unsupported_ids: list[str] = []
+    for f, excerpts in cited:
+        fid = f.get("id", "?")
+        if not excerpts:
+            unsupported_ids.append(fid)
+            continue
+        verdict = str(next(answers).get("verdict", "")).upper()
+        if verdict == "SUPPORTED":
+            supported += 1
+        else:
+            unsupported_ids.append(fid)
+    return {"supported": supported, "total": len(findings), "unsupported_ids": unsupported_ids}
+
+
+def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dict:
+    """Flag numbers/dates asserted in a finding that do NOT match its evidence.
+
+    Closes the 45-days-vs-3-days gap.  Returns {mismatches: [{finding_id, value,
+    source}], count}.
+    """
+    evidence_index = _evidence_index(result)
+    scored = [
+        (f, excerpts)
+        for f in result.get("findings", [])
+        if (excerpts := _cited_excerpts(f, evidence_index))
+    ]
+    prompts = [
+        (
+            SYSTEM,
+            "List every specific number or date asserted in the FINDING that does "
+            "NOT match the CITED EVIDENCE.\n"
+            'Reply with ONLY {"mismatches": [{"value": "<claimed>", "source": "<what the evidence says>"}]}. '
+            "Empty list if all match.\n\n"
+            f"FINDING: {f.get('description', '')}\n"
+            f"CITED EVIDENCE: {' || '.join(excerpts)}",
+        )
+        for f, excerpts in scored
+    ]
+    answers = _map_chat(chat_fn, prompts, workers)
+    mismatches: list[dict] = []
+    for (f, _excerpts), answer in zip(scored, answers):
+        for m in answer.get("mismatches", []) or []:
+            mismatches.append(
+                {
+                    "finding_id": f.get("id", "?"),
+                    "value": m.get("value", ""),
+                    "source": m.get("source", ""),
+                }
+            )
+    return {"mismatches": mismatches, "count": len(mismatches)}
+
+
+def citation_relevance(result: dict, chat_fn, *, workers: int = 1) -> dict:
+    """Context precision: fraction of cited passages actually relevant to the claim.
+
+    Per evidence_ref, ask yes/no relevance.  precision = relevant / total_refs.
+    Returns {precision, relevant, total}; when total == 0 (no cited passages with
+    excerpts) precision is None — the kept ``total`` lets a reader tell "perfect"
+    apart from "nothing to score".
+    """
+    evidence_index = _evidence_index(result)
+    prompts: list[tuple[str, str]] = []
+    for f in result.get("findings", []):
+        desc = f.get("description", "")
+        for ref in f.get("evidence_refs", []):
+            ev = evidence_index.get(ref.get("evidence_id", ""))
+            if not ev:
+                continue
+            excerpt = ev.get("excerpt") or ""
+            if not excerpt:
+                continue
+            prompts.append(
+                (
+                    SYSTEM,
+                    "Is this cited passage actually relevant to / used by this claim?\n"
+                    'Reply with ONLY {"relevant": "yes" or "no"}.\n\n'
+                    f"CLAIM: {desc}\n"
+                    f"CITED PASSAGE: {excerpt}",
+                )
+            )
+    answers = _map_chat(chat_fn, prompts, workers)
+    total = len(prompts)
+    relevant = sum(1 for a in answers if str(a.get("relevant", "")).lower() == "yes")
+    if not total:
+        return {"precision": None, "relevant": relevant, "total": total}
+    return {"precision": round(relevant / total, 4), "relevant": relevant, "total": total}
+
+
+def nc_semantic_precision(result: dict, registry, chat_fn, *, workers: int = 1) -> dict:
+    """Count negative-control falsehoods the output asserts or endorses.
+
+    Per NC item, ask yes/no whether the output endorses the false statement.
+    Returns {asserted, total, asserted_ids}.  The semantic counterpart to the
+    inert deterministic NC check.
+    """
+    output_text = _output_text(result)
+    nc_items = registry.nc_items
+    prompts = [
+        (
+            SYSTEM,
+            "Does the OUTPUT assert or endorse the following FALSE statement?\n"
+            'Reply with ONLY {"asserted": "yes" or "no"}.\n\n'
+            f"FALSE STATEMENT: {item.description}\n"
+            f"OUTPUT:\n{output_text}",
+        )
+        for item in nc_items
+    ]
+    answers = _map_chat(chat_fn, prompts, workers)
+    asserted_ids = [
+        item.id for item, a in zip(nc_items, answers) if str(a.get("asserted", "")).lower() == "yes"
+    ]
+    return {"asserted": len(asserted_ids), "total": len(nc_items), "asserted_ids": asserted_ids}
+
+
+def fabricated_entity(result: dict, chat_fn) -> dict:
+    """Count systems/orgs/metrics named in the output but absent from the corpus.
+
+    Returns {count, entities}.  The corpus universe is the set of evidence
+    excerpts + locators.
+    """
+    output_text = _output_text(result)
+    corpus = "\n".join(
+        f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}"
+        for ev in result.get("evidence_index", [])
+    )
+    user = (
+        "List any system, organization, or metric NAMED in the OUTPUT that does NOT "
+        "appear anywhere in the CORPUS EVIDENCE.\n"
+        'Reply with ONLY {"fabricated": ["<entity>", ...]}.  Empty list if none.\n\n'
+        f"OUTPUT:\n{output_text}\n\n"
+        f"CORPUS EVIDENCE:\n{corpus}"
+    )
+    entities = chat_fn(SYSTEM, user).get("fabricated", []) or []
+    return {"count": len(entities), "entities": list(entities)}
+
+
+def contradiction(result: dict, chat_fn) -> dict:
+    """Count internally contradictory finding pairs.
+
+    Returns {count, pairs}.  pairs is the list of contradicting finding-id pairs
+    the judge reports.
+    """
+    lines = []
+    for f in result.get("findings", []):
+        lines.append(f"{f.get('id', '?')}: {f.get('title', '')} — {f.get('description', '')}")
+    user = (
+        "Are any two of these FINDINGS mutually contradictory? List each contradicting pair.\n"
+        'Reply with ONLY {"pairs": [["<id_a>", "<id_b>"], ...]}.  Empty list if none.\n\n'
+        + "\n".join(lines)
+    )
+    pairs = chat_fn(SYSTEM, user).get("pairs", []) or []
+    return {"count": len(pairs), "pairs": [list(p) for p in pairs]}
+
+
+def open_gap(result: dict, chat_fn) -> dict:
+    """G-Eval open probe: the most important process issue the output missed.
+
+    Returns {gap} — a free-text advisory narrative (no score).
+    """
+    pg = result.get("process_graph") or {}
+    pg_summary = f"process_graph has {len(pg.get('processes', []))} processes"
+    user = (
+        "Given this corpus scope and output, what important process issue did the "
+        "output FAIL to surface?\n"
+        'Reply with ONLY {"gap": "<the most important missed issue, one short paragraph>"}.\n\n'
+        f"WORKSPACE SCOPE: {_workspace_intention(result)}\n"
+        f"{pg_summary}\n"
+        f"OUTPUT:\n{_output_text(result)}"
+    )
+    return {"gap": str(chat_fn(SYSTEM, user).get("gap", ""))}
+
+
+def actionability(result: dict, chat_fn, *, workers: int = 1) -> dict:
+    """Average 0-1 rating of whether proposed actions are specific+quantified+linked.
+
+    Returns {score, rated}.  Each action is rated against whether it is specific,
+    quantified, and linked to a finding.
+    """
+    actions = result.get("proposed_actions", []) or []
+    finding_ids = {f.get("id") for f in result.get("findings", [])}
+    prompts = [
+        (
+            SYSTEM,
+            "Rate whether this proposed action is SPECIFIC, QUANTIFIED, and LINKED to a "
+            "finding.\n"
+            'Reply with ONLY {"score": <number 0-1>}.\n\n'
+            f"TITLE: {a.get('title', '')}\n"
+            f"DESCRIPTION: {a.get('description', '')}\n"
+            f"OWNER: {a.get('owner_persona', '')}  HORIZON: {a.get('horizon', '')}  "
+            f"LEVER: {a.get('lever', '')}  EFFORT: {a.get('effort', '')}\n"
+            f"EXPECTED_SAVINGS_FTE: {a.get('expected_savings_fte', '')}  "
+            f"EXPECTED_SAVINGS_USD: {a.get('expected_savings_usd', '')}\n"
+            f"LINKED_TO_FINDING: {a.get('finding_id') in finding_ids}",
+        )
+        for a in actions
+    ]
+    answers = _map_chat(chat_fn, prompts, workers)
+    scores: list[float] = []
+    for a in answers:
+        value = _coerce_float(a.get("score"))
+        if value is None:  # malformed vote -> skip this action, keep the metric
+            continue
+        scores.append(value)
+    score = round(sum(scores) / len(scores), 4) if scores else None
+    return {"score": score, "rated": len(scores)}
+
+
+def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict:
+    """Per-finding judgment of whether stated severity matches the evidence.
+
+    Returns {miscalibrated, total, verdicts: {finding_id: under|over|calibrated}}.
+    """
+    evidence_index = _evidence_index(result)
+    findings = result.get("findings", [])
+    prompts = [
+        (
+            SYSTEM,
+            "Does the STATED SEVERITY match what the CITED EVIDENCE supports?\n"
+            'Reply with ONLY {"calibration": "under" or "over" or "calibrated"}.\n\n'
+            f"STATED SEVERITY: {f.get('severity', '')}  SCORE: {f.get('score', '')}\n"
+            f"FINDING: {f.get('description', '')}\n"
+            f"CITED EVIDENCE: {' || '.join(_cited_excerpts(f, evidence_index))}",
+        )
+        for f in findings
+    ]
+    answers = _map_chat(chat_fn, prompts, workers)
+    verdicts: dict[str, str] = {}
+    miscalibrated = 0
+    for f, a in zip(findings, answers):
+        verdict = str(a.get("calibration", "calibrated")).lower()
+        verdicts[f.get("id", "?")] = verdict
+        if verdict in ("under", "over"):
+            miscalibrated += 1
+    return {"miscalibrated": miscalibrated, "total": len(findings), "verdicts": verdicts}
+
+
+def answer_relevancy(result: dict, chat_fn) -> dict:
+    """RAGAS-style: does the output address the stated workspace intention?
+
+    Returns {score} in [0,1], or {"score": None} when the vote fails to coerce.
+    """
+    user = (
+        "Does the OUTPUT address the stated WORKSPACE INTENTION (on-topic, responsive)?\n"
+        'Reply with ONLY {"score": <number 0-1>}.\n\n'
+        f"WORKSPACE INTENTION: {_workspace_intention(result)}\n"
+        f"OUTPUT:\n{_output_text(result)}"
+    )
+    return {"score": _coerce_float(chat_fn(SYSTEM, user).get("score"))}
+
+
+def surface_deduplication(result: dict, chat_fn, *, workers: int = 1) -> dict:
+    """Fraction of near-duplicate process-graph node pairs that are genuinely distinct.
+
+    Scoping rules:
+    - Processes: all pairs compared (cross-process is valid at this level).
+    - Activities and decisions: ONLY within the same parent process.  The same
+      activity name appearing in two different processes is a legitimate repetition
+      (e.g. "Approve Request" in both Loan and Credit-Card flows), not a duplicate.
+
+    For each surface, the top-10 most name-similar pairs (token-Jaccard >= 0.30)
+    are selected.  For activities/decisions the parent process name is included in
+    the judge prompt so it can reason about intra-process context.  30 pairs total.
+
+    Returns {distinct, redundant, total, distinct_rate, redundant_pairs}.
+    """
+    pg = result.get("process_graph", {})
+    procs = pg.get("processes", [])
+
+    def _toks(node: dict) -> frozenset[str]:
+        return frozenset(node.get("name", "").lower().split())
+
+    PER_SURFACE_CAP = 10
+    # candidates: (surface, node_a, node_b, parent_process_name)
+    candidates: list[tuple[str, dict, dict, str]] = []
+
+    # Processes: compare all pairs
+    if len(procs) >= 2:
+        pairs: list[tuple[float, dict, dict]] = []
+        for i in range(len(procs)):
+            for j in range(i + 1, len(procs)):
+                a_t, b_t = _toks(procs[i]), _toks(procs[j])
+                union = a_t | b_t
+                if not union:
+                    continue
+                jac = len(a_t & b_t) / len(union)
+                if jac >= 0.30:
+                    pairs.append((jac, procs[i], procs[j]))
+        pairs.sort(key=lambda x: x[0], reverse=True)
+        for _jac, a, b in pairs[:PER_SURFACE_CAP]:
+            candidates.append(("process", a, b, ""))
+
+    # Activities and decisions: within the same parent process only
+    for surface_key, attr in (("activity", "activities"), ("decision", "decisions")):
+        all_pairs: list[tuple[float, dict, dict, str]] = []
+        for proc in procs:
+            nodes = proc.get(attr, [])
+            proc_name = proc.get("name", "")
+            if len(nodes) < 2:
+                continue
+            for i in range(len(nodes)):
+                for j in range(i + 1, len(nodes)):
+                    a_t, b_t = _toks(nodes[i]), _toks(nodes[j])
+                    union = a_t | b_t
+                    if not union:
+                        continue
+                    jac = len(a_t & b_t) / len(union)
+                    if jac >= 0.30:
+                        all_pairs.append((jac, nodes[i], nodes[j], proc_name))
+        all_pairs.sort(key=lambda x: x[0], reverse=True)
+        for _jac, a, b, proc_name in all_pairs[:PER_SURFACE_CAP]:
+            candidates.append((surface_key, a, b, proc_name))
+
+    if not candidates:
+        return {"distinct": 0, "redundant": 0, "total": 0, "distinct_rate": None, "redundant_pairs": []}
+
+    prompts = []
+    for surface, a, b, parent_proc in candidates:
+        ctx = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else ""
+        prompts.append((
+            SYSTEM,
+            f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a "
+            f"duplicate / sub-case / restatement of the other?\n"
+            f"{ctx}"
+            'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": "<one line>"}.\n\n'
+            f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n"
+            f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}",
+        ))
+
+    answers = _map_chat(chat_fn, prompts, workers)
+
+    distinct = 0
+    redundant = 0
+    redundant_pairs: list[dict] = []
+    for (surface, a, b, _parent), answer in zip(candidates, answers):
+        verdict = str(answer.get("verdict", "")).upper()
+        if verdict == "DISTINCT":
+            distinct += 1
+        else:
+            redundant += 1
+            redundant_pairs.append({
+                "surface": surface,
+                "a": a.get("name", ""),
+                "b": b.get("name", ""),
+                "reason": str(answer.get("reason", "")),
+            })
+
+    total = distinct + redundant
+    return {
+        "distinct": distinct,
+        "redundant": redundant,
+        "total": total,
+        "distinct_rate": round(distinct / total, 4) if total else None,
+        "redundant_pairs": redundant_pairs,
+    }
+
+
+def comparative_vs_champion(result: dict, champion_result: dict, chat_fn) -> dict:
+    """Pairwise MT-Bench-style review of candidate vs champion (advisory only).
+
+    Returns {candidate, champion, more_consistent} where candidate/champion are
+    1-5 ratings on Coverage/Quality/Evidence/Actionability/Regression.  Never
+    feeds G5.
+    """
+    user = (
+        "Score the CANDIDATE and the CHAMPION outputs on five axes (1-5 each): "
+        "Coverage, Quality, Evidence, Actionability, Regression.  Then say which is "
+        "more internally consistent.\n"
+        "Reply with ONLY "
+        '{"candidate": {"coverage": x, "quality": x, "evidence": x, "actionability": x, "regression": x}, '
+        '"champion": {"coverage": x, "quality": x, "evidence": x, "actionability": x, "regression": x}, '
+        '"more_consistent": "candidate" or "champion"}.\n\n'
+        f"CANDIDATE:\n{_output_text(result)}\n\n"
+        f"CHAMPION:\n{_output_text(champion_result)}"
+    )
+    out = chat_fn(SYSTEM, user)
+    return {
+        "candidate": out.get("candidate", {}),
+        "champion": out.get("champion", {}),
+        "more_consistent": out.get("more_consistent", ""),
+    }
+
+
+# ── median-of-N for [J] metrics ──────────────────────────────────────────────────
+
+
+def _numeric_leaves(d: dict) -> dict[tuple, float]:
+    """Flatten a metric dict to {path: float} over its FLOAT score-leaves only.
+
+    Median applies to continuous scores only.  A leaf counts as numeric-for-median
+    only when its value is a ``float``; ``bool`` and ``int`` leaves (counts,
+    denominators, 1-5 axes, and other bookkeeping) are deliberately skipped and
+    taken from the first run unchanged — this avoids fractional counts (rated=0.5)
+    and count/len(list) disagreement under runs>1 with an even N.
+    """
+    out: dict[tuple, float] = {}
+
+    def walk(node, path: tuple) -> None:
+        if isinstance(node, float):
+            out[path] = node
+        elif isinstance(node, dict):
+            for k, v in node.items():
+                walk(v, path + (k,))
+
+    walk(d, ())
+    return out
+
+
+def _set_leaf(d: dict, path: tuple, value: float) -> None:
+    node = d
+    for key in path[:-1]:
+        node = node[key]
+    node[path[-1]] = value
+
+
+def _median_runs(samples: list[dict]) -> dict:
+    """Median across N metric-dicts: FLOAT score-leaves -> per-key median; rest = first.
+
+    Only continuous float scores are medianed; integer bookkeeping (counts,
+    denominators, 1-5 axes) and all non-numeric fields are taken from the first run.
+    """
+    samples = [s for s in samples if isinstance(s, dict)]
+    if not samples:
+        return {}
+    base = samples[0]
+    if len(samples) == 1:
+        return base
+    leaf_values: dict[tuple, list[float]] = {}
+    for s in samples:
+        for path, val in _numeric_leaves(s).items():
+            leaf_values.setdefault(path, []).append(val)
+    merged = dict(base)
+    for path, vals in leaf_values.items():
+        try:
+            _set_leaf(merged, path, round(statistics.median(vals), 4))
+        except (KeyError, TypeError):
+            continue
+    return merged
+
+
+# ── orchestrator ─────────────────────────────────────────────────────────────────
+
+
+def run_judge(
+    result: dict,
+    registry,
+    *,
+    judge_model: str,
+    runs: int = 1,
+    concurrency: int = 1,
+    pipeline_model: str = "",
+    champion_result: dict | None = None,
+    chat_fn=None,
+    embed_fn=None,
+    tau: float = 0.70,
+    lexical_missed_ids: list[str] | None = None,
+) -> AdvisoryReport:
+    """Run the G4 advisory gate, best-effort.  NEVER raises; NEVER affects verdict.
+
+    If chat_fn / embed_fn are None, real ones are built from JudgeClient /
+    OllamaEmbedder (tests inject stubs instead).  Each [J] metric runs `runs`
+    times and the median of its numeric scores is kept.  Every metric is wrapped
+    in try/except: a failure appends to report.errors and the run continues.
+
+    ``concurrency`` (opt-in, default 1) bounds the per-item [J] metrics' internal
+    fan-out: 1 keeps the sequential per-item loops; >=2 runs each metric's items
+    across a thread pool (order preserved).  The median-of-N ``runs`` loop stays
+    sequential and the single-call metrics are unaffected.  The result is
+    byte-for-byte identical at concurrency=1.
+
+    Returns an AdvisoryReport (a plain dict carrier) with calibrated=False and
+    same_provider_caveat = same_provider(pipeline_model, judge_model).
+    """
+    if chat_fn is None:
+        client = JudgeClient(judge_model)
+        chat_fn = client.chat_json
+    if embed_fn is None:
+        embed_fn = OllamaEmbedder().embed
+
+    report = AdvisoryReport(
+        judge_model=judge_model,
+        same_provider_caveat=same_provider(pipeline_model, judge_model),
+        calibrated=False,
+        runs=runs,
+    )
+
+    def _run_det(name: str, fn) -> None:
+        try:
+            report.metrics[name] = fn()
+        except Exception as exc:  # best-effort: never raise
+            report.errors.append(f"{name}: {type(exc).__name__}: {exc}")
+
+    def _run_judge_metric(name: str, fn) -> None:
+        try:
+            samples = [fn() for _ in range(max(1, runs))]
+            report.metrics[name] = _median_runs(samples)
+        except Exception as exc:  # best-effort: never raise
+            report.errors.append(f"{name}: {type(exc).__name__}: {exc}")
+
+    # [D] deterministic — always computed, no LLM.
+    _run_det("source_coverage", lambda: source_coverage(result))
+    _run_det("excerpt_fill_rate", lambda: excerpt_fill_rate(result))
+
+    # [E] embedding — context recall.
+    _run_det(
+        "semantic_recovery",
+        lambda: semantic_recovery(result, registry, lexical_missed_ids or [], embed_fn, tau=tau),
+    )
+
+    # [J] judge — median-of-N.  Per-item metrics fan out at workers=concurrency.
+    _run_judge_metric("faithfulness", lambda: faithfulness(result, chat_fn, workers=concurrency))
+    _run_judge_metric(
+        "numeric_temporal_fidelity",
+        lambda: numeric_temporal_fidelity(result, chat_fn, workers=concurrency),
+    )
+    _run_judge_metric(
+        "citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency)
+    )
+    _run_judge_metric(
+        "nc_semantic_precision",
+        lambda: nc_semantic_precision(result, registry, chat_fn, workers=concurrency),
+    )
+    _run_judge_metric("fabricated_entity", lambda: fabricated_entity(result, chat_fn))
+    _run_judge_metric("contradiction", lambda: contradiction(result, chat_fn))
+    _run_judge_metric("open_gap", lambda: open_gap(result, chat_fn))
+    _run_judge_metric("actionability", lambda: actionability(result, chat_fn, workers=concurrency))
+    _run_judge_metric(
+        "severity_calibration",
+        lambda: severity_calibration(result, chat_fn, workers=concurrency),
+    )
+    _run_judge_metric("answer_relevancy", lambda: answer_relevancy(result, chat_fn))
+    _run_judge_metric(
+        "surface_deduplication",
+        lambda: surface_deduplication(result, chat_fn, workers=concurrency),
+    )
+    if champion_result is not None:
+        _run_judge_metric(
+            "comparative_vs_champion",
+            lambda: comparative_vs_champion(result, champion_result, chat_fn),
+        )
+
+    return report
diff --git a/fireflyframework_agentic/evaluation/judge_client.py b/fireflyframework_agentic/evaluation/judge_client.py
new file mode 100644
index 00000000..1af17f53
--- /dev/null
+++ b/fireflyframework_agentic/evaluation/judge_client.py
@@ -0,0 +1,454 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Provider-agnostic LLM-as-a-Judge client for the G4 advisory gate.
+
+Zero new dependencies: stdlib (urllib.request, json, os, time, re) + numpy.
+The client is a thin POST wrapper over four chat providers (Anthropic, OpenAI,
+Azure OpenAI, Ollama) plus an Ollama embedder.  It is deliberately tolerant:
+chat_json extracts the FIRST JSON object from the model text (models wrap JSON
+in prose / code fences), and retries transient HTTP errors with backoff.
+
+This module is import-safe: importing it touches NO network and reads NO API
+key.  Keys are read lazily, per-call, only when a real request is made — so the
+judge tests can import and inject stubs without any secret present.
+
+Provider/model spec format: "<provider>:<model>", e.g. "anthropic:claude-sonnet-4-6",
+"openai:gpt-4o", "azure:gpt-4o", "ollama:llama3".  A bare model with no prefix is
+treated as provider "unknown" (see parse_model / same_provider).
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import re
+import time
+import urllib.error
+import urllib.request
+
+import numpy as np
+
+# Transient HTTP status codes worth retrying (rate limit + 5xx).
+_RETRY_STATUS = (429, 500, 502, 503, 504)
+
+# Hard cap on a honoured Retry-After sleep (a hostile header should not stall us).
+_MAX_RETRY_AFTER = 30.0
+
+
+def _env(name, default=None):
+    """Read an env var, stripping surrounding whitespace; empty-after-strip -> default.
+
+    Defensive against a ``.env`` value that arrives with a trailing ``\\r`` /
+    whitespace (CRLF), which would otherwise corrupt a request URL or header.
+    An unset OR blank value falls back to ``default`` so the existing
+    missing-key -> RuntimeError behaviour is preserved.
+    """
+    value = os.environ.get(name)
+    if value is None:
+        return default
+    value = value.strip()
+    return value if value else default
+
+
+def _retry_delay(exc: urllib.error.HTTPError, attempt: int) -> float:
+    """Seconds to sleep before retrying an HTTPError.
+
+    On 429 honour the ``Retry-After`` header (capped at 30s) when it is present
+    and numeric; otherwise fall back to exponential backoff (2 ** attempt).
+    """
+    if exc.code == 429:
+        headers = getattr(exc, "headers", None)
+        retry_after = headers.get("retry-after") if headers is not None else None
+        if retry_after is not None:
+            try:
+                return min(float(retry_after), _MAX_RETRY_AFTER)
+            except (TypeError, ValueError):
+                pass
+    return 2.0**attempt
+
+
+def parse_model(spec: str) -> tuple[str, str]:
+    """Split a "provider:model" spec into (provider, model).
+
+    A bare spec with no ':' is returned as provider "unknown" with the whole
+    string as the model, e.g. "claude-sonnet-4-6" -> ("unknown", "claude-sonnet-4-6").
+    The provider is lower-cased; the model keeps its original case.
+    """
+    spec = (spec or "").strip()
+    if ":" not in spec:
+        return "unknown", spec
+    provider, model = spec.split(":", 1)
+    return provider.strip().lower(), model.strip()
+
+
+def same_provider(pipeline_model: str, judge_model: str) -> bool:
+    """True iff both specs name the SAME known provider prefix.
+
+    A missing or "unknown" provider on either side -> not-same (False).  This is
+    the same-provider caveat signal: when the judge and the pipeline share a
+    provider the judged metrics are advisory (no cross-provider isolation).
+    """
+    p_provider, _ = parse_model(pipeline_model)
+    j_provider, _ = parse_model(judge_model)
+    if p_provider == "unknown" or j_provider == "unknown":
+        return False
+    return p_provider == j_provider
+
+
+def _first_json_object(text: str) -> dict:
+    """Extract and parse the FIRST balanced JSON object embedded in text.
+
+    Models wrap JSON in prose, preambles, or ```json code fences.  This scans
+    for the first '{' and walks the string tracking brace depth (string-aware,
+    so braces inside quoted values do not confuse the matcher) to find its
+    matching '}'.  Falls back to a greedy regex span if no balanced object is
+    found.  Raises ValueError when nothing parses.
+    """
+    if not text:
+        raise ValueError("empty model response")
+
+    # Fast path: a clean JSON object with no surrounding prose.  A non-dict
+    # clean parse (e.g. a top-level array) is intentionally ignored so the brace
+    # scanner can still find an embedded object rather than returning arr[0].
+    try:
+        parsed = json.loads(text.strip())
+    except (json.JSONDecodeError, ValueError):
+        parsed = None
+    if isinstance(parsed, dict):
+        return parsed
+
+    start = text.find("{")
+    while start != -1:
+        depth = 0
+        in_string = False
+        escape = False
+        for i in range(start, len(text)):
+            ch = text[i]
+            if in_string:
+                if escape:
+                    escape = False
+                elif ch == "\\":
+                    escape = True
+                elif ch == '"':
+                    in_string = False
+                continue
+            if ch == '"':
+                in_string = True
+            elif ch == "{":
+                depth += 1
+            elif ch == "}":
+                depth -= 1
+                if depth == 0:
+                    candidate = text[start : i + 1]
+                    try:
+                        return json.loads(candidate)
+                    except json.JSONDecodeError:
+                        break  # try the next '{'
+        start = text.find("{", start + 1)
+
+    # Greedy fallback: first '{' .. last '}' across newlines.
+    match = re.search(r"\{.*\}", text, re.DOTALL)
+    if match:
+        return json.loads(match.group(0))
+    raise ValueError("no JSON object found in model response")
+
+
+def _http_post_json(url: str, headers: dict, body: dict, timeout: int) -> dict:
+    """POST a JSON body and return the parsed JSON response (single attempt)."""
+    data = json.dumps(body).encode("utf-8")
+    req_headers = {"content-type": "application/json", **headers}
+    req = urllib.request.Request(url, data=data, headers=req_headers, method="POST")
+    with urllib.request.urlopen(req, timeout=timeout) as resp:
+        return json.loads(resp.read().decode("utf-8"))
+
+
+def _extract_openai_text(resp: dict) -> str:
+    """Pull the assistant text from an OpenAI/Azure chat-completions response.
+
+    Guards an empty ``choices`` list and a null ``message.content`` and raises a
+    descriptive RuntimeError (not a KeyError) when no text is present, so the
+    judge layer records a clean dropped-vote reason instead of a stack trace.
+    """
+    choices = resp.get("choices") or []
+    if choices:
+        text = (choices[0].get("message") or {}).get("content")
+        if text:
+            return text
+    raise RuntimeError(f"judge returned no text: {resp}")
+
+
+class JudgeClient:
+    """Minimal multi-provider chat client returning parsed JSON dicts.
+
+    Dispatch is by the provider prefix of the model spec.  temperature is pinned
+    to 0.0 for deterministic verdicts.  Transient HTTP errors (429/5xx) and URL
+    errors are retried up to max_retries: a 429 honours the ``Retry-After``
+    header (capped at 30s) when present, otherwise backoff is exponential
+    (2 ** attempt seconds).
+
+    The API key / endpoint env vars are read lazily inside chat_json, so
+    constructing a JudgeClient never requires a secret.
+    """
+
+    def __init__(self, model: str, timeout: int = 120, max_retries: int = 3) -> None:
+        self.model_spec = model
+        self.provider, self.model = parse_model(model)
+        self.timeout = timeout
+        self.max_retries = max_retries
+
+    def chat_json(self, system: str, user: str, max_tokens: int = 1024) -> dict:
+        """Send (system, user) to the provider and parse the first JSON object.
+
+        Raises on exhausted retries / unknown provider / unparseable output.
+        The judge module wraps every call in try/except, so a raise here becomes
+        a dropped vote rather than a crash.
+        """
+        last_exc: Exception | None = None
+        for attempt in range(self.max_retries):
+            try:
+                text = self._dispatch(system, user, max_tokens)
+                return _first_json_object(text)
+            except urllib.error.HTTPError as exc:
+                last_exc = exc
+                if exc.code not in _RETRY_STATUS or attempt == self.max_retries - 1:
+                    raise
+                time.sleep(_retry_delay(exc, attempt))
+            except (urllib.error.URLError, TimeoutError, ConnectionError) as exc:
+                last_exc = exc
+                if attempt == self.max_retries - 1:
+                    raise
+                time.sleep(2**attempt)
+        if last_exc is not None:
+            raise last_exc
+        raise RuntimeError("chat_json exhausted retries without a response")
+
+    def _dispatch(self, system: str, user: str, max_tokens: int) -> str:
+        """Route to the per-provider call and return the raw model text."""
+        if self.provider == "anthropic":
+            return self._anthropic(system, user, max_tokens)
+        if self.provider == "openai":
+            return self._openai(system, user, max_tokens)
+        if self.provider == "azure":
+            return self._azure(system, user, max_tokens)
+        if self.provider == "ollama":
+            return self._ollama(system, user, max_tokens)
+        raise ValueError(
+            f"unknown judge provider {self.provider!r} in {self.model_spec!r}; "
+            "use anthropic:/openai:/azure:/ollama:"
+        )
+
+    def _anthropic(self, system: str, user: str, max_tokens: int) -> str:
+        api_key = _env("ANTHROPIC_API_KEY")
+        if not api_key:
+            raise RuntimeError("ANTHROPIC_API_KEY not set")
+        body = {
+            "model": self.model,
+            "max_tokens": max_tokens,
+            "temperature": 0.0,
+            "system": system,
+            "messages": [{"role": "user", "content": user}],
+        }
+        headers = {"x-api-key": api_key, "anthropic-version": "2023-06-01"}
+        resp = _http_post_json("https://api.anthropic.com/v1/messages", headers, body, self.timeout)
+        text = next(
+            (b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None
+        )
+        if not text:
+            raise RuntimeError(f"judge returned no text: {resp}")
+        return text
+
+    def _openai(self, system: str, user: str, max_tokens: int) -> str:
+        api_key = _env("OPENAI_API_KEY")
+        if not api_key:
+            raise RuntimeError("OPENAI_API_KEY not set")
+        body = {
+            "model": self.model,
+            "max_tokens": max_tokens,
+            "temperature": 0.0,
+            "messages": [
+                {"role": "system", "content": system},
+                {"role": "user", "content": user},
+            ],
+        }
+        headers = {"Authorization": f"Bearer {api_key}"}
+        resp = _http_post_json(
+            "https://api.openai.com/v1/chat/completions", headers, body, self.timeout
+        )
+        return _extract_openai_text(resp)
+
+    def _azure(self, system: str, user: str, max_tokens: int) -> str:
+        endpoint = _env("AZURE_OPENAI_ENDPOINT")
+        api_key = _env("AZURE_OPENAI_API_KEY")
+        if not endpoint:
+            raise RuntimeError("AZURE_OPENAI_ENDPOINT not set")
+        if not api_key:
+            raise RuntimeError("AZURE_OPENAI_API_KEY not set")
+        api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01"
+        # Azure deployment lives in the URL path, not the JSON body.
+        url = (
+            f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions"
+            f"?api-version={api_version}"
+        )
+        body = {
+            "max_tokens": max_tokens,
+            "temperature": 0.0,
+            "messages": [
+                {"role": "system", "content": system},
+                {"role": "user", "content": user},
+            ],
+        }
+        headers = {"api-key": api_key}
+        resp = _http_post_json(url, headers, body, self.timeout)
+        return _extract_openai_text(resp)
+
+    def _ollama(self, system: str, user: str, max_tokens: int) -> str:
+        host = _env("OLLAMA_HOST") or "http://localhost:11434"
+        body = {
+            "model": self.model,
+            "stream": False,
+            "options": {"temperature": 0.0, "num_predict": max_tokens},
+            "messages": [
+                {"role": "system", "content": system},
+                {"role": "user", "content": user},
+            ],
+        }
+        resp = _http_post_json(f"{host.rstrip('/')}/api/chat", {}, body, self.timeout)
+        text = (resp.get("message") or {}).get("content")
+        if not text:
+            raise RuntimeError(f"judge returned no text: {resp}")
+        return text
+
+
+class OpenAIEmbedder:
+    """OpenAI embeddings client over /v1/embeddings.
+
+    Reads OPENAI_API_KEY from the environment.  Default model: text-embedding-3-small.
+    """
+
+    def __init__(self, model: str = "text-embedding-3-small", timeout: int = 60) -> None:
+        self.model = model
+        self.timeout = timeout
+
+    def embed(self, texts: list[str]) -> np.ndarray:
+        api_key = _env("OPENAI_API_KEY")
+        if not api_key:
+            raise RuntimeError("OPENAI_API_KEY not set")
+        headers = {"Authorization": f"Bearer {api_key}"}
+        body = {"model": self.model, "input": texts}
+        resp = _http_post_json("https://api.openai.com/v1/embeddings", headers, body, self.timeout)
+        data = resp.get("data", [])
+        vectors = [item["embedding"] for item in sorted(data, key=lambda x: x["index"])]
+        return np.asarray(vectors, dtype=np.float32)
+
+
+class AzureOpenAIEmbedder:
+    """Azure OpenAI embeddings client.
+
+    Reads AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, and optionally
+    AZURE_OPENAI_API_VERSION from the environment.  The model name is the
+    deployment name.  Default model: text-embedding-3-small.
+    """
+
+    def __init__(self, model: str = "text-embedding-3-small", timeout: int = 60) -> None:
+        self.model = model
+        self.timeout = timeout
+
+    def embed(self, texts: list[str]) -> np.ndarray:
+        endpoint = _env("AZURE_OPENAI_ENDPOINT")
+        api_key = _env("AZURE_OPENAI_API_KEY")
+        if not endpoint:
+            raise RuntimeError("AZURE_OPENAI_ENDPOINT not set")
+        if not api_key:
+            raise RuntimeError("AZURE_OPENAI_API_KEY not set")
+        api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01"
+        url = (
+            f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings"
+            f"?api-version={api_version}"
+        )
+        headers = {"api-key": api_key}
+        vectors = self._embed_with_split(texts, url, headers)
+        return np.asarray(vectors, dtype=np.float32)
+
+    def _embed_with_split(self, texts: list[str], url: str, headers: dict) -> list[list[float]]:
+        """Send texts in one request; on HTTP 400 split in half and retry each half."""
+        try:
+            resp = _http_post_json(url, headers, {"input": texts}, self.timeout)
+            data = resp.get("data", [])
+            return [item["embedding"] for item in sorted(data, key=lambda x: x["index"])]
+        except urllib.error.HTTPError as exc:
+            if exc.code == 400 and len(texts) > 1:
+                mid = len(texts) // 2
+                left = self._embed_with_split(texts[:mid], url, headers)
+                right = self._embed_with_split(texts[mid:], url, headers)
+                return left + right
+            raise
+
+
+class OllamaEmbedder:
+    """Local Ollama embedding client (default model bge-m3) over /api/embeddings.
+
+    Posts one prompt per call (the stable single-prompt form) and stacks the
+    returned vectors into a 2-D numpy array.  Constructing it touches no network;
+    the host is resolved from $OLLAMA_HOST at call time.
+    """
+
+    def __init__(self, model: str = "bge-m3", host: str | None = None, timeout: int = 60) -> None:
+        self.model = model
+        self.host = (host or _env("OLLAMA_HOST") or "http://localhost:11434").rstrip("/")
+        self.timeout = timeout
+
+    def embed(self, texts: list[str]) -> np.ndarray:
+        """Embed a list of strings -> float32 ndarray of shape (len(texts), dim)."""
+        vectors: list[list[float]] = []
+        for text in texts:
+            body = {"model": self.model, "prompt": text}
+            resp = _http_post_json(f"{self.host}/api/embeddings", {}, body, self.timeout)
+            vectors.append(resp["embedding"])
+        return np.asarray(vectors, dtype=np.float32)
+
+
+def build_embedder(spec: str):
+    """Return an ``embed_fn(list[str]) -> np.ndarray`` for an embedder spec.
+
+    Dispatch is on the provider prefix of a "<provider>:<model>" spec:
+    - "ollama" / "ollama:<model>" -> OllamaEmbedder(model or "bge-m3").embed.
+    - a bare "<model>" with no ':' -> treated as an Ollama model.
+    - any other provider -> NotImplementedError (the extension point).
+
+    Add a new backend by adding a branch here.
+    """
+    if (spec or "").strip() == "ollama":  # bare provider, no model -> default model
+        return OllamaEmbedder("bge-m3").embed
+    provider, model = parse_model(spec)
+    if provider in ("unknown", "ollama"):  # bare "<model>" or "ollama:<model>"
+        return OllamaEmbedder(model or "bge-m3").embed
+    if provider == "openai":
+        return OpenAIEmbedder(model or "text-embedding-3-small").embed
+    if provider == "azure":
+        return AzureOpenAIEmbedder(model or "text-embedding-3-small").embed
+    raise NotImplementedError(
+        f"embedder backend {provider!r} not implemented yet; add it in build_embedder()"
+    )
+
+
+def cosine(a, b) -> float:
+    """Cosine similarity between two 1-D vectors; 0.0 if either is the zero vector."""
+    a = np.asarray(a, dtype=np.float64).ravel()
+    b = np.asarray(b, dtype=np.float64).ravel()
+    na = float(np.linalg.norm(a))
+    nb = float(np.linalg.norm(b))
+    if na == 0.0 or nb == 0.0:
+        return 0.0
+    return float(np.dot(a, b) / (na * nb))
diff --git a/fireflyframework_agentic/evaluation/matcher.py b/fireflyframework_agentic/evaluation/matcher.py
index 2f5065df..b4d81f44 100644
--- a/fireflyframework_agentic/evaluation/matcher.py
+++ b/fireflyframework_agentic/evaluation/matcher.py
@@ -29,12 +29,7 @@
 
 import numpy as np
 
-
-def cosine(a, b) -> float:
-    """Cosine similarity between two vectors."""
-    a = np.asarray(a, dtype=float)
-    b = np.asarray(b, dtype=float)
-    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9))
+from fireflyframework_agentic.evaluation.judge_client import cosine
 
 
 def tokens(text: str) -> list[str]:

From 1906ede934bb82cca1b127341a2f457a66e59a3c Mon Sep 17 00:00:00 2001
From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com>
Date: Thu, 18 Jun 2026 23:56:09 +0200
Subject: [PATCH 07/48] feat(evaluation): add champion tracking and flyeval CLI
 (#274)

* feat(evaluation): add ChampionRecord and champion management functions

* feat(evaluation): add run_config_snapshot for flyradar run configuration capture

* feat(evaluation): add flyeval CLI with gate, aa-band, day-zero, invalidate subcommands

---------

Co-authored-by: miguelgfierro <miguelgfierro@users.noreply.github.com>
---
 .../evaluation/champion.py                    | 169 ++++++
 fireflyframework_agentic/evaluation/cli.py    | 573 ++++++++++++++++++
 .../evaluation/run_config_snapshot.py         | 160 +++++
 3 files changed, 902 insertions(+)
 create mode 100644 fireflyframework_agentic/evaluation/champion.py
 create mode 100644 fireflyframework_agentic/evaluation/cli.py
 create mode 100644 fireflyframework_agentic/evaluation/run_config_snapshot.py

diff --git a/fireflyframework_agentic/evaluation/champion.py b/fireflyframework_agentic/evaluation/champion.py
new file mode 100644
index 00000000..239429eb
--- /dev/null
+++ b/fireflyframework_agentic/evaluation/champion.py
@@ -0,0 +1,169 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Per-corpus champion management.
+
+Champions are per-corpus — mode 2A (conformance) and mode 2B (extraction)
+metrics live in incommensurable spaces.  There is no global champion.
+See EVALUATION_FRAMEWORK.md (per-corpus champions).
+
+The historical fake-100% incident: banca-cordobesa/baseline.json was populated
+with a champion scored against an EMPTY must-find registry.  The EMPTY_MUST_FIND
+guard in G1 prevents a recurrence; the invalidate_champion() function provides
+the corrective action when it does happen.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+@dataclass
+class ChampionRecord:
+    """Per-corpus champion, stored as 'champion' key in baseline.json."""
+
+    corpus: str
+    run_id: str
+    model_id: str
+    registry_sha256: str
+    scores: dict  # {metric_name: float}
+    aa_noise: dict = field(default_factory=dict)  # {metric_name: noise_floor}
+    is_day_zero: bool = False
+    human_sign_offs: list[str] = field(default_factory=list)
+    config: dict = field(default_factory=dict)  # evaluation config snapshot
+    corpus_sha256: str = ""  # pin of the evidence corpus the champion was verified against
+
+    def primary_metric(self) -> str:
+        return next(iter(self.scores)) if self.scores else ""
+
+    def primary_score(self) -> float:
+        return float(self.scores.get(self.primary_metric(), 0.0))
+
+
+def load_champion(baseline_path: str | Path) -> ChampionRecord | None:
+    """Load the current per-corpus champion from baseline.json.
+
+    Returns None when:
+    - The file does not exist (normal Day-Zero state).
+    - The file exists but 'champion' is null (post-invalidation state).
+    """
+    path = Path(baseline_path)
+    if not path.exists():
+        return None
+    raw = json.loads(path.read_text(encoding="utf-8"))
+    champ_raw = raw.get("champion")
+    if champ_raw is None:
+        return None
+    return ChampionRecord(
+        corpus=champ_raw["corpus"],
+        run_id=champ_raw["run_id"],
+        model_id=champ_raw["model_id"],
+        registry_sha256=champ_raw["registry_sha256"],
+        scores=champ_raw.get("scores", {}),
+        aa_noise=champ_raw.get("aa_noise", {}),
+        is_day_zero=champ_raw.get("is_day_zero", False),
+        human_sign_offs=champ_raw.get("human_sign_offs", []),
+        config=champ_raw.get("config", {}),
+        corpus_sha256=champ_raw.get("corpus_sha256", ""),
+    )
+
+
+def save_champion(
+    baseline_path: str | Path,
+    champion: ChampionRecord,
+    *,
+    summary: str = "",
+    date: str = "",
+) -> None:
+    """Persist a new champion and append a promotion log entry.
+
+    Reads the existing file if it exists (to preserve the log), then writes
+    the new champion.  The promotion log is append-only.
+    """
+    path = Path(baseline_path)
+    if path.exists():
+        raw = json.loads(path.read_text(encoding="utf-8"))
+        log = raw.get("promotion_log", [])
+        prev_run = raw.get("champion", {})
+        prev_run_id = prev_run.get("run_id") if isinstance(prev_run, dict) else None
+    else:
+        log = []
+        prev_run_id = None
+
+    log.append(
+        {
+            "date": date or "unknown",
+            "from": prev_run_id,
+            "to": champion.run_id,
+            "label": "day-zero" if champion.is_day_zero else "promotion",
+            "summary": summary,
+        }
+    )
+
+    payload = {
+        "champion": {
+            "corpus": champion.corpus,
+            "run_id": champion.run_id,
+            "model_id": champion.model_id,
+            "registry_sha256": champion.registry_sha256,
+            "scores": champion.scores,
+            "aa_noise": champion.aa_noise,
+            "is_day_zero": champion.is_day_zero,
+            "human_sign_offs": champion.human_sign_offs,
+            "config": champion.config,
+            "corpus_sha256": champion.corpus_sha256,
+        },
+        "promotion_log": log,
+    }
+    path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
+
+
+def invalidate_champion(
+    baseline_path: str | Path,
+    *,
+    reason: str,
+    date: str = "",
+) -> None:
+    """Null out the current champion and record the invalidation reason.
+
+    Used when a champion was locked in against an empty or tampered registry
+    (the banca-cordobesa fake-100% incident).
+    """
+    path = Path(baseline_path)
+    if not path.exists():
+        return
+    raw = json.loads(path.read_text(encoding="utf-8"))
+    log = raw.get("promotion_log", [])
+    prev_run = raw.get("champion", {})
+    prev_run_id = prev_run.get("run_id") if isinstance(prev_run, dict) else None
+    log.append(
+        {
+            "date": date or "unknown",
+            "from": prev_run_id,
+            "to": None,
+            "label": "INVALIDATED",
+            "summary": reason,
+        }
+    )
+    raw["champion"] = None
+    raw["promotion_log"] = log
+    path.write_text(json.dumps(raw, indent=2, ensure_ascii=False), encoding="utf-8")
+
+
+def input_hash(result_dict: dict) -> str:
+    """Stable 16-char SHA-256 prefix of the DiscoveryResult for provenance."""
+    canonical = json.dumps(result_dict, sort_keys=True, ensure_ascii=False)
+    return hashlib.sha256(canonical.encode("utf-8")).hexdigest()[:16]
diff --git a/fireflyframework_agentic/evaluation/cli.py b/fireflyframework_agentic/evaluation/cli.py
new file mode 100644
index 00000000..7ac868d9
--- /dev/null
+++ b/fireflyframework_agentic/evaluation/cli.py
@@ -0,0 +1,573 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""flyeval — FlyRadar Lean Core evaluation CLI.
+
+Usage
+-----
+    flyeval gate      --result R.json --registry REG.json [--baseline B.json] [--judge-model P:M]
+    flyeval aa-band   --results R1.json R2.json ... --registry REG.json
+    flyeval day-zero  --result R.json --registry REG.json --baseline B.json --signoffs 2
+    flyeval invalidate --baseline B.json --reason "..."
+
+The deterministic gates G1-G3 + G5 (human sign-off) decide the verdict: every
+subcommand exits 0 on PROMOTE, 1 on HOLD.  G4 (the --judge-model LLM-as-a-Judge,
+on by default, --no-judge to skip) is non-blocking — it prints advisory signals
+and never changes the verdict or the exit code.
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import os
+import sys
+from pathlib import Path
+
+from fireflyframework_agentic.evaluation import __version__
+from fireflyframework_agentic.evaluation.champion import (
+    ChampionRecord,
+    invalidate_champion,
+    load_champion,
+    save_champion,
+)
+from fireflyframework_agentic.evaluation.corpus import load_corpus
+from fireflyframework_agentic.evaluation.gates import g2_recall_precision, run_gates
+from fireflyframework_agentic.evaluation.judge import run_judge
+from fireflyframework_agentic.evaluation.judge_client import build_embedder
+from fireflyframework_agentic.evaluation.matcher import matches
+from fireflyframework_agentic.evaluation.registry import load_registry
+from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict as get_verdict
+from fireflyframework_agentic.evaluation.stats import aa_band, left_skew_flag
+
+
+def _load_json(path: str) -> dict:
+    return json.loads(Path(path).read_text(encoding="utf-8"))
+
+
+def _lexical_missed_ids(result: dict, registry) -> list[str]:
+    """Scored (non-L3) real-item ids matched by no finding — the G2 lexical misses G4 recovers."""
+    evidence_index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")}
+    findings = result.get("findings", [])
+    scored = [i for i in registry.real_items if i.tier != "L3"]
+    return [i.id for i in scored if not any(matches(f, i, evidence_index) for f in findings)]
+
+
+def _read_experiment_config(result_path: str) -> dict | None:
+    """Read the experiment_configuration.json recorded next to the run's output.json.
+
+    The experiment config records how the run was generated; it is authored by the
+    generation step at run time.  Evaluation only reads it for display and never
+    writes or overwrites it.  Returns None when the run has no recorded config.
+    """
+    path = Path(result_path).parent / "experiment_configuration.json"
+    if not path.exists():
+        return None
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def _write_eval_config(result_path: str, config: dict) -> Path:
+    """Write evaluation_configuration.json next to the run's output.json.
+
+    The evaluation config is authored by flyeval at gate time (registry/corpus SHAs,
+    recall metric, floors, judge settings), so unlike the experiment config it is
+    owned here and safe to (over)write each run.  It mirrors the block embedded in
+    the scorecard, as a machine-readable artifact.
+    """
+    path = Path(result_path).parent / "evaluation_configuration.json"
+    path.write_text(json.dumps(config, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
+    return path
+
+
+def _eval_config(args, registry, corpus=None) -> dict:
+    """Capture the run's evaluation configuration for provenance.
+
+    Uses getattr defaults so it works for both `gate` (has every flag) and
+    `day-zero` (lacks the gate-only flags, falling back to the lexical/no-judge
+    defaults, which honestly reflects how day-zero scores).
+    """
+    jm = getattr(args, "judge_model", None)
+    baseline = getattr(args, "baseline", None)
+    tau = getattr(args, "tau", 0.70)
+    return {
+        "evaluator_version": __version__,
+        "registry_sha256": registry.sha256(),
+        "corpus_sha256": corpus.sha256 if corpus else None,
+        "model_id": getattr(args, "model_id", None) or "unknown",
+        "gates": {
+            "G1": {
+                "name": "Structural & Safe",
+                "pii_list": getattr(args, "pii_list", None) or [],
+                "metrics": {
+                    "empty_must_find": "registry has >=1 must-find item; guards the fake-100% "
+                    "champion (EMPTY_MUST_FIND)",
+                    "registry_sha256_pin": "loaded registry matches its file hash (GOLD_DRIFT)",
+                    "corpus_sha256_pin": "corpus matches its hash when supplied (CORPUS_DRIFT)",
+                    "schema_valid": "required top-level keys present in the result "
+                    "(SCHEMA_INVALID)",
+                    "pii_non_disclosure": "no corpus PII name appears in any finding/report text "
+                    "(PII_LEAK)",
+                },
+            },
+            "G2": {
+                "name": "Recall & Precision",
+                "recall_metric": getattr(args, "recall_metric", "lexical"),
+                "recall_floor": getattr(args, "recall_floor", 0.70),
+                "tau": tau,
+                "tau_nc": getattr(args, "tau_nc", 0.85),
+                "embedder": getattr(args, "embedder", None),
+                "metrics": {
+                    "lexical_recall": "token-overlap recall (always reported)",
+                    "semantic_recall": "embedding-similarity recall at >= tau (needs embedder)",
+                    "hybrid_recall": "per item, a lexical OR semantic match",
+                    "per_tier_recall": "hit/total per tier L0-L3; an L0 miss blocks",
+                    "nc_precision": "negative-control items wrongly emitted; an NC hit blocks",
+                    "finding_redundancy_rate": "fraction of findings duplicating another's topic",
+                },
+            },
+            "G3": {
+                "name": "Grounded",
+                "grounding_floor": getattr(args, "grounding_floor", 0.90),
+                "human_spot_check_n": 5,
+                "corpus_verification": corpus is not None,
+                "metrics": {
+                    "grounding_pct": "findings whose cited excerpt shares a topic token; blocks "
+                    "below grounding_floor",
+                    "evidence_verified": "cited excerpts located in the actual corpus "
+                    "(when supplied)",
+                    "evidence_fabricated": "populated excerpts not found in their cited source "
+                    "(EVIDENCE_FABRICATED)",
+                    "evidence_source_unknown": "locators resolving to no corpus document "
+                    "(EVIDENCE_SOURCE_UNKNOWN)",
+                    "excerpt_fill_rate": "evidence entries carrying a populated excerpt",
+                    "source_coverage": "distinct corpus documents cited",
+                },
+            },
+            "G4": {
+                "name": "LLM Judge (advisory, non-blocking)",
+                "judge_model": jm,
+                "judge_runs": getattr(args, "judge_runs", 1) if jm else None,
+                "judge_concurrency": getattr(args, "judge_concurrency", 1) if jm else None,
+                "judge_temperature": 0.0 if jm else None,
+                "tau": tau if jm else None,
+                "metrics": {
+                    "faithfulness": "each finding's claim entailed by its cited evidence",
+                    "numeric_temporal_fidelity": "numbers and dates in findings match the evidence",
+                    "citation_relevance": "cited evidence refs are on-topic (context precision)",
+                    "nc_semantic_precision": "negative-control items semantically asserted",
+                    "fabricated_entity": "named entities absent from the corpus",
+                    "contradiction": "findings contradicting the evidence or each other",
+                    "open_gap": "a consequential issue the output failed to surface",
+                    "actionability": "proposed actions are specific and actionable",
+                    "severity_calibration": "stated severity matches the evidence",
+                    "answer_relevancy": "output addresses the workspace intention",
+                    "source_coverage": "distinct corpus documents cited (deterministic)",
+                    "excerpt_fill_rate": "evidence entries with a populated excerpt "
+                    "(deterministic)",
+                },
+            },
+            "G5": {
+                "name": "No-regression / promotion",
+                "is_day_zero": baseline is None,
+                "human_signed_off": getattr(args, "human_signed_off", False),
+                "signoffs": getattr(args, "signoffs", 0),
+                "baseline": baseline,
+                "baseline_sha256": _file_sha256(baseline) if baseline else None,
+                "metrics": {
+                    "improvements": "metrics beating the champion by more than the AA noise band",
+                    "regressions": "metrics that regressed versus the champion",
+                    "noise_band": "per-metric AA noise floor a candidate must exceed",
+                    "guardrail_regression": "any guardrail metric that dropped",
+                    "signoffs": "independent human sign-offs recorded",
+                },
+            },
+        },
+    }
+
+
+def _file_sha256(path: str) -> str | None:
+    """SHA-256 of a file's bytes, or None when it can't be read."""
+    try:
+        return hashlib.sha256(Path(path).read_bytes()).hexdigest()
+    except OSError:
+        return None
+
+
+# ── gate ──────────────────────────────────────────────────────────────────────
+
+
+def cmd_gate(args: argparse.Namespace) -> int:
+    if getattr(args, "no_judge", False):
+        args.judge_model = None  # explicit opt-out; G4 runs by default otherwise
+    result = _load_json(args.result)
+    registry = load_registry(args.registry)
+    corpus = load_corpus(args.corpus) if args.corpus else None
+    champion = load_champion(args.baseline) if args.baseline else None
+    champion_scores = champion.scores if champion else None
+    aa_noise = champion.aa_noise if champion else None
+
+    embed_fn = build_embedder(args.embedder) if args.embedder else None
+
+    if args.recall_metric in ("hybrid", "semantic") and embed_fn is None:
+        print(
+            f"ERROR: --recall-metric {args.recall_metric} requires --embedder.\n"
+            "  Example: --embedder openai:text-embedding-3-small",
+            file=sys.stderr,
+        )
+        return 2
+
+    gate_results = run_gates(
+        result,
+        registry,
+        args.registry,
+        pii_list=args.pii_list or [],
+        recall_floor=args.recall_floor,
+        grounding_floor=args.grounding_floor,
+        champion_scores=champion_scores,
+        aa_noise=aa_noise,
+        is_day_zero=(champion is None),
+        human_signed_off=args.human_signed_off,
+        signoff_count=args.signoffs,
+        embed_fn=embed_fn,
+        tau=args.tau,
+        recall_metric=args.recall_metric,
+        tau_nc=args.tau_nc,
+        corpus=corpus,
+    )
+
+    # G4 — on by default, non-blocking.  Skipped only with --no-judge; never affects the verdict.
+    advisory = None
+    if args.judge_model:
+        champion_result = _load_json(args.champion_result) if args.champion_result else None
+        advisory = run_judge(
+            result,
+            registry,
+            judge_model=args.judge_model,
+            runs=args.judge_runs,
+            concurrency=args.judge_concurrency,
+            pipeline_model=args.model_id or "",
+            champion_result=champion_result,
+            embed_fn=embed_fn,
+            tau=args.tau,
+            lexical_missed_ids=_lexical_missed_ids(result, registry),
+        )
+
+    config = _eval_config(args, registry, corpus)
+    _write_eval_config(args.result, config)
+    experiment_config = _read_experiment_config(args.result)
+    scorecard = render_scorecard(
+        gate_results,
+        corpus=registry.corpus,
+        model_id=args.model_id or "unknown",
+        run_id=args.run_id or "run",
+        is_self_graded=True,
+        kappa_advisory=registry.is_kappa_advisory(),
+        evidence_unverified=corpus is None,
+        advisory=advisory,
+        config=config,
+        experiment_config=experiment_config,
+    )
+    print(scorecard)
+
+    v = get_verdict(gate_results)
+    return 0 if v == "PROMOTE" else 1
+
+
+# ── aa-band ───────────────────────────────────────────────────────────────────
+
+
+def cmd_aa_band(args: argparse.Namespace) -> int:
+    registry = load_registry(args.registry)
+
+    if args.recall_metric in ("hybrid", "semantic") and not args.embedder:
+        print(
+            f"ERROR: --recall-metric {args.recall_metric} requires --embedder.\n"
+            "  Example: --embedder openai:text-embedding-3-small",
+            file=sys.stderr,
+        )
+        return 2
+
+    embed_fn = build_embedder(args.embedder) if args.embedder else None
+    corpus = load_corpus(args.corpus) if args.corpus else None
+    scores: list[float] = []
+
+    for rp in args.results:
+        result = _load_json(rp)
+        g2 = g2_recall_precision(
+            result, registry,
+            recall_metric=args.recall_metric, embed_fn=embed_fn,
+            tau=args.tau, tau_nc=args.tau_nc,
+            corpus=corpus,
+        )
+        if g2.passed or g2.details.get("recall") is not None:
+            scores.append(g2.details.get("recall", 0.0))
+
+    if len(scores) < 2:
+        print(
+            f"ERROR: need >= 2 runs for aa_band; got {len(scores)}.  "
+            "Make sure the registry is non-empty and the runs are valid.",
+            file=sys.stderr,
+        )
+        return 1
+
+    band = aa_band(scores)
+    high_var = left_skew_flag(scores)
+    print(f"A/A noise band (95th-pct pairwise delta): {band:.4f}")
+    print(f"Scores across reruns: {[round(s, 4) for s in scores]}")
+    if high_var:
+        print("WARNING: HIGH_VARIANCE — min < median - 0.10.  Investigate before using this band.")
+    return 0
+
+
+# ── day-zero ──────────────────────────────────────────────────────────────────
+
+
+def cmd_day_zero(args: argparse.Namespace) -> int:
+    result = _load_json(args.result)
+    registry = load_registry(args.registry)
+
+    if not args.corpus:
+        print(
+            "ERROR: day-zero (a promotion decision) requires --corpus for evidence\n"
+            "verification — a champion must not be minted on unverified evidence.\n"
+            "  Supply the run's input bundle, e.g.  --corpus experiments/<corpus>/input.json",
+            file=sys.stderr,
+        )
+        return 2
+    corpus = load_corpus(args.corpus)
+
+    if args.signoffs < 2:
+        print(
+            f"ERROR: Day-Zero requires 2 independent human sign-offs; got {args.signoffs}.",
+            file=sys.stderr,
+        )
+        return 1
+
+    gate_results = run_gates(
+        result,
+        registry,
+        args.registry,
+        is_day_zero=True,
+        human_signed_off=True,
+        signoff_count=args.signoffs,
+        corpus=corpus,
+    )
+
+    config = _eval_config(args, registry, corpus)
+    _write_eval_config(args.result, config)
+    experiment_config = _read_experiment_config(args.result)
+    v = get_verdict(gate_results)
+    scorecard = render_scorecard(
+        gate_results,
+        corpus=registry.corpus,
+        model_id=args.model_id or "unknown",
+        run_id=args.run_id or "day-zero",
+        is_self_graded=True,
+        kappa_advisory=registry.is_kappa_advisory(),
+        config=config,
+        experiment_config=experiment_config,
+    )
+    print(scorecard)
+
+    if v == "PROMOTE" and args.baseline:
+        g2 = next((g for g in gate_results if g.gate == "G2"), None)
+        g3 = next((g for g in gate_results if g.gate == "G3"), None)
+        scores = {}
+        if g2:
+            scores["recall"] = g2.details.get("recall", 0.0)
+        if g3:
+            scores["grounding_pct"] = g3.details.get("grounding_pct", 0.0)
+
+        champion = ChampionRecord(
+            corpus=registry.corpus,
+            run_id=args.run_id or "day-zero",
+            model_id=args.model_id or "unknown",
+            registry_sha256=registry.sha256(),
+            scores=scores,
+            is_day_zero=True,
+            human_sign_offs=[f"signoff-{i + 1}" for i in range(args.signoffs)],
+            config=config,
+            corpus_sha256=corpus.sha256,
+        )
+        save_champion(
+            args.baseline,
+            champion,
+            summary=f"Day-Zero champion for {registry.corpus}",
+            date=args.date or "unknown",
+        )
+        print(f"\nDay-Zero champion saved to {args.baseline}")
+
+    return 0 if v == "PROMOTE" else 1
+
+
+# ── invalidate ────────────────────────────────────────────────────────────────
+
+
+def cmd_invalidate(args: argparse.Namespace) -> int:
+    invalidate_champion(args.baseline, reason=args.reason, date=args.date or "unknown")
+    print(f"Champion invalidated in {args.baseline}.  Reason: {args.reason}")
+    return 0
+
+
+# ── parser ────────────────────────────────────────────────────────────────────
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="flyeval",
+        description="FlyRadar Lean Core eval: G1-G3 + G5 deterministic, G4 judge on by default",
+    )
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    def _add_common(p: argparse.ArgumentParser) -> None:
+        p.add_argument("--result", required=True, help="Path to DiscoveryResult JSON")
+        p.add_argument("--registry", required=True, help="Path to lean-1 registry JSON")
+        p.add_argument(
+            "--corpus",
+            help="Path to the run's input.json corpus bundle — enables deterministic "
+            "evidence verification (required for day-zero; without it, gate runs "
+            "carry an EVIDENCE UNVERIFIED disclosure)",
+        )
+        p.add_argument("--baseline", help="Path to baseline.json (per-corpus champion store)")
+        p.add_argument("--model-id", default="unknown")
+        p.add_argument("--run-id", default="run")
+        p.add_argument("--date", default="", help="ISO date for promotion log")
+
+    # gate
+    p_gate = sub.add_parser("gate", help="Run the gates and print a scorecard")
+    _add_common(p_gate)
+    p_gate.add_argument("--recall-floor", type=float, default=0.70)
+    p_gate.add_argument("--grounding-floor", type=float, default=0.90)
+    p_gate.add_argument("--pii-list", nargs="*", default=[])
+    p_gate.add_argument(
+        "--embedder",
+        default=os.environ.get("FLYEVAL_EMBEDDER"),
+        help="opt-in embedder spec for the semantic recall path "
+        '(e.g. "azure:text-embedding-3-small"); omit for pure-lexical recall. '
+        "Env: FLYEVAL_EMBEDDER",
+    )
+    p_gate.add_argument(
+        "--recall-metric",
+        choices=["lexical", "semantic", "hybrid"],
+        default=os.environ.get("FLYEVAL_RECALL_METRIC", "hybrid"),
+        help="which recall metric GATES (default hybrid; hybrid/semantic require --embedder). "
+        "Env: FLYEVAL_RECALL_METRIC",
+    )
+    p_gate.add_argument(
+        "--tau",
+        type=float,
+        default=float(os.environ.get("FLYEVAL_TAU", "0.70")),
+        help="cosine similarity threshold for the semantic recall path (real items). "
+        "Env: FLYEVAL_TAU",
+    )
+    p_gate.add_argument(
+        "--tau-nc",
+        type=float,
+        default=float(os.environ.get("FLYEVAL_TAU_NC", "0.85")),
+        help="cosine similarity threshold for NC item detection (higher; no source anchor). "
+        "Env: FLYEVAL_TAU_NC",
+    )
+    p_gate.add_argument("--human-signed-off", action="store_true")
+    p_gate.add_argument("--signoffs", type=int, default=0)
+    p_gate.add_argument(
+        "--judge-model",
+        default=os.environ.get("FLYEVAL_JUDGE_MODEL", "anthropic:claude-sonnet-4-6"),
+        help="provider:model for the non-blocking G4 LLM-as-a-Judge (e.g. azure:gpt-4o). "
+        "Runs by default; pass --no-judge to skip G4. Env: FLYEVAL_JUDGE_MODEL",
+    )
+    p_gate.add_argument(
+        "--no-judge",
+        action="store_true",
+        help="skip the G4 LLM-as-a-Judge (it runs by default).",
+    )
+    p_gate.add_argument(
+        "--judge-runs",
+        type=int,
+        default=int(os.environ.get("FLYEVAL_JUDGE_RUNS", "1")),
+        help="G4 judge runs; the median of numeric scores is kept (odd recommended). "
+        "Env: FLYEVAL_JUDGE_RUNS",
+    )
+    p_gate.add_argument(
+        "--judge-concurrency",
+        type=int,
+        default=int(os.environ.get("FLYEVAL_JUDGE_CONCURRENCY", "1")),
+        help="bounded fan-out for the per-item G4 [J] metrics (1 = sequential; "
+        ">=2 runs each metric's chat calls across a thread pool, order preserved). "
+        "Env: FLYEVAL_JUDGE_CONCURRENCY",
+    )
+    p_gate.add_argument(
+        "--champion-result",
+        help="Path to the champion's output.json for the G4 comparative-review metric",
+    )
+    p_gate.set_defaults(func=cmd_gate)
+
+    # aa-band
+    p_aa = sub.add_parser("aa-band", help="Compute A/A noise band from champion reruns")
+    p_aa.add_argument(
+        "--results",
+        nargs="+",
+        required=True,
+        help="Paths to champion-rerun result JSON files (>= 2)",
+    )
+    p_aa.add_argument("--registry", required=True)
+    p_aa.add_argument(
+        "--recall-metric",
+        choices=["lexical", "semantic", "hybrid"],
+        default="hybrid",
+        help="recall metric to use — must match the champion's metric (default hybrid; "
+        "hybrid/semantic require --embedder)",
+    )
+    p_aa.add_argument(
+        "--embedder",
+        default=None,
+        help="embedder spec for semantic/hybrid recall (e.g. ollama:bge-m3)",
+    )
+    p_aa.add_argument("--tau", type=float, default=0.70)
+    p_aa.add_argument("--tau-nc", type=float, default=0.85)
+    p_aa.add_argument(
+        "--corpus",
+        help="Path to input.json — must match the gate's corpus setting so the "
+        "band is computed under the same evidence filtering as the champion",
+    )
+    p_aa.set_defaults(func=cmd_aa_band)
+
+    # day-zero
+    p_dz = sub.add_parser("day-zero", help="Promote the inaugural champion (Day-Zero protocol)")
+    _add_common(p_dz)
+    p_dz.add_argument(
+        "--signoffs",
+        type=int,
+        default=0,
+        help="Number of independent human sign-offs collected (need 2)",
+    )
+    p_dz.set_defaults(func=cmd_day_zero)
+
+    # invalidate
+    p_inv = sub.add_parser("invalidate", help="Invalidate the current champion")
+    p_inv.add_argument("--baseline", required=True)
+    p_inv.add_argument("--reason", required=True)
+    p_inv.add_argument("--date", default="")
+    p_inv.set_defaults(func=cmd_invalidate)
+
+    return parser
+
+
+def main() -> None:
+    parser = build_parser()
+    args = parser.parse_args()
+    sys.exit(args.func(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fireflyframework_agentic/evaluation/run_config_snapshot.py b/fireflyframework_agentic/evaluation/run_config_snapshot.py
new file mode 100644
index 00000000..db543129
--- /dev/null
+++ b/fireflyframework_agentic/evaluation/run_config_snapshot.py
@@ -0,0 +1,160 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Capture the effective flyradar run configuration into experiment_configuration.json.
+
+Non-invasive snapshot: it records how a run was generated by reading what flyradar
+already exposes as data — the request options the caller sent, the ``/api/v1/version``
+endpoint, ``RadarSettings``, and the prompt catalog — without modifying flyradar.  The
+snapshot is written next to the run's ``output.json`` at generation time, which is the
+moment the configuration is known.
+
+This is the bridge: the durable fix is for flyradar to stamp the same config into
+``DiscoveryResult`` itself (the one place that knows the effective values and cannot
+drift).  See the "flyradar improvements" issue.  ``temperature`` and ``seed`` are not
+exposed by ``RadarSettings`` and are recorded as ``null`` here.
+
+Usage:
+    cd flyradar_experiments
+    set -a && source .env && set +a
+    uv run python -m fireflyframework_agentic.evaluation.run_config_snapshot \
+        --output-dir experiments/bbva_españa/runs/2026-06-12-sonnet-01 \
+        --options    request_options.json \
+        --commit     c107918
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import urllib.request
+from importlib.resources import files
+from pathlib import Path
+
+try:
+    from flyradar.config import RadarSettings
+except ImportError:  # flyradar is an optional dependency of this snapshot.
+    RadarSettings = None
+
+#: Path of the flyradar version endpoint (whitelisted in the service middleware).
+VERSION_PATH = "/api/v1/version"
+
+#: RadarSettings fields that define scoring / dedup behaviour, captured verbatim.
+_SETTINGS_KEYS = (
+    "model",
+    "fallback_model",
+    "duplicity_similarity_threshold",
+    "rootcause_cost_weight",
+    "rootcause_frequency_weight",
+    "rootcause_actionability_weight",
+)
+
+
+def fetch_version(base_url: str, *, timeout: float = 10.0) -> dict:
+    """GET the flyradar version endpoint; return ``{}`` on any failure."""
+    url = base_url.rstrip("/") + VERSION_PATH
+    try:
+        with urllib.request.urlopen(url, timeout=timeout) as resp:
+            return json.loads(resp.read().decode("utf-8"))
+    except Exception:
+        return {}
+
+
+def load_radar_settings() -> dict | None:
+    """Dump the scoring / dedup RadarSettings, or ``None`` if flyradar isn't importable."""
+    if RadarSettings is None:
+        return None
+    settings = RadarSettings()
+    return {key: getattr(settings, key, None) for key in _SETTINGS_KEYS}
+
+
+def load_prompt_versions() -> dict | None:
+    """Read each stage prompt's ``version`` from the flyradar prompt catalog, or ``None``."""
+    try:
+        catalog = files("flyradar.resources.prompts")
+    except ModuleNotFoundError:
+        return None
+    versions: dict[str, str] = {}
+    for entry in catalog.iterdir():
+        if not entry.name.endswith(".yaml"):
+            continue
+        for line in entry.read_text(encoding="utf-8").splitlines():
+            if line.strip().startswith("version:"):
+                versions[entry.name[:-5]] = line.split(":", 1)[1].strip().strip('"')
+                break
+    return versions or None
+
+
+def build_run_config(
+    options: dict,
+    *,
+    version: dict,
+    settings: dict | None,
+    prompt_versions: dict | None,
+    commit: str | None = None,
+) -> dict:
+    """Assemble the experiment-configuration snapshot from its captured parts."""
+    return {
+        "captured_by": "config-snapshot (non-invasive)",
+        "flyradar_version": version.get("version"),
+        "flyradar_commit": commit or version.get("commit"),
+        "options": options,
+        "settings": settings,
+        "prompt_versions": prompt_versions,
+        "temperature": None,
+        "seed": None,
+        "_note": (
+            "Non-invasive snapshot captured at generation time. `options` is the request "
+            "the caller sent; `settings` and `prompt_versions` are read from flyradar when "
+            "importable at the deployed commit. `temperature` and `seed` are not exposed by "
+            "RadarSettings and are recorded as null. The durable fix is for flyradar to stamp "
+            "this config into DiscoveryResult (see the 'flyradar improvements' issue)."
+        ),
+    }
+
+
+def write_snapshot(output_dir: str | Path, config: dict) -> Path:
+    """Write ``experiment_configuration.json`` into the run's output directory."""
+    path = Path(output_dir) / "experiment_configuration.json"
+    path.write_text(json.dumps(config, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
+    return path
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Capture the flyradar run configuration.")
+    parser.add_argument("--output-dir", required=True, help="Run directory holding output.json.")
+    parser.add_argument(
+        "--options", required=True, help="JSON file of the DiscoveryRequest options that were sent."
+    )
+    parser.add_argument(
+        "--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL)."
+    )
+    parser.add_argument("--commit", default=None, help="Deployed flyradar git commit, if known.")
+    args = parser.parse_args(argv)
+
+    base_url = args.base_url or os.environ.get("FLYRADAR_BASE_URL", "")
+    options = json.loads(Path(args.options).read_text(encoding="utf-8"))
+    config = build_run_config(
+        options,
+        version=fetch_version(base_url) if base_url else {},
+        settings=load_radar_settings(),
+        prompt_versions=load_prompt_versions(),
+        commit=args.commit,
+    )
+    path = write_snapshot(args.output_dir, config)
+    print(f"Wrote {path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 4ab1d859d16d4ae92d6a6d3a4a283a236d25d29d Mon Sep 17 00:00:00 2001
From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com>
Date: Thu, 18 Jun 2026 23:58:49 +0200
Subject: [PATCH 08/48] feat(lab): add retrieval metrics (hit@k, recall@k, MRR,
 MAP, nDCG) (#275)

* feat(lab): add retrieval_metrics module with compute_retrieval_metrics and RetrieverMetrics

* feat(lab): export RetrieverMetrics and compute_retrieval_metrics from lab package

* feat(evaluation): import RetrieverMetrics and compute_retrieval_metrics from lab.retrieval_metrics

---------

Co-authored-by: miguelgfierro <miguelgfierro@users.noreply.github.com>
---
 .../evaluation/__init__.py                    |   2 +-
 fireflyframework_agentic/lab/__init__.py      |   3 +
 .../lab/retrieval_metrics.py                  | 200 ++++++++++++++++++
 3 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 fireflyframework_agentic/lab/retrieval_metrics.py

diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py
index 37093075..ad01980c 100644
--- a/fireflyframework_agentic/evaluation/__init__.py
+++ b/fireflyframework_agentic/evaluation/__init__.py
@@ -39,7 +39,7 @@
 from fireflyframework_agentic.evaluation.judge_client import JudgeClient, OllamaEmbedder, build_embedder, cosine
 from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens
 from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256
-from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics
+from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics
 from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag
 
 try:
diff --git a/fireflyframework_agentic/lab/__init__.py b/fireflyframework_agentic/lab/__init__.py
index 46cc08dc..8e127d8a 100644
--- a/fireflyframework_agentic/lab/__init__.py
+++ b/fireflyframework_agentic/lab/__init__.py
@@ -18,6 +18,7 @@
 from fireflyframework_agentic.lab.comparison import ComparisonEntry, ModelComparison
 from fireflyframework_agentic.lab.dataset import EvalCase, EvalDataset
 from fireflyframework_agentic.lab.evaluator import EvalOrchestrator, EvalReport, EvalResult
+from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics
 from fireflyframework_agentic.lab.session import LabSession, SessionEntry
 
 __all__ = [
@@ -31,5 +32,7 @@
     "EvalResult",
     "LabSession",
     "ModelComparison",
+    "RetrieverMetrics",
     "SessionEntry",
+    "compute_retrieval_metrics",
 ]
diff --git a/fireflyframework_agentic/lab/retrieval_metrics.py b/fireflyframework_agentic/lab/retrieval_metrics.py
new file mode 100644
index 00000000..5f3e2373
--- /dev/null
+++ b/fireflyframework_agentic/lab/retrieval_metrics.py
@@ -0,0 +1,200 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Deterministic IR evaluation metrics for ranked retrieval results (no LLM, no network).
+
+Industry-standard information-retrieval metrics computed over a ranked list of
+retrieved chunks vs the gold set each result carries (``gold`` + per-hit
+``is_gold``).  Metrics are reported at cut-offs k ∈ {1, 5, 10}:
+
+* **Hit@k** -- at least one gold document appears in the top-k results.
+* **Recall@k** -- fraction of gold documents found in top-k.
+* **Precision@k** -- fraction of top-k results that are gold.
+* **MRR@10** -- mean reciprocal rank of the first gold hit (up to k=10).
+* **MAP@10** -- mean average precision (up to k=10).
+* **nDCG@10** -- normalised discounted cumulative gain (up to k=10).
+
+Optional fields (populated when the raw result rows contain them):
+
+* ``no_answer_rate`` -- fraction of rows where the model produced no answer.
+* ``citation_precision`` -- precision of in-answer citations vs gold set.
+* ``mean_search_ms`` / ``mean_answer_ms`` -- mean retrieval and generation latencies.
+
+Ported from ``flycanon_experiments/scripts/deterministic_eval.py``.
+"""
+
+from __future__ import annotations
+
+import math
+
+from pydantic import BaseModel
+
+KS = (1, 5, 10)
+
+
+def _dedup(retrieved: list[dict]) -> list[dict]:
+    """Return one entry per source, first chunk wins, preserving rank order.
+
+    flycanon splits each ingested document into many chunks; a single gold
+    filing can therefore appear multiple times in the ranked list.  Without
+    deduplication nDCG/MAP/Recall count every chunk separately, inflating
+    scores past 1.0 when a good embedding model retrieves several chunks from
+    the same filing.  Taking only the first (highest-ranked) chunk per
+    source_id makes the list item-unique, matching the recommenders-library
+    contract that all IR formulae assume.
+    """
+    seen: set[str] = set()
+    out: list[dict] = []
+    for r in sorted(retrieved, key=lambda x: x["rank"]):
+        key = r.get("source_id") or "|".join(r.get("identities", []))
+        if key not in seen:
+            seen.add(key)
+            out.append(r)
+    return out
+
+
+def _ndcg(retrieved: list[dict], n_gold: int, k: int = 10) -> float:
+    """Return nDCG@k for a single query."""
+    dcg = sum(
+        1.0 / math.log2(r["rank"] + 1)
+        for r in retrieved
+        if r.get("is_gold") and r["rank"] <= k
+    )
+    ideal = sum(1.0 / math.log2(i + 2) for i in range(min(n_gold, k)))
+    return dcg / ideal if ideal else 0.0
+
+
+def _ap(retrieved: list[dict], n_gold: int, k: int = 10) -> float:
+    """Return average precision@k for a single query."""
+    hits, precisions = 0, []
+    for r in sorted(retrieved, key=lambda x: x["rank"]):
+        if r["rank"] > k:
+            break
+        if r.get("is_gold"):
+            hits += 1
+            precisions.append(hits / r["rank"])
+    return sum(precisions) / min(n_gold, k) if n_gold else 0.0
+
+
+def compute_retrieval_metrics(results: list[dict]) -> dict:
+    """Compute deterministic IR metrics over a list of retrieval result rows.
+
+    Each element of *results* must be a dict with at least:
+
+    * ``retrieved`` -- list of dicts with ``rank`` (int, 1-based), ``source_id``
+      (str) or ``identities`` (list[str]), and ``is_gold`` (bool).
+    * ``gold`` -- list of gold source identifiers (used to compute ``n_gold``).
+
+    Optional keys per row:
+
+    * ``no_answer`` (bool) / ``answer`` (str) -- used for ``no_answer_rate``.
+    * ``citations`` (list[dict]) -- each with ``is_gold`` (bool) for citation precision.
+    * ``search_ms`` (float) / ``answer_ms`` (float) -- latency in milliseconds.
+
+    Returns a flat dict with keys: ``n_queries``, ``hit@1``, ``hit@5``,
+    ``hit@10``, ``recall@1``, ``recall@5``, ``recall@10``, ``precision@1``,
+    ``precision@5``, ``precision@10``, ``mrr@10``, ``map@10``, ``ndcg@10``,
+    ``no_answer_rate``, ``citation_precision``, ``mean_search_ms``,
+    ``mean_answer_ms``.
+    """
+    n = len(results)
+    agg = {f"{m}@{k}": 0.0 for k in KS for m in ("hit", "recall", "precision")}
+    agg.update({"mrr@10": 0.0, "map@10": 0.0, "ndcg@10": 0.0})
+    no_answer = 0
+    cite_num = cite_den = 0.0
+    search_ms: list[float] = []
+    answer_ms: list[float] = []
+
+    for row in results:
+        retrieved = _dedup(row["retrieved"])
+        n_gold = max(len(set(row["gold"])), 1)
+        gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")]
+        for k in KS:
+            in_k = [g for g in gold_ranks if g <= k]
+            agg[f"hit@{k}"] += 1.0 if in_k else 0.0
+            agg[f"recall@{k}"] += len(in_k) / n_gold
+            agg[f"precision@{k}"] += len(in_k) / k
+        agg["mrr@10"] += (1.0 / min(gold_ranks)) if gold_ranks else 0.0
+        agg["map@10"] += _ap(retrieved, n_gold)
+        agg["ndcg@10"] += _ndcg(retrieved, n_gold)
+
+        if row.get("no_answer") or not row.get("answer", "").strip():
+            no_answer += 1
+        cites = row.get("citations", [])
+        if cites:
+            cite_num += sum(1 for c in cites if c.get("is_gold"))
+            cite_den += len(cites)
+        if row.get("search_ms") is not None:
+            search_ms.append(row["search_ms"])
+        if row.get("answer_ms") is not None:
+            answer_ms.append(row["answer_ms"])
+
+    out = {k: round(v / n, 4) for k, v in agg.items()} if n else {}
+    out["n_queries"] = n
+    out["no_answer_rate"] = round(no_answer / n, 4) if n else None
+    out["citation_precision"] = round(cite_num / cite_den, 4) if cite_den else None
+    out["mean_search_ms"] = round(sum(search_ms) / len(search_ms)) if search_ms else None
+    out["mean_answer_ms"] = round(sum(answer_ms) / len(answer_ms)) if answer_ms else None
+    return out
+
+
+class RetrieverMetrics(BaseModel):
+    """Structured IR metrics for a retrieval evaluation run.
+
+    Fields mirror the flat dict returned by :func:`compute_retrieval_metrics`.
+    Optional fields are ``None`` when the raw result rows lack the required data
+    (e.g. no latency timestamps, no citations).
+    """
+
+    n_queries: int = 0
+    hit_at_1: float = 0.0
+    hit_at_5: float = 0.0
+    hit_at_10: float = 0.0
+    recall_at_1: float = 0.0
+    recall_at_5: float = 0.0
+    recall_at_10: float = 0.0
+    precision_at_1: float = 0.0
+    precision_at_5: float = 0.0
+    precision_at_10: float = 0.0
+    mrr_at_10: float = 0.0
+    map_at_10: float = 0.0
+    ndcg_at_10: float = 0.0
+    no_answer_rate: float | None = None
+    citation_precision: float | None = None
+    mean_search_ms: float | None = None
+    mean_answer_ms: float | None = None
+
+    @classmethod
+    def from_results(cls, results: list[dict]) -> "RetrieverMetrics":
+        """Compute metrics from raw retrieval result rows and return a model instance."""
+        m = compute_retrieval_metrics(results)
+        return cls(
+            n_queries=m.get("n_queries", 0),
+            hit_at_1=m.get("hit@1", 0.0),
+            hit_at_5=m.get("hit@5", 0.0),
+            hit_at_10=m.get("hit@10", 0.0),
+            recall_at_1=m.get("recall@1", 0.0),
+            recall_at_5=m.get("recall@5", 0.0),
+            recall_at_10=m.get("recall@10", 0.0),
+            precision_at_1=m.get("precision@1", 0.0),
+            precision_at_5=m.get("precision@5", 0.0),
+            precision_at_10=m.get("precision@10", 0.0),
+            mrr_at_10=m.get("mrr@10", 0.0),
+            map_at_10=m.get("map@10", 0.0),
+            ndcg_at_10=m.get("ndcg@10", 0.0),
+            no_answer_rate=m.get("no_answer_rate"),
+            citation_precision=m.get("citation_precision"),
+            mean_search_ms=m.get("mean_search_ms"),
+            mean_answer_ms=m.get("mean_answer_ms"),
+        )

From 0acac370f601451015b3f98366717b57a7c9c401 Mon Sep 17 00:00:00 2001
From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 00:02:20 +0200
Subject: [PATCH 09/48] feat(examples): add flyradar and flycanon evaluation
 examples (#276)

* feat(evaluation): add flyradar gate evaluation example

* feat(evaluation): add flycanon RAG retrieval evaluation example

---------

Co-authored-by: miguelgfierro <miguelgfierro@users.noreply.github.com>
---
 examples/flycanon_eval_example.py | 379 ++++++++++++++++++++++++++++
 examples/flyradar_eval_example.py | 406 ++++++++++++++++++++++++++++++
 2 files changed, 785 insertions(+)
 create mode 100644 examples/flycanon_eval_example.py
 create mode 100644 examples/flyradar_eval_example.py

diff --git a/examples/flycanon_eval_example.py b/examples/flycanon_eval_example.py
new file mode 100644
index 00000000..9d8d071b
--- /dev/null
+++ b/examples/flycanon_eval_example.py
@@ -0,0 +1,379 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""FlyCanon evaluation example — RAG retrieval benchmark with champion/challenger tracking.
+
+Demonstrates how to use ``fireflyframework_agentic.evaluation`` to replicate
+the flycanon experiment evaluation workflow:
+
+1. Load a results JSONL file produced by a flycanon retrieval pipeline.
+2. Compute deterministic IR metrics (Recall@k, Precision@k, MRR, nDCG, MAP).
+3. Compare against a saved baseline to detect regression.
+4. Print a formatted metrics table.
+5. Offer to promote the new run to champion when it beats the baseline.
+
+The champion/challenger pattern mirrors the flycanon_experiments harness:
+each run writes metrics to a file; ``approve`` promotes it by repointing
+baseline.json.  Here we replicate that flow using the framework's
+``compute_retrieval_metrics`` / ``RetrieverMetrics`` API directly.
+
+Usage::
+
+    # Score a results file (no baseline comparison)
+    python examples/flycanon_eval_example.py --results-file results.jsonl
+
+    # Compare against a saved baseline
+    python examples/flycanon_eval_example.py \\
+        --results-file results.jsonl \\
+        --baseline baseline.json
+
+    # Promote if better (write new champion to baseline.json)
+    python examples/flycanon_eval_example.py \\
+        --results-file results.jsonl \\
+        --baseline baseline.json \\
+        --promote-if-better
+
+Exit codes: 0 = scored successfully, 1 = regression detected vs baseline.
+
+Results JSONL format
+--------------------
+Each line is a JSON object representing one query's retrieval result::
+
+    {
+        "question": "What was Apple's revenue in Q4 2023?",
+        "gold": ["AAPL_10K_2023", "AAPL_10Q_Q4_2023"],
+        "retrieved": [
+            {"rank": 1, "source_id": "AAPL_10K_2023",  "is_gold": true},
+            {"rank": 2, "source_id": "MSFT_10K_2023",  "is_gold": false},
+            {"rank": 3, "source_id": "AAPL_10Q_Q4_2023", "is_gold": true}
+        ],
+        "answer": "Apple's revenue in Q4 2023 was $89.5 billion.",
+        "no_answer": false,
+        "citations": [
+            {"source_id": "AAPL_10K_2023", "is_gold": true}
+        ],
+        "search_ms": 142,
+        "answer_ms": 2310
+    }
+
+The ``gold`` list contains the source IDs that are considered correct answers.
+Each entry in ``retrieved`` must have a 1-based ``rank``, ``source_id`` (or
+``identities`` list), and ``is_gold`` bool.
+
+Baseline JSON format
+--------------------
+A flat JSON object with metric names as keys and float values::
+
+    {
+        "ndcg@10": 0.7234,
+        "mrr@10": 0.6891,
+        "recall@10": 0.8120,
+        "hit@10": 0.9100,
+        "map@10": 0.6543,
+        "n_queries": 200
+    }
+
+This is the same format written by ``--promote-if-better``.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from fireflyframework_agentic.evaluation import RetrieverMetrics, compute_retrieval_metrics
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+# Metrics that form the primary quality signal for champion/challenger
+# comparisons.  These are listed in priority order: nDCG@10 is the primary
+# ranking metric; MRR@10 measures how quickly the first gold result appears;
+# Recall@10 measures overall coverage; Hit@10 measures binary success rate;
+# MAP@10 measures precision across the ranked list.
+PRIMARY_METRICS = ["ndcg@10", "mrr@10", "recall@10", "hit@10", "map@10"]
+
+# Regression threshold: a metric must drop by more than this fraction of its
+# baseline value to be flagged as a regression (guards against noise).
+REGRESSION_THRESHOLD = 0.01
+
+
+def _load_jsonl(path: str) -> list[dict]:
+    """Load a newline-delimited JSON file, one object per line."""
+    lines = Path(path).read_text(encoding="utf-8").strip().splitlines()
+    return [json.loads(line) for line in lines if line.strip()]
+
+
+def _load_baseline(path: str) -> dict | None:
+    """Load a baseline JSON file, returning None if it does not exist."""
+    p = Path(path)
+    if not p.exists():
+        return None
+    return json.loads(p.read_text(encoding="utf-8"))
+
+
+def _save_baseline(path: str, metrics: dict) -> None:
+    """Write a flat metrics dict to the baseline JSON file."""
+    Path(path).write_text(json.dumps(metrics, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
+
+
+def _metrics_to_flat(m: RetrieverMetrics) -> dict:
+    """Convert a RetrieverMetrics model to the flat dict stored in baseline.json."""
+    return {
+        "n_queries": m.n_queries,
+        "hit@1": m.hit_at_1,
+        "hit@5": m.hit_at_5,
+        "hit@10": m.hit_at_10,
+        "recall@1": m.recall_at_1,
+        "recall@5": m.recall_at_5,
+        "recall@10": m.recall_at_10,
+        "precision@1": m.precision_at_1,
+        "precision@5": m.precision_at_5,
+        "precision@10": m.precision_at_10,
+        "mrr@10": m.mrr_at_10,
+        "map@10": m.map_at_10,
+        "ndcg@10": m.ndcg_at_10,
+        "no_answer_rate": m.no_answer_rate,
+        "citation_precision": m.citation_precision,
+        "mean_search_ms": m.mean_search_ms,
+        "mean_answer_ms": m.mean_answer_ms,
+    }
+
+
+def _print_metrics_table(metrics: RetrieverMetrics, baseline: dict | None) -> None:
+    """Print a formatted table comparing current metrics vs baseline."""
+    flat = _metrics_to_flat(metrics)
+
+    col_w = 22
+    num_w = 10
+    header = f"{'Metric':<{col_w}} {'Current':>{num_w}}"
+    if baseline:
+        header += f" {'Baseline':>{num_w}} {'Delta':>{num_w}}"
+    print(header)
+    print("-" * (col_w + num_w + (num_w * 2 + 2 if baseline else 0)))
+
+    for key, value in flat.items():
+        if value is None:
+            continue
+        # Format floats as 4 decimal places; ints as plain integers.
+        if isinstance(value, float):
+            cur_str = f"{value:.4f}"
+        else:
+            cur_str = str(value)
+
+        row = f"{key:<{col_w}} {cur_str:>{num_w}}"
+        if baseline and key in baseline and isinstance(value, float):
+            base_val = baseline[key]
+            delta = value - base_val
+            delta_str = f"{delta:+.4f}"
+            row += f" {base_val:>{num_w}.4f} {delta_str:>{num_w}}"
+        print(row)
+
+    print()
+
+
+def _detect_regressions(flat: dict, baseline: dict) -> list[str]:
+    """Return the names of primary metrics that regressed vs baseline.
+
+    A regression is flagged when the new value drops by more than
+    REGRESSION_THRESHOLD * baseline_value (relative threshold).  This
+    guards against flagging noise as a regression.
+    """
+    regressions = []
+    for key in PRIMARY_METRICS:
+        new_val = flat.get(key)
+        base_val = baseline.get(key)
+        if new_val is None or base_val is None:
+            continue
+        if base_val > 0 and (base_val - new_val) / base_val > REGRESSION_THRESHOLD:
+            regressions.append(key)
+    return regressions
+
+
+def _beats_baseline(flat: dict, baseline: dict) -> bool:
+    """Return True if the new metrics are better than or equal to the baseline.
+
+    'Better' means no primary metric has regressed beyond REGRESSION_THRESHOLD
+    AND at least one primary metric has improved.
+    """
+    regressions = _detect_regressions(flat, baseline)
+    if regressions:
+        return False
+    # Check for at least one improvement.
+    for key in PRIMARY_METRICS:
+        new_val = flat.get(key)
+        base_val = baseline.get(key)
+        if new_val is not None and base_val is not None and new_val > base_val:
+            return True
+    return False
+
+
+# ---------------------------------------------------------------------------
+# Main evaluation flow
+# ---------------------------------------------------------------------------
+
+
+def run_evaluation(args: argparse.Namespace) -> int:
+    """Run retrieval metric scoring and optional champion/challenger comparison."""
+
+    # ------------------------------------------------------------------
+    # Step 1 — Load results from the JSONL file.
+    #
+    # Each line is one query's retrieval result.  The file is produced by
+    # a flycanon pipeline run (runner.run_queries writes results.jsonl).
+    # ------------------------------------------------------------------
+    print(f"Loading results  : {args.results_file}")
+    results = _load_jsonl(args.results_file)
+    print(f"  {len(results)} query results loaded.")
+
+    if not results:
+        print("ERROR: results file is empty.", file=sys.stderr)
+        return 1
+
+    # ------------------------------------------------------------------
+    # Step 2 — Compute deterministic IR metrics.
+    #
+    # compute_retrieval_metrics() returns a flat dict of standard IR metrics.
+    # RetrieverMetrics.from_results() wraps that into a typed Pydantic model
+    # for convenient attribute access.
+    #
+    # Metrics are computed at cut-offs k ∈ {1, 5, 10} and include:
+    #   hit@k       -- at least one gold doc in top-k (binary)
+    #   recall@k    -- fraction of gold docs found in top-k
+    #   precision@k -- fraction of top-k that are gold
+    #   mrr@10      -- mean reciprocal rank of first gold hit
+    #   map@10      -- mean average precision
+    #   ndcg@10     -- normalised discounted cumulative gain
+    # ------------------------------------------------------------------
+    print("\nComputing retrieval metrics ...")
+    metrics = RetrieverMetrics.from_results(results)
+
+    print(f"  nDCG@10    : {metrics.ndcg_at_10:.4f}")
+    print(f"  MRR@10     : {metrics.mrr_at_10:.4f}")
+    print(f"  Recall@10  : {metrics.recall_at_10:.4f}")
+    print(f"  Hit@10     : {metrics.hit_at_10:.4f}")
+    print(f"  MAP@10     : {metrics.map_at_10:.4f}")
+
+    # ------------------------------------------------------------------
+    # Step 3 — Load the baseline (champion) for regression detection.
+    # ------------------------------------------------------------------
+    baseline = None
+    if args.baseline:
+        baseline = _load_baseline(args.baseline)
+        if baseline:
+            print(f"\nLoaded baseline  : {args.baseline}")
+        else:
+            print(f"\nNo baseline found at {args.baseline} — first run, no comparison.")
+
+    # ------------------------------------------------------------------
+    # Step 4 — Print the full metrics table.
+    # ------------------------------------------------------------------
+    print("\n" + "=" * 56)
+    print("Retrieval Metrics")
+    print("=" * 56)
+    _print_metrics_table(metrics, baseline)
+
+    # ------------------------------------------------------------------
+    # Step 5 — Regression check.
+    #
+    # Compare against the baseline on primary metrics.  Regressions block
+    # promotion (exit code 1) unless --promote-if-better is set and the
+    # run actually improved overall.
+    # ------------------------------------------------------------------
+    flat = _metrics_to_flat(metrics)
+
+    if baseline:
+        regressions = _detect_regressions(flat, baseline)
+        if regressions:
+            print(f"REGRESSION detected on: {', '.join(regressions)}")
+            print(f"  Threshold: {REGRESSION_THRESHOLD * 100:.0f}% relative drop on any primary metric.")
+        else:
+            better = _beats_baseline(flat, baseline)
+            if better:
+                print("Challenger BEATS baseline on at least one primary metric.")
+            else:
+                print("Challenger is on-par with baseline (no regression, no improvement).")
+
+        if regressions and not args.promote_if_better:
+            print("\nVerdict: HOLD — regression detected.  Tune the pipeline and re-run.")
+            return 1
+
+    # ------------------------------------------------------------------
+    # Step 6 — Champion promotion.
+    #
+    # When --promote-if-better is set and the metrics beat (or equal) the
+    # baseline, save the new metrics as the champion.  Future runs will
+    # compare against this updated record.
+    # ------------------------------------------------------------------
+    if args.promote_if_better and args.baseline:
+        if baseline is None or _beats_baseline(flat, baseline):
+            _save_baseline(args.baseline, flat)
+            print(f"\nChampion PROMOTED — metrics saved to {args.baseline}")
+        else:
+            print("\nNot promoted — challenger did not beat baseline on primary metrics.")
+
+    print("\nVerdict: PROMOTE" if not (baseline and _detect_regressions(flat, baseline)) else "\nVerdict: HOLD")
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="flycanon_eval_example",
+        description=(
+            "FlyCanon RAG retrieval benchmark — computes IR metrics from a results JSONL "
+            "and compares against a champion baseline."
+        ),
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    p.add_argument(
+        "--results-file",
+        required=True,
+        help="Path to results.jsonl produced by the flycanon pipeline.",
+    )
+    p.add_argument(
+        "--baseline",
+        default=None,
+        help=(
+            "Path to baseline.json (champion store).  When absent, scores are printed "
+            "without comparison."
+        ),
+    )
+    p.add_argument(
+        "--promote-if-better",
+        action="store_true",
+        help=(
+            "When set, write new metrics to baseline.json if the challenger beats the "
+            "champion on primary metrics.  Has no effect when --baseline is omitted."
+        ),
+    )
+    return p
+
+
+def main() -> None:
+    parser = build_parser()
+    args = parser.parse_args()
+    sys.exit(run_evaluation(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/flyradar_eval_example.py b/examples/flyradar_eval_example.py
new file mode 100644
index 00000000..706528f4
--- /dev/null
+++ b/examples/flyradar_eval_example.py
@@ -0,0 +1,406 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""FlyRadar evaluation example — gate-based process-mining quality gate.
+
+Demonstrates how to use ``fireflyframework_agentic.evaluation`` to replicate
+the flyradar experiment quality-gate workflow:
+
+1. Load a must-find registry (the gold standard items the model must discover).
+2. Load a DiscoveryResult produced by a flyradar pipeline run.
+3. Run gates G1-G5 to produce a structured verdict:
+     G1 -- Structural & Safe (schema validity, PII, empty-registry guard).
+     G2 -- Recall & Precision (must-find recall floor, NC precision).
+     G3 -- Grounded (finding-to-evidence anchoring).
+     G4 -- LLM-as-a-Judge (advisory only; never blocks promotion).
+     G5 -- No-regression / promotion (champion/challenger comparison).
+4. Render a human-readable scorecard and print the final verdict.
+5. Promote the challenger to champion when the verdict is PROMOTE.
+
+Usage::
+
+    # Minimal: deterministic gates only (no G4 judge, no baseline)
+    python examples/flyradar_eval_example.py \\
+        --result output.json \\
+        --registry registry.json
+
+    # With corpus verification and a champion baseline
+    python examples/flyradar_eval_example.py \\
+        --result output.json \\
+        --registry registry.json \\
+        --baseline baseline.json \\
+        --corpus input.json
+
+    # With the advisory G4 LLM judge (requires API key in environment)
+    FLYEVAL_JUDGE_MODEL=anthropic:claude-sonnet-4-6 \\
+    python examples/flyradar_eval_example.py \\
+        --result output.json \\
+        --registry registry.json \\
+        --judge-model anthropic:claude-sonnet-4-6
+
+Exit codes: 0 = PROMOTE, 1 = HOLD.
+
+Input file formats
+------------------
+``--result`` (output.json)
+    A DiscoveryResult JSON produced by a flyradar pipeline run.  Must contain
+    at minimum ``findings`` (list) and ``evidence_index`` (list).
+
+``--registry`` (registry.json)
+    A lean-1 registry JSON.  Each item has ``id``, ``tier`` (L0-L3), ``title``,
+    ``description``, and ``nc`` (bool, True for negative controls).
+
+``--baseline`` (baseline.json)
+    A ChampionRecord JSON written by a previous PROMOTE run.  When omitted the
+    gate runs in day-zero mode (G5 always passes and a new champion is minted).
+
+``--corpus`` (input.json)
+    The corpus bundle used during the run.  When supplied, G3 verifies that cited
+    evidence excerpts actually appear in the corpus documents.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from fireflyframework_agentic.evaluation import (
+    ChampionRecord,
+    GateResult,
+    build_embedder,
+    load_champion,
+    load_corpus,
+    load_registry,
+    render_scorecard,
+    run_gates,
+    run_judge,
+    save_champion,
+    verdict,
+    VERDICT_PROMOTE,
+)
+from fireflyframework_agentic.evaluation.models import EvalConfig
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _load_json(path: str) -> dict:
+    """Read a JSON file and return its contents as a dict."""
+    return json.loads(Path(path).read_text(encoding="utf-8"))
+
+
+def _lexical_missed_ids(result: dict, registry) -> list[str]:
+    """Return the IDs of registry items not matched by any finding (lexically).
+
+    The G4 judge uses these to focus its coverage checks on items that
+    lexical recall missed — the places where semantic recovery matters most.
+    """
+    from fireflyframework_agentic.evaluation.matcher import matches
+
+    evidence_index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")}
+    findings = result.get("findings", [])
+    # L3 items are informational-only and are never scored.
+    scored_items = [item for item in registry.real_items if item.tier != "L3"]
+    return [
+        item.id
+        for item in scored_items
+        if not any(matches(f, item, evidence_index) for f in findings)
+    ]
+
+
+# ---------------------------------------------------------------------------
+# Main evaluation flow
+# ---------------------------------------------------------------------------
+
+
+def run_evaluation(args: argparse.Namespace) -> int:
+    """Run the full flyradar gate evaluation and return an exit code."""
+
+    # ------------------------------------------------------------------
+    # Step 1 — Load inputs.
+    # ------------------------------------------------------------------
+    print(f"Loading result   : {args.result}")
+    result = _load_json(args.result)
+
+    print(f"Loading registry : {args.registry}")
+    registry = load_registry(args.registry)
+    print(f"  {len(registry.real_items)} real items, {len(registry.nc_items)} NC items")
+
+    # The EvalConfig captures provenance for the run record.
+    config = EvalConfig(
+        model_id=args.model_id,
+        corpus=registry.corpus,
+        run_id=args.run_id,
+        registry_path=args.registry,
+        corpus_path=args.corpus or "",
+        baseline_path=args.baseline or "",
+        judge_model=args.judge_model or "",
+    )
+
+    # Optional: corpus bundle for deterministic evidence verification (G3).
+    corpus = None
+    if args.corpus:
+        print(f"Loading corpus   : {args.corpus}")
+        corpus = load_corpus(args.corpus)
+
+    # Optional: champion record for regression detection (G5).
+    champion = None
+    champion_scores = None
+    aa_noise = None
+    if args.baseline:
+        print(f"Loading baseline : {args.baseline}")
+        champion = load_champion(args.baseline)
+        if champion:
+            champion_scores = champion.scores
+            aa_noise = champion.aa_noise
+            print(f"  Champion run   : {champion.run_id} ({champion.model_id})")
+        else:
+            print("  No champion found — running in day-zero mode.")
+
+    # Optional: embedder for semantic/hybrid recall (G2).
+    embed_fn = None
+    if args.embedder:
+        print(f"Building embedder: {args.embedder}")
+        embed_fn = build_embedder(args.embedder)
+
+    print()
+
+    # ------------------------------------------------------------------
+    # Step 2 — Run deterministic gates G1-G3 + G5.
+    #
+    # run_gates() returns a list of GateResult objects, one per gate.
+    # Each GateResult carries:
+    #   .gate   -- "G1" | "G2" | "G3" | "G5"
+    #   .passed -- bool
+    #   .details -- dict with per-metric values
+    #   .errors  -- list[str] of blocking error codes
+    # ------------------------------------------------------------------
+    print("Running gates G1-G3 + G5 ...")
+    gate_results: list[GateResult] = run_gates(
+        result,
+        registry,
+        args.registry,
+        pii_list=args.pii_list or [],
+        recall_floor=args.recall_floor,
+        grounding_floor=args.grounding_floor,
+        champion_scores=champion_scores,
+        aa_noise=aa_noise,
+        is_day_zero=(champion is None),
+        human_signed_off=args.human_signed_off,
+        signoff_count=args.signoffs,
+        embed_fn=embed_fn,
+        tau=args.tau,
+        recall_metric=args.recall_metric,
+        tau_nc=args.tau_nc,
+        corpus=corpus,
+    )
+
+    # Quick gate summary before the full scorecard.
+    for gr in gate_results:
+        status = "PASS" if gr.passed else "FAIL"
+        print(f"  {gr.gate}: {status}")
+
+    # ------------------------------------------------------------------
+    # Step 3 — Run the advisory G4 LLM-as-a-Judge (optional).
+    #
+    # G4 is non-blocking: it never changes the verdict or exit code.
+    # It produces an AdvisoryReport with per-finding quality signals
+    # (faithfulness, citation relevance, fabricated entities, etc.).
+    # ------------------------------------------------------------------
+    advisory = None
+    if args.judge_model:
+        print(f"\nRunning G4 judge ({args.judge_model}) ...")
+        missed_ids = _lexical_missed_ids(result, registry)
+        advisory = run_judge(
+            result,
+            registry,
+            judge_model=args.judge_model,
+            runs=args.judge_runs,
+            concurrency=args.judge_concurrency,
+            pipeline_model=args.model_id,
+            embed_fn=embed_fn,
+            tau=args.tau,
+            lexical_missed_ids=missed_ids,
+        )
+        print(f"  Judge completed ({args.judge_runs} run(s)).")
+    else:
+        print("\nG4 judge skipped (pass --judge-model to enable).")
+
+    # ------------------------------------------------------------------
+    # Step 4 — Render the scorecard.
+    #
+    # render_scorecard() produces a markdown-formatted human-readable
+    # report that mirrors the output of `flyeval gate` in the playground.
+    # ------------------------------------------------------------------
+    print()
+    scorecard = render_scorecard(
+        gate_results,
+        corpus=registry.corpus,
+        model_id=config.model_id,
+        run_id=config.run_id,
+        is_self_graded=True,
+        kappa_advisory=registry.is_kappa_advisory(),
+        evidence_unverified=(corpus is None),
+        advisory=advisory,
+    )
+    print(scorecard)
+
+    # ------------------------------------------------------------------
+    # Step 5 — Inspect the verdict and handle promotion.
+    #
+    # verdict() returns "PROMOTE" or "HOLD" based on the gate results.
+    # On PROMOTE, save the challenger as the new champion so future runs
+    # can detect regressions against this baseline.
+    # ------------------------------------------------------------------
+    v = verdict(gate_results)
+    print(f"\nFinal verdict: {v}")
+
+    if v == VERDICT_PROMOTE and args.baseline:
+        # Extract the key scores from G2 and G3 to store in the champion record.
+        g2 = next((g for g in gate_results if g.gate == "G2"), None)
+        g3 = next((g for g in gate_results if g.gate == "G3"), None)
+        scores: dict[str, float] = {}
+        if g2:
+            scores["recall"] = g2.details.get("recall", 0.0)
+        if g3:
+            scores["grounding_pct"] = g3.details.get("grounding_pct", 0.0)
+
+        new_champion = ChampionRecord(
+            corpus=registry.corpus,
+            run_id=config.run_id,
+            model_id=config.model_id,
+            registry_sha256=registry.sha256(),
+            scores=scores,
+            is_day_zero=(champion is None),
+        )
+        save_champion(
+            args.baseline,
+            new_champion,
+            summary=f"Promoted by flyradar_eval_example.py — {config.run_id}",
+        )
+        print(f"Champion saved to {args.baseline}")
+
+    # Exit 0 = PROMOTE, 1 = HOLD (mirrors `flyeval gate` convention).
+    return 0 if v == VERDICT_PROMOTE else 1
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="flyradar_eval_example",
+        description="FlyRadar gate evaluation — replicates the flyeval gate workflow.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Required inputs.
+    p.add_argument("--result", required=True, help="Path to DiscoveryResult JSON.")
+    p.add_argument("--registry", required=True, help="Path to lean-1 registry JSON.")
+
+    # Optional inputs.
+    p.add_argument(
+        "--baseline",
+        help="Path to baseline.json (champion store).  When absent, runs in day-zero mode.",
+    )
+    p.add_argument(
+        "--corpus",
+        help="Path to input.json corpus bundle for deterministic evidence verification (G3).",
+    )
+
+    # Run metadata.
+    p.add_argument("--model-id", default="unknown", help="Model identifier for the scorecard.")
+    p.add_argument("--run-id", default="example-run", help="Run identifier for the scorecard.")
+
+    # Gate thresholds.
+    p.add_argument(
+        "--recall-floor",
+        type=float,
+        default=0.70,
+        help="Minimum recall required for G2 to pass.",
+    )
+    p.add_argument(
+        "--grounding-floor",
+        type=float,
+        default=0.90,
+        help="Minimum grounding percentage required for G3 to pass.",
+    )
+    p.add_argument(
+        "--recall-metric",
+        choices=["lexical", "semantic", "hybrid"],
+        default="lexical",
+        help="Recall metric used by G2.  'semantic' and 'hybrid' require --embedder.",
+    )
+    p.add_argument(
+        "--tau",
+        type=float,
+        default=0.70,
+        help="Cosine similarity threshold for semantic recall (real items).",
+    )
+    p.add_argument(
+        "--tau-nc",
+        type=float,
+        default=0.85,
+        help="Cosine similarity threshold for NC item detection.",
+    )
+    p.add_argument("--pii-list", nargs="*", default=[], help="PII tokens to check for in findings.")
+    p.add_argument("--human-signed-off", action="store_true", help="Mark this run as human-reviewed.")
+    p.add_argument("--signoffs", type=int, default=0, help="Number of human sign-offs collected.")
+
+    # G4 judge options.
+    p.add_argument(
+        "--judge-model",
+        default=None,
+        help=(
+            "Provider:model string for the advisory G4 LLM judge "
+            "(e.g. 'anthropic:claude-sonnet-4-6').  Omit to skip G4."
+        ),
+    )
+    p.add_argument(
+        "--judge-runs",
+        type=int,
+        default=1,
+        help="Number of judge calls to aggregate (odd number recommended for median).",
+    )
+    p.add_argument(
+        "--judge-concurrency",
+        type=int,
+        default=1,
+        help="Thread fan-out for per-item G4 metrics (1 = sequential).",
+    )
+
+    # Embedder for semantic recall.
+    p.add_argument(
+        "--embedder",
+        default=None,
+        help="Embedder spec for semantic/hybrid recall (e.g. 'ollama:bge-m3').",
+    )
+
+    return p
+
+
+def main() -> None:
+    parser = build_parser()
+    args = parser.parse_args()
+    sys.exit(run_evaluation(args))
+
+
+if __name__ == "__main__":
+    main()

From cc048cf187371d99927072d03dd3016c8e765777 Mon Sep 17 00:00:00 2001
From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 00:08:13 +0200
Subject: [PATCH 10/48] test(evaluation): add unit tests for evaluation package
 and retrieval metrics (#277)

* feat(evaluation): add tests/unit/evaluation package init

* feat(evaluation): add unit tests for matcher (anchored, source_stem, tokens, matches)

* feat(evaluation): add unit tests for stats (aa_band, aggregate_grounding, left_skew_flag)

* feat(evaluation): add unit tests for gates (GateResult, verdict, render_scorecard, g5_no_regression)

* feat(evaluation): add unit tests for champion (ChampionRecord, load/save/invalidate, input_hash)

* feat(evaluation): add unit tests for retrieval_metrics (compute_retrieval_metrics, RetrieverMetrics)

* feat(evaluation): fix boundary test for left_skew_flag (floating-point precision)

* feat(evaluation): fix no_answer_rate test to match implementation behaviour

---------

Co-authored-by: miguelgfierro <miguelgfierro@users.noreply.github.com>
---
 tests/unit/evaluation/__init__.py        |   0
 tests/unit/evaluation/test_champion.py   | 199 ++++++++++++++++++
 tests/unit/evaluation/test_gates.py      | 219 ++++++++++++++++++++
 tests/unit/evaluation/test_matcher.py    | 221 ++++++++++++++++++++
 tests/unit/evaluation/test_stats.py      | 183 +++++++++++++++++
 tests/unit/lab/test_retrieval_metrics.py | 247 +++++++++++++++++++++++
 6 files changed, 1069 insertions(+)
 create mode 100644 tests/unit/evaluation/__init__.py
 create mode 100644 tests/unit/evaluation/test_champion.py
 create mode 100644 tests/unit/evaluation/test_gates.py
 create mode 100644 tests/unit/evaluation/test_matcher.py
 create mode 100644 tests/unit/evaluation/test_stats.py
 create mode 100644 tests/unit/lab/test_retrieval_metrics.py

diff --git a/tests/unit/evaluation/__init__.py b/tests/unit/evaluation/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/evaluation/test_champion.py b/tests/unit/evaluation/test_champion.py
new file mode 100644
index 00000000..948a9639
--- /dev/null
+++ b/tests/unit/evaluation/test_champion.py
@@ -0,0 +1,199 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for evaluation.champion: ChampionRecord, load/save/invalidate_champion, input_hash."""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from fireflyframework_agentic.evaluation.champion import (
+    ChampionRecord,
+    input_hash,
+    invalidate_champion,
+    load_champion,
+    save_champion,
+)
+
+
+def _make_champion(**overrides) -> ChampionRecord:
+    defaults = dict(
+        corpus="test-corpus",
+        run_id="run-2026-01",
+        model_id="claude-sonnet-4-5",
+        registry_sha256="abc123",
+        scores={"recall": 0.85, "grounding_pct": 0.92},
+        aa_noise={"recall": 0.02},
+        is_day_zero=False,
+        human_sign_offs=["reviewer-1"],
+    )
+    defaults.update(overrides)
+    return ChampionRecord(**defaults)
+
+
+# ── load_champion ─────────────────────────────────────────────────────────────
+
+
+def test_load_champion_nonexistent_file_returns_none(tmp_path):
+    result = load_champion(tmp_path / "baseline.json")
+    assert result is None
+
+
+def test_load_champion_file_with_null_champion_returns_none(tmp_path):
+    baseline = tmp_path / "baseline.json"
+    baseline.write_text(json.dumps({"champion": None, "promotion_log": []}), encoding="utf-8")
+    assert load_champion(baseline) is None
+
+
+# ── save_champion / load_champion round-trip ──────────────────────────────────
+
+
+def test_save_then_load_round_trips_all_fields(tmp_path):
+    baseline = tmp_path / "baseline.json"
+    champ = _make_champion()
+    save_champion(baseline, champ, summary="initial champion", date="2026-01-01")
+
+    loaded = load_champion(baseline)
+    assert loaded is not None
+    assert loaded.corpus == champ.corpus
+    assert loaded.run_id == champ.run_id
+    assert loaded.model_id == champ.model_id
+    assert loaded.registry_sha256 == champ.registry_sha256
+    assert loaded.scores == champ.scores
+    assert loaded.aa_noise == champ.aa_noise
+    assert loaded.is_day_zero == champ.is_day_zero
+    assert loaded.human_sign_offs == champ.human_sign_offs
+
+
+def test_save_champion_appends_promotion_log_entry(tmp_path):
+    baseline = tmp_path / "baseline.json"
+    champ = _make_champion()
+    save_champion(baseline, champ, summary="first", date="2026-01-01")
+
+    champ2 = _make_champion(run_id="run-2026-02", scores={"recall": 0.90})
+    save_champion(baseline, champ2, summary="second", date="2026-02-01")
+
+    raw = json.loads(baseline.read_text(encoding="utf-8"))
+    log = raw["promotion_log"]
+    assert len(log) == 2
+    assert log[0]["to"] == "run-2026-01"
+    assert log[1]["to"] == "run-2026-02"
+    assert log[1]["from"] == "run-2026-01"
+
+
+def test_save_champion_creates_file_when_missing(tmp_path):
+    baseline = tmp_path / "baseline.json"
+    assert not baseline.exists()
+    save_champion(baseline, _make_champion())
+    assert baseline.exists()
+
+
+def test_save_champion_day_zero_flag_preserved(tmp_path):
+    baseline = tmp_path / "baseline.json"
+    champ = _make_champion(is_day_zero=True)
+    save_champion(baseline, champ)
+    loaded = load_champion(baseline)
+    assert loaded.is_day_zero is True
+
+
+def test_save_champion_label_is_day_zero_when_flag_set(tmp_path):
+    baseline = tmp_path / "baseline.json"
+    champ = _make_champion(is_day_zero=True)
+    save_champion(baseline, champ)
+    raw = json.loads(baseline.read_text(encoding="utf-8"))
+    assert raw["promotion_log"][0]["label"] == "day-zero"
+
+
+def test_save_champion_label_is_promotion_when_flag_not_set(tmp_path):
+    baseline = tmp_path / "baseline.json"
+    save_champion(baseline, _make_champion(is_day_zero=False))
+    raw = json.loads(baseline.read_text(encoding="utf-8"))
+    assert raw["promotion_log"][0]["label"] == "promotion"
+
+
+# ── invalidate_champion ───────────────────────────────────────────────────────
+
+
+def test_invalidate_champion_sets_champion_to_null(tmp_path):
+    baseline = tmp_path / "baseline.json"
+    save_champion(baseline, _make_champion())
+    invalidate_champion(baseline, reason="EMPTY_MUST_FIND fake champion", date="2026-03-01")
+
+    loaded = load_champion(baseline)
+    assert loaded is None
+
+    raw = json.loads(baseline.read_text(encoding="utf-8"))
+    assert raw["champion"] is None
+
+
+def test_invalidate_champion_appends_invalidation_log(tmp_path):
+    baseline = tmp_path / "baseline.json"
+    save_champion(baseline, _make_champion(), date="2026-01-01")
+    invalidate_champion(baseline, reason="fake champion", date="2026-03-01")
+
+    raw = json.loads(baseline.read_text(encoding="utf-8"))
+    log = raw["promotion_log"]
+    assert log[-1]["label"] == "INVALIDATED"
+    assert "fake champion" in log[-1]["summary"]
+    assert log[-1]["to"] is None
+
+
+def test_invalidate_champion_noop_when_file_missing(tmp_path):
+    # Should not raise when file does not exist.
+    invalidate_champion(tmp_path / "no-file.json", reason="test")
+
+
+# ── ChampionRecord helpers ────────────────────────────────────────────────────
+
+
+def test_primary_metric_returns_first_key():
+    champ = _make_champion(scores={"recall": 0.85, "grounding_pct": 0.92})
+    assert champ.primary_metric() == "recall"
+
+
+def test_primary_score_returns_first_value():
+    champ = _make_champion(scores={"recall": 0.85, "grounding_pct": 0.92})
+    assert champ.primary_score() == 0.85
+
+
+def test_primary_metric_empty_scores():
+    champ = _make_champion(scores={})
+    assert champ.primary_metric() == ""
+    assert champ.primary_score() == 0.0
+
+
+# ── input_hash ────────────────────────────────────────────────────────────────
+
+
+def test_input_hash_is_16_chars():
+    result = input_hash({"key": "value"})
+    assert len(result) == 16
+
+
+def test_input_hash_is_deterministic():
+    data = {"process_graph": {"processes": []}, "findings": []}
+    h1 = input_hash(data)
+    h2 = input_hash(data)
+    assert h1 == h2
+
+
+def test_input_hash_differs_for_different_inputs():
+    assert input_hash({"a": 1}) != input_hash({"a": 2})
+
+
+def test_input_hash_key_order_independent():
+    # sort_keys=True in input_hash should make {"a":1, "b":2} == {"b":2, "a":1}.
+    assert input_hash({"a": 1, "b": 2}) == input_hash({"b": 2, "a": 1})
diff --git a/tests/unit/evaluation/test_gates.py b/tests/unit/evaluation/test_gates.py
new file mode 100644
index 00000000..2edc3b99
--- /dev/null
+++ b/tests/unit/evaluation/test_gates.py
@@ -0,0 +1,219 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for evaluation.gates: GateResult, verdict, render_scorecard, g5_no_regression."""
+
+from __future__ import annotations
+
+from fireflyframework_agentic.evaluation.gates import (
+    GateResult,
+    Verdict,
+    g5_no_regression,
+    render_scorecard,
+)
+from fireflyframework_agentic.evaluation.scorecard import verdict
+
+
+# ── GateResult ────────────────────────────────────────────────────────────────
+
+
+def test_gate_result_str_pass():
+    gr = GateResult(gate="G1", passed=True)
+    assert str(gr) == "[G1] PASS"
+
+
+def test_gate_result_str_flag():
+    gr = GateResult(gate="G2", passed=False, reason_code="RECALL_BELOW_FLOOR")
+    assert str(gr) == "[G2] FLAG:RECALL_BELOW_FLOOR"
+
+
+def test_gate_result_flag_without_reason_code():
+    gr = GateResult(gate="G3", passed=False, reason_code="")
+    assert str(gr) == "[G3] FLAG:"
+
+
+def test_gate_result_passed_true():
+    gr = GateResult(gate="G5", passed=True, details={"note": "ok"})
+    assert gr.passed is True
+    assert gr.details["note"] == "ok"
+
+
+def test_gate_result_default_details_is_empty_dict():
+    gr = GateResult(gate="G1", passed=True)
+    assert gr.details == {}
+
+
+# ── verdict ───────────────────────────────────────────────────────────────────
+
+
+def test_verdict_promote_when_all_pass_and_g5_present():
+    gates = [
+        GateResult(gate="G1", passed=True),
+        GateResult(gate="G2", passed=True),
+        GateResult(gate="G3", passed=True),
+        GateResult(gate="G5", passed=True),
+    ]
+    assert verdict(gates) == "PROMOTE"
+
+
+def test_verdict_hold_when_any_gate_fails():
+    gates = [
+        GateResult(gate="G1", passed=True),
+        GateResult(gate="G2", passed=False, reason_code="RECALL_BELOW_FLOOR"),
+        GateResult(gate="G3", passed=True),
+        GateResult(gate="G5", passed=True),
+    ]
+    assert verdict(gates) == "HOLD"
+
+
+def test_verdict_hold_when_g5_missing():
+    # All G1/G2/G3 pass but G5 is absent — no promotion without sign-off.
+    gates = [
+        GateResult(gate="G1", passed=True),
+        GateResult(gate="G2", passed=True),
+        GateResult(gate="G3", passed=True),
+    ]
+    assert verdict(gates) == "HOLD"
+
+
+def test_verdict_hold_on_empty_list():
+    assert verdict([]) == "HOLD"
+
+
+def test_verdict_hold_when_g5_fails():
+    gates = [
+        GateResult(gate="G1", passed=True),
+        GateResult(gate="G2", passed=True),
+        GateResult(gate="G3", passed=True),
+        GateResult(gate="G5", passed=False, reason_code="HOLD"),
+    ]
+    assert verdict(gates) == "HOLD"
+
+
+# ── render_scorecard (from gates module) ──────────────────────────────────────
+
+
+def test_render_scorecard_contains_verdict_line():
+    gates = [
+        GateResult(gate="G1", passed=True),
+        GateResult(gate="G2", passed=True),
+        GateResult(gate="G3", passed=True),
+        GateResult(gate="G5", passed=True),
+    ]
+    output = render_scorecard(gates)
+    assert "VERDICT: PROMOTE" in output
+
+
+def test_render_scorecard_hold_when_flag():
+    gates = [
+        GateResult(gate="G1", passed=False, reason_code="SCHEMA_INVALID"),
+        GateResult(gate="G2", passed=True),
+        GateResult(gate="G3", passed=True),
+        GateResult(gate="G5", passed=True),
+    ]
+    output = render_scorecard(gates)
+    assert "VERDICT: HOLD" in output
+
+
+def test_render_scorecard_includes_all_gate_lines():
+    gates = [
+        GateResult(gate="G1", passed=True),
+        GateResult(gate="G2", passed=True),
+        GateResult(gate="G3", passed=True),
+        GateResult(gate="G5", passed=True),
+    ]
+    output = render_scorecard(gates)
+    for gate_label in ("[G1]", "[G2]", "[G3]", "[G5]"):
+        assert gate_label in output
+
+
+# ── g5_no_regression ──────────────────────────────────────────────────────────
+
+
+def test_g5_day_zero_insufficient_signoffs():
+    result = g5_no_regression(
+        candidate_scores={"recall": 0.85},
+        champion_scores=None,
+        aa_noise=None,
+        is_day_zero=True,
+        human_signed_off=False,
+        signoff_count=1,
+    )
+    assert result.passed is False
+    assert result.reason_code == "HOLD"
+
+
+def test_g5_day_zero_sufficient_signoffs():
+    result = g5_no_regression(
+        candidate_scores={"recall": 0.85},
+        champion_scores=None,
+        aa_noise=None,
+        is_day_zero=True,
+        human_signed_off=False,
+        signoff_count=2,
+    )
+    assert result.passed is True
+    assert result.details["day_zero"] is True
+
+
+def test_g5_hold_when_no_human_signoff():
+    result = g5_no_regression(
+        candidate_scores={"recall": 0.90},
+        champion_scores={"recall": 0.80},
+        aa_noise={"recall": 0.02},
+        human_signed_off=False,
+    )
+    assert result.passed is False
+    assert result.reason_code == "HOLD"
+
+
+def test_g5_hold_when_regression_beyond_band():
+    # Candidate recall 0.75 vs champion 0.80; delta=-0.05 < -band=-0.02.
+    result = g5_no_regression(
+        candidate_scores={"recall": 0.75},
+        champion_scores={"recall": 0.80},
+        aa_noise={"recall": 0.02},
+        human_signed_off=True,
+    )
+    assert result.passed is False
+    assert result.reason_code == "HOLD"
+    assert any("recall" in r for r in result.details["regressions"])
+
+
+def test_g5_promote_when_candidate_beats_champion():
+    result = g5_no_regression(
+        candidate_scores={"recall": 0.90},
+        champion_scores={"recall": 0.80},
+        aa_noise={"recall": 0.02},
+        human_signed_off=True,
+    )
+    assert result.passed is True
+    assert result.details["improvements"]
+
+
+def test_g5_promote_when_within_noise_band():
+    # delta = 0.01 — positive but within band of 0.02; counts as no regression, no improvement.
+    result = g5_no_regression(
+        candidate_scores={"recall": 0.81},
+        champion_scores={"recall": 0.80},
+        aa_noise={"recall": 0.02},
+        human_signed_off=True,
+    )
+    assert result.passed is True
+    assert result.details["improvements"] == []
+
+
+def test_g5_verdict_constants():
+    assert Verdict.PROMOTE == "PROMOTE"
+    assert Verdict.HOLD == "HOLD"
diff --git a/tests/unit/evaluation/test_matcher.py b/tests/unit/evaluation/test_matcher.py
new file mode 100644
index 00000000..cc87564b
--- /dev/null
+++ b/tests/unit/evaluation/test_matcher.py
@@ -0,0 +1,221 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for evaluation.matcher: anchored, source_stem, tokens, matches."""
+
+from __future__ import annotations
+
+import pytest
+
+from fireflyframework_agentic.evaluation.matcher import (
+    anchored,
+    matches,
+    source_stem,
+    tokens,
+)
+from fireflyframework_agentic.evaluation.registry import RegistryItem
+
+
+# ── tokens ───────────────────────────────────────────────────────────────────
+
+
+def test_tokens_basic():
+    result = tokens("Hello World")
+    assert result == ["hello", "world"]
+
+
+def test_tokens_lowercases():
+    result = tokens("KYC AML PEP")
+    assert result == ["kyc", "aml", "pep"]
+
+
+def test_tokens_strips_punctuation():
+    result = tokens("risk-management: cost (FTE).")
+    assert "risk" in result
+    assert "management" in result
+    assert "cost" in result
+    assert "fte" in result
+
+
+def test_tokens_empty_string():
+    assert tokens("") == []
+
+
+def test_tokens_numbers_included():
+    result = tokens("case-id CU-2026-1003")
+    assert "2026" in result or "cu" in result
+
+
+def test_tokens_unicode():
+    result = tokens("análisis de crédito")
+    assert "análisis" in result or "an" in result
+
+
+# ── anchored ─────────────────────────────────────────────────────────────────
+
+
+def test_anchored_overlapping_long_token():
+    # "underwriting" is 12 chars — well above the 5-char floor.
+    assert anchored("credit underwriting risk", "underwriting process steps") is True
+
+
+def test_anchored_no_overlap():
+    # No token >= 5 chars shared between claim and evidence.
+    assert anchored("cat sat", "dog ran") is False
+
+
+def test_anchored_short_tokens_ignored():
+    # All tokens in both strings are < 5 chars; no overlap counts.
+    assert anchored("a big cat", "a big dog") is False
+
+
+def test_anchored_mixed_lengths_match():
+    # "kyc" is < 5, but "compliance" is long enough.
+    assert anchored("kyc compliance review", "compliance framework") is True
+
+
+def test_anchored_custom_min_token():
+    # Lower the floor so short tokens can anchor.
+    assert anchored("kyc check", "kyc process", min_token=3) is True
+
+
+def test_anchored_both_empty():
+    assert anchored("", "") is False
+
+
+def test_anchored_partial_token_no_match():
+    # "risk" (4 chars) is below the default 5-char floor.
+    assert anchored("risk alert", "risk factor") is False
+
+
+def test_anchored_returns_bool():
+    result = anchored("credit underwriting", "underwriting model")
+    assert isinstance(result, bool)
+
+
+# ── source_stem ───────────────────────────────────────────────────────────────
+
+
+def test_source_stem_bare_filename_with_extension():
+    assert source_stem("SOP-002-kyc-edd.md") == "sop-002-kyc-edd"
+
+
+def test_source_stem_directory_prefixed():
+    assert source_stem("sops/SOP-002-kyc-edd.md") == "sop-002-kyc-edd"
+
+
+def test_source_stem_deep_path_prefix():
+    assert source_stem("docs/policies/SOP-002-kyc-edd.md") == "sop-002-kyc-edd"
+
+
+def test_source_stem_lowercase():
+    # Stems are always lowercased.
+    assert source_stem("REPORT-FINAL.pdf") == "report-final"
+
+
+def test_source_stem_event_log_row_id():
+    # src-<process>:<case> → process stem.
+    assert source_stem("src-credit-underwriting:CU-2026-1003") == "credit-underwriting"
+
+
+def test_source_stem_event_log_row_id_preserves_hyphens():
+    assert source_stem("src-kyc-onboarding:KYC-001") == "kyc-onboarding"
+
+
+def test_source_stem_strips_fragment():
+    # #page=N should be removed before stemming.
+    assert source_stem("docs/report.pdf#page=5") == "report"
+
+
+def test_source_stem_strips_anchor():
+    assert source_stem("sops/SOP-001.md#section-3") == "sop-001"
+
+
+def test_source_stem_bare_no_extension():
+    # No extension, no directory — stem is just the lowercase name.
+    assert source_stem("my-document") == "my-document"
+
+
+def test_source_stem_no_directory_no_extension_lowercase():
+    assert source_stem("Signal") == "signal"
+
+
+def test_source_stem_csv_extension():
+    assert source_stem("activity-cost-fte.csv") == "activity-cost-fte"
+
+
+# ── matches ───────────────────────────────────────────────────────────────────
+
+
+def _make_item(description: str, evidence: list[str], keywords: list[str] | None = None) -> RegistryItem:
+    """Construct a minimal RegistryItem for matching tests."""
+    return RegistryItem(
+        id="test-item",
+        tier="L1",
+        description=description,
+        evidence=evidence,
+        scope="finding",
+        keywords=keywords or [],
+    )
+
+
+def _make_finding(title: str, description: str, evidence_id: str) -> dict:
+    return {
+        "title": title,
+        "description": description,
+        "evidence_refs": [{"evidence_id": evidence_id}],
+    }
+
+
+def _make_evidence_index(evidence_id: str, locator: str, excerpt: str = "") -> dict:
+    return {evidence_id: {"id": evidence_id, "locator": locator, "excerpt": excerpt}}
+
+
+def test_matches_true_when_source_and_topic_match():
+    # Finding title shares a long token with item description and cites the same source.
+    item = _make_item("credit underwriting process", ["sop-kyc-credit.md"])
+    finding = _make_finding("credit underwriting review", "credit underwriting risk assessment", "ev-1")
+    evidence_index = _make_evidence_index("ev-1", "sop-kyc-credit.md")
+    assert matches(finding, item, evidence_index, scope="finding") is True
+
+
+def test_matches_false_when_source_differs():
+    # Token match exists but sources don't overlap — anti-gaming guard fires.
+    item = _make_item("credit underwriting process", ["sop-credit.md"])
+    finding = _make_finding("credit underwriting review", "credit underwriting details", "ev-1")
+    evidence_index = _make_evidence_index("ev-1", "other-document.md")
+    assert matches(finding, item, evidence_index, scope="finding") is False
+
+
+def test_matches_false_when_no_token_overlap():
+    # Same source, but no shared long token between finding text and item description.
+    item = _make_item("regulatory capital requirement", ["sop-capital.md"])
+    finding = _make_finding("kyc identity check", "client onboarding steps", "ev-1")
+    evidence_index = _make_evidence_index("ev-1", "sop-capital.md")
+    assert matches(finding, item, evidence_index, scope="finding") is False
+
+
+def test_matches_keyword_rail_short_token():
+    # "KYC" is 3 chars — below the 5-char token floor but valid as a keyword.
+    item = _make_item("some description about identity", ["sop-kyc.md"], keywords=["KYC"])
+    finding = _make_finding("KYC onboarding", "KYC onboarding process", "ev-1")
+    evidence_index = _make_evidence_index("ev-1", "sop-kyc.md")
+    assert matches(finding, item, evidence_index, scope="finding") is True
+
+
+def test_matches_empty_evidence_refs_returns_false():
+    # Finding with no evidence refs cannot share a source with any item.
+    item = _make_item("credit underwriting", ["sop-credit.md"])
+    finding = {"title": "credit underwriting", "description": "credit underwriting risk", "evidence_refs": []}
+    assert matches(finding, item, {}, scope="finding") is False
diff --git a/tests/unit/evaluation/test_stats.py b/tests/unit/evaluation/test_stats.py
new file mode 100644
index 00000000..9523be8c
--- /dev/null
+++ b/tests/unit/evaluation/test_stats.py
@@ -0,0 +1,183 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for evaluation.stats: aa_band, aggregate_grounding, left_skew_flag."""
+
+from __future__ import annotations
+
+import pytest
+
+from fireflyframework_agentic.evaluation.stats import (
+    aa_band,
+    aggregate_grounding,
+    left_skew_flag,
+)
+
+
+# ── aa_band ──────────────────────────────────────────────────────────────────
+
+
+def test_aa_band_two_identical_scores():
+    # Two identical scores produce zero pairwise delta.
+    assert aa_band([0.80, 0.80]) == 0.0
+
+
+def test_aa_band_two_different_scores():
+    # Single delta = |0.90 - 0.80| = 0.10; 95th percentile of one value is that value.
+    result = aa_band([0.80, 0.90])
+    assert abs(result - 0.10) < 1e-9
+
+
+def test_aa_band_three_scores_known_deltas():
+    # Scores: 0.70, 0.80, 0.90
+    # Pairwise deltas: |0.70-0.80|=0.10, |0.70-0.90|=0.20, |0.80-0.90|=0.10
+    # Sorted: [0.10, 0.10, 0.20] → 95th pct index = int(3 * 95 / 100) = 2 → 0.20
+    result = aa_band([0.70, 0.80, 0.90])
+    assert abs(result - 0.20) < 1e-9
+
+
+def test_aa_band_large_spread():
+    # Max delta in [0.0, 1.0] is 1.0.
+    result = aa_band([0.0, 1.0])
+    assert abs(result - 1.0) < 1e-9
+
+
+def test_aa_band_requires_at_least_two_scores():
+    with pytest.raises(ValueError, match="aa_band requires >= 2 reruns"):
+        aa_band([0.80])
+
+
+def test_aa_band_empty_raises():
+    with pytest.raises(ValueError, match="aa_band requires >= 2 reruns"):
+        aa_band([])
+
+
+def test_aa_band_custom_percentile():
+    # 50th percentile of [0.10, 0.10, 0.20] at idx=1 → 0.10.
+    result = aa_band([0.70, 0.80, 0.90], percentile=50)
+    assert abs(result - 0.10) < 1e-9
+
+
+def test_aa_band_returns_float():
+    result = aa_band([0.80, 0.85, 0.90])
+    assert isinstance(result, float)
+
+
+# ── aggregate_grounding ───────────────────────────────────────────────────────
+
+
+def test_aggregate_grounding_single_dict():
+    g = {"support_pct": 90.0, "unsupported_ids": ["ev-1"]}
+    result = aggregate_grounding([g])
+    assert result["support_pct"] == 90.0
+    assert result["unsupported_ids"] == ["ev-1"]
+    assert result["_aggregate_runs"] == 1
+
+
+def test_aggregate_grounding_mean_support_pct():
+    dicts = [
+        {"support_pct": 80.0, "unsupported_ids": []},
+        {"support_pct": 100.0, "unsupported_ids": []},
+    ]
+    result = aggregate_grounding(dicts)
+    assert result["support_pct"] == 90.0
+
+
+def test_aggregate_grounding_union_of_unsupported_ids():
+    dicts = [
+        {"support_pct": 90.0, "unsupported_ids": ["ev-1", "ev-2"]},
+        {"support_pct": 85.0, "unsupported_ids": ["ev-2", "ev-3"]},
+    ]
+    result = aggregate_grounding(dicts)
+    assert set(result["unsupported_ids"]) == {"ev-1", "ev-2", "ev-3"}
+
+
+def test_aggregate_grounding_union_sorted():
+    dicts = [
+        {"support_pct": 90.0, "unsupported_ids": ["ev-b"]},
+        {"support_pct": 90.0, "unsupported_ids": ["ev-a"]},
+    ]
+    result = aggregate_grounding(dicts)
+    assert result["unsupported_ids"] == ["ev-a", "ev-b"]
+
+
+def test_aggregate_grounding_empty_input():
+    result = aggregate_grounding([])
+    assert result["support_pct"] == 0.0
+    assert result["unsupported_ids"] == []
+
+
+def test_aggregate_grounding_records_run_count():
+    dicts = [
+        {"support_pct": 80.0, "unsupported_ids": []},
+        {"support_pct": 90.0, "unsupported_ids": []},
+        {"support_pct": 100.0, "unsupported_ids": []},
+    ]
+    result = aggregate_grounding(dicts)
+    assert result["_aggregate_runs"] == 3
+
+
+def test_aggregate_grounding_per_run_pct_recorded():
+    dicts = [
+        {"support_pct": 80.0, "unsupported_ids": []},
+        {"support_pct": 100.0, "unsupported_ids": []},
+    ]
+    result = aggregate_grounding(dicts)
+    assert result["_support_pct_per_run"] == [80.0, 100.0]
+
+
+def test_aggregate_grounding_missing_unsupported_ids_treated_as_empty():
+    dicts = [
+        {"support_pct": 90.0},  # no unsupported_ids key
+        {"support_pct": 80.0, "unsupported_ids": ["ev-1"]},
+    ]
+    result = aggregate_grounding(dicts)
+    assert result["unsupported_ids"] == ["ev-1"]
+
+
+# ── left_skew_flag ────────────────────────────────────────────────────────────
+
+
+def test_left_skew_flag_true_when_catastrophic_run():
+    # median([0.80, 0.80, 0.80]) = 0.80; min = 0.60 < 0.80 - 0.10 = 0.70.
+    assert left_skew_flag([0.60, 0.80, 0.80]) is True
+
+
+def test_left_skew_flag_false_when_min_close_to_median():
+    # median = 0.80; min = 0.75; 0.75 >= 0.80 - 0.10 = 0.70 → no flag.
+    assert left_skew_flag([0.75, 0.80, 0.85]) is False
+
+
+def test_left_skew_flag_false_when_all_equal():
+    assert left_skew_flag([0.85, 0.85, 0.85]) is False
+
+
+def test_left_skew_flag_boundary_just_above_threshold():
+    # min = 0.71, median = 0.80; 0.71 >= 0.80 - 0.10 = 0.70 → no flag.
+    assert left_skew_flag([0.71, 0.80, 0.80]) is False
+
+
+def test_left_skew_flag_single_score_always_false():
+    # A single score has no meaningful distribution; function returns False.
+    assert left_skew_flag([0.50]) is False
+
+
+def test_left_skew_flag_two_scores_with_large_gap():
+    # median([0.50, 0.90]) = 0.70; min = 0.50 < 0.70 - 0.10 = 0.60.
+    assert left_skew_flag([0.50, 0.90]) is True
+
+
+def test_left_skew_flag_returns_bool():
+    result = left_skew_flag([0.80, 0.85, 0.90])
+    assert isinstance(result, bool)
diff --git a/tests/unit/lab/test_retrieval_metrics.py b/tests/unit/lab/test_retrieval_metrics.py
new file mode 100644
index 00000000..a018a08b
--- /dev/null
+++ b/tests/unit/lab/test_retrieval_metrics.py
@@ -0,0 +1,247 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for lab.retrieval_metrics: compute_retrieval_metrics and RetrieverMetrics."""
+
+from __future__ import annotations
+
+import math
+
+import pytest
+
+from fireflyframework_agentic.lab.retrieval_metrics import (
+    RetrieverMetrics,
+    compute_retrieval_metrics,
+)
+
+
+# ── helpers ───────────────────────────────────────────────────────────────────
+
+
+def _row(gold_rank: int | None = None, total: int = 5, n_gold: int = 1) -> dict:
+    """Build one result row with ``total`` retrieved items.
+
+    If ``gold_rank`` is not None, the item at that rank is marked as gold.
+    All items get a unique ``source_id`` so dedup leaves them all.
+    """
+    retrieved = []
+    for rank in range(1, total + 1):
+        retrieved.append({
+            "rank": rank,
+            "source_id": f"doc-{rank}",
+            "is_gold": rank == gold_rank,
+        })
+    gold_ids = [f"doc-{gold_rank}"] if gold_rank is not None else []
+    return {
+        "retrieved": retrieved,
+        "gold": gold_ids * n_gold,
+    }
+
+
+# ── hit@k ─────────────────────────────────────────────────────────────────────
+
+
+def test_hit_at_1_perfect_when_gold_is_rank1():
+    results = [_row(gold_rank=1)]
+    m = compute_retrieval_metrics(results)
+    assert m["hit@1"] == 1.0
+
+
+def test_hit_at_1_zero_when_gold_not_in_top1():
+    results = [_row(gold_rank=2)]
+    m = compute_retrieval_metrics(results)
+    assert m["hit@1"] == 0.0
+
+
+def test_hit_at_5_one_when_gold_at_rank5():
+    results = [_row(gold_rank=5)]
+    m = compute_retrieval_metrics(results)
+    assert m["hit@5"] == 1.0
+
+
+def test_hit_at_5_zero_when_gold_not_in_top5():
+    # Gold is at rank 10 — outside top-5 window with only 5 items, make 10.
+    results = [_row(gold_rank=None, total=10)]  # no gold in retrieved
+    m = compute_retrieval_metrics(results)
+    assert m["hit@5"] == 0.0
+
+
+def test_hit_at_10_one_when_gold_at_rank10():
+    results = [_row(gold_rank=10, total=10)]
+    m = compute_retrieval_metrics(results)
+    assert m["hit@10"] == 1.0
+
+
+# ── recall@k ──────────────────────────────────────────────────────────────────
+
+
+def test_recall_at_k_increases_with_k():
+    # Gold at rank 3: recall@1=0, recall@5>=recall@1.
+    results = [_row(gold_rank=3)]
+    m = compute_retrieval_metrics(results)
+    assert m["recall@1"] <= m["recall@5"] <= m["recall@10"]
+
+
+def test_recall_at_1_full_when_single_gold_at_rank1():
+    results = [_row(gold_rank=1, n_gold=1)]
+    m = compute_retrieval_metrics(results)
+    assert m["recall@1"] == 1.0
+
+
+def test_recall_at_1_zero_when_no_gold_in_rank1():
+    results = [_row(gold_rank=5)]
+    m = compute_retrieval_metrics(results)
+    assert m["recall@1"] == 0.0
+
+
+# ── MRR ───────────────────────────────────────────────────────────────────────
+
+
+def test_mrr_is_1_when_gold_at_rank1():
+    results = [_row(gold_rank=1)]
+    m = compute_retrieval_metrics(results)
+    assert m["mrr@10"] == 1.0
+
+
+def test_mrr_is_half_when_gold_at_rank2():
+    results = [_row(gold_rank=2)]
+    m = compute_retrieval_metrics(results)
+    assert abs(m["mrr@10"] - 0.5) < 1e-9
+
+
+def test_mrr_is_zero_when_no_gold():
+    results = [_row(gold_rank=None)]
+    m = compute_retrieval_metrics(results)
+    assert m["mrr@10"] == 0.0
+
+
+def test_mrr_average_across_queries():
+    # Query 1: gold at rank 1 (MRR=1.0); Query 2: gold at rank 2 (MRR=0.5).
+    results = [_row(gold_rank=1), _row(gold_rank=2)]
+    m = compute_retrieval_metrics(results)
+    assert abs(m["mrr@10"] - 0.75) < 1e-3
+
+
+# ── nDCG ──────────────────────────────────────────────────────────────────────
+
+
+def test_ndcg_is_1_when_gold_at_rank1():
+    results = [_row(gold_rank=1, n_gold=1)]
+    m = compute_retrieval_metrics(results)
+    assert abs(m["ndcg@10"] - 1.0) < 1e-9
+
+
+def test_ndcg_is_less_than_1_when_gold_not_at_rank1():
+    results = [_row(gold_rank=3, n_gold=1)]
+    m = compute_retrieval_metrics(results)
+    assert m["ndcg@10"] < 1.0
+    assert m["ndcg@10"] > 0.0
+
+
+def test_ndcg_is_zero_when_no_gold():
+    results = [_row(gold_rank=None)]
+    m = compute_retrieval_metrics(results)
+    assert m["ndcg@10"] == 0.0
+
+
+# ── n_queries ─────────────────────────────────────────────────────────────────
+
+
+def test_n_queries_matches_input_length():
+    results = [_row(gold_rank=1), _row(gold_rank=2), _row(gold_rank=3)]
+    m = compute_retrieval_metrics(results)
+    assert m["n_queries"] == 3
+
+
+def test_empty_results_returns_zero_n_queries():
+    m = compute_retrieval_metrics([])
+    assert m["n_queries"] == 0
+
+
+# ── optional fields ───────────────────────────────────────────────────────────
+
+
+def test_no_answer_rate_is_zero_when_answer_present():
+    # Rows with a non-empty answer string are counted as answered.
+    results = [{**_row(gold_rank=1), "answer": "some answer text"}]
+    m = compute_retrieval_metrics(results)
+    assert m["no_answer_rate"] == 0.0
+
+
+def test_no_answer_rate_is_one_when_no_answer_field():
+    # Rows without an answer field are treated as no-answer by the implementation.
+    results = [_row(gold_rank=1)]
+    m = compute_retrieval_metrics(results)
+    assert m["no_answer_rate"] == 1.0
+
+
+def test_citation_precision_is_none_when_no_citations():
+    results = [_row(gold_rank=1)]
+    m = compute_retrieval_metrics(results)
+    assert m["citation_precision"] is None
+
+
+def test_latency_fields_are_none_when_absent():
+    results = [_row(gold_rank=1)]
+    m = compute_retrieval_metrics(results)
+    assert m["mean_search_ms"] is None
+    assert m["mean_answer_ms"] is None
+
+
+def test_mean_search_ms_computed_when_present():
+    results = [{**_row(gold_rank=1), "search_ms": 100.0, "answer_ms": 200.0}]
+    m = compute_retrieval_metrics(results)
+    assert m["mean_search_ms"] == 100
+    assert m["mean_answer_ms"] == 200
+
+
+# ── RetrieverMetrics.from_results ─────────────────────────────────────────────
+
+
+def test_retriever_metrics_from_results_hit_at_1():
+    results = [_row(gold_rank=1)]
+    rm = RetrieverMetrics.from_results(results)
+    assert rm.hit_at_1 == 1.0
+
+
+def test_retriever_metrics_from_results_n_queries():
+    results = [_row(gold_rank=1), _row(gold_rank=2)]
+    rm = RetrieverMetrics.from_results(results)
+    assert rm.n_queries == 2
+
+
+def test_retriever_metrics_from_results_mrr():
+    results = [_row(gold_rank=1)]
+    rm = RetrieverMetrics.from_results(results)
+    assert rm.mrr_at_10 == 1.0
+
+
+def test_retriever_metrics_from_results_defaults_on_empty():
+    rm = RetrieverMetrics.from_results([])
+    assert rm.n_queries == 0
+    assert rm.hit_at_1 == 0.0
+    assert rm.mrr_at_10 == 0.0
+
+
+def test_retriever_metrics_is_pydantic_model():
+    rm = RetrieverMetrics()
+    assert rm.n_queries == 0
+    assert rm.hit_at_1 == 0.0
+    assert rm.no_answer_rate is None
+
+
+def test_retriever_metrics_recall_increases_with_k():
+    results = [_row(gold_rank=3)]
+    rm = RetrieverMetrics.from_results(results)
+    assert rm.recall_at_1 <= rm.recall_at_5 <= rm.recall_at_10

From f79439b0abec86dac42ae96ad3e61856a39cea60 Mon Sep 17 00:00:00 2001
From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 00:12:26 +0200
Subject: [PATCH 11/48] docs(evaluation): add evaluation package documentation
 (#278)

* feat(evaluation): add evaluation package documentation

* docs(evaluation): mention evaluation subpackage in README

---------

Co-authored-by: miguelgfierro <miguelgfierro@users.noreply.github.com>
---
 README.md          |   7 +
 docs/evaluation.md | 435 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 442 insertions(+)
 create mode 100644 docs/evaluation.md

diff --git a/README.md b/README.md
index 9d005b23..904237da 100644
--- a/README.md
+++ b/README.md
@@ -412,6 +412,12 @@ classDiagram
   `EvalDataset` loads/saves test cases from JSON. `ModelComparison` runs the
   same prompts across multiple agents for side-by-side analysis.
 
+- **Evaluation** — Gate-based quality gates (G1–G5), LLM-as-judge advisory scoring,
+  champion/challenger tracking, and deterministic retrieval metrics for assessing
+  agent and pipeline outputs. The `flyeval` CLI drives the full gate pipeline from
+  the command line. Install with `pip install "fireflyframework-agentic[evaluation]"`.
+  See [docs/evaluation.md](docs/evaluation.md) for the full guide.
+
   > **Optional developer tooling.** `fireflyframework_agentic.experiments` (A/B
   > experiments) and `fireflyframework_agentic.lab` (offline evaluation /
   > benchmarking) are leaf modules — nothing in the core imports them and they add
@@ -817,6 +823,7 @@ Detailed guides for each module:
 - [Security](docs/security.md) — Prompt/output guards, at-rest encryption
 - [Experiments](docs/experiments.md) — A/B testing, variant comparison
 - [Lab](docs/lab.md) — Benchmarks, datasets, evaluators
+- [Evaluation](docs/evaluation.md) — Gate pipeline, flyeval CLI, champion/challenger, retrieval metrics
 - Studio — moved to [fireflyframework-agentic-studio](https://github.com/fireflyframework/fireflyframework-agentic-studio)
 ---
 
diff --git a/docs/evaluation.md b/docs/evaluation.md
new file mode 100644
index 00000000..c2abe319
--- /dev/null
+++ b/docs/evaluation.md
@@ -0,0 +1,435 @@
+# Evaluation Guide
+
+Copyright 2026 Firefly Software Foundation. Licensed under the Apache License 2.0.
+
+The Evaluation subpackage provides gate-based quality gates, LLM-as-judge advisory scoring,
+champion/challenger tracking, and deterministic retrieval metrics for assessing agent outputs.
+
+---
+
+## Concepts
+
+### Gate pipeline
+
+The evaluation framework runs **five gates** in sequence. Every gate always runs — a failed
+gate raises a *flag*, not a veto, so the scorecard always carries the complete picture.
+
+| Gate | Name | Kind | Description |
+|------|------|------|-------------|
+| G1 | Structural & Safe | Deterministic | Schema validity, PII non-disclosure, empty-registry guard. |
+| G2 | Must-finds & Negative Controls | Deterministic | Lexical/semantic recall against the must-find registry; NC precision. |
+| G3 | Evidence (Grounding) | Deterministic | Excerpt-to-corpus anchoring; fabricated-evidence detection. |
+| G4 | LLM-as-a-Judge | Advisory (non-blocking) | Semantic faithfulness, entailment, gap detection — never changes the verdict. |
+| G5 | No-regression / Promotion | Human decision | Champion/challenger comparison with A/A noise band; collects sign-offs. |
+
+**No gate vetoes.** Failures append to the `GateResult` flags list and scoring continues.
+The scorecard carries every signal regardless of which gates fired.
+
+### GateResult
+
+`GateResult` is a dataclass returned by each gate:
+
+```python
+@dataclass
+class GateResult:
+    gate: str       # "G1", "G2", …, "G5"
+    passed: bool
+    reason_code: str = ""   # e.g. "SCHEMA_INVALID", "NC_HIT", "UNGROUNDED"
+    details: dict = field(default_factory=dict)
+```
+
+`str(gate_result)` prints `[G2] PASS` or `[G2] FLAG:NC_HIT`.
+
+### Verdict
+
+`verdict(gate_results)` returns `VERDICT_PROMOTE` or `VERDICT_HOLD`:
+
+- `VERDICT_PROMOTE` — all gates passed **and** G5 (the human sign-off gate) is present.
+- `VERDICT_HOLD` — any gate flagged, or G5 is missing.
+
+The CLI exits `0` on PROMOTE and `1` on HOLD, so it composes into CI.
+
+### Must-find registry
+
+A registry (`lean-1` schema) is a JSON file listing items the discovery output is
+expected to surface (`tier` L0–L3) and negative controls (NC) it must *not* assert.
+
+```json
+{
+  "schema_version": "lean-1",
+  "corpus": "banca-cordobesa",
+  "items": [
+    { "id": "ao-pep-4eyes", "tier": "L0", "scope": "decision",
+      "description": "PEP cases require a second analyst sign-off (4-eyes)",
+      "keywords": ["PEP", "4-eyes"],
+      "evidence": ["SOP-002-kyc-edd.md"] },
+    { "id": "ao-nc-realtime", "tier": "NC", "scope": "finding",
+      "description": "KYC-Hub synchronises in real time — factually false" }
+  ]
+}
+```
+
+Tier semantics: L0 = must-find control (a single miss flags the run), L1 = high-priority,
+L2 = important, L3 = nice-to-have (not counted in the recall floor).
+
+### Advisory judge (G4)
+
+G4 calls a chat LLM (or local Ollama model) for semantic checks the deterministic gates
+cannot perform: faithfulness, entailment, numeric/temporal fidelity, actionability,
+fabricated-entity detection, and more. It is:
+
+- **Non-blocking** — `AdvisoryReport` is carried separately and never enters `verdict()`.
+- **Non-deterministic** — each metric runs `judge_runs` times (default: 3) and the
+  median score is reported.
+- **Opt-in** — pass `--judge-model provider:model` to activate it; omit the flag to skip.
+
+### Champion/challenger pattern
+
+Champions are **per-corpus**. `ChampionRecord` persists the best-known run so that
+promotion decisions are made against a stable, signed baseline rather than the last run.
+
+```
+               ┌──────────────────────────────────────────┐
+               │  run result JSON (challenger)            │
+               └──────────────┬───────────────────────────┘
+                              │
+              ┌───────────────▼───────────────┐
+              │  G1 · G2 · G3 (deterministic) │
+              │  G4 (advisory, opt-in)         │
+              └───────────────┬───────────────┘
+                              │  flags + scores
+              ┌───────────────▼───────────────┐
+              │  G5 — no-regression vs        │
+              │  champion baseline + A/A band │
+              └───────────────┬───────────────┘
+                              │
+              ┌───────────────▼───────────────┐
+              │  Markdown scorecard           │
+              │  PROMOTE / HOLD               │
+              └───────────────────────────────┘
+```
+
+`invalidate_champion()` marks a baseline invalid. The `EMPTY_MUST_FIND` guard in G1
+prevents a fake-100% champion being created against an empty registry.
+
+---
+
+## Installation
+
+The evaluation subpackage requires `scipy` and `numpy`. Install the optional extra:
+
+```bash
+pip install "fireflyframework-agentic[evaluation]"
+```
+
+The `flyeval` CLI entry-point is registered automatically by the package. Verify:
+
+```bash
+flyeval --version
+```
+
+---
+
+## CLI
+
+All subcommands exit `0` on PROMOTE and `1` on HOLD.
+
+### `flyeval gate`
+
+Run the full gate pipeline against a result JSON and print a Markdown scorecard.
+
+```bash
+flyeval gate \
+  --result      runs/2026-06-18/output.json \
+  --registry    registries/banca-cordobesa.json \
+  --baseline    baselines/banca-cordobesa.json \
+  --judge-model anthropic:claude-3-5-haiku \
+  --judge-runs  3
+```
+
+Key flags:
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--result` | required | Path to the run's `output.json`. |
+| `--registry` | required | Must-find registry (lean-1 JSON). |
+| `--baseline` | — | Champion baseline JSON for G5 regression check. |
+| `--judge-model` | — | `provider:model` for G4 advisory judge. |
+| `--judge-runs` | 3 | Number of independent judge calls (median aggregation). |
+| `--no-judge` | — | Skip G4 entirely. |
+| `--recall-floor` | 0.70 | Minimum G2 recall before flagging. |
+| `--grounding-floor` | 0.90 | Minimum G3 grounding rate before flagging. |
+| `--corpus` | — | Path to the evidence corpus bundle for G3 verification. |
+| `--pii-list` | — | Path to a JSON array of names to scan for PII leaks (G1). |
+| `--embedder` | — | `provider:model` for semantic recall (G2 embedding path). |
+| `--model-id` | "unknown" | Identifier of the model under evaluation (for scorecard). |
+
+### `flyeval aa-band`
+
+Compute the A/A noise band from multiple repeated runs of the same model to establish
+the noise floor before setting up the champion comparison.
+
+```bash
+flyeval aa-band \
+  --results runs/aa-run-1/output.json runs/aa-run-2/output.json runs/aa-run-3/output.json \
+  --registry registries/banca-cordobesa.json
+```
+
+The command prints per-metric variance and recommended noise floors.
+
+### `flyeval day-zero`
+
+Promote the very first champion for a corpus (Day-Zero protocol). Requires at least
+`--signoffs` sign-offs (default: 2) before PROMOTE is issued.
+
+```bash
+flyeval day-zero \
+  --result   runs/2026-06-18/output.json \
+  --registry registries/banca-cordobesa.json \
+  --baseline baselines/banca-cordobesa.json \
+  --signoffs 2
+```
+
+The command writes the new `ChampionRecord` into `--baseline` on success.
+
+### `flyeval invalidate`
+
+Mark the current champion invalid with a documented reason. Use this when the registry
+changes in a way that makes the existing champion incommensurable.
+
+```bash
+flyeval invalidate \
+  --baseline baselines/banca-cordobesa.json \
+  --reason   "Registry expanded from 39 to 94 items (lean-1 v2)."
+```
+
+---
+
+## Python API
+
+### Running gates
+
+```python
+import json
+from fireflyframework_agentic.evaluation import (
+    run_gates,
+    render_scorecard,
+    verdict,
+    load_registry,
+    VERDICT_PROMOTE,
+)
+
+result = json.loads(open("runs/2026-06-18/output.json").read())
+registry = load_registry("registries/banca-cordobesa.json")
+
+gate_results = run_gates(result, registry)
+scorecard_md = render_scorecard(
+    gate_results,
+    corpus="banca-cordobesa",
+    model_id="anthropic:claude-3-5-sonnet",
+    run_id="2026-06-18-sonnet-01",
+)
+print(scorecard_md)
+
+v = verdict(gate_results)
+print("Verdict:", v)  # "PROMOTE" or "HOLD"
+assert v == VERDICT_PROMOTE
+```
+
+### Champion management
+
+```python
+from fireflyframework_agentic.evaluation import (
+    load_champion,
+    save_champion,
+    invalidate_champion,
+    ChampionRecord,
+)
+
+# Load the current champion (returns None on Day Zero).
+champ = load_champion("baselines/banca-cordobesa.json")
+if champ is None:
+    print("Day Zero — no champion yet.")
+else:
+    print(f"Champion: {champ.run_id} | {champ.primary_metric()}={champ.primary_score():.3f}")
+
+# Save a new champion after a successful PROMOTE.
+new_champ = ChampionRecord(
+    corpus="banca-cordobesa",
+    run_id="2026-06-18-sonnet-01",
+    model_id="anthropic:claude-3-5-sonnet",
+    registry_sha256=registry.sha256(),
+    scores={"lexical_recall": 0.857, "grounding_pct": 0.941},
+    human_sign_offs=["alice", "bob"],
+)
+save_champion("baselines/banca-cordobesa.json", new_champ)
+
+# Invalidate when the registry changes materially.
+invalidate_champion(
+    "baselines/banca-cordobesa.json",
+    reason="Registry expanded from 39 to 94 items.",
+)
+```
+
+### EvalConfig
+
+`EvalConfig` is a Pydantic model that captures the parameters of a single evaluation run.
+Use it to build reproducible, serialisable run records.
+
+```python
+from fireflyframework_agentic.evaluation.models import EvalConfig
+
+cfg = EvalConfig(
+    model_id="anthropic:claude-3-5-sonnet",
+    corpus="banca-cordobesa",
+    run_id="2026-06-18-sonnet-01",
+    registry_path="registries/banca-cordobesa.json",
+    corpus_path="corpora/banca-cordobesa/",
+    baseline_path="baselines/banca-cordobesa.json",
+    judge_model="anthropic:claude-3-5-haiku",
+    judge_runs=3,
+)
+print(cfg.model_dump_json(indent=2))
+```
+
+### Advisory judge (G4)
+
+```python
+from fireflyframework_agentic.evaluation import run_judge, JudgeClient, build_embedder
+
+client = JudgeClient(
+    chat_fn=my_chat_fn,        # callable(system: str, user: str) -> dict
+    embed_fn=build_embedder("ollama:bge-m3"),
+)
+
+advisory = run_judge(
+    result=result,
+    registry=registry,
+    client=client,
+    runs=3,
+    missed_ids=[],   # IDs the deterministic G2 missed — judge tries to recover them
+)
+print(advisory.scores)   # dict of metric -> float
+print(advisory.errors)   # any metrics that failed (best-effort, never raises)
+```
+
+---
+
+## Retrieval Metrics
+
+The `compute_retrieval_metrics()` function computes standard IR metrics over ranked
+retrieval results. It is imported from `fireflyframework_agentic.lab.retrieval_metrics`
+and re-exported by the evaluation package.
+
+Supported metrics at cut-offs k ∈ {1, 5, 10}:
+
+- **Hit@k** — at least one gold document in top-k.
+- **Recall@k** — fraction of gold documents in top-k.
+- **Precision@k** — fraction of top-k results that are gold.
+- **MRR@10** — mean reciprocal rank of the first gold hit.
+- **MAP@10** — mean average precision.
+- **nDCG@10** — normalised discounted cumulative gain.
+
+```python
+from fireflyframework_agentic.evaluation import compute_retrieval_metrics, RetrieverMetrics
+
+# Each row is a query; each row's "retrieved" list is ranked (rank=1 is top).
+rows = [
+    {
+        "query": "KYC enhanced due diligence steps",
+        "gold": ["SOP-002-kyc-edd.md"],
+        "retrieved": [
+            {"rank": 1, "source_id": "SOP-002-kyc-edd.md", "is_gold": True},
+            {"rank": 2, "source_id": "SOP-001-account-opening.md", "is_gold": False},
+            {"rank": 3, "source_id": "INT-002-KYC-Jaime.md", "is_gold": True},
+        ],
+    },
+]
+
+metrics: RetrieverMetrics = compute_retrieval_metrics(rows)
+print(f"Recall@5:  {metrics.recall_5:.3f}")
+print(f"nDCG@10:   {metrics.ndcg_10:.3f}")
+print(f"MRR@10:    {metrics.mrr_10:.3f}")
+```
+
+`RetrieverMetrics` also carries optional fields when the raw rows include them:
+`no_answer_rate`, `citation_precision`, `mean_search_ms`, `mean_answer_ms`.
+
+---
+
+## Architecture
+
+```mermaid
+flowchart TD
+    R["result JSON\n(DiscoveryResult / output.json)"]
+    REG["Registry JSON\n(lean-1 must-find)"]
+    CORP["Corpus bundle\n(raw evidence documents)"]
+    BASE["Baseline JSON\n(champion record)"]
+
+    R --> G1["G1 · Structural & Safe\n(schema, PII, empty-registry)"]
+    REG --> G1
+    R --> G2["G2 · Recall & NC Precision\n(lexical + optional semantic)"]
+    REG --> G2
+    R --> G3["G3 · Grounding\n(excerpt anchoring, fabrication)"]
+    CORP --> G3
+    R --> G4["G4 · LLM Judge advisory\n(faithfulness, entailment, gaps)"]
+    REG --> G4
+    G1 --> SC["Markdown Scorecard\nrender_scorecard()"]
+    G2 --> SC
+    G3 --> SC
+    G4 -.advisory.-> SC
+    BASE --> G5["G5 · No-regression\n(A/A band, sign-offs)"]
+    G1 --> G5
+    G2 --> G5
+    G3 --> G5
+    G5 --> SC
+    SC --> V["verdict()\nPROMOTE / HOLD"]
+    V --> CHAMP["save_champion()\nor invalidate_champion()"]
+```
+
+---
+
+## Reference
+
+### Exports
+
+All symbols below are importable from `fireflyframework_agentic.evaluation`.
+
+| Symbol | Kind | Description |
+|--------|------|-------------|
+| `EvalConfig` | Pydantic model | Parameters for a single evaluation run. |
+| `GateResult` | Dataclass | Result of one gate: `gate`, `passed`, `reason_code`, `details`. |
+| `Verdict` | Constants class | `Verdict.PROMOTE`, `Verdict.HOLD`. |
+| `VERDICT_PROMOTE` | `str` | `"PROMOTE"`. |
+| `VERDICT_HOLD` | `str` | `"HOLD"`. |
+| `run_gates()` | Function | Run all four deterministic gates (G1–G3, G5 shape) and return results. |
+| `g2_recall_precision()` | Function | Run only G2 (recall + NC precision) and return `GateResult`. |
+| `verdict()` | Function | Derive PROMOTE/HOLD from a list of `GateResult`. |
+| `render_scorecard()` | Function | Render a Markdown scorecard from gate results and metadata. |
+| `ChampionRecord` | Dataclass | Per-corpus champion metadata and scores. |
+| `load_champion()` | Function | Load the current champion from `baseline.json`; returns `None` on Day Zero. |
+| `save_champion()` | Function | Persist a new champion to `baseline.json`. |
+| `invalidate_champion()` | Function | Mark the champion invalid with a reason string. |
+| `AdvisoryReport` | Dataclass | G4 judge output: `scores`, `errors`, `raw`. |
+| `run_judge()` | Function | Run the LLM-as-a-Judge advisory pass. |
+| `JudgeClient` | Dataclass | Holds `chat_fn` and `embed_fn` for the judge. |
+| `OllamaEmbedder` | Class | Local Ollama embedding callable (default BGE-M3). |
+| `build_embedder()` | Function | Factory: `"ollama:bge-m3"` → `OllamaEmbedder`. |
+| `cosine()` | Function | Cosine similarity between two numpy vectors. |
+| `Registry` | Dataclass | Parsed must-find registry with real items and NC items. |
+| `RegistryItem` | Dataclass | One must-find or NC item: `id`, `tier`, `scope`, `description`, …. |
+| `load_registry()` | Function | Parse and validate a lean-1 registry JSON file. |
+| `registry_sha256()` | Function | SHA-256 of a registry file path. |
+| `load_corpus()` | Function | Load and index a corpus bundle for G3 evidence verification. |
+| `corpus_sha256()` | Function | SHA-256 of a corpus directory or bundle. |
+| `verify_evidence_index()` | Function | Check each `evidence_index` entry against the corpus. |
+| `EMPTY` / `FABRICATED` / `SOURCE_UNKNOWN` / `VERIFIED` | `str` | Evidence verification status constants. |
+| `RetrieverMetrics` | Pydantic model | IR metrics: `recall_k`, `precision_k`, `ndcg_10`, `mrr_10`, `map_10`. |
+| `compute_retrieval_metrics()` | Function | Compute IR metrics from a list of ranked-retrieval result rows. |
+| `anchored()` | Function | True if claim and evidence share at least one non-trivial token. |
+| `matches()` | Function | Gate predicate: does a candidate match a registry item? |
+| `source_stem()` | Function | Normalise a `locator` path to its file stem for dedup. |
+| `tokens()` | Function | Tokenise text to a list of lowercase word strings. |
+| `aa_band()` | Function | Compute per-metric A/A noise floor from repeated runs. |
+| `aggregate_grounding()` | Function | Summarise grounding stats across a result's findings. |
+| `left_skew_flag()` | Function | True when the score distribution is left-skewed (over-optimistic). |

From a1d28a597ad87559dad0e26a2f266cf516553d21 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 09:24:24 +0200
Subject: [PATCH 12/48] remove examples/flyradar_eval_example.py

---
 examples/flyradar_eval_example.py | 406 ------------------------------
 1 file changed, 406 deletions(-)
 delete mode 100644 examples/flyradar_eval_example.py

diff --git a/examples/flyradar_eval_example.py b/examples/flyradar_eval_example.py
deleted file mode 100644
index 706528f4..00000000
--- a/examples/flyradar_eval_example.py
+++ /dev/null
@@ -1,406 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""FlyRadar evaluation example — gate-based process-mining quality gate.
-
-Demonstrates how to use ``fireflyframework_agentic.evaluation`` to replicate
-the flyradar experiment quality-gate workflow:
-
-1. Load a must-find registry (the gold standard items the model must discover).
-2. Load a DiscoveryResult produced by a flyradar pipeline run.
-3. Run gates G1-G5 to produce a structured verdict:
-     G1 -- Structural & Safe (schema validity, PII, empty-registry guard).
-     G2 -- Recall & Precision (must-find recall floor, NC precision).
-     G3 -- Grounded (finding-to-evidence anchoring).
-     G4 -- LLM-as-a-Judge (advisory only; never blocks promotion).
-     G5 -- No-regression / promotion (champion/challenger comparison).
-4. Render a human-readable scorecard and print the final verdict.
-5. Promote the challenger to champion when the verdict is PROMOTE.
-
-Usage::
-
-    # Minimal: deterministic gates only (no G4 judge, no baseline)
-    python examples/flyradar_eval_example.py \\
-        --result output.json \\
-        --registry registry.json
-
-    # With corpus verification and a champion baseline
-    python examples/flyradar_eval_example.py \\
-        --result output.json \\
-        --registry registry.json \\
-        --baseline baseline.json \\
-        --corpus input.json
-
-    # With the advisory G4 LLM judge (requires API key in environment)
-    FLYEVAL_JUDGE_MODEL=anthropic:claude-sonnet-4-6 \\
-    python examples/flyradar_eval_example.py \\
-        --result output.json \\
-        --registry registry.json \\
-        --judge-model anthropic:claude-sonnet-4-6
-
-Exit codes: 0 = PROMOTE, 1 = HOLD.
-
-Input file formats
-------------------
-``--result`` (output.json)
-    A DiscoveryResult JSON produced by a flyradar pipeline run.  Must contain
-    at minimum ``findings`` (list) and ``evidence_index`` (list).
-
-``--registry`` (registry.json)
-    A lean-1 registry JSON.  Each item has ``id``, ``tier`` (L0-L3), ``title``,
-    ``description``, and ``nc`` (bool, True for negative controls).
-
-``--baseline`` (baseline.json)
-    A ChampionRecord JSON written by a previous PROMOTE run.  When omitted the
-    gate runs in day-zero mode (G5 always passes and a new champion is minted).
-
-``--corpus`` (input.json)
-    The corpus bundle used during the run.  When supplied, G3 verifies that cited
-    evidence excerpts actually appear in the corpus documents.
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import sys
-from pathlib import Path
-
-from fireflyframework_agentic.evaluation import (
-    ChampionRecord,
-    GateResult,
-    build_embedder,
-    load_champion,
-    load_corpus,
-    load_registry,
-    render_scorecard,
-    run_gates,
-    run_judge,
-    save_champion,
-    verdict,
-    VERDICT_PROMOTE,
-)
-from fireflyframework_agentic.evaluation.models import EvalConfig
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def _load_json(path: str) -> dict:
-    """Read a JSON file and return its contents as a dict."""
-    return json.loads(Path(path).read_text(encoding="utf-8"))
-
-
-def _lexical_missed_ids(result: dict, registry) -> list[str]:
-    """Return the IDs of registry items not matched by any finding (lexically).
-
-    The G4 judge uses these to focus its coverage checks on items that
-    lexical recall missed — the places where semantic recovery matters most.
-    """
-    from fireflyframework_agentic.evaluation.matcher import matches
-
-    evidence_index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")}
-    findings = result.get("findings", [])
-    # L3 items are informational-only and are never scored.
-    scored_items = [item for item in registry.real_items if item.tier != "L3"]
-    return [
-        item.id
-        for item in scored_items
-        if not any(matches(f, item, evidence_index) for f in findings)
-    ]
-
-
-# ---------------------------------------------------------------------------
-# Main evaluation flow
-# ---------------------------------------------------------------------------
-
-
-def run_evaluation(args: argparse.Namespace) -> int:
-    """Run the full flyradar gate evaluation and return an exit code."""
-
-    # ------------------------------------------------------------------
-    # Step 1 — Load inputs.
-    # ------------------------------------------------------------------
-    print(f"Loading result   : {args.result}")
-    result = _load_json(args.result)
-
-    print(f"Loading registry : {args.registry}")
-    registry = load_registry(args.registry)
-    print(f"  {len(registry.real_items)} real items, {len(registry.nc_items)} NC items")
-
-    # The EvalConfig captures provenance for the run record.
-    config = EvalConfig(
-        model_id=args.model_id,
-        corpus=registry.corpus,
-        run_id=args.run_id,
-        registry_path=args.registry,
-        corpus_path=args.corpus or "",
-        baseline_path=args.baseline or "",
-        judge_model=args.judge_model or "",
-    )
-
-    # Optional: corpus bundle for deterministic evidence verification (G3).
-    corpus = None
-    if args.corpus:
-        print(f"Loading corpus   : {args.corpus}")
-        corpus = load_corpus(args.corpus)
-
-    # Optional: champion record for regression detection (G5).
-    champion = None
-    champion_scores = None
-    aa_noise = None
-    if args.baseline:
-        print(f"Loading baseline : {args.baseline}")
-        champion = load_champion(args.baseline)
-        if champion:
-            champion_scores = champion.scores
-            aa_noise = champion.aa_noise
-            print(f"  Champion run   : {champion.run_id} ({champion.model_id})")
-        else:
-            print("  No champion found — running in day-zero mode.")
-
-    # Optional: embedder for semantic/hybrid recall (G2).
-    embed_fn = None
-    if args.embedder:
-        print(f"Building embedder: {args.embedder}")
-        embed_fn = build_embedder(args.embedder)
-
-    print()
-
-    # ------------------------------------------------------------------
-    # Step 2 — Run deterministic gates G1-G3 + G5.
-    #
-    # run_gates() returns a list of GateResult objects, one per gate.
-    # Each GateResult carries:
-    #   .gate   -- "G1" | "G2" | "G3" | "G5"
-    #   .passed -- bool
-    #   .details -- dict with per-metric values
-    #   .errors  -- list[str] of blocking error codes
-    # ------------------------------------------------------------------
-    print("Running gates G1-G3 + G5 ...")
-    gate_results: list[GateResult] = run_gates(
-        result,
-        registry,
-        args.registry,
-        pii_list=args.pii_list or [],
-        recall_floor=args.recall_floor,
-        grounding_floor=args.grounding_floor,
-        champion_scores=champion_scores,
-        aa_noise=aa_noise,
-        is_day_zero=(champion is None),
-        human_signed_off=args.human_signed_off,
-        signoff_count=args.signoffs,
-        embed_fn=embed_fn,
-        tau=args.tau,
-        recall_metric=args.recall_metric,
-        tau_nc=args.tau_nc,
-        corpus=corpus,
-    )
-
-    # Quick gate summary before the full scorecard.
-    for gr in gate_results:
-        status = "PASS" if gr.passed else "FAIL"
-        print(f"  {gr.gate}: {status}")
-
-    # ------------------------------------------------------------------
-    # Step 3 — Run the advisory G4 LLM-as-a-Judge (optional).
-    #
-    # G4 is non-blocking: it never changes the verdict or exit code.
-    # It produces an AdvisoryReport with per-finding quality signals
-    # (faithfulness, citation relevance, fabricated entities, etc.).
-    # ------------------------------------------------------------------
-    advisory = None
-    if args.judge_model:
-        print(f"\nRunning G4 judge ({args.judge_model}) ...")
-        missed_ids = _lexical_missed_ids(result, registry)
-        advisory = run_judge(
-            result,
-            registry,
-            judge_model=args.judge_model,
-            runs=args.judge_runs,
-            concurrency=args.judge_concurrency,
-            pipeline_model=args.model_id,
-            embed_fn=embed_fn,
-            tau=args.tau,
-            lexical_missed_ids=missed_ids,
-        )
-        print(f"  Judge completed ({args.judge_runs} run(s)).")
-    else:
-        print("\nG4 judge skipped (pass --judge-model to enable).")
-
-    # ------------------------------------------------------------------
-    # Step 4 — Render the scorecard.
-    #
-    # render_scorecard() produces a markdown-formatted human-readable
-    # report that mirrors the output of `flyeval gate` in the playground.
-    # ------------------------------------------------------------------
-    print()
-    scorecard = render_scorecard(
-        gate_results,
-        corpus=registry.corpus,
-        model_id=config.model_id,
-        run_id=config.run_id,
-        is_self_graded=True,
-        kappa_advisory=registry.is_kappa_advisory(),
-        evidence_unverified=(corpus is None),
-        advisory=advisory,
-    )
-    print(scorecard)
-
-    # ------------------------------------------------------------------
-    # Step 5 — Inspect the verdict and handle promotion.
-    #
-    # verdict() returns "PROMOTE" or "HOLD" based on the gate results.
-    # On PROMOTE, save the challenger as the new champion so future runs
-    # can detect regressions against this baseline.
-    # ------------------------------------------------------------------
-    v = verdict(gate_results)
-    print(f"\nFinal verdict: {v}")
-
-    if v == VERDICT_PROMOTE and args.baseline:
-        # Extract the key scores from G2 and G3 to store in the champion record.
-        g2 = next((g for g in gate_results if g.gate == "G2"), None)
-        g3 = next((g for g in gate_results if g.gate == "G3"), None)
-        scores: dict[str, float] = {}
-        if g2:
-            scores["recall"] = g2.details.get("recall", 0.0)
-        if g3:
-            scores["grounding_pct"] = g3.details.get("grounding_pct", 0.0)
-
-        new_champion = ChampionRecord(
-            corpus=registry.corpus,
-            run_id=config.run_id,
-            model_id=config.model_id,
-            registry_sha256=registry.sha256(),
-            scores=scores,
-            is_day_zero=(champion is None),
-        )
-        save_champion(
-            args.baseline,
-            new_champion,
-            summary=f"Promoted by flyradar_eval_example.py — {config.run_id}",
-        )
-        print(f"Champion saved to {args.baseline}")
-
-    # Exit 0 = PROMOTE, 1 = HOLD (mirrors `flyeval gate` convention).
-    return 0 if v == VERDICT_PROMOTE else 1
-
-
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-
-
-def build_parser() -> argparse.ArgumentParser:
-    p = argparse.ArgumentParser(
-        prog="flyradar_eval_example",
-        description="FlyRadar gate evaluation — replicates the flyeval gate workflow.",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-
-    # Required inputs.
-    p.add_argument("--result", required=True, help="Path to DiscoveryResult JSON.")
-    p.add_argument("--registry", required=True, help="Path to lean-1 registry JSON.")
-
-    # Optional inputs.
-    p.add_argument(
-        "--baseline",
-        help="Path to baseline.json (champion store).  When absent, runs in day-zero mode.",
-    )
-    p.add_argument(
-        "--corpus",
-        help="Path to input.json corpus bundle for deterministic evidence verification (G3).",
-    )
-
-    # Run metadata.
-    p.add_argument("--model-id", default="unknown", help="Model identifier for the scorecard.")
-    p.add_argument("--run-id", default="example-run", help="Run identifier for the scorecard.")
-
-    # Gate thresholds.
-    p.add_argument(
-        "--recall-floor",
-        type=float,
-        default=0.70,
-        help="Minimum recall required for G2 to pass.",
-    )
-    p.add_argument(
-        "--grounding-floor",
-        type=float,
-        default=0.90,
-        help="Minimum grounding percentage required for G3 to pass.",
-    )
-    p.add_argument(
-        "--recall-metric",
-        choices=["lexical", "semantic", "hybrid"],
-        default="lexical",
-        help="Recall metric used by G2.  'semantic' and 'hybrid' require --embedder.",
-    )
-    p.add_argument(
-        "--tau",
-        type=float,
-        default=0.70,
-        help="Cosine similarity threshold for semantic recall (real items).",
-    )
-    p.add_argument(
-        "--tau-nc",
-        type=float,
-        default=0.85,
-        help="Cosine similarity threshold for NC item detection.",
-    )
-    p.add_argument("--pii-list", nargs="*", default=[], help="PII tokens to check for in findings.")
-    p.add_argument("--human-signed-off", action="store_true", help="Mark this run as human-reviewed.")
-    p.add_argument("--signoffs", type=int, default=0, help="Number of human sign-offs collected.")
-
-    # G4 judge options.
-    p.add_argument(
-        "--judge-model",
-        default=None,
-        help=(
-            "Provider:model string for the advisory G4 LLM judge "
-            "(e.g. 'anthropic:claude-sonnet-4-6').  Omit to skip G4."
-        ),
-    )
-    p.add_argument(
-        "--judge-runs",
-        type=int,
-        default=1,
-        help="Number of judge calls to aggregate (odd number recommended for median).",
-    )
-    p.add_argument(
-        "--judge-concurrency",
-        type=int,
-        default=1,
-        help="Thread fan-out for per-item G4 metrics (1 = sequential).",
-    )
-
-    # Embedder for semantic recall.
-    p.add_argument(
-        "--embedder",
-        default=None,
-        help="Embedder spec for semantic/hybrid recall (e.g. 'ollama:bge-m3').",
-    )
-
-    return p
-
-
-def main() -> None:
-    parser = build_parser()
-    args = parser.parse_args()
-    sys.exit(run_evaluation(args))
-
-
-if __name__ == "__main__":
-    main()

From 61617186f1ed103c783197784497dd841a260b43 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 09:24:27 +0200
Subject: [PATCH 13/48] ci: add --extra evaluation to typecheck and test sync
 steps

---
 .github/workflows/pr-gate.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr-gate.yml b/.github/workflows/pr-gate.yml
index c0ef76d4..86e35717 100644
--- a/.github/workflows/pr-gate.yml
+++ b/.github/workflows/pr-gate.yml
@@ -57,7 +57,7 @@ jobs:
       - uses: actions/setup-python@v6
         with:
           python-version: '3.13'
-      - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra openai-embeddings
+      - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra openai-embeddings --extra evaluation
       - run: uv run pyright
 
   test:
@@ -72,7 +72,7 @@ jobs:
       - uses: actions/setup-python@v6
         with:
           python-version: '3.13'
-      - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra vectorstores-pgvector --extra openai-embeddings
+      - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra vectorstores-pgvector --extra openai-embeddings --extra evaluation
       - run: uv run pytest -m "not nightly" --cov --cov-report=term-missing
 
   build:

From 203134ca971377816c462b7d4c5125d9ebc9d4e0 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 09:24:32 +0200
Subject: [PATCH 14/48] fix(evaluation): resolve all ruff lint errors (import
 sort, SIM108, B905, N806, UP035)

---
 examples/flycanon_eval_example.py             | 13 +--
 .../evaluation/__init__.py                    | 21 ++++-
 fireflyframework_agentic/evaluation/cli.py    | 42 +++++-----
 fireflyframework_agentic/evaluation/corpus.py | 20 ++---
 fireflyframework_agentic/evaluation/gates.py  | 42 +++-------
 fireflyframework_agentic/evaluation/judge.py  | 79 ++++++++-----------
 .../evaluation/judge_client.py                | 25 ++----
 .../evaluation/matcher.py                     | 60 +++++++-------
 .../evaluation/registry.py                    | 40 +++++-----
 .../evaluation/run_config_snapshot.py         |  9 +--
 .../evaluation/scorecard.py                   | 44 +++--------
 fireflyframework_agentic/evaluation/stats.py  |  9 +--
 uv.lock                                       | 59 +++++++++++++-
 13 files changed, 220 insertions(+), 243 deletions(-)

diff --git a/examples/flycanon_eval_example.py b/examples/flycanon_eval_example.py
index 9d8d071b..856b520b 100644
--- a/examples/flycanon_eval_example.py
+++ b/examples/flycanon_eval_example.py
@@ -94,8 +94,7 @@
 import sys
 from pathlib import Path
 
-from fireflyframework_agentic.evaluation import RetrieverMetrics, compute_retrieval_metrics
-
+from fireflyframework_agentic.evaluation import RetrieverMetrics
 
 # ---------------------------------------------------------------------------
 # Helpers
@@ -171,10 +170,7 @@ def _print_metrics_table(metrics: RetrieverMetrics, baseline: dict | None) -> No
         if value is None:
             continue
         # Format floats as 4 decimal places; ints as plain integers.
-        if isinstance(value, float):
-            cur_str = f"{value:.4f}"
-        else:
-            cur_str = str(value)
+        cur_str = f"{value:.4f}" if isinstance(value, float) else str(value)
 
         row = f"{key:<{col_w}} {cur_str:>{num_w}}"
         if baseline and key in baseline and isinstance(value, float):
@@ -353,10 +349,7 @@ def build_parser() -> argparse.ArgumentParser:
     p.add_argument(
         "--baseline",
         default=None,
-        help=(
-            "Path to baseline.json (champion store).  When absent, scores are printed "
-            "without comparison."
-        ),
+        help=("Path to baseline.json (champion store).  When absent, scores are printed without comparison."),
     )
     p.add_argument(
         "--promote-if-better",
diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py
index ad01980c..d986d09f 100644
--- a/fireflyframework_agentic/evaluation/__init__.py
+++ b/fireflyframework_agentic/evaluation/__init__.py
@@ -31,16 +31,29 @@
 
 from importlib.metadata import PackageNotFoundError, version
 
-from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index
+from fireflyframework_agentic.evaluation.champion import (
+    ChampionRecord,
+    invalidate_champion,
+    load_champion,
+    save_champion,
+)
+from fireflyframework_agentic.evaluation.corpus import (
+    EMPTY,
+    FABRICATED,
+    SOURCE_UNKNOWN,
+    VERIFIED,
+    corpus_sha256,
+    load_corpus,
+    verify_evidence_index,
+)
 from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, run_gates
-from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict, VERDICT_PROMOTE, VERDICT_HOLD
-from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion
 from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge
 from fireflyframework_agentic.evaluation.judge_client import JudgeClient, OllamaEmbedder, build_embedder, cosine
 from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens
 from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256
-from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics
+from fireflyframework_agentic.evaluation.scorecard import VERDICT_HOLD, VERDICT_PROMOTE, render_scorecard, verdict
 from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag
+from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics
 
 try:
     __version__ = version("fireflyframework-agentic")
diff --git a/fireflyframework_agentic/evaluation/cli.py b/fireflyframework_agentic/evaluation/cli.py
index 7ac868d9..80dc418a 100644
--- a/fireflyframework_agentic/evaluation/cli.py
+++ b/fireflyframework_agentic/evaluation/cli.py
@@ -48,7 +48,8 @@
 from fireflyframework_agentic.evaluation.judge_client import build_embedder
 from fireflyframework_agentic.evaluation.matcher import matches
 from fireflyframework_agentic.evaluation.registry import load_registry
-from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict as get_verdict
+from fireflyframework_agentic.evaluation.scorecard import render_scorecard
+from fireflyframework_agentic.evaluation.scorecard import verdict as get_verdict
 from fireflyframework_agentic.evaluation.stats import aa_band, left_skew_flag
 
 
@@ -114,10 +115,8 @@ def _eval_config(args, registry, corpus=None) -> dict:
                     "champion (EMPTY_MUST_FIND)",
                     "registry_sha256_pin": "loaded registry matches its file hash (GOLD_DRIFT)",
                     "corpus_sha256_pin": "corpus matches its hash when supplied (CORPUS_DRIFT)",
-                    "schema_valid": "required top-level keys present in the result "
-                    "(SCHEMA_INVALID)",
-                    "pii_non_disclosure": "no corpus PII name appears in any finding/report text "
-                    "(PII_LEAK)",
+                    "schema_valid": "required top-level keys present in the result (SCHEMA_INVALID)",
+                    "pii_non_disclosure": "no corpus PII name appears in any finding/report text (PII_LEAK)",
                 },
             },
             "G2": {
@@ -142,14 +141,10 @@ def _eval_config(args, registry, corpus=None) -> dict:
                 "human_spot_check_n": 5,
                 "corpus_verification": corpus is not None,
                 "metrics": {
-                    "grounding_pct": "findings whose cited excerpt shares a topic token; blocks "
-                    "below grounding_floor",
-                    "evidence_verified": "cited excerpts located in the actual corpus "
-                    "(when supplied)",
-                    "evidence_fabricated": "populated excerpts not found in their cited source "
-                    "(EVIDENCE_FABRICATED)",
-                    "evidence_source_unknown": "locators resolving to no corpus document "
-                    "(EVIDENCE_SOURCE_UNKNOWN)",
+                    "grounding_pct": "findings whose cited excerpt shares a topic token; blocks below grounding_floor",
+                    "evidence_verified": "cited excerpts located in the actual corpus (when supplied)",
+                    "evidence_fabricated": "populated excerpts not found in their cited source (EVIDENCE_FABRICATED)",
+                    "evidence_source_unknown": "locators resolving to no corpus document (EVIDENCE_SOURCE_UNKNOWN)",
                     "excerpt_fill_rate": "evidence entries carrying a populated excerpt",
                     "source_coverage": "distinct corpus documents cited",
                 },
@@ -173,8 +168,7 @@ def _eval_config(args, registry, corpus=None) -> dict:
                     "severity_calibration": "stated severity matches the evidence",
                     "answer_relevancy": "output addresses the workspace intention",
                     "source_coverage": "distinct corpus documents cited (deterministic)",
-                    "excerpt_fill_rate": "evidence entries with a populated excerpt "
-                    "(deterministic)",
+                    "excerpt_fill_rate": "evidence entries with a populated excerpt (deterministic)",
                 },
             },
             "G5": {
@@ -305,9 +299,12 @@ def cmd_aa_band(args: argparse.Namespace) -> int:
     for rp in args.results:
         result = _load_json(rp)
         g2 = g2_recall_precision(
-            result, registry,
-            recall_metric=args.recall_metric, embed_fn=embed_fn,
-            tau=args.tau, tau_nc=args.tau_nc,
+            result,
+            registry,
+            recall_metric=args.recall_metric,
+            embed_fn=embed_fn,
+            tau=args.tau,
+            tau_nc=args.tau_nc,
             corpus=corpus,
         )
         if g2.passed or g2.details.get("recall") is not None:
@@ -468,15 +465,13 @@ def _add_common(p: argparse.ArgumentParser) -> None:
         "--tau",
         type=float,
         default=float(os.environ.get("FLYEVAL_TAU", "0.70")),
-        help="cosine similarity threshold for the semantic recall path (real items). "
-        "Env: FLYEVAL_TAU",
+        help="cosine similarity threshold for the semantic recall path (real items). Env: FLYEVAL_TAU",
     )
     p_gate.add_argument(
         "--tau-nc",
         type=float,
         default=float(os.environ.get("FLYEVAL_TAU_NC", "0.85")),
-        help="cosine similarity threshold for NC item detection (higher; no source anchor). "
-        "Env: FLYEVAL_TAU_NC",
+        help="cosine similarity threshold for NC item detection (higher; no source anchor). Env: FLYEVAL_TAU_NC",
     )
     p_gate.add_argument("--human-signed-off", action="store_true")
     p_gate.add_argument("--signoffs", type=int, default=0)
@@ -495,8 +490,7 @@ def _add_common(p: argparse.ArgumentParser) -> None:
         "--judge-runs",
         type=int,
         default=int(os.environ.get("FLYEVAL_JUDGE_RUNS", "1")),
-        help="G4 judge runs; the median of numeric scores is kept (odd recommended). "
-        "Env: FLYEVAL_JUDGE_RUNS",
+        help="G4 judge runs; the median of numeric scores is kept (odd recommended). Env: FLYEVAL_JUDGE_RUNS",
     )
     p_gate.add_argument(
         "--judge-concurrency",
diff --git a/fireflyframework_agentic/evaluation/corpus.py b/fireflyframework_agentic/evaluation/corpus.py
index 32835f2c..34926b41 100644
--- a/fireflyframework_agentic/evaluation/corpus.py
+++ b/fireflyframework_agentic/evaluation/corpus.py
@@ -80,7 +80,7 @@ def normalize(text: str) -> str:
     smart quotes, collapse whitespace, casefold."""
     text = unicodedata.normalize("NFKC", text)
     text = text.replace("**", "").replace("*", "")
-    text = re.sub(r"[\"""''']", "", text)
+    text = re.sub(r"[\"" "''']", "", text)
     return re.sub(r"\s+", " ", text).strip().casefold()
 
 
@@ -129,9 +129,7 @@ def load_corpus(path: str | Path) -> Corpus:
 
 def _fragment_coverage(fragment: str, source: str) -> float:
     """Fraction of fragment covered by matching blocks of >= _MIN_BLOCK_CHARS chars."""
-    blocks = difflib.SequenceMatcher(
-        None, fragment, source, autojunk=False
-    ).get_matching_blocks()
+    blocks = difflib.SequenceMatcher(None, fragment, source, autojunk=False).get_matching_blocks()
     covered = sum(b.size for b in blocks if b.size >= _MIN_BLOCK_CHARS)
     return covered / len(fragment)
 
@@ -158,11 +156,9 @@ def verify_entry(corpus: Corpus, entry: dict) -> str:
     if not excerpt:
         return EMPTY
 
-    fragments = [
-        f.strip()
-        for f in _SPLICE_PATTERN.split(excerpt)
-        if len(f.strip()) >= _MIN_FRAGMENT_CHARS
-    ] or [excerpt]
+    fragments = [f.strip() for f in _SPLICE_PATTERN.split(excerpt) if len(f.strip()) >= _MIN_FRAGMENT_CHARS] or [
+        excerpt
+    ]
 
     for fragment in fragments:
         if fragment in source:
@@ -178,8 +174,4 @@ def verify_evidence_index(corpus: Corpus, result: dict) -> dict[str, str]:
     Returns {evidence_id: status} over all entries — referenced or not — so
     the gates share one verification pass.
     """
-    return {
-        ev["id"]: verify_entry(corpus, ev)
-        for ev in result.get("evidence_index", [])
-        if ev.get("id")
-    }
+    return {ev["id"]: verify_entry(corpus, ev) for ev in result.get("evidence_index", []) if ev.get("id")}
diff --git a/fireflyframework_agentic/evaluation/gates.py b/fireflyframework_agentic/evaluation/gates.py
index 057bfea7..fc98d311 100644
--- a/fireflyframework_agentic/evaluation/gates.py
+++ b/fireflyframework_agentic/evaluation/gates.py
@@ -93,11 +93,7 @@ def _build_evidence_index(result: dict, corpus: Corpus | None = None) -> dict[st
     if corpus is None:
         return index
     statuses = verify_evidence_index(corpus, result)
-    return {
-        eid: ev
-        for eid, ev in index.items()
-        if statuses[eid] in (VERIFIED, EMPTY)
-    }
+    return {eid: ev for eid, ev in index.items() if statuses[eid] in (VERIFIED, EMPTY)}
 
 
 # ── G1: Structural & Safe ────────────────────────────────────────────────────
@@ -322,8 +318,10 @@ def _finding_redundancy_rate(findings: list[dict]) -> float:
     """Fraction of findings that are near-duplicates of another (Jaccard ≥0.6 on ≥5-char tokens)."""
     if len(findings) < 2:
         return 0.0
+
     def _tok(text: str) -> frozenset[str]:
         return frozenset(t.lower() for t in text.split() if len(t) >= 5)
+
     token_sets = [_tok(f.get("description", "")) for f in findings]
     in_redundant: set[int] = set()
     for i in range(len(token_sets)):
@@ -381,9 +379,7 @@ def g2_recall_precision(
         if item.tier == "NC":
             lexical[item.id] = False
         elif item.scope == "dependency_graph" and item.from_node:
-            lexical[item.id] = matcher.matches_dependency_graph_relation(
-                item, result, evidence_index
-            )
+            lexical[item.id] = matcher.matches_dependency_graph_relation(item, result, evidence_index)
         else:
             lexical[item.id] = any(
                 matches(c, item, evidence_index, scope=scope)
@@ -394,14 +390,10 @@ def g2_recall_precision(
     if recall_metric not in ("lexical", "semantic", "hybrid"):
         raise ValueError(f"unknown recall_metric {recall_metric!r}")
     if recall_metric in ("semantic", "hybrid") and embed_fn is None:
-        raise ValueError(
-            f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn"
-        )
+        raise ValueError(f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn")
 
     if embed_fn is not None:
-        semantic = matcher.semantic_hits(
-            candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc
-        )
+        semantic = matcher.semantic_hits(candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc)
         # dependency_graph relation items have no embedding candidates (§5.3b uses
         # the endpoint matcher, not per-candidate text embeddings); mirror the
         # lexical result so semantic/hybrid never under-credits them.
@@ -424,8 +416,7 @@ def g2_recall_precision(
     finding_count = len(findings)
     finding_scoped_items = [i for i in registry.real_items if i.scope == "finding"]
     findings_matched = sum(
-        1 for f in findings
-        if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items)
+        1 for f in findings if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items)
     )
     _sn = {
         "finding_count": finding_count,
@@ -493,9 +484,7 @@ def _semantic_details() -> dict:
             "lexical_recall": round(_weighted_recall(scored_items, lexical), 4),
             "semantic_recall": round(_weighted_recall(scored_items, semantic), 4),
             "hybrid_recall": round(
-                _weighted_recall(
-                    scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical}
-                ),
+                _weighted_recall(scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical}),
                 4,
             ),
             "tau": tau,
@@ -577,8 +566,8 @@ def g3_grounded(
 
     grounded_ids: list[str] = []
     # Ungrounded split (§6.3): distinguish format issues from real faithfulness failures.
-    ungrounded_empty_only: list[str] = []    # every ref had an empty excerpt
-    ungrounded_populated: list[str] = []     # had populated excerpt(s) but none anchored
+    ungrounded_empty_only: list[str] = []  # every ref had an empty excerpt
+    ungrounded_populated: list[str] = []  # had populated excerpt(s) but none anchored
 
     # Excerpt fill: count all resolved refs and how many carry a non-empty excerpt.
     total_refs = 0
@@ -657,18 +646,14 @@ def g3_grounded(
             "Populated excerpt(s) not found in the cited corpus document — "
             "the run asserts evidence the source does not contain."
         )
-        return GateResult(
-            gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details
-        )
+        return GateResult(gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details)
 
     if unknown_source_ids:
         details["message"] = (
             "Evidence locator(s) resolve to no corpus document — either the "
             "corpus bundle is incomplete or the run invented a source."
         )
-        return GateResult(
-            gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details
-        )
+        return GateResult(gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details)
 
     if grounding_pct < grounding_floor:
         details["floor"] = grounding_floor
@@ -746,8 +731,7 @@ def g5_no_regression(
         band = noise.get(metric, 0.0)
         if delta < -band:
             regressions.append(
-                f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} "
-                f"delta={delta:+.4f} < -band={-band:.4f}"
+                f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} delta={delta:+.4f} < -band={-band:.4f}"
             )
         elif delta > band:
             improvements.append(f"{metric}: delta={delta:+.4f} > band={band:.4f}")
diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py
index a347c8e1..80a90b04 100644
--- a/fireflyframework_agentic/evaluation/judge.py
+++ b/fireflyframework_agentic/evaluation/judge.py
@@ -142,10 +142,7 @@ def _map_chat(chat_fn, prompts, workers=1):
 
     results: list[dict] = [{} for _ in prompts]
     with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
-        futures = {
-            executor.submit(chat_fn, system, user): idx
-            for idx, (system, user) in enumerate(prompts)
-        }
+        futures = {executor.submit(chat_fn, system, user): idx for idx, (system, user) in enumerate(prompts)}
         for future in concurrent.futures.as_completed(futures):
             idx = futures[future]
             try:
@@ -165,11 +162,7 @@ def source_coverage(result: dict) -> dict:
     source stems present in evidence_index but cited by no finding.
     """
     evidence_index = _evidence_index(result)
-    all_stems = {
-        source_stem(ev.get("locator", ""))
-        for ev in result.get("evidence_index", [])
-        if ev.get("locator")
-    }
+    all_stems = {source_stem(ev.get("locator", "")) for ev in result.get("evidence_index", []) if ev.get("locator")}
     cited_stems: set[str] = set()
     for f in result.get("findings", []):
         for ref in f.get("evidence_refs", []):
@@ -245,7 +238,7 @@ def semantic_recovery(
     cand_vecs = np.asarray(embed_fn(candidate_texts), dtype=np.float64)
 
     recovered: list[dict] = []
-    for item, ivec in zip(missed_items, item_vecs):
+    for item, ivec in zip(missed_items, item_vecs, strict=False):
         best = max((cosine(ivec, cvec) for cvec in cand_vecs), default=0.0)
         if best >= tau:
             recovered.append({"id": item.id, "cosine": round(best, 4)})
@@ -307,11 +300,7 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic
     source}], count}.
     """
     evidence_index = _evidence_index(result)
-    scored = [
-        (f, excerpts)
-        for f in result.get("findings", [])
-        if (excerpts := _cited_excerpts(f, evidence_index))
-    ]
+    scored = [(f, excerpts) for f in result.get("findings", []) if (excerpts := _cited_excerpts(f, evidence_index))]
     prompts = [
         (
             SYSTEM,
@@ -326,7 +315,7 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic
     ]
     answers = _map_chat(chat_fn, prompts, workers)
     mismatches: list[dict] = []
-    for (f, _excerpts), answer in zip(scored, answers):
+    for (f, _excerpts), answer in zip(scored, answers, strict=False):
         for m in answer.get("mismatches", []) or []:
             mismatches.append(
                 {
@@ -395,7 +384,7 @@ def nc_semantic_precision(result: dict, registry, chat_fn, *, workers: int = 1)
     ]
     answers = _map_chat(chat_fn, prompts, workers)
     asserted_ids = [
-        item.id for item, a in zip(nc_items, answers) if str(a.get("asserted", "")).lower() == "yes"
+        item.id for item, a in zip(nc_items, answers, strict=False) if str(a.get("asserted", "")).lower() == "yes"
     ]
     return {"asserted": len(asserted_ids), "total": len(nc_items), "asserted_ids": asserted_ids}
 
@@ -407,10 +396,7 @@ def fabricated_entity(result: dict, chat_fn) -> dict:
     excerpts + locators.
     """
     output_text = _output_text(result)
-    corpus = "\n".join(
-        f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}"
-        for ev in result.get("evidence_index", [])
-    )
+    corpus = "\n".join(f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" for ev in result.get("evidence_index", []))
     user = (
         "List any system, organization, or metric NAMED in the OUTPUT that does NOT "
         "appear anywhere in the CORPUS EVIDENCE.\n"
@@ -433,8 +419,7 @@ def contradiction(result: dict, chat_fn) -> dict:
         lines.append(f"{f.get('id', '?')}: {f.get('title', '')} — {f.get('description', '')}")
     user = (
         "Are any two of these FINDINGS mutually contradictory? List each contradicting pair.\n"
-        'Reply with ONLY {"pairs": [["<id_a>", "<id_b>"], ...]}.  Empty list if none.\n\n'
-        + "\n".join(lines)
+        'Reply with ONLY {"pairs": [["<id_a>", "<id_b>"], ...]}.  Empty list if none.\n\n' + "\n".join(lines)
     )
     pairs = chat_fn(SYSTEM, user).get("pairs", []) or []
     return {"count": len(pairs), "pairs": [list(p) for p in pairs]}
@@ -514,7 +499,7 @@ def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict:
     answers = _map_chat(chat_fn, prompts, workers)
     verdicts: dict[str, str] = {}
     miscalibrated = 0
-    for f, a in zip(findings, answers):
+    for f, a in zip(findings, answers, strict=False):
         verdict = str(a.get("calibration", "calibrated")).lower()
         verdicts[f.get("id", "?")] = verdict
         if verdict in ("under", "over"):
@@ -557,7 +542,7 @@ def surface_deduplication(result: dict, chat_fn, *, workers: int = 1) -> dict:
     def _toks(node: dict) -> frozenset[str]:
         return frozenset(node.get("name", "").lower().split())
 
-    PER_SURFACE_CAP = 10
+    per_surface_cap = 10
     # candidates: (surface, node_a, node_b, parent_process_name)
     candidates: list[tuple[str, dict, dict, str]] = []
 
@@ -574,7 +559,7 @@ def _toks(node: dict) -> frozenset[str]:
                 if jac >= 0.30:
                     pairs.append((jac, procs[i], procs[j]))
         pairs.sort(key=lambda x: x[0], reverse=True)
-        for _jac, a, b in pairs[:PER_SURFACE_CAP]:
+        for _jac, a, b in pairs[:per_surface_cap]:
             candidates.append(("process", a, b, ""))
 
     # Activities and decisions: within the same parent process only
@@ -595,7 +580,7 @@ def _toks(node: dict) -> frozenset[str]:
                     if jac >= 0.30:
                         all_pairs.append((jac, nodes[i], nodes[j], proc_name))
         all_pairs.sort(key=lambda x: x[0], reverse=True)
-        for _jac, a, b, proc_name in all_pairs[:PER_SURFACE_CAP]:
+        for _jac, a, b, proc_name in all_pairs[:per_surface_cap]:
             candidates.append((surface_key, a, b, proc_name))
 
     if not candidates:
@@ -604,33 +589,37 @@ def _toks(node: dict) -> frozenset[str]:
     prompts = []
     for surface, a, b, parent_proc in candidates:
         ctx = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else ""
-        prompts.append((
-            SYSTEM,
-            f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a "
-            f"duplicate / sub-case / restatement of the other?\n"
-            f"{ctx}"
-            'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": "<one line>"}.\n\n'
-            f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n"
-            f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}",
-        ))
+        prompts.append(
+            (
+                SYSTEM,
+                f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a "
+                f"duplicate / sub-case / restatement of the other?\n"
+                f"{ctx}"
+                'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": "<one line>"}.\n\n'
+                f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n"
+                f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}",
+            )
+        )
 
     answers = _map_chat(chat_fn, prompts, workers)
 
     distinct = 0
     redundant = 0
     redundant_pairs: list[dict] = []
-    for (surface, a, b, _parent), answer in zip(candidates, answers):
+    for (surface, a, b, _parent), answer in zip(candidates, answers, strict=False):
         verdict = str(answer.get("verdict", "")).upper()
         if verdict == "DISTINCT":
             distinct += 1
         else:
             redundant += 1
-            redundant_pairs.append({
-                "surface": surface,
-                "a": a.get("name", ""),
-                "b": b.get("name", ""),
-                "reason": str(answer.get("reason", "")),
-            })
+            redundant_pairs.append(
+                {
+                    "surface": surface,
+                    "a": a.get("name", ""),
+                    "b": b.get("name", ""),
+                    "reason": str(answer.get("reason", "")),
+                }
+            )
 
     total = distinct + redundant
     return {
@@ -800,9 +789,7 @@ def _run_judge_metric(name: str, fn) -> None:
         "numeric_temporal_fidelity",
         lambda: numeric_temporal_fidelity(result, chat_fn, workers=concurrency),
     )
-    _run_judge_metric(
-        "citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency)
-    )
+    _run_judge_metric("citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency))
     _run_judge_metric(
         "nc_semantic_precision",
         lambda: nc_semantic_precision(result, registry, chat_fn, workers=concurrency),
diff --git a/fireflyframework_agentic/evaluation/judge_client.py b/fireflyframework_agentic/evaluation/judge_client.py
index 1af17f53..e4b58dea 100644
--- a/fireflyframework_agentic/evaluation/judge_client.py
+++ b/fireflyframework_agentic/evaluation/judge_client.py
@@ -245,8 +245,7 @@ def _dispatch(self, system: str, user: str, max_tokens: int) -> str:
         if self.provider == "ollama":
             return self._ollama(system, user, max_tokens)
         raise ValueError(
-            f"unknown judge provider {self.provider!r} in {self.model_spec!r}; "
-            "use anthropic:/openai:/azure:/ollama:"
+            f"unknown judge provider {self.provider!r} in {self.model_spec!r}; use anthropic:/openai:/azure:/ollama:"
         )
 
     def _anthropic(self, system: str, user: str, max_tokens: int) -> str:
@@ -262,9 +261,7 @@ def _anthropic(self, system: str, user: str, max_tokens: int) -> str:
         }
         headers = {"x-api-key": api_key, "anthropic-version": "2023-06-01"}
         resp = _http_post_json("https://api.anthropic.com/v1/messages", headers, body, self.timeout)
-        text = next(
-            (b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None
-        )
+        text = next((b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None)
         if not text:
             raise RuntimeError(f"judge returned no text: {resp}")
         return text
@@ -283,9 +280,7 @@ def _openai(self, system: str, user: str, max_tokens: int) -> str:
             ],
         }
         headers = {"Authorization": f"Bearer {api_key}"}
-        resp = _http_post_json(
-            "https://api.openai.com/v1/chat/completions", headers, body, self.timeout
-        )
+        resp = _http_post_json("https://api.openai.com/v1/chat/completions", headers, body, self.timeout)
         return _extract_openai_text(resp)
 
     def _azure(self, system: str, user: str, max_tokens: int) -> str:
@@ -297,10 +292,7 @@ def _azure(self, system: str, user: str, max_tokens: int) -> str:
             raise RuntimeError("AZURE_OPENAI_API_KEY not set")
         api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01"
         # Azure deployment lives in the URL path, not the JSON body.
-        url = (
-            f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions"
-            f"?api-version={api_version}"
-        )
+        url = f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions?api-version={api_version}"
         body = {
             "max_tokens": max_tokens,
             "temperature": 0.0,
@@ -373,10 +365,7 @@ def embed(self, texts: list[str]) -> np.ndarray:
         if not api_key:
             raise RuntimeError("AZURE_OPENAI_API_KEY not set")
         api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01"
-        url = (
-            f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings"
-            f"?api-version={api_version}"
-        )
+        url = f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings?api-version={api_version}"
         headers = {"api-key": api_key}
         vectors = self._embed_with_split(texts, url, headers)
         return np.asarray(vectors, dtype=np.float32)
@@ -438,9 +427,7 @@ def build_embedder(spec: str):
         return OpenAIEmbedder(model or "text-embedding-3-small").embed
     if provider == "azure":
         return AzureOpenAIEmbedder(model or "text-embedding-3-small").embed
-    raise NotImplementedError(
-        f"embedder backend {provider!r} not implemented yet; add it in build_embedder()"
-    )
+    raise NotImplementedError(f"embedder backend {provider!r} not implemented yet; add it in build_embedder()")
 
 
 def cosine(a, b) -> float:
diff --git a/fireflyframework_agentic/evaluation/matcher.py b/fireflyframework_agentic/evaluation/matcher.py
index b4d81f44..ccf61c96 100644
--- a/fireflyframework_agentic/evaluation/matcher.py
+++ b/fireflyframework_agentic/evaluation/matcher.py
@@ -113,9 +113,7 @@ def _keyword_anchored(desc: str, keywords: list[str]) -> bool:
     if not keywords:
         return False
     desc_lower = desc.lower()
-    return any(
-        re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords
-    )
+    return any(re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords)
 
 
 def candidate_text(candidate: dict, scope: str) -> str:
@@ -141,18 +139,28 @@ def candidate_text(candidate: dict, scope: str) -> str:
         pain = candidate.get("pain_points") or []
         goals_str = " ".join(goals) if isinstance(goals, list) else str(goals)
         pain_str = " ".join(pain) if isinstance(pain, list) else str(pain)
-        return " ".join(filter(None, [
-            candidate.get("name", ""),
-            candidate.get("role", ""),
-            goals_str,
-            pain_str,
-        ]))
+        return " ".join(
+            filter(
+                None,
+                [
+                    candidate.get("name", ""),
+                    candidate.get("role", ""),
+                    goals_str,
+                    pain_str,
+                ],
+            )
+        )
     if scope == "informal_channel":
-        return " ".join(filter(None, [
-            candidate.get("name", ""),
-            candidate.get("usage_context", ""),
-            candidate.get("notes", ""),
-        ]))
+        return " ".join(
+            filter(
+                None,
+                [
+                    candidate.get("name", ""),
+                    candidate.get("usage_context", ""),
+                    candidate.get("notes", ""),
+                ],
+            )
+        )
     # process, decision, system, dependency_graph (diagnostic nodes)
     return " ".join(filter(None, [candidate.get("name", ""), candidate.get("description", "")]))
 
@@ -246,9 +254,7 @@ def matches_dependency_graph_relation(
 
     def _anchor(endpoint_text: str) -> set[str]:
         return {
-            a["id"]
-            for a in all_activities
-            if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text)
+            a["id"] for a in all_activities if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text)
         }
 
     from_ids = _anchor(item.from_node)
@@ -268,9 +274,8 @@ def _node_stems(node: dict) -> set[str]:
     dg = result.get("dependency_graph", {})
 
     for edge in dg.get("activity_edges", []):
-        if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids:
-            if _node_stems(edge) & item_stems:
-                return True
+        if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids and _node_stems(edge) & item_stems:
+            return True
 
     for path in dg.get("critical_paths", []):
         if not (_node_stems(path) & item_stems):
@@ -325,19 +330,13 @@ def semantic_hits(
 
     # Flatten all candidates across scopes, preserving their scope tag for
     # text extraction and per-item filtering.
-    scoped: list[tuple[str, dict]] = [
-        (scope, cand)
-        for scope, cands in candidates.items()
-        for cand in cands
-    ]
+    scoped: list[tuple[str, dict]] = [(scope, cand) for scope, cands in candidates.items() for cand in cands]
 
     if not scoped:
         return {item.id: False for item in items}
 
     cand_texts = [candidate_text(cand, scope) for scope, cand in scoped]
-    item_texts = [
-        " ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items
-    ]
+    item_texts = [" ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items]
 
     cand_vecs = np.asarray(embed_fn(cand_texts))
     item_vecs = np.asarray(embed_fn(item_texts))
@@ -359,10 +358,7 @@ def semantic_hits(
                 if cosine(cand_vecs[k], item_vec) >= tau_nc:
                     hit = True
                     break
-            elif (
-                shares_source(cand, item, evidence_index)
-                and cosine(cand_vecs[k], item_vec) >= tau
-            ):
+            elif shares_source(cand, item, evidence_index) and cosine(cand_vecs[k], item_vec) >= tau:
                 hit = True
                 break
         hits[item.id] = hit
diff --git a/fireflyframework_agentic/evaluation/registry.py b/fireflyframework_agentic/evaluation/registry.py
index 2b869ba9..87c4beb1 100644
--- a/fireflyframework_agentic/evaluation/registry.py
+++ b/fireflyframework_agentic/evaluation/registry.py
@@ -24,6 +24,7 @@
 - kappa present (0.0 placeholder allowed; G2 advisory until >= 0.70)
 - ABANCA DILO items must target a single measured sub-population
 """
+
 from __future__ import annotations
 
 import hashlib
@@ -35,8 +36,15 @@
 
 VALID_TIERS = ("L0", "L1", "L2", "L3", "NC")
 VALID_SCOPES = (
-    "process", "activity", "decision", "finding", "action",
-    "persona", "system", "informal_channel", "dependency_graph",
+    "process",
+    "activity",
+    "decision",
+    "finding",
+    "action",
+    "persona",
+    "system",
+    "informal_channel",
+    "dependency_graph",
 )
 SCHEMA_VERSION = "lean-1"
 KAPPA_ADVISORY_THRESHOLD = 0.70
@@ -47,13 +55,13 @@ class RegistryItem:
     id: str
     tier: Literal["L0", "L1", "L2", "L3", "NC"]
     description: str
-    evidence: list[str]          # source file paths (path portion of locator, no #page=N)
-    scope: str = "finding"       # which DiscoveryResult surface to match against (§4.3)
+    evidence: list[str]  # source file paths (path portion of locator, no #page=N)
+    scope: str = "finding"  # which DiscoveryResult surface to match against (§4.3)
     keywords: list[str] = field(default_factory=list)
     weight: float = 1.0
-    from_node: str = ""   # dependency_graph relation items only
-    to_node: str = ""     # dependency_graph relation items only
-    relation: str = ""    # defaults to "precedes" when from/to present
+    from_node: str = ""  # dependency_graph relation items only
+    to_node: str = ""  # dependency_graph relation items only
+    relation: str = ""  # defaults to "precedes" when from/to present
 
 
 @dataclass(frozen=True)
@@ -87,10 +95,7 @@ def sha256(self) -> str:
 
 def _validate(raw: dict, path: Path) -> None:
     if raw.get("schema_version") != SCHEMA_VERSION:
-        raise ValueError(
-            f"{path.name}: schema_version must be '{SCHEMA_VERSION}', "
-            f"got {raw.get('schema_version')!r}"
-        )
+        raise ValueError(f"{path.name}: schema_version must be '{SCHEMA_VERSION}', got {raw.get('schema_version')!r}")
     for fname in ("corpus", "author", "date"):
         if not raw.get(fname):
             raise ValueError(f"{path.name}: missing required field '{fname}'")
@@ -116,20 +121,17 @@ def _validate(raw: dict, path: Path) -> None:
         tier = it.get("tier")
         if tier not in VALID_TIERS:
             raise ValueError(
-                f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; "
-                f"must be one of {VALID_TIERS}"
+                f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; must be one of {VALID_TIERS}"
             )
         scope = it.get("scope", "finding")
         if scope not in VALID_SCOPES:
             raise ValueError(
-                f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; "
-                f"must be one of {VALID_SCOPES}"
+                f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; must be one of {VALID_SCOPES}"
             )
         if scope == "dependency_graph":
             if not it.get("from") or not it.get("to"):
                 raise ValueError(
-                    f"{path.name}: dependency_graph item '{it.get('id')}' must have "
-                    "non-empty 'from' and 'to'"
+                    f"{path.name}: dependency_graph item '{it.get('id')}' must have non-empty 'from' and 'to'"
                 )
         else:
             if "from" in it or "to" in it or "relation" in it:
@@ -153,13 +155,13 @@ def _validate(raw: dict, path: Path) -> None:
     # ABANCA DILO blend guard: items must assert a single sub-population target.
     # Checks for phrases that would indicate a blended numeric target is asserted.
     # "blend" alone is too broad (items may reference it negatively).
-    BLEND_PHRASES = ("combined distribution", "across all offices regardless of segment")
+    blend_phrases = ("combined distribution", "across all offices regardless of segment")
     for it in items:
         if it.get("tier") == "NC":
             continue
         desc = it.get("description", "").lower()
         iid = it.get("id", "")
-        if any(phrase in desc for phrase in BLEND_PHRASES):
+        if any(phrase in desc for phrase in blend_phrases):
             raise ValueError(
                 f"{path.name}: item '{iid}' description targets a blended distribution; "
                 "ABANCA DILO items must target a single measured sub-population "
diff --git a/fireflyframework_agentic/evaluation/run_config_snapshot.py b/fireflyframework_agentic/evaluation/run_config_snapshot.py
index db543129..c029e8e6 100644
--- a/fireflyframework_agentic/evaluation/run_config_snapshot.py
+++ b/fireflyframework_agentic/evaluation/run_config_snapshot.py
@@ -32,6 +32,7 @@
         --options    request_options.json \
         --commit     c107918
 """
+
 from __future__ import annotations
 
 import argparse
@@ -133,12 +134,8 @@ def write_snapshot(output_dir: str | Path, config: dict) -> Path:
 def main(argv: list[str] | None = None) -> int:
     parser = argparse.ArgumentParser(description="Capture the flyradar run configuration.")
     parser.add_argument("--output-dir", required=True, help="Run directory holding output.json.")
-    parser.add_argument(
-        "--options", required=True, help="JSON file of the DiscoveryRequest options that were sent."
-    )
-    parser.add_argument(
-        "--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL)."
-    )
+    parser.add_argument("--options", required=True, help="JSON file of the DiscoveryRequest options that were sent.")
+    parser.add_argument("--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL).")
     parser.add_argument("--commit", default=None, help="Deployed flyradar git commit, if known.")
     args = parser.parse_args(argv)
 
diff --git a/fireflyframework_agentic/evaluation/scorecard.py b/fireflyframework_agentic/evaluation/scorecard.py
index b34885e8..da3c4a87 100644
--- a/fireflyframework_agentic/evaluation/scorecard.py
+++ b/fireflyframework_agentic/evaluation/scorecard.py
@@ -188,13 +188,9 @@ def _render_advisory(report) -> list[str]:
         d = m["faithfulness"]
         u = d.get("unsupported_ids", [])
         extra = f"   (unsupported: {', '.join(u)})" if u else ""
-        lines.append(
-            f"Faithfulness (entailment):       {d.get('supported')}/{d.get('total')} supported{extra}"
-        )
+        lines.append(f"Faithfulness (entailment):       {d.get('supported')}/{d.get('total')} supported{extra}")
     if "numeric_temporal_fidelity" in m:
-        lines.append(
-            f"Numeric/temporal fidelity:       {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)"
-        )
+        lines.append(f"Numeric/temporal fidelity:       {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)")
     if "citation_relevance" in m:
         d = m["citation_relevance"]
         lines.append(
@@ -218,14 +214,10 @@ def _render_advisory(report) -> list[str]:
         lines.append(f"Contradiction detection:         {m['contradiction'].get('count', 0)}")
     if "actionability" in m:
         d = m["actionability"]
-        lines.append(
-            f"Actionability:                   {_num(d.get('score'))}   (rated {d.get('rated', 0)})"
-        )
+        lines.append(f"Actionability:                   {_num(d.get('score'))}   (rated {d.get('rated', 0)})")
     if "severity_calibration" in m:
         d = m["severity_calibration"]
-        lines.append(
-            f"Severity calibration:            {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated"
-        )
+        lines.append(f"Severity calibration:            {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated")
     if "answer_relevancy" in m:
         lines.append(f"Answer relevancy:                {_num(m['answer_relevancy'].get('score'))}")
     if "comparative_vs_champion" in m:
@@ -236,14 +228,10 @@ def _render_advisory(report) -> list[str]:
         d = m["source_coverage"]
         o = d.get("orphaned", [])
         extra = f"   (orphaned: {', '.join(o)})" if o else ""
-        lines.append(
-            f"Source coverage [D]:             {d.get('cited')}/{d.get('total')} documents cited{extra}"
-        )
+        lines.append(f"Source coverage [D]:             {d.get('cited')}/{d.get('total')} documents cited{extra}")
     if "excerpt_fill_rate" in m:
         d = m["excerpt_fill_rate"]
-        lines.append(
-            f"Evidence-excerpt fill [D]:       {d.get('populated')}/{d.get('total')} populated"
-        )
+        lines.append(f"Evidence-excerpt fill [D]:       {d.get('populated')}/{d.get('total')} populated")
     if "open_gap" in m:
         gap = (m["open_gap"].get("gap") or "").strip()
         if gap:
@@ -259,9 +247,7 @@ def _render_advisory(report) -> list[str]:
         json.dumps({"metrics": report.metrics, "details": report.details}, indent=2, default=str),
         "```",
     ]
-    lines.append(
-        "> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10)."
-    )
+    lines.append("> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10).")
     lines.append("")
     return lines
 
@@ -284,9 +270,7 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]:
         matched = d.get("findings_matched_to_registry", {}).get("fraction", 0.0)
 
         tier_summary = ", ".join(
-            f"{t} {v['hit']}/{v['total']}"
-            for t, v in tiers.items()
-            if "hit" in v and "total" in v
+            f"{t} {v['hit']}/{v['total']}" for t, v in tiers.items() if "hit" in v and "total" in v
         )
         lines.append(
             f"Lexical recall is **{recall:.3f}** ({tier_summary}). "
@@ -300,9 +284,7 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]:
                 "The run is covering the same ground multiple times rather than broadening coverage."
             )
         else:
-            lines.append(
-                f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic."
-            )
+            lines.append(f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic.")
         lines.append(
             "_G2 is a topic-level test. A recall of 1.000 means every required topic was "
             "mentioned somewhere — it does not verify that the specific claims about those "
@@ -453,14 +435,10 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]:
     flag_names = [g.gate for g in flags]
 
     if not flags:
-        lines.append(
-            "All deterministic gates pass. The run is ready for G5 human sign-off."
-        )
+        lines.append("All deterministic gates pass. The run is ready for G5 human sign-off.")
     else:
         flag_str = ", ".join(flag_names)
-        lines.append(
-            f"The run is at **HOLD** due to flags on: {flag_str}. "
-        )
+        lines.append(f"The run is at **HOLD** due to flags on: {flag_str}. ")
         for g in flags:
             if g.gate == "G3" and g.reason_code == "EVIDENCE_SOURCE_UNKNOWN":
                 lines.append(
diff --git a/fireflyframework_agentic/evaluation/stats.py b/fireflyframework_agentic/evaluation/stats.py
index e70c629a..c622588c 100644
--- a/fireflyframework_agentic/evaluation/stats.py
+++ b/fireflyframework_agentic/evaluation/stats.py
@@ -23,10 +23,11 @@
 aggregation bug where the previous runner inherited run 0's grounding report
 unchanged instead of merging across all runs.
 """
+
 from __future__ import annotations
 
 import statistics
-from typing import Sequence
+from collections.abc import Sequence
 
 
 def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float:
@@ -49,11 +50,7 @@ def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float:
     scores = list(scores)
     if len(scores) < 2:
         raise ValueError(f"aa_band requires >= 2 reruns; got {len(scores)}")
-    deltas = [
-        abs(x - y)
-        for i, x in enumerate(scores)
-        for y in scores[i + 1:]
-    ]
+    deltas = [abs(x - y) for i, x in enumerate(scores) for y in scores[i + 1 :]]
     sorted_deltas = sorted(deltas)
     # Index for the requested percentile; clamp to valid range
     idx = max(0, min(len(sorted_deltas) - 1, int(len(sorted_deltas) * percentile / 100)))
diff --git a/uv.lock b/uv.lock
index 7e3b501c..93e18075 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1209,6 +1209,10 @@ dev = [
 embeddings = [
     { name = "numpy" },
 ]
+evaluation = [
+    { name = "numpy" },
+    { name = "scipy" },
+]
 google-embeddings = [
     { name = "google-generativeai" },
 ]
@@ -1279,6 +1283,7 @@ requires-dist = [
     { name = "mistralai", marker = "extra == 'mistral-embeddings'", specifier = ">=1.0.0" },
     { name = "motor", marker = "extra == 'mongodb'", specifier = ">=3.6.0" },
     { name = "numpy", marker = "extra == 'embeddings'", specifier = ">=1.26.0" },
+    { name = "numpy", marker = "extra == 'evaluation'", specifier = ">=1.26.0" },
     { name = "numpy", marker = "extra == 'reasoning-eval'", specifier = ">=2.0.0" },
     { name = "openai", marker = "extra == 'azure-embeddings'", specifier = ">=1.0.0" },
     { name = "openai", marker = "extra == 'openai-embeddings'", specifier = ">=1.0.0" },
@@ -1304,13 +1309,14 @@ requires-dist = [
     { name = "python-dotenv", specifier = ">=1.0.0" },
     { name = "qdrant-client", marker = "extra == 'vectorstores-qdrant'", specifier = ">=1.12.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" },
+    { name = "scipy", marker = "extra == 'evaluation'", specifier = ">=1.11" },
     { name = "sqlalchemy", marker = "extra == 'postgres'", specifier = ">=2.0.0" },
     { name = "sqlite-vec", marker = "extra == 'vectorstores-sqlite-vec'", specifier = ">=0.1.6" },
     { name = "testcontainers", marker = "extra == 'dev'", specifier = ">=4.10.0" },
     { name = "voyageai", marker = "extra == 'voyage-embeddings'", specifier = ">=0.3.0" },
     { name = "watchfiles", marker = "extra == 'watch'", specifier = ">=0.24.0" },
 ]
-provides-extras = ["postgres", "mongodb", "security", "embeddings", "openai-embeddings", "cohere-embeddings", "google-embeddings", "mistral-embeddings", "voyage-embeddings", "azure-embeddings", "bedrock-embeddings", "ollama-embeddings", "reasoning-eval", "vectorstores-chroma", "vectorstores-sqlite-vec", "vectorstores-pinecone", "vectorstores-qdrant", "vectorstores-pgvector", "watch", "binary", "all", "dev"]
+provides-extras = ["postgres", "mongodb", "security", "embeddings", "openai-embeddings", "cohere-embeddings", "google-embeddings", "mistral-embeddings", "voyage-embeddings", "azure-embeddings", "bedrock-embeddings", "ollama-embeddings", "reasoning-eval", "vectorstores-chroma", "vectorstores-sqlite-vec", "vectorstores-pinecone", "vectorstores-qdrant", "vectorstores-pgvector", "watch", "binary", "all", "evaluation", "dev"]
 
 [[package]]
 name = "flatbuffers"
@@ -4489,6 +4495,57 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/87/72/c6c32d2b657fa3dad1de340254e14390b1e334ce38268b7ad51abda3c8c2/s3transfer-0.17.0-py3-none-any.whl", hash = "sha256:ce3801712acf4ad3e89fb9990df97b4972e93f4b3b0004d214be5bce12814c20", size = 86811, upload-time = "2026-04-29T22:07:34.966Z" },
 ]
 
+[[package]]
+name = "scipy"
+version = "1.17.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/76/27/07ee1b57b65e92645f219b37148a7e7928b82e2b5dbeccecb4dff7c64f0b/scipy-1.17.1-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:5e3c5c011904115f88a39308379c17f91546f77c1667cea98739fe0fccea804c", size = 31590199, upload-time = "2026-02-23T00:19:17.192Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/ae/db19f8ab842e9b724bf5dbb7db29302a91f1e55bc4d04b1025d6d605a2c5/scipy-1.17.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6fac755ca3d2c3edcb22f479fceaa241704111414831ddd3bc6056e18516892f", size = 28154001, upload-time = "2026-02-23T00:19:22.241Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/58/3ce96251560107b381cbd6e8413c483bbb1228a6b919fa8652b0d4090e7f/scipy-1.17.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ff200bf9d24f2e4d5dc6ee8c3ac64d739d3a89e2326ba68aaf6c4a2b838fd7d", size = 20325719, upload-time = "2026-02-23T00:19:26.329Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/83/15087d945e0e4d48ce2377498abf5ad171ae013232ae31d06f336e64c999/scipy-1.17.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4b400bdc6f79fa02a4d86640310dde87a21fba0c979efff5248908c6f15fad1b", size = 22683595, upload-time = "2026-02-23T00:19:30.304Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/e0/e58fbde4a1a594c8be8114eb4aac1a55bcd6587047efc18a61eb1f5c0d30/scipy-1.17.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b64ca7d4aee0102a97f3ba22124052b4bd2152522355073580bf4845e2550b6", size = 32896429, upload-time = "2026-02-23T00:19:35.536Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/5f/f17563f28ff03c7b6799c50d01d5d856a1d55f2676f537ca8d28c7f627cd/scipy-1.17.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:581b2264fc0aa555f3f435a5944da7504ea3a065d7029ad60e7c3d1ae09c5464", size = 35203952, upload-time = "2026-02-23T00:19:42.259Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/a5/9afd17de24f657fdfe4df9a3f1ea049b39aef7c06000c13db1530d81ccca/scipy-1.17.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:beeda3d4ae615106d7094f7e7cef6218392e4465cc95d25f900bebabfded0950", size = 34979063, upload-time = "2026-02-23T00:19:47.547Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/13/88b1d2384b424bf7c924f2038c1c409f8d88bb2a8d49d097861dd64a57b2/scipy-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6609bc224e9568f65064cfa72edc0f24ee6655b47575954ec6339534b2798369", size = 37598449, upload-time = "2026-02-23T00:19:53.238Z" },
+    { url = "https://files.pythonhosted.org/packages/35/e5/d6d0e51fc888f692a35134336866341c08655d92614f492c6860dc45bb2c/scipy-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:37425bc9175607b0268f493d79a292c39f9d001a357bebb6b88fdfaff13f6448", size = 36510943, upload-time = "2026-02-23T00:20:50.89Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/fd/3be73c564e2a01e690e19cc618811540ba5354c67c8680dce3281123fb79/scipy-1.17.1-cp313-cp313-win_arm64.whl", hash = "sha256:5cf36e801231b6a2059bf354720274b7558746f3b1a4efb43fcf557ccd484a87", size = 24545621, upload-time = "2026-02-23T00:20:55.871Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/6b/17787db8b8114933a66f9dcc479a8272e4b4da75fe03b0c282f7b0ade8cd/scipy-1.17.1-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:d59c30000a16d8edc7e64152e30220bfbd724c9bbb08368c054e24c651314f0a", size = 31936708, upload-time = "2026-02-23T00:19:58.694Z" },
+    { url = "https://files.pythonhosted.org/packages/38/2e/524405c2b6392765ab1e2b722a41d5da33dc5c7b7278184a8ad29b6cb206/scipy-1.17.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:010f4333c96c9bb1a4516269e33cb5917b08ef2166d5556ca2fd9f082a9e6ea0", size = 28570135, upload-time = "2026-02-23T00:20:03.934Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/c3/5bd7199f4ea8556c0c8e39f04ccb014ac37d1468e6cfa6a95c6b3562b76e/scipy-1.17.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:2ceb2d3e01c5f1d83c4189737a42d9cb2fc38a6eeed225e7515eef71ad301dce", size = 20741977, upload-time = "2026-02-23T00:20:07.935Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/b8/8ccd9b766ad14c78386599708eb745f6b44f08400a5fd0ade7cf89b6fc93/scipy-1.17.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:844e165636711ef41f80b4103ed234181646b98a53c8f05da12ca5ca289134f6", size = 23029601, upload-time = "2026-02-23T00:20:12.161Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/a0/3cb6f4d2fb3e17428ad2880333cac878909ad1a89f678527b5328b93c1d4/scipy-1.17.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:158dd96d2207e21c966063e1635b1063cd7787b627b6f07305315dd73d9c679e", size = 33019667, upload-time = "2026-02-23T00:20:17.208Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/c3/2d834a5ac7bf3a0c806ad1508efc02dda3c8c61472a56132d7894c312dea/scipy-1.17.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74cbb80d93260fe2ffa334efa24cb8f2f0f622a9b9febf8b483c0b865bfb3475", size = 35264159, upload-time = "2026-02-23T00:20:23.087Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/77/d3ed4becfdbd217c52062fafe35a72388d1bd82c2d0ba5ca19d6fcc93e11/scipy-1.17.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:dbc12c9f3d185f5c737d801da555fb74b3dcfa1a50b66a1a93e09190f41fab50", size = 35102771, upload-time = "2026-02-23T00:20:28.636Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/12/d19da97efde68ca1ee5538bb261d5d2c062f0c055575128f11a2730e3ac1/scipy-1.17.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:94055a11dfebe37c656e70317e1996dc197e1a15bbcc351bcdd4610e128fe1ca", size = 37665910, upload-time = "2026-02-23T00:20:34.743Z" },
+    { url = "https://files.pythonhosted.org/packages/06/1c/1172a88d507a4baaf72c5a09bb6c018fe2ae0ab622e5830b703a46cc9e44/scipy-1.17.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e30bdeaa5deed6bc27b4cc490823cd0347d7dae09119b8803ae576ea0ce52e4c", size = 36562980, upload-time = "2026-02-23T00:20:40.575Z" },
+    { url = "https://files.pythonhosted.org/packages/70/b0/eb757336e5a76dfa7911f63252e3b7d1de00935d7705cf772db5b45ec238/scipy-1.17.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a720477885a9d2411f94a93d16f9d89bad0f28ca23c3f8daa521e2dcc3f44d49", size = 24856543, upload-time = "2026-02-23T00:20:45.313Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/83/333afb452af6f0fd70414dc04f898647ee1423979ce02efa75c3b0f2c28e/scipy-1.17.1-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:a48a72c77a310327f6a3a920092fa2b8fd03d7deaa60f093038f22d98e096717", size = 31584510, upload-time = "2026-02-23T00:21:01.015Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/a6/d05a85fd51daeb2e4ea71d102f15b34fedca8e931af02594193ae4fd25f7/scipy-1.17.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:45abad819184f07240d8a696117a7aacd39787af9e0b719d00285549ed19a1e9", size = 28170131, upload-time = "2026-02-23T00:21:05.888Z" },
+    { url = "https://files.pythonhosted.org/packages/db/7b/8624a203326675d7746a254083a187398090a179335b2e4a20e2ddc46e83/scipy-1.17.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:3fd1fcdab3ea951b610dc4cef356d416d5802991e7e32b5254828d342f7b7e0b", size = 20342032, upload-time = "2026-02-23T00:21:09.904Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/35/2c342897c00775d688d8ff3987aced3426858fd89d5a0e26e020b660b301/scipy-1.17.1-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7bdf2da170b67fdf10bca777614b1c7d96ae3ca5794fd9587dce41eb2966e866", size = 22678766, upload-time = "2026-02-23T00:21:14.313Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/f2/7cdb8eb308a1a6ae1e19f945913c82c23c0c442a462a46480ce487fdc0ac/scipy-1.17.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adb2642e060a6549c343603a3851ba76ef0b74cc8c079a9a58121c7ec9fe2350", size = 32957007, upload-time = "2026-02-23T00:21:19.663Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/2e/7eea398450457ecb54e18e9d10110993fa65561c4f3add5e8eccd2b9cd41/scipy-1.17.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eee2cfda04c00a857206a4330f0c5e3e56535494e30ca445eb19ec624ae75118", size = 35221333, upload-time = "2026-02-23T00:21:25.278Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/77/5b8509d03b77f093a0d52e606d3c4f79e8b06d1d38c441dacb1e26cacf46/scipy-1.17.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d2650c1fb97e184d12d8ba010493ee7b322864f7d3d00d3f9bb97d9c21de4068", size = 35042066, upload-time = "2026-02-23T00:21:31.358Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/df/18f80fb99df40b4070328d5ae5c596f2f00fffb50167e31439e932f29e7d/scipy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:08b900519463543aa604a06bec02461558a6e1cef8fdbb8098f77a48a83c8118", size = 37612763, upload-time = "2026-02-23T00:21:37.247Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/39/f0e8ea762a764a9dc52aa7dabcfad51a354819de1f0d4652b6a1122424d6/scipy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:3877ac408e14da24a6196de0ddcace62092bfc12a83823e92e49e40747e52c19", size = 37290984, upload-time = "2026-02-23T00:22:35.023Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/56/fe201e3b0f93d1a8bcf75d3379affd228a63d7e2d80ab45467a74b494947/scipy-1.17.1-cp314-cp314-win_arm64.whl", hash = "sha256:f8885db0bc2bffa59d5c1b72fad7a6a92d3e80e7257f967dd81abb553a90d293", size = 25192877, upload-time = "2026-02-23T00:22:39.798Z" },
+    { url = "https://files.pythonhosted.org/packages/96/ad/f8c414e121f82e02d76f310f16db9899c4fcde36710329502a6b2a3c0392/scipy-1.17.1-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:1cc682cea2ae55524432f3cdff9e9a3be743d52a7443d0cba9017c23c87ae2f6", size = 31949750, upload-time = "2026-02-23T00:21:42.289Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/b0/c741e8865d61b67c81e255f4f0a832846c064e426636cd7de84e74d209be/scipy-1.17.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:2040ad4d1795a0ae89bfc7e8429677f365d45aa9fd5e4587cf1ea737f927b4a1", size = 28585858, upload-time = "2026-02-23T00:21:47.706Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/1b/3985219c6177866628fa7c2595bfd23f193ceebbe472c98a08824b9466ff/scipy-1.17.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:131f5aaea57602008f9822e2115029b55d4b5f7c070287699fe45c661d051e39", size = 20757723, upload-time = "2026-02-23T00:21:52.039Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/19/2a04aa25050d656d6f7b9e7b685cc83d6957fb101665bfd9369ca6534563/scipy-1.17.1-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9cdc1a2fcfd5c52cfb3045feb399f7b3ce822abdde3a193a6b9a60b3cb5854ca", size = 23043098, upload-time = "2026-02-23T00:21:56.185Z" },
+    { url = "https://files.pythonhosted.org/packages/86/f1/3383beb9b5d0dbddd030335bf8a8b32d4317185efe495374f134d8be6cce/scipy-1.17.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e3dcd57ab780c741fde8dc68619de988b966db759a3c3152e8e9142c26295ad", size = 33030397, upload-time = "2026-02-23T00:22:01.404Z" },
+    { url = "https://files.pythonhosted.org/packages/41/68/8f21e8a65a5a03f25a79165ec9d2b28c00e66dc80546cf5eb803aeeff35b/scipy-1.17.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a9956e4d4f4a301ebf6cde39850333a6b6110799d470dbbb1e25326ac447f52a", size = 35281163, upload-time = "2026-02-23T00:22:07.024Z" },
+    { url = "https://files.pythonhosted.org/packages/84/8d/c8a5e19479554007a5632ed7529e665c315ae7492b4f946b0deb39870e39/scipy-1.17.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:a4328d245944d09fd639771de275701ccadf5f781ba0ff092ad141e017eccda4", size = 35116291, upload-time = "2026-02-23T00:22:12.585Z" },
+    { url = "https://files.pythonhosted.org/packages/52/52/e57eceff0e342a1f50e274264ed47497b59e6a4e3118808ee58ddda7b74a/scipy-1.17.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a77cbd07b940d326d39a1d1b37817e2ee4d79cb30e7338f3d0cddffae70fcaa2", size = 37682317, upload-time = "2026-02-23T00:22:18.513Z" },
+    { url = "https://files.pythonhosted.org/packages/11/2f/b29eafe4a3fbc3d6de9662b36e028d5f039e72d345e05c250e121a230dd4/scipy-1.17.1-cp314-cp314t-win_amd64.whl", hash = "sha256:eb092099205ef62cd1782b006658db09e2fed75bffcae7cc0d44052d8aa0f484", size = 37345327, upload-time = "2026-02-23T00:22:24.442Z" },
+    { url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165, upload-time = "2026-02-23T00:22:29.563Z" },
+]
+
 [[package]]
 name = "secretstorage"
 version = "3.5.0"

From 9c3555d03331bb8e05361dc49865df0355171d29 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:18:11 +0200
Subject: [PATCH 15/48] chore(evaluation): delete cli.py

---
 fireflyframework_agentic/evaluation/cli.py | 573 ---------------------
 1 file changed, 573 deletions(-)
 delete mode 100644 fireflyframework_agentic/evaluation/cli.py

diff --git a/fireflyframework_agentic/evaluation/cli.py b/fireflyframework_agentic/evaluation/cli.py
deleted file mode 100644
index 7ac868d9..00000000
--- a/fireflyframework_agentic/evaluation/cli.py
+++ /dev/null
@@ -1,573 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""flyeval — FlyRadar Lean Core evaluation CLI.
-
-Usage
------
-    flyeval gate      --result R.json --registry REG.json [--baseline B.json] [--judge-model P:M]
-    flyeval aa-band   --results R1.json R2.json ... --registry REG.json
-    flyeval day-zero  --result R.json --registry REG.json --baseline B.json --signoffs 2
-    flyeval invalidate --baseline B.json --reason "..."
-
-The deterministic gates G1-G3 + G5 (human sign-off) decide the verdict: every
-subcommand exits 0 on PROMOTE, 1 on HOLD.  G4 (the --judge-model LLM-as-a-Judge,
-on by default, --no-judge to skip) is non-blocking — it prints advisory signals
-and never changes the verdict or the exit code.
-"""
-
-from __future__ import annotations
-
-import argparse
-import hashlib
-import json
-import os
-import sys
-from pathlib import Path
-
-from fireflyframework_agentic.evaluation import __version__
-from fireflyframework_agentic.evaluation.champion import (
-    ChampionRecord,
-    invalidate_champion,
-    load_champion,
-    save_champion,
-)
-from fireflyframework_agentic.evaluation.corpus import load_corpus
-from fireflyframework_agentic.evaluation.gates import g2_recall_precision, run_gates
-from fireflyframework_agentic.evaluation.judge import run_judge
-from fireflyframework_agentic.evaluation.judge_client import build_embedder
-from fireflyframework_agentic.evaluation.matcher import matches
-from fireflyframework_agentic.evaluation.registry import load_registry
-from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict as get_verdict
-from fireflyframework_agentic.evaluation.stats import aa_band, left_skew_flag
-
-
-def _load_json(path: str) -> dict:
-    return json.loads(Path(path).read_text(encoding="utf-8"))
-
-
-def _lexical_missed_ids(result: dict, registry) -> list[str]:
-    """Scored (non-L3) real-item ids matched by no finding — the G2 lexical misses G4 recovers."""
-    evidence_index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")}
-    findings = result.get("findings", [])
-    scored = [i for i in registry.real_items if i.tier != "L3"]
-    return [i.id for i in scored if not any(matches(f, i, evidence_index) for f in findings)]
-
-
-def _read_experiment_config(result_path: str) -> dict | None:
-    """Read the experiment_configuration.json recorded next to the run's output.json.
-
-    The experiment config records how the run was generated; it is authored by the
-    generation step at run time.  Evaluation only reads it for display and never
-    writes or overwrites it.  Returns None when the run has no recorded config.
-    """
-    path = Path(result_path).parent / "experiment_configuration.json"
-    if not path.exists():
-        return None
-    return json.loads(path.read_text(encoding="utf-8"))
-
-
-def _write_eval_config(result_path: str, config: dict) -> Path:
-    """Write evaluation_configuration.json next to the run's output.json.
-
-    The evaluation config is authored by flyeval at gate time (registry/corpus SHAs,
-    recall metric, floors, judge settings), so unlike the experiment config it is
-    owned here and safe to (over)write each run.  It mirrors the block embedded in
-    the scorecard, as a machine-readable artifact.
-    """
-    path = Path(result_path).parent / "evaluation_configuration.json"
-    path.write_text(json.dumps(config, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
-    return path
-
-
-def _eval_config(args, registry, corpus=None) -> dict:
-    """Capture the run's evaluation configuration for provenance.
-
-    Uses getattr defaults so it works for both `gate` (has every flag) and
-    `day-zero` (lacks the gate-only flags, falling back to the lexical/no-judge
-    defaults, which honestly reflects how day-zero scores).
-    """
-    jm = getattr(args, "judge_model", None)
-    baseline = getattr(args, "baseline", None)
-    tau = getattr(args, "tau", 0.70)
-    return {
-        "evaluator_version": __version__,
-        "registry_sha256": registry.sha256(),
-        "corpus_sha256": corpus.sha256 if corpus else None,
-        "model_id": getattr(args, "model_id", None) or "unknown",
-        "gates": {
-            "G1": {
-                "name": "Structural & Safe",
-                "pii_list": getattr(args, "pii_list", None) or [],
-                "metrics": {
-                    "empty_must_find": "registry has >=1 must-find item; guards the fake-100% "
-                    "champion (EMPTY_MUST_FIND)",
-                    "registry_sha256_pin": "loaded registry matches its file hash (GOLD_DRIFT)",
-                    "corpus_sha256_pin": "corpus matches its hash when supplied (CORPUS_DRIFT)",
-                    "schema_valid": "required top-level keys present in the result "
-                    "(SCHEMA_INVALID)",
-                    "pii_non_disclosure": "no corpus PII name appears in any finding/report text "
-                    "(PII_LEAK)",
-                },
-            },
-            "G2": {
-                "name": "Recall & Precision",
-                "recall_metric": getattr(args, "recall_metric", "lexical"),
-                "recall_floor": getattr(args, "recall_floor", 0.70),
-                "tau": tau,
-                "tau_nc": getattr(args, "tau_nc", 0.85),
-                "embedder": getattr(args, "embedder", None),
-                "metrics": {
-                    "lexical_recall": "token-overlap recall (always reported)",
-                    "semantic_recall": "embedding-similarity recall at >= tau (needs embedder)",
-                    "hybrid_recall": "per item, a lexical OR semantic match",
-                    "per_tier_recall": "hit/total per tier L0-L3; an L0 miss blocks",
-                    "nc_precision": "negative-control items wrongly emitted; an NC hit blocks",
-                    "finding_redundancy_rate": "fraction of findings duplicating another's topic",
-                },
-            },
-            "G3": {
-                "name": "Grounded",
-                "grounding_floor": getattr(args, "grounding_floor", 0.90),
-                "human_spot_check_n": 5,
-                "corpus_verification": corpus is not None,
-                "metrics": {
-                    "grounding_pct": "findings whose cited excerpt shares a topic token; blocks "
-                    "below grounding_floor",
-                    "evidence_verified": "cited excerpts located in the actual corpus "
-                    "(when supplied)",
-                    "evidence_fabricated": "populated excerpts not found in their cited source "
-                    "(EVIDENCE_FABRICATED)",
-                    "evidence_source_unknown": "locators resolving to no corpus document "
-                    "(EVIDENCE_SOURCE_UNKNOWN)",
-                    "excerpt_fill_rate": "evidence entries carrying a populated excerpt",
-                    "source_coverage": "distinct corpus documents cited",
-                },
-            },
-            "G4": {
-                "name": "LLM Judge (advisory, non-blocking)",
-                "judge_model": jm,
-                "judge_runs": getattr(args, "judge_runs", 1) if jm else None,
-                "judge_concurrency": getattr(args, "judge_concurrency", 1) if jm else None,
-                "judge_temperature": 0.0 if jm else None,
-                "tau": tau if jm else None,
-                "metrics": {
-                    "faithfulness": "each finding's claim entailed by its cited evidence",
-                    "numeric_temporal_fidelity": "numbers and dates in findings match the evidence",
-                    "citation_relevance": "cited evidence refs are on-topic (context precision)",
-                    "nc_semantic_precision": "negative-control items semantically asserted",
-                    "fabricated_entity": "named entities absent from the corpus",
-                    "contradiction": "findings contradicting the evidence or each other",
-                    "open_gap": "a consequential issue the output failed to surface",
-                    "actionability": "proposed actions are specific and actionable",
-                    "severity_calibration": "stated severity matches the evidence",
-                    "answer_relevancy": "output addresses the workspace intention",
-                    "source_coverage": "distinct corpus documents cited (deterministic)",
-                    "excerpt_fill_rate": "evidence entries with a populated excerpt "
-                    "(deterministic)",
-                },
-            },
-            "G5": {
-                "name": "No-regression / promotion",
-                "is_day_zero": baseline is None,
-                "human_signed_off": getattr(args, "human_signed_off", False),
-                "signoffs": getattr(args, "signoffs", 0),
-                "baseline": baseline,
-                "baseline_sha256": _file_sha256(baseline) if baseline else None,
-                "metrics": {
-                    "improvements": "metrics beating the champion by more than the AA noise band",
-                    "regressions": "metrics that regressed versus the champion",
-                    "noise_band": "per-metric AA noise floor a candidate must exceed",
-                    "guardrail_regression": "any guardrail metric that dropped",
-                    "signoffs": "independent human sign-offs recorded",
-                },
-            },
-        },
-    }
-
-
-def _file_sha256(path: str) -> str | None:
-    """SHA-256 of a file's bytes, or None when it can't be read."""
-    try:
-        return hashlib.sha256(Path(path).read_bytes()).hexdigest()
-    except OSError:
-        return None
-
-
-# ── gate ──────────────────────────────────────────────────────────────────────
-
-
-def cmd_gate(args: argparse.Namespace) -> int:
-    if getattr(args, "no_judge", False):
-        args.judge_model = None  # explicit opt-out; G4 runs by default otherwise
-    result = _load_json(args.result)
-    registry = load_registry(args.registry)
-    corpus = load_corpus(args.corpus) if args.corpus else None
-    champion = load_champion(args.baseline) if args.baseline else None
-    champion_scores = champion.scores if champion else None
-    aa_noise = champion.aa_noise if champion else None
-
-    embed_fn = build_embedder(args.embedder) if args.embedder else None
-
-    if args.recall_metric in ("hybrid", "semantic") and embed_fn is None:
-        print(
-            f"ERROR: --recall-metric {args.recall_metric} requires --embedder.\n"
-            "  Example: --embedder openai:text-embedding-3-small",
-            file=sys.stderr,
-        )
-        return 2
-
-    gate_results = run_gates(
-        result,
-        registry,
-        args.registry,
-        pii_list=args.pii_list or [],
-        recall_floor=args.recall_floor,
-        grounding_floor=args.grounding_floor,
-        champion_scores=champion_scores,
-        aa_noise=aa_noise,
-        is_day_zero=(champion is None),
-        human_signed_off=args.human_signed_off,
-        signoff_count=args.signoffs,
-        embed_fn=embed_fn,
-        tau=args.tau,
-        recall_metric=args.recall_metric,
-        tau_nc=args.tau_nc,
-        corpus=corpus,
-    )
-
-    # G4 — on by default, non-blocking.  Skipped only with --no-judge; never affects the verdict.
-    advisory = None
-    if args.judge_model:
-        champion_result = _load_json(args.champion_result) if args.champion_result else None
-        advisory = run_judge(
-            result,
-            registry,
-            judge_model=args.judge_model,
-            runs=args.judge_runs,
-            concurrency=args.judge_concurrency,
-            pipeline_model=args.model_id or "",
-            champion_result=champion_result,
-            embed_fn=embed_fn,
-            tau=args.tau,
-            lexical_missed_ids=_lexical_missed_ids(result, registry),
-        )
-
-    config = _eval_config(args, registry, corpus)
-    _write_eval_config(args.result, config)
-    experiment_config = _read_experiment_config(args.result)
-    scorecard = render_scorecard(
-        gate_results,
-        corpus=registry.corpus,
-        model_id=args.model_id or "unknown",
-        run_id=args.run_id or "run",
-        is_self_graded=True,
-        kappa_advisory=registry.is_kappa_advisory(),
-        evidence_unverified=corpus is None,
-        advisory=advisory,
-        config=config,
-        experiment_config=experiment_config,
-    )
-    print(scorecard)
-
-    v = get_verdict(gate_results)
-    return 0 if v == "PROMOTE" else 1
-
-
-# ── aa-band ───────────────────────────────────────────────────────────────────
-
-
-def cmd_aa_band(args: argparse.Namespace) -> int:
-    registry = load_registry(args.registry)
-
-    if args.recall_metric in ("hybrid", "semantic") and not args.embedder:
-        print(
-            f"ERROR: --recall-metric {args.recall_metric} requires --embedder.\n"
-            "  Example: --embedder openai:text-embedding-3-small",
-            file=sys.stderr,
-        )
-        return 2
-
-    embed_fn = build_embedder(args.embedder) if args.embedder else None
-    corpus = load_corpus(args.corpus) if args.corpus else None
-    scores: list[float] = []
-
-    for rp in args.results:
-        result = _load_json(rp)
-        g2 = g2_recall_precision(
-            result, registry,
-            recall_metric=args.recall_metric, embed_fn=embed_fn,
-            tau=args.tau, tau_nc=args.tau_nc,
-            corpus=corpus,
-        )
-        if g2.passed or g2.details.get("recall") is not None:
-            scores.append(g2.details.get("recall", 0.0))
-
-    if len(scores) < 2:
-        print(
-            f"ERROR: need >= 2 runs for aa_band; got {len(scores)}.  "
-            "Make sure the registry is non-empty and the runs are valid.",
-            file=sys.stderr,
-        )
-        return 1
-
-    band = aa_band(scores)
-    high_var = left_skew_flag(scores)
-    print(f"A/A noise band (95th-pct pairwise delta): {band:.4f}")
-    print(f"Scores across reruns: {[round(s, 4) for s in scores]}")
-    if high_var:
-        print("WARNING: HIGH_VARIANCE — min < median - 0.10.  Investigate before using this band.")
-    return 0
-
-
-# ── day-zero ──────────────────────────────────────────────────────────────────
-
-
-def cmd_day_zero(args: argparse.Namespace) -> int:
-    result = _load_json(args.result)
-    registry = load_registry(args.registry)
-
-    if not args.corpus:
-        print(
-            "ERROR: day-zero (a promotion decision) requires --corpus for evidence\n"
-            "verification — a champion must not be minted on unverified evidence.\n"
-            "  Supply the run's input bundle, e.g.  --corpus experiments/<corpus>/input.json",
-            file=sys.stderr,
-        )
-        return 2
-    corpus = load_corpus(args.corpus)
-
-    if args.signoffs < 2:
-        print(
-            f"ERROR: Day-Zero requires 2 independent human sign-offs; got {args.signoffs}.",
-            file=sys.stderr,
-        )
-        return 1
-
-    gate_results = run_gates(
-        result,
-        registry,
-        args.registry,
-        is_day_zero=True,
-        human_signed_off=True,
-        signoff_count=args.signoffs,
-        corpus=corpus,
-    )
-
-    config = _eval_config(args, registry, corpus)
-    _write_eval_config(args.result, config)
-    experiment_config = _read_experiment_config(args.result)
-    v = get_verdict(gate_results)
-    scorecard = render_scorecard(
-        gate_results,
-        corpus=registry.corpus,
-        model_id=args.model_id or "unknown",
-        run_id=args.run_id or "day-zero",
-        is_self_graded=True,
-        kappa_advisory=registry.is_kappa_advisory(),
-        config=config,
-        experiment_config=experiment_config,
-    )
-    print(scorecard)
-
-    if v == "PROMOTE" and args.baseline:
-        g2 = next((g for g in gate_results if g.gate == "G2"), None)
-        g3 = next((g for g in gate_results if g.gate == "G3"), None)
-        scores = {}
-        if g2:
-            scores["recall"] = g2.details.get("recall", 0.0)
-        if g3:
-            scores["grounding_pct"] = g3.details.get("grounding_pct", 0.0)
-
-        champion = ChampionRecord(
-            corpus=registry.corpus,
-            run_id=args.run_id or "day-zero",
-            model_id=args.model_id or "unknown",
-            registry_sha256=registry.sha256(),
-            scores=scores,
-            is_day_zero=True,
-            human_sign_offs=[f"signoff-{i + 1}" for i in range(args.signoffs)],
-            config=config,
-            corpus_sha256=corpus.sha256,
-        )
-        save_champion(
-            args.baseline,
-            champion,
-            summary=f"Day-Zero champion for {registry.corpus}",
-            date=args.date or "unknown",
-        )
-        print(f"\nDay-Zero champion saved to {args.baseline}")
-
-    return 0 if v == "PROMOTE" else 1
-
-
-# ── invalidate ────────────────────────────────────────────────────────────────
-
-
-def cmd_invalidate(args: argparse.Namespace) -> int:
-    invalidate_champion(args.baseline, reason=args.reason, date=args.date or "unknown")
-    print(f"Champion invalidated in {args.baseline}.  Reason: {args.reason}")
-    return 0
-
-
-# ── parser ────────────────────────────────────────────────────────────────────
-
-
-def build_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(
-        prog="flyeval",
-        description="FlyRadar Lean Core eval: G1-G3 + G5 deterministic, G4 judge on by default",
-    )
-    sub = parser.add_subparsers(dest="command", required=True)
-
-    def _add_common(p: argparse.ArgumentParser) -> None:
-        p.add_argument("--result", required=True, help="Path to DiscoveryResult JSON")
-        p.add_argument("--registry", required=True, help="Path to lean-1 registry JSON")
-        p.add_argument(
-            "--corpus",
-            help="Path to the run's input.json corpus bundle — enables deterministic "
-            "evidence verification (required for day-zero; without it, gate runs "
-            "carry an EVIDENCE UNVERIFIED disclosure)",
-        )
-        p.add_argument("--baseline", help="Path to baseline.json (per-corpus champion store)")
-        p.add_argument("--model-id", default="unknown")
-        p.add_argument("--run-id", default="run")
-        p.add_argument("--date", default="", help="ISO date for promotion log")
-
-    # gate
-    p_gate = sub.add_parser("gate", help="Run the gates and print a scorecard")
-    _add_common(p_gate)
-    p_gate.add_argument("--recall-floor", type=float, default=0.70)
-    p_gate.add_argument("--grounding-floor", type=float, default=0.90)
-    p_gate.add_argument("--pii-list", nargs="*", default=[])
-    p_gate.add_argument(
-        "--embedder",
-        default=os.environ.get("FLYEVAL_EMBEDDER"),
-        help="opt-in embedder spec for the semantic recall path "
-        '(e.g. "azure:text-embedding-3-small"); omit for pure-lexical recall. '
-        "Env: FLYEVAL_EMBEDDER",
-    )
-    p_gate.add_argument(
-        "--recall-metric",
-        choices=["lexical", "semantic", "hybrid"],
-        default=os.environ.get("FLYEVAL_RECALL_METRIC", "hybrid"),
-        help="which recall metric GATES (default hybrid; hybrid/semantic require --embedder). "
-        "Env: FLYEVAL_RECALL_METRIC",
-    )
-    p_gate.add_argument(
-        "--tau",
-        type=float,
-        default=float(os.environ.get("FLYEVAL_TAU", "0.70")),
-        help="cosine similarity threshold for the semantic recall path (real items). "
-        "Env: FLYEVAL_TAU",
-    )
-    p_gate.add_argument(
-        "--tau-nc",
-        type=float,
-        default=float(os.environ.get("FLYEVAL_TAU_NC", "0.85")),
-        help="cosine similarity threshold for NC item detection (higher; no source anchor). "
-        "Env: FLYEVAL_TAU_NC",
-    )
-    p_gate.add_argument("--human-signed-off", action="store_true")
-    p_gate.add_argument("--signoffs", type=int, default=0)
-    p_gate.add_argument(
-        "--judge-model",
-        default=os.environ.get("FLYEVAL_JUDGE_MODEL", "anthropic:claude-sonnet-4-6"),
-        help="provider:model for the non-blocking G4 LLM-as-a-Judge (e.g. azure:gpt-4o). "
-        "Runs by default; pass --no-judge to skip G4. Env: FLYEVAL_JUDGE_MODEL",
-    )
-    p_gate.add_argument(
-        "--no-judge",
-        action="store_true",
-        help="skip the G4 LLM-as-a-Judge (it runs by default).",
-    )
-    p_gate.add_argument(
-        "--judge-runs",
-        type=int,
-        default=int(os.environ.get("FLYEVAL_JUDGE_RUNS", "1")),
-        help="G4 judge runs; the median of numeric scores is kept (odd recommended). "
-        "Env: FLYEVAL_JUDGE_RUNS",
-    )
-    p_gate.add_argument(
-        "--judge-concurrency",
-        type=int,
-        default=int(os.environ.get("FLYEVAL_JUDGE_CONCURRENCY", "1")),
-        help="bounded fan-out for the per-item G4 [J] metrics (1 = sequential; "
-        ">=2 runs each metric's chat calls across a thread pool, order preserved). "
-        "Env: FLYEVAL_JUDGE_CONCURRENCY",
-    )
-    p_gate.add_argument(
-        "--champion-result",
-        help="Path to the champion's output.json for the G4 comparative-review metric",
-    )
-    p_gate.set_defaults(func=cmd_gate)
-
-    # aa-band
-    p_aa = sub.add_parser("aa-band", help="Compute A/A noise band from champion reruns")
-    p_aa.add_argument(
-        "--results",
-        nargs="+",
-        required=True,
-        help="Paths to champion-rerun result JSON files (>= 2)",
-    )
-    p_aa.add_argument("--registry", required=True)
-    p_aa.add_argument(
-        "--recall-metric",
-        choices=["lexical", "semantic", "hybrid"],
-        default="hybrid",
-        help="recall metric to use — must match the champion's metric (default hybrid; "
-        "hybrid/semantic require --embedder)",
-    )
-    p_aa.add_argument(
-        "--embedder",
-        default=None,
-        help="embedder spec for semantic/hybrid recall (e.g. ollama:bge-m3)",
-    )
-    p_aa.add_argument("--tau", type=float, default=0.70)
-    p_aa.add_argument("--tau-nc", type=float, default=0.85)
-    p_aa.add_argument(
-        "--corpus",
-        help="Path to input.json — must match the gate's corpus setting so the "
-        "band is computed under the same evidence filtering as the champion",
-    )
-    p_aa.set_defaults(func=cmd_aa_band)
-
-    # day-zero
-    p_dz = sub.add_parser("day-zero", help="Promote the inaugural champion (Day-Zero protocol)")
-    _add_common(p_dz)
-    p_dz.add_argument(
-        "--signoffs",
-        type=int,
-        default=0,
-        help="Number of independent human sign-offs collected (need 2)",
-    )
-    p_dz.set_defaults(func=cmd_day_zero)
-
-    # invalidate
-    p_inv = sub.add_parser("invalidate", help="Invalidate the current champion")
-    p_inv.add_argument("--baseline", required=True)
-    p_inv.add_argument("--reason", required=True)
-    p_inv.add_argument("--date", default="")
-    p_inv.set_defaults(func=cmd_invalidate)
-
-    return parser
-
-
-def main() -> None:
-    parser = build_parser()
-    args = parser.parse_args()
-    sys.exit(args.func(args))
-
-
-if __name__ == "__main__":
-    main()

From e9fd9651a017a037330ff698f0768572d0d3f557 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:18:18 +0200
Subject: [PATCH 16/48] chore(evaluation): delete gates.py

---
 fireflyframework_agentic/evaluation/gates.py | 840 -------------------
 1 file changed, 840 deletions(-)
 delete mode 100644 fireflyframework_agentic/evaluation/gates.py

diff --git a/fireflyframework_agentic/evaluation/gates.py b/fireflyframework_agentic/evaluation/gates.py
deleted file mode 100644
index 057bfea7..00000000
--- a/fireflyframework_agentic/evaluation/gates.py
+++ /dev/null
@@ -1,840 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Four gates — every gate always runs; a failure raises a flag, not a veto.
-
-Gate pipeline (EVALUATION_FRAMEWORK.md §6):
-    G1 — Structural & Safe
-    G2 — Must-finds & negative controls
-    G3 — Evidence (grounding)
-    G5 — No-regression / promotion (human decision)
-
-Each gate is a pure function of the result dict + supporting inputs.
-run_gates() always executes all four gates and returns all four results so
-the scorecard carries the complete picture regardless of which flags fire.
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-
-from fireflyframework_agentic.evaluation import matcher
-from fireflyframework_agentic.evaluation.corpus import (
-    EMPTY,
-    FABRICATED,
-    SOURCE_UNKNOWN,
-    VERIFIED,
-    Corpus,
-    corpus_sha256,
-    verify_evidence_index,
-)
-from fireflyframework_agentic.evaluation.matcher import anchored, matches
-from fireflyframework_agentic.evaluation.registry import Registry, registry_sha256
-
-
-@dataclass
-class GateResult:
-    gate: str
-    passed: bool
-    reason_code: str = ""
-    details: dict = field(default_factory=dict)
-
-    def __str__(self) -> str:
-        status = "PASS" if self.passed else f"FLAG:{self.reason_code}"
-        return f"[{self.gate}] {status}"
-
-
-class Verdict:
-    """Promotion gate verdict constants.
-
-    Use ``Verdict.PROMOTE`` when the challenger meets the quality bar and
-    is safe to become the new champion.  Use ``Verdict.HOLD`` when the
-    challenger does not meet the bar and must be iterated on.
-    """
-
-    PROMOTE: str = "PROMOTE"
-    HOLD: str = "HOLD"
-
-
-def render_scorecard(gate_results: list[GateResult]) -> str:
-    """Render a human-readable scorecard from a list of GateResult objects.
-
-    Emits one line per gate: ``[G1] PASS`` or ``[G2] FLAG:RECALL_BELOW_FLOOR``.
-    The overall verdict (PROMOTE / HOLD) appears on the final line.  A run
-    promotes only when every gate passes; any flag signals HOLD.
-    """
-    lines = [str(r) for r in gate_results]
-    all_passed = all(r.passed for r in gate_results)
-    verdict = Verdict.PROMOTE if all_passed else Verdict.HOLD
-    lines.append(f"VERDICT: {verdict}")
-    return "\n".join(lines)
-
-
-def _build_evidence_index(result: dict, corpus: Corpus | None = None) -> dict[str, dict]:
-    """Index evidence by id; with a corpus, drop entries that fail verification.
-
-    Dropped entries (FABRICATED excerpt or SOURCE_UNKNOWN locator) cannot
-    contribute source stems to G2's shared-source guard or excerpts to G3's
-    grounding — a run cannot anchor anything on evidence it invented.  EMPTY
-    entries are kept: an empty excerpt is a format problem, not fabrication,
-    and its (verified) locator stem is still a legitimate citation.
-    """
-    index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")}
-    if corpus is None:
-        return index
-    statuses = verify_evidence_index(corpus, result)
-    return {
-        eid: ev
-        for eid, ev in index.items()
-        if statuses[eid] in (VERIFIED, EMPTY)
-    }
-
-
-# ── G1: Structural & Safe ────────────────────────────────────────────────────
-
-
-def _name_duplication_rate(nodes: list[dict]) -> float:
-    """Tier-1 + Tier-2 name clustering; returns 1 - clusters/count.
-
-    Tier 1: same normalized id (lower-case) merges nodes into one cluster.
-    Tier 2: name token-Jaccard >= 0.6 merges nodes into one cluster.
-
-    Report-only: no gate flag fires on any threshold.
-    """
-    n = len(nodes)
-    if n < 2:
-        return 0.0
-
-    group = list(range(n))
-
-    def _root(i: int) -> int:
-        while group[i] != i:
-            group[i] = group[group[i]]
-            i = group[i]
-        return i
-
-    seen: dict[str, int] = {}
-    for i, node in enumerate(nodes):
-        nid = node.get("id", "").lower()
-        if nid in seen:
-            group[_root(i)] = _root(seen[nid])
-        else:
-            seen[nid] = i
-
-    toks = [frozenset(node.get("name", "").lower().split()) for node in nodes]
-    for i in range(n):
-        for j in range(i + 1, n):
-            a, b = toks[i], toks[j]
-            union_ab = a | b
-            if union_ab and len(a & b) / len(union_ab) >= 0.6:
-                group[_root(i)] = _root(j)
-
-    clusters = len({_root(i) for i in range(n)})
-    return round(1 - clusters / n, 4)
-
-
-def g1_structural(
-    result: dict,
-    registry: Registry,
-    registry_path: str,
-    *,
-    pii_list: list[str] | None = None,
-    corpus: Corpus | None = None,
-) -> GateResult:
-    """G1 — Structural & Safe (hard veto).
-
-    Checks (in order):
-    1. EMPTY_MUST_FIND — must run first; kills the fake-100%-champion bug.
-    2. Registry SHA-256 pin: loaded Registry matches the file on disk.
-    3. Corpus SHA-256 pin (when a corpus is supplied): same drift guard for
-       the evidence universe (CORPUS_DRIFT).
-    4. Required top-level keys present in result.
-    5. PII non-disclosure: no corpus PII name in any finding/report text.
-    """
-    # Guard 1: empty registry (fake-champion guard — always first)
-    if not registry.real_items:
-        return GateResult(
-            gate="G1",
-            passed=False,
-            reason_code="EMPTY_MUST_FIND",
-            details={"message": "Registry has zero real items — cannot evaluate recall."},
-        )
-
-    # Guard 2: registry SHA-256 pin
-    computed_sha = registry_sha256(registry_path)
-    if computed_sha != registry.sha256():
-        return GateResult(
-            gate="G1",
-            passed=False,
-            reason_code="GOLD_DRIFT",
-            details={
-                "message": "Registry file has changed since it was loaded.",
-                "expected": registry.sha256(),
-                "actual": computed_sha,
-            },
-        )
-
-    # Guard 3: corpus SHA-256 pin (CORPUS_DRIFT — the GOLD_DRIFT twin for evidence)
-    if corpus is not None:
-        current_corpus_sha = corpus_sha256(corpus.path)
-        if current_corpus_sha != corpus.sha256:
-            return GateResult(
-                gate="G1",
-                passed=False,
-                reason_code="CORPUS_DRIFT",
-                details={
-                    "message": "Corpus file has changed since it was loaded.",
-                    "expected": corpus.sha256,
-                    "actual": current_corpus_sha,
-                },
-            )
-
-    # Guard 4: required result keys
-    required = ("process_graph", "findings", "evidence_index")
-    missing = [k for k in required if k not in result]
-    if missing:
-        return GateResult(
-            gate="G1",
-            passed=False,
-            reason_code="SCHEMA_INVALID",
-            details={"missing_keys": missing},
-        )
-
-    # Guard 5: PII check
-    if pii_list:
-        free_text: list[str] = []
-        for finding in result.get("findings", []):
-            free_text.extend([finding.get("title", ""), finding.get("description", "")])
-        for report in result.get("reports", []):
-            free_text.append(str(report))
-        combined = " ".join(free_text).lower()
-        hits = [name for name in pii_list if name.lower() in combined]
-        if hits:
-            return GateResult(
-                gate="G1",
-                passed=False,
-                reason_code="PII_LEAK",
-                details={
-                    "message": "Corpus PII names found in findings/reports.",
-                    "matches": hits[:5],
-                },
-            )
-
-    pg = result.get("process_graph", {})
-    processes = pg.get("processes", [])
-    activities = [a for p in processes for a in p.get("activities", [])]
-    decisions = [d for p in processes for d in p.get("decisions", [])]
-    dg = result.get("dependency_graph", {})
-
-    details = {
-        "registry_sha256": registry.sha256(),
-        "real_items": len(registry.real_items),
-        "nc_items": len(registry.nc_items),
-        "map": {
-            "processes": {
-                "count": len(processes),
-                "duplication_rate": _name_duplication_rate(processes),
-            },
-            "activities": {
-                "count": len(activities),
-                "duplication_rate": _name_duplication_rate(activities),
-            },
-            "decisions": {
-                "count": len(decisions),
-                "duplication_rate": _name_duplication_rate(decisions),
-            },
-            "personas": {
-                "count": len(result.get("personas", [])),
-                "duplication_rate": _name_duplication_rate(result.get("personas", [])),
-            },
-            "systems": {
-                "count": len(result.get("systems", [])),
-                "duplication_rate": _name_duplication_rate(result.get("systems", [])),
-            },
-            "informal_channels": {
-                "count": len(result.get("informal_channels", [])),
-                "duplication_rate": _name_duplication_rate(result.get("informal_channels", [])),
-            },
-            "dependency_graph_edges": len(dg.get("activity_edges", [])),
-        },
-    }
-    if corpus is not None:
-        details["corpus_sha256"] = corpus.sha256
-    return GateResult(gate="G1", passed=True, details=details)
-
-
-# ── G2: Recall & Precision ───────────────────────────────────────────────────
-
-
-def _candidates_by_scope(result: dict) -> dict[str, list[dict]]:
-    """Build per-scope candidate lists from a DiscoveryResult (§4.3).
-
-    Process candidates are augmented with their children's evidence_refs because
-    process nodes typically carry no own refs — the source-document guard uses the
-    union of the process's own refs and all its activities' and decisions' refs.
-
-    dependency_graph-scoped items are relation items (all carry from/to) and are
-    matched via matcher.matches_dependency_graph_relation() — not through per-candidate
-    iteration — so no "dependency_graph" key is included here.
-    """
-    pg = result.get("process_graph", {})
-    processes = pg.get("processes", [])
-
-    def _merge_refs(proc: dict) -> dict:
-        children_refs = [
-            ref
-            for child_list in (proc.get("activities", []), proc.get("decisions", []))
-            for child in child_list
-            for ref in child.get("evidence_refs", [])
-        ]
-        return {**proc, "evidence_refs": list(proc.get("evidence_refs", [])) + children_refs}
-
-    return {
-        "process": [_merge_refs(p) for p in processes],
-        "activity": [a for p in processes for a in p.get("activities", [])],
-        "decision": [d for p in processes for d in p.get("decisions", [])],
-        "finding": result.get("findings", []),
-        "action": result.get("proposed_actions", []),
-        "persona": result.get("personas", []),
-        "system": result.get("systems", []),
-        "informal_channel": result.get("informal_channels", []),
-    }
-
-
-def _weighted_recall(scored_items: list, hits: dict[str, bool]) -> float:
-    """Weighted recall of a hit map over the scored (non-L3) items."""
-    total_weight = sum(item.weight for item in scored_items) or 1.0
-    weighted_hit = sum(item.weight for item in scored_items if hits[item.id])
-    return weighted_hit / total_weight
-
-
-def _finding_redundancy_rate(findings: list[dict]) -> float:
-    """Fraction of findings that are near-duplicates of another (Jaccard ≥0.6 on ≥5-char tokens)."""
-    if len(findings) < 2:
-        return 0.0
-    def _tok(text: str) -> frozenset[str]:
-        return frozenset(t.lower() for t in text.split() if len(t) >= 5)
-    token_sets = [_tok(f.get("description", "")) for f in findings]
-    in_redundant: set[int] = set()
-    for i in range(len(token_sets)):
-        for j in range(i + 1, len(token_sets)):
-            a, b = token_sets[i], token_sets[j]
-            union = a | b
-            sim = len(a & b) / len(union) if union else 1.0
-            if sim >= 0.6:
-                in_redundant.add(i)
-                in_redundant.add(j)
-    return round(len(in_redundant) / len(findings), 4)
-
-
-def g2_recall_precision(
-    result: dict,
-    registry: Registry,
-    *,
-    recall_floor: float = 0.70,
-    embed_fn=None,
-    tau: float = 0.70,
-    tau_nc: float = 0.85,
-    recall_metric: str = "lexical",
-    corpus: Corpus | None = None,
-) -> GateResult:
-    """G2 — Recall & Precision (hard veto).
-
-    - L0 miss  -> BLOCK (zeros the evaluation; regulatory-mandatory item absent)
-    - NC hit   -> BLOCK (precision failure; plausible-but-false item was emitted)
-    - recall < floor -> BLOCK
-
-    With a ``corpus``, evidence entries that fail verification (fabricated
-    excerpt or unknown source) are excluded from the evidence index before
-    matching, so the shared-source guard only accepts citations to real
-    corpus documents — a fabricated locator cannot satisfy any item.
-
-    ``recall_metric`` ("lexical"/"semantic"/"hybrid") selects which hit map GATES.
-    "lexical" is matcher.matches (shared-source + topic-anchored token overlap) and
-    needs no embedder.  "semantic"/"hybrid" add the embedding path (matcher.semantic_hits,
-    threshold ``tau`` for real items, ``tau_nc`` for NC items) and REQUIRE ``embed_fn``
-    — passing them without one raises ValueError (use "lexical" for the offline path).
-    When an embedder is supplied, all three recalls (lexical/semantic/hybrid) are
-    reported in details regardless of which one gates.
-    """
-    evidence_index = _build_evidence_index(result, corpus)
-    candidates = _candidates_by_scope(result)
-    findings = candidates["finding"]
-
-    # NC items anchor via the embedding path only (§6.2): a correct finding about
-    # the true mirror fact shares vocabulary with the false description, so a
-    # token or keyword match would falsely convict it.  Lexical NC is always False.
-    # dependency_graph relation items (those with from_node) use the endpoint
-    # matcher (§5.3b) instead of the per-candidate text predicate.
-    lexical: dict[str, bool] = {}
-    for item in registry.items:
-        if item.tier == "NC":
-            lexical[item.id] = False
-        elif item.scope == "dependency_graph" and item.from_node:
-            lexical[item.id] = matcher.matches_dependency_graph_relation(
-                item, result, evidence_index
-            )
-        else:
-            lexical[item.id] = any(
-                matches(c, item, evidence_index, scope=scope)
-                for scope in matcher.allowed_scopes(item)
-                for c in candidates.get(scope, [])
-            )
-
-    if recall_metric not in ("lexical", "semantic", "hybrid"):
-        raise ValueError(f"unknown recall_metric {recall_metric!r}")
-    if recall_metric in ("semantic", "hybrid") and embed_fn is None:
-        raise ValueError(
-            f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn"
-        )
-
-    if embed_fn is not None:
-        semantic = matcher.semantic_hits(
-            candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc
-        )
-        # dependency_graph relation items have no embedding candidates (§5.3b uses
-        # the endpoint matcher, not per-candidate text embeddings); mirror the
-        # lexical result so semantic/hybrid never under-credits them.
-        for item in registry.items:
-            if item.scope == "dependency_graph" and item.from_node:
-                semantic[item.id] = lexical[item.id]
-    else:
-        semantic = None
-
-    metric = recall_metric
-
-    if semantic is None or metric == "lexical":
-        hits = lexical
-    elif metric == "semantic":
-        hits = semantic
-    else:  # hybrid
-        hits = {iid: lexical[iid] or semantic[iid] for iid in lexical}
-
-    # Signal-to-noise panel — report-only, §6.2 item 3
-    finding_count = len(findings)
-    finding_scoped_items = [i for i in registry.real_items if i.scope == "finding"]
-    findings_matched = sum(
-        1 for f in findings
-        if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items)
-    )
-    _sn = {
-        "finding_count": finding_count,
-        "findings_matched_to_registry": {
-            "count": findings_matched,
-            "fraction": round(findings_matched / finding_count, 4) if finding_count else 0.0,
-        },
-        "finding_redundancy_rate": _finding_redundancy_rate(findings),
-    }
-    if corpus is not None:
-        excluded = len(_build_evidence_index(result)) - len(evidence_index)
-        _sn["evidence_entries_excluded_unverified"] = excluded
-
-    # L0 misses
-    l0_misses = [item.id for item in registry.l0_items if not hits[item.id]]
-    if l0_misses:
-        return GateResult(
-            gate="G2",
-            passed=False,
-            reason_code="L0_MISSING",
-            details={
-                "l0_misses": l0_misses,
-                "message": "Regulatory-mandatory items not found — evaluation zeroed.",
-                **_sn,
-            },
-        )
-
-    # NC precision
-    nc_hits = [item.id for item in registry.nc_items if hits[item.id]]
-    if nc_hits:
-        return GateResult(
-            gate="G2",
-            passed=False,
-            reason_code="NC_HIT",
-            details={
-                "nc_hits": nc_hits,
-                "message": "Plausible-but-false negative control items were matched — precision failure.",
-                **_sn,
-            },
-        )
-
-    # Weighted recall — over scored items only (L0/L1/L2).  L3 is a bonus tier
-    # ("extra credit"): an L3 miss must not lower recall, so L3 is excluded from
-    # the denominator and only reported in per_tier below.  Recall is computed over
-    # the GATING hit map so the gate is internally consistent with the chosen metric.
-    real_items = registry.real_items
-    scored_items = [item for item in real_items if item.tier != "L3"]
-    recall = _weighted_recall(scored_items, hits)
-
-    per_tier: dict[str, dict] = {}
-    for tier in ("L0", "L1", "L2", "L3"):
-        tier_items = [i for i in real_items if i.tier == tier]
-        if not tier_items:
-            continue
-        per_tier[tier] = {
-            "hit": sum(1 for i in tier_items if hits[i.id]),
-            "total": len(tier_items),
-        }
-
-    def _semantic_details() -> dict:
-        """The extra recall-breakdown keys, only emitted when an embedder is given."""
-        if semantic is None:
-            return {}
-        return {
-            "lexical_recall": round(_weighted_recall(scored_items, lexical), 4),
-            "semantic_recall": round(_weighted_recall(scored_items, semantic), 4),
-            "hybrid_recall": round(
-                _weighted_recall(
-                    scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical}
-                ),
-                4,
-            ),
-            "tau": tau,
-        }
-
-    if recall < recall_floor:
-        return GateResult(
-            gate="G2",
-            passed=False,
-            reason_code="RECALL_BELOW_FLOOR",
-            details={
-                "recall": round(recall, 4),
-                "recall_metric": metric,
-                "floor": recall_floor,
-                "per_tier": per_tier,
-                "misses": [item.id for item in scored_items if not hits[item.id]],
-                **_semantic_details(),
-                **_sn,
-            },
-        )
-
-    return GateResult(
-        gate="G2",
-        passed=True,
-        details={
-            "recall": round(recall, 4),
-            "recall_metric": metric,
-            "floor": recall_floor,
-            "per_tier": per_tier,
-            "nc_items_checked": len(registry.nc_items),
-            **_semantic_details(),
-            **_sn,
-        },
-    )
-
-
-# ── G3: Grounded ─────────────────────────────────────────────────────────────
-
-
-def g3_grounded(
-    result: dict,
-    *,
-    grounding_floor: float = 0.90,
-    human_spot_check_n: int = 5,
-    corpus: Corpus | None = None,
-) -> GateResult:
-    """G3 — Grounded (automated portion; human spot-check triggered on pass).
-
-    For each finding, verifies that at least one cited evidence excerpt shares a
-    non-trivial token with the finding description (topic-anchoring).
-
-    With a ``corpus``, the gate also looks in a third direction — cited ->
-    exists: every evidence entry is verified against the actual corpus text
-    (corpus.verify_entry).  A populated excerpt not found in its cited source
-    raises EVIDENCE_FABRICATED; a locator resolving to no corpus document
-    raises EVIDENCE_SOURCE_UNKNOWN; and only verified excerpts can ground a
-    finding, so a run cannot ground itself on evidence it invented.
-
-    Also reports excerpt fill rate and source coverage so the reviewer can tell
-    whether ungrounded findings are a format problem (empty excerpts) or a real
-    faithfulness signal (populated excerpts that do not anchor).
-
-    Known limitation: topic-anchoring, not claim entailment.  A '45 days' claim
-    cited to a '3 days' source passes if they share the process name (excerpt
-    verification confirms the quote is real, not that the claim matches it).
-    The human spot-check is the binding faithfulness signal until NLI/AIS lands.
-    """
-    evidence_index = _build_evidence_index(result)
-    findings = result.get("findings", [])
-    statuses = verify_evidence_index(corpus, result) if corpus is not None else None
-
-    if not findings:
-        return GateResult(
-            gate="G3",
-            passed=False,
-            reason_code="NO_FINDINGS",
-            details={"message": "Result has zero findings — cannot compute grounding."},
-        )
-
-    grounded_ids: list[str] = []
-    # Ungrounded split (§6.3): distinguish format issues from real faithfulness failures.
-    ungrounded_empty_only: list[str] = []    # every ref had an empty excerpt
-    ungrounded_populated: list[str] = []     # had populated excerpt(s) but none anchored
-
-    # Excerpt fill: count all resolved refs and how many carry a non-empty excerpt.
-    total_refs = 0
-    populated_refs = 0
-
-    # Source coverage: which source stems are cited by at least one finding.
-    cited_stems: set[str] = set()
-
-    for finding in findings:
-        fid = finding.get("id", "?")
-        desc = finding.get("description", "")
-        is_grounded = False
-        had_populated = False
-        for ref in finding.get("evidence_refs", []):
-            ev = evidence_index.get(ref.get("evidence_id", ""))
-            if ev:
-                total_refs += 1
-                excerpt = ev.get("excerpt") or ""
-                if excerpt:
-                    populated_refs += 1
-                    had_populated = True
-                    # Track source coverage (even for ungrounded findings).
-                    stem = matcher.source_stem(ev.get("locator", ""))
-                    if stem:
-                        cited_stems.add(stem)
-                    # Only a corpus-verified excerpt can ground a finding.
-                    if statuses is not None and statuses.get(ev.get("id")) != VERIFIED:
-                        continue
-                    if anchored(desc, excerpt):
-                        is_grounded = True
-                        break
-        if is_grounded:
-            grounded_ids.append(fid)
-        elif had_populated:
-            ungrounded_populated.append(fid)
-        else:
-            ungrounded_empty_only.append(fid)
-
-    grounding_pct = len(grounded_ids) / len(findings)
-
-    # All source stems present in the evidence index (not just those cited).
-    all_stems: set[str] = set()
-    for ev in result.get("evidence_index", []):
-        stem = matcher.source_stem(ev.get("locator", ""))
-        if stem:
-            all_stems.add(stem)
-    orphaned = sorted(all_stems - cited_stems)
-
-    excerpt_fill = f"{populated_refs}/{total_refs}" if total_refs else "0/0"
-    source_coverage = f"{len(cited_stems)}/{len(all_stems)}" if all_stems else "0/0"
-
-    details = {
-        "grounding_pct": round(grounding_pct, 4),
-        "grounded": len(grounded_ids),
-        "total": len(findings),
-        "excerpt_fill": excerpt_fill,
-        "source_coverage": source_coverage,
-        "orphaned_sources": orphaned,
-    }
-
-    fabricated_ids: list[str] = []
-    unknown_source_ids: list[str] = []
-    if statuses is not None:
-        fabricated_ids = sorted(e for e, s in statuses.items() if s == FABRICATED)
-        unknown_source_ids = sorted(e for e, s in statuses.items() if s == SOURCE_UNKNOWN)
-        details["evidence_verification"] = {
-            "entries": len(statuses),
-            "verified": sum(1 for s in statuses.values() if s == VERIFIED),
-            "empty_excerpt": sum(1 for s in statuses.values() if s == EMPTY),
-            "fabricated": fabricated_ids,
-            "source_unknown": unknown_source_ids,
-        }
-
-    if fabricated_ids:
-        details["message"] = (
-            "Populated excerpt(s) not found in the cited corpus document — "
-            "the run asserts evidence the source does not contain."
-        )
-        return GateResult(
-            gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details
-        )
-
-    if unknown_source_ids:
-        details["message"] = (
-            "Evidence locator(s) resolve to no corpus document — either the "
-            "corpus bundle is incomplete or the run invented a source."
-        )
-        return GateResult(
-            gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details
-        )
-
-    if grounding_pct < grounding_floor:
-        details["floor"] = grounding_floor
-        details["ungrounded_with_populated_excerpts"] = ungrounded_populated
-        details["ungrounded_with_empty_excerpts_only"] = ungrounded_empty_only
-        return GateResult(gate="G3", passed=False, reason_code="UNGROUNDED", details=details)
-
-    spot_n = min(human_spot_check_n, len(findings))
-    details["human_spot_check"] = (
-        f"ACTION REQUIRED: manually review {spot_n} sampled findings for "
-        "field-consistency, citation-accuracy, and client-readiness.  "
-        "This is the binding faithfulness signal until NLI/AIS lands."
-    )
-    return GateResult(gate="G3", passed=True, details=details)
-
-
-# ── G5: No-regression / promotion (human decision) ───────────────────────────
-
-
-def g5_no_regression(
-    candidate_scores: dict[str, float],
-    champion_scores: dict[str, float] | None,
-    aa_noise: dict[str, float] | None,
-    *,
-    is_day_zero: bool = False,
-    human_signed_off: bool = False,
-    signoff_count: int = 0,
-) -> GateResult:
-    """G5 — No-regression / promotion gate (human decision).
-
-    Day-Zero: no champion exists.  Requires G1-G3 pass + 2 independent sign-offs.
-    Normal promotion: candidate must beat champion by > aa_noise on every metric,
-    no guardrail regresses, + 1 human sign-off.
-
-    Champions are per-corpus.  Do not compare across corpora.
-    """
-    if is_day_zero or champion_scores is None:
-        required = 2
-        if signoff_count < required:
-            return GateResult(
-                gate="G5",
-                passed=False,
-                reason_code="HOLD",
-                details={
-                    "reason": (
-                        f"Day-Zero requires {required} independent human sign-offs "
-                        f"(kappa >= 0.70); got {signoff_count}."
-                    ),
-                    "action": "Collect sign-offs, then re-run with --day-zero --signoffs 2",
-                },
-            )
-        return GateResult(
-            gate="G5",
-            passed=True,
-            details={"day_zero": True, "signoffs": signoff_count},
-        )
-
-    if not human_signed_off:
-        return GateResult(
-            gate="G5",
-            passed=False,
-            reason_code="HOLD",
-            details={"reason": "Human sign-off required for promotion."},
-        )
-
-    noise = aa_noise or {}
-    regressions: list[str] = []
-    improvements: list[str] = []
-
-    for metric, cand_val in candidate_scores.items():
-        champ_val = champion_scores.get(metric)
-        if champ_val is None:
-            continue
-        delta = cand_val - champ_val
-        band = noise.get(metric, 0.0)
-        if delta < -band:
-            regressions.append(
-                f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} "
-                f"delta={delta:+.4f} < -band={-band:.4f}"
-            )
-        elif delta > band:
-            improvements.append(f"{metric}: delta={delta:+.4f} > band={band:.4f}")
-
-    if regressions:
-        return GateResult(
-            gate="G5",
-            passed=False,
-            reason_code="HOLD",
-            details={
-                "regressions": regressions,
-                "improvements": improvements,
-                "message": "Guardrail metric(s) regressed beyond A/A noise band.",
-            },
-        )
-
-    return GateResult(
-        gate="G5",
-        passed=True,
-        details={"improvements": improvements, "noise_band": noise},
-    )
-
-
-# ── Full gate pipeline ────────────────────────────────────────────────────────
-
-
-def run_gates(
-    result: dict,
-    registry: Registry,
-    registry_path: str,
-    *,
-    pii_list: list[str] | None = None,
-    recall_floor: float = 0.70,
-    grounding_floor: float = 0.90,
-    champion_scores: dict[str, float] | None = None,
-    aa_noise: dict[str, float] | None = None,
-    is_day_zero: bool = False,
-    human_signed_off: bool = False,
-    signoff_count: int = 0,
-    embed_fn=None,
-    tau: float = 0.70,
-    tau_nc: float = 0.85,
-    recall_metric: str = "lexical",
-    corpus: Corpus | None = None,
-) -> list[GateResult]:
-    """Run all gates G1 -> G2 -> G3 -> G5; every gate always executes.
-
-    A failed gate raises a flag in its GateResult but never prevents the
-    remaining gates from running.  The scorecard therefore always carries the
-    complete picture: a run that misses a regulatory item *and* grounds poorly
-    shows both flags.  See EVALUATION_FRAMEWORK.md §2 ('No gate vetoes').
-
-    ``corpus`` (optional) enables deterministic evidence verification: G1 pins
-    the corpus hash, G2 ignores unverified evidence entries, and G3 flags
-    fabricated excerpts and unknown sources.  Without it, evidence is taken at
-    face value from the run's own evidence_index (disclosed on the scorecard).
-
-    Returns all four GateResult objects.
-    """
-    g1 = g1_structural(result, registry, registry_path, pii_list=pii_list, corpus=corpus)
-
-    g2 = g2_recall_precision(
-        result,
-        registry,
-        recall_floor=recall_floor,
-        embed_fn=embed_fn,
-        tau=tau,
-        tau_nc=tau_nc,
-        recall_metric=recall_metric,
-        corpus=corpus,
-    )
-
-    g3 = g3_grounded(result, grounding_floor=grounding_floor, corpus=corpus)
-
-    # G5 uses whatever scores G2/G3 produced; 0.0 when a gate flagged and did
-    # not emit the metric (e.g. L0_MISSING returns before computing recall).
-    candidate_scores = {
-        "recall": g2.details.get("recall", 0.0),
-        "grounding_pct": g3.details.get("grounding_pct", 0.0),
-    }
-    g5 = g5_no_regression(
-        candidate_scores,
-        champion_scores,
-        aa_noise,
-        is_day_zero=is_day_zero,
-        human_signed_off=human_signed_off,
-        signoff_count=signoff_count,
-    )
-
-    return [g1, g2, g3, g5]

From 38c3f60f5109d559c6fe385c1b12eea878282f2e Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:18:23 +0200
Subject: [PATCH 17/48] chore(evaluation): delete corpus.py

---
 fireflyframework_agentic/evaluation/corpus.py | 185 ------------------
 1 file changed, 185 deletions(-)
 delete mode 100644 fireflyframework_agentic/evaluation/corpus.py

diff --git a/fireflyframework_agentic/evaluation/corpus.py b/fireflyframework_agentic/evaluation/corpus.py
deleted file mode 100644
index 32835f2c..00000000
--- a/fireflyframework_agentic/evaluation/corpus.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Corpus loading and evidence verification (EVALUATION_FRAMEWORK.md §6.3).
-
-The corpus is the third pinned evaluation input, next to the DiscoveryResult
-and the registry: the raw document bundle (input.json) the discovery pipeline
-read.  It is the trusted side of every evidence anchor — the registry tells
-the evaluator what *should* be found; only the corpus can tell it whether what
-a run cited is *real*.
-
-verify_entry() closes the fabricated-evidence channel: a run controls every
-byte of its own evidence_index, so any check computable from (result, registry)
-alone can be satisfied by self-reported evidence.  Checking each excerpt
-against the actual corpus text is the only deterministic counter.
-
-Excerpt contract: excerpts are verbatim quotes from the source document.
-Spliced quotes (fragments joined with '...' or '…') are supported — each
-fragment is verified independently.  Paraphrase belongs in the finding
-description, never in an excerpt.
-"""
-
-from __future__ import annotations
-
-import base64
-import difflib
-import hashlib
-import json
-import re
-import unicodedata
-from dataclasses import dataclass
-from pathlib import Path
-
-from fireflyframework_agentic.evaluation.matcher import source_stem
-
-# Verification statuses for one evidence_index entry.
-VERIFIED = "verified"  # excerpt found (verbatim or spliced) in the cited source
-EMPTY = "empty"  # entry carries no excerpt text — nothing to verify
-SOURCE_UNKNOWN = "source_unknown"  # locator resolves to no corpus document
-FABRICATED = "fabricated"  # populated excerpt not found in the cited source
-
-# A spliced excerpt is split on these joiners; fragments shorter than
-# _MIN_FRAGMENT_CHARS are too generic to verify and are skipped.
-_SPLICE_PATTERN = re.compile(r"\.\.\.|…| -- ")
-_MIN_FRAGMENT_CHARS = 15
-
-# A fragment passes fuzzily when matching blocks (>= _MIN_BLOCK_CHARS chars)
-# cover at least _COVERAGE_THRESHOLD of it — tolerates punctuation/whitespace
-# drift while rejecting invented text (measured ~0.10-0.32 coverage).
-_COVERAGE_THRESHOLD = 0.85
-_MIN_BLOCK_CHARS = 4
-
-
-@dataclass
-class Corpus:
-    """The decoded, normalized corpus: {source stem: normalized text}.
-
-    sha256 pins the corpus file exactly like the registry pin (§4.6): the
-    champion record stores it, and G1 re-hashes the file at scoring time to
-    flag CORPUS_DRIFT.
-    """
-
-    texts: dict[str, str]
-    sha256: str
-    path: str
-
-
-def normalize(text: str) -> str:
-    """Normalize text for excerpt matching: NFKC, strip markdown emphasis and
-    smart quotes, collapse whitespace, casefold."""
-    text = unicodedata.normalize("NFKC", text)
-    text = text.replace("**", "").replace("*", "")
-    text = re.sub(r"[\"""''']", "", text)
-    return re.sub(r"\s+", " ", text).strip().casefold()
-
-
-def corpus_sha256(path: str | Path) -> str:
-    """SHA-256 of the corpus file on disk (the CORPUS_DRIFT re-hash)."""
-    return hashlib.sha256(Path(path).read_bytes()).hexdigest()
-
-
-def load_corpus(path: str | Path) -> Corpus:
-    """Load a FlyRadar input.json bundle into a stem-indexed normalized Corpus.
-
-    Decodes every artifacts[] file and signals[] event log (base64), normalizes
-    the text, and keys each by the same source_stem the matcher uses — so a
-    locator in any convention resolves to its document.
-
-    Raises:
-        ValueError: when the bundle contains no documents, or two documents
-            reduce to the same stem (a collision would let a fabricated
-            citation resolve against the wrong real file).
-    """
-    path = Path(path)
-    raw = json.loads(path.read_text(encoding="utf-8"))
-
-    named_contents: list[tuple[str, str]] = []
-    for artifact in raw.get("artifacts", []):
-        named_contents.append((artifact["filename"], artifact["content_base64"]))
-    for signal in raw.get("signals", []):
-        named_contents.append((signal["name"], signal["content_base64"]))
-
-    if not named_contents:
-        raise ValueError(f"corpus bundle {path} contains no artifacts or signals")
-
-    texts: dict[str, str] = {}
-    for name, content_b64 in named_contents:
-        stem = source_stem(name)
-        if stem in texts:
-            raise ValueError(
-                f"corpus stem collision: two documents reduce to {stem!r} — "
-                "rename one; a collision would verify citations against the wrong file"
-            )
-        decoded = base64.b64decode(content_b64).decode("utf-8", errors="replace")
-        texts[stem] = normalize(decoded)
-
-    return Corpus(texts=texts, sha256=corpus_sha256(path), path=str(path))
-
-
-def _fragment_coverage(fragment: str, source: str) -> float:
-    """Fraction of fragment covered by matching blocks of >= _MIN_BLOCK_CHARS chars."""
-    blocks = difflib.SequenceMatcher(
-        None, fragment, source, autojunk=False
-    ).get_matching_blocks()
-    covered = sum(b.size for b in blocks if b.size >= _MIN_BLOCK_CHARS)
-    return covered / len(fragment)
-
-
-def verify_entry(corpus: Corpus, entry: dict) -> str:
-    """Verify one evidence_index entry against the corpus.
-
-    Returns one of VERIFIED / EMPTY / SOURCE_UNKNOWN / FABRICATED:
-    - the locator must resolve (by source stem) to a corpus document, and
-    - every fragment of the excerpt must appear in that document's text,
-      verbatim after normalization or with matching-block coverage >=
-      _COVERAGE_THRESHOLD.
-
-    The score is the minimum over fragments, so one invented fragment sinks a
-    spliced excerpt.
-
-    """
-    stem = source_stem(entry.get("locator", ""))
-    source = corpus.texts.get(stem)
-    if source is None:
-        return SOURCE_UNKNOWN
-
-    excerpt = normalize(entry.get("excerpt") or "")
-    if not excerpt:
-        return EMPTY
-
-    fragments = [
-        f.strip()
-        for f in _SPLICE_PATTERN.split(excerpt)
-        if len(f.strip()) >= _MIN_FRAGMENT_CHARS
-    ] or [excerpt]
-
-    for fragment in fragments:
-        if fragment in source:
-            continue
-        if _fragment_coverage(fragment, source) < _COVERAGE_THRESHOLD:
-            return FABRICATED
-    return VERIFIED
-
-
-def verify_evidence_index(corpus: Corpus, result: dict) -> dict[str, str]:
-    """Verify every evidence_index entry of a DiscoveryResult.
-
-    Returns {evidence_id: status} over all entries — referenced or not — so
-    the gates share one verification pass.
-    """
-    return {
-        ev["id"]: verify_entry(corpus, ev)
-        for ev in result.get("evidence_index", [])
-        if ev.get("id")
-    }

From f81992336b5b0932a3d65c3acfc2b8a439a27a1b Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:18:30 +0200
Subject: [PATCH 18/48] chore(evaluation): delete registry.py

---
 .../evaluation/registry.py                    | 214 ------------------
 1 file changed, 214 deletions(-)
 delete mode 100644 fireflyframework_agentic/evaluation/registry.py

diff --git a/fireflyframework_agentic/evaluation/registry.py b/fireflyframework_agentic/evaluation/registry.py
deleted file mode 100644
index 2b869ba9..00000000
--- a/fireflyframework_agentic/evaluation/registry.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""lean-1 registry loader — one schema for all four corpora.
-
-Replaces the four mutually incompatible schemes in use today (L1-L5,
-documented/observed/pain-point, critical/important, and no tiers).
-Loader enforces all invariants; they are not documentation.
-
-Invariants (EVALUATION_FRAMEWORK.md, the must-find registry):
-- schema_version == "lean-1"
-- every tier is one of L0 L1 L2 L3 NC
-- negative_control_count >= ceil(real_items / 10)
-- kappa present (0.0 placeholder allowed; G2 advisory until >= 0.70)
-- ABANCA DILO items must target a single measured sub-population
-"""
-from __future__ import annotations
-
-import hashlib
-import json
-import math
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Literal
-
-VALID_TIERS = ("L0", "L1", "L2", "L3", "NC")
-VALID_SCOPES = (
-    "process", "activity", "decision", "finding", "action",
-    "persona", "system", "informal_channel", "dependency_graph",
-)
-SCHEMA_VERSION = "lean-1"
-KAPPA_ADVISORY_THRESHOLD = 0.70
-
-
-@dataclass(frozen=True)
-class RegistryItem:
-    id: str
-    tier: Literal["L0", "L1", "L2", "L3", "NC"]
-    description: str
-    evidence: list[str]          # source file paths (path portion of locator, no #page=N)
-    scope: str = "finding"       # which DiscoveryResult surface to match against (§4.3)
-    keywords: list[str] = field(default_factory=list)
-    weight: float = 1.0
-    from_node: str = ""   # dependency_graph relation items only
-    to_node: str = ""     # dependency_graph relation items only
-    relation: str = ""    # defaults to "precedes" when from/to present
-
-
-@dataclass(frozen=True)
-class Registry:
-    schema_version: str
-    corpus: str
-    author: str
-    date: str
-    kappa: float
-    items: list[RegistryItem]
-    _sha256: str = field(default="", compare=False)
-
-    @property
-    def real_items(self) -> list[RegistryItem]:
-        return [i for i in self.items if i.tier != "NC"]
-
-    @property
-    def nc_items(self) -> list[RegistryItem]:
-        return [i for i in self.items if i.tier == "NC"]
-
-    @property
-    def l0_items(self) -> list[RegistryItem]:
-        return [i for i in self.items if i.tier == "L0"]
-
-    def is_kappa_advisory(self) -> bool:
-        return self.kappa < KAPPA_ADVISORY_THRESHOLD
-
-    def sha256(self) -> str:
-        return self._sha256
-
-
-def _validate(raw: dict, path: Path) -> None:
-    if raw.get("schema_version") != SCHEMA_VERSION:
-        raise ValueError(
-            f"{path.name}: schema_version must be '{SCHEMA_VERSION}', "
-            f"got {raw.get('schema_version')!r}"
-        )
-    for fname in ("corpus", "author", "date"):
-        if not raw.get(fname):
-            raise ValueError(f"{path.name}: missing required field '{fname}'")
-    if "kappa" not in raw:
-        raise ValueError(f"{path.name}: missing 'kappa' field (use 0.0 as placeholder)")
-
-    items = raw.get("items", [])
-
-    # EMPTY_MUST_FIND guard — must be first; kills fake-champion bug
-    if not items:
-        raise ValueError(
-            f"{path.name}: EMPTY_MUST_FIND — items list is empty; "
-            "cannot evaluate recall.  This guard exists to prevent the "
-            "fake-100%-champion failure."
-        )
-
-    ids = [it.get("id") for it in items]
-    if len(ids) != len(set(ids)):
-        dupes = sorted({i for i in ids if ids.count(i) > 1})
-        raise ValueError(f"{path.name}: duplicate item ids: {dupes}")
-
-    for it in items:
-        tier = it.get("tier")
-        if tier not in VALID_TIERS:
-            raise ValueError(
-                f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; "
-                f"must be one of {VALID_TIERS}"
-            )
-        scope = it.get("scope", "finding")
-        if scope not in VALID_SCOPES:
-            raise ValueError(
-                f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; "
-                f"must be one of {VALID_SCOPES}"
-            )
-        if scope == "dependency_graph":
-            if not it.get("from") or not it.get("to"):
-                raise ValueError(
-                    f"{path.name}: dependency_graph item '{it.get('id')}' must have "
-                    "non-empty 'from' and 'to'"
-                )
-        else:
-            if "from" in it or "to" in it or "relation" in it:
-                raise ValueError(
-                    f"{path.name}: item '{it.get('id')}' has 'from'/'to'/'relation' "
-                    f"but scope is '{scope}'; these fields are only valid on "
-                    "dependency_graph-scoped items"
-                )
-
-    real_count = sum(1 for it in items if it.get("tier") != "NC")
-    nc_count = sum(1 for it in items if it.get("tier") == "NC")
-    required_nc = max(1, math.ceil(real_count / 10))
-    if nc_count < required_nc:
-        raise ValueError(
-            f"{path.name}: NC density too low — {nc_count} NC item(s) for "
-            f"{real_count} real items; need >= {required_nc} (ceil(real/10)).  "
-            "Without NC items the eval measures recall only; a verbose hallucinator "
-            "scores perfectly."
-        )
-
-    # ABANCA DILO blend guard: items must assert a single sub-population target.
-    # Checks for phrases that would indicate a blended numeric target is asserted.
-    # "blend" alone is too broad (items may reference it negatively).
-    BLEND_PHRASES = ("combined distribution", "across all offices regardless of segment")
-    for it in items:
-        if it.get("tier") == "NC":
-            continue
-        desc = it.get("description", "").lower()
-        iid = it.get("id", "")
-        if any(phrase in desc for phrase in BLEND_PHRASES):
-            raise ValueError(
-                f"{path.name}: item '{iid}' description targets a blended distribution; "
-                "ABANCA DILO items must target a single measured sub-population "
-                "(Empresas or PyMEs).  Use segment-keyed items: "
-                "dilo-empresas-operativa-42pct AND dilo-pymes-operativa-29pct separately."
-            )
-
-
-def _compute_sha256(path: Path) -> str:
-    return hashlib.sha256(path.read_bytes()).hexdigest()
-
-
-def load_registry(path: str | Path) -> Registry:
-    """Load and validate a lean-1 registry file.
-
-    Raises ValueError with a descriptive message on any invariant violation.
-    The EMPTY_MUST_FIND check runs first — it is the fake-champion guard.
-    """
-    path = Path(path)
-    raw = json.loads(path.read_text(encoding="utf-8"))
-    _validate(raw, path)
-    sha = _compute_sha256(path)
-
-    items = [
-        RegistryItem(
-            id=it["id"],
-            tier=it["tier"],
-            scope=it.get("scope", "finding"),
-            description=it.get("description", ""),
-            evidence=it.get("evidence", []),
-            keywords=it.get("keywords", []),
-            weight=float(it.get("weight", 1.0)),
-            from_node=it.get("from", "") if it.get("scope") == "dependency_graph" else "",
-            to_node=it.get("to", "") if it.get("scope") == "dependency_graph" else "",
-            relation=it.get("relation", "precedes") if it.get("scope") == "dependency_graph" else "",
-        )
-        for it in raw["items"]
-    ]
-
-    return Registry(
-        schema_version=raw["schema_version"],
-        corpus=raw["corpus"],
-        author=raw["author"],
-        date=raw["date"],
-        kappa=float(raw["kappa"] or 0.0),
-        items=items,
-        _sha256=sha,
-    )
-
-
-def registry_sha256(path: str | Path) -> str:
-    return _compute_sha256(Path(path))

From 3bc07861bafefab41c04e6e4697be779097b3f49 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:18:36 +0200
Subject: [PATCH 19/48] chore(evaluation): delete matcher.py

---
 .../evaluation/matcher.py                     | 369 ------------------
 1 file changed, 369 deletions(-)
 delete mode 100644 fireflyframework_agentic/evaluation/matcher.py

diff --git a/fireflyframework_agentic/evaluation/matcher.py b/fireflyframework_agentic/evaluation/matcher.py
deleted file mode 100644
index b4d81f44..00000000
--- a/fireflyframework_agentic/evaluation/matcher.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Single matching primitive reused across G2 (recall/precision) and G3 (grounding).
-
-anchored() is topic-level lexical overlap.  matches() is the gate predicate.
-One function, three uses — do not write three matching functions.
-
-Known limitation (EVALUATION_FRAMEWORK.md): anchored() is topic-anchored, not claim-verified.
-A '45 days' claim cited to a '3 days' source passes if they share the process name.
-Real claim entailment (NLI/AIS) is Phase 2.  The G3 human spot-check is the
-binding faithfulness signal until then.
-"""
-
-from __future__ import annotations
-
-import re
-
-import numpy as np
-
-from fireflyframework_agentic.evaluation.judge_client import cosine
-
-
-def tokens(text: str) -> list[str]:
-    return re.findall(r"\b\w+\b", text.lower())
-
-
-def anchored(claim: str, evidence: str, *, min_token: int = 5) -> bool:
-    """True if claim and evidence share at least one non-trivial token (>= min_token chars).
-
-    Rejects a citation to an unrelated document.  Does NOT verify the claim value —
-    that gap is closed by the deferred NLI/AIS check in Phase 2.
-    """
-    a = {t for t in tokens(claim) if len(t) >= min_token}
-    b = {t for t in tokens(evidence) if len(t) >= min_token}
-    return bool(a & b)
-
-
-def source_stem(locator: str) -> str:
-    """Normalize a locator/source path to a stable document stem for matching.
-
-    Robust to the two locator conventions observed across runs:
-    - directory-prefixed ('sops/SOP-002-kyc-edd.md') and bare ('SOP-002-kyc-edd.md')
-      both reduce to 'sop-002-kyc-edd';
-    - event-log row ids ('src-credit-underwriting:CU-2026-1003') reduce to the
-      process stem 'credit-underwriting', so they join the CSV the registry cites.
-
-    Preserves the same-document anti-gaming property of matches(): it still keys
-    on which source document a finding cites — just independent of directory
-    prefix, file extension, and case, so one registry scores every run.
-    """
-    s = locator.split("#")[0]  # drop the locator fragment (#page=N, #anchor)
-    s = s.rsplit("/", 1)[-1]  # basename — strip any directory prefix
-    if s.startswith("src-") and ":" in s:  # event-log row id: src-<process>:<case>
-        return s.split(":", 1)[0][len("src-") :].lower()
-    if "." in s:  # strip a trailing file extension
-        s = s.rsplit(".", 1)[0]
-    return s.lower()
-
-
-def _finding_sources(finding: dict, evidence_index: dict[str, dict]) -> set[str]:
-    """Return the set of normalized source-document stems cited by a finding."""
-    sources: set[str] = set()
-    for ref in finding.get("evidence_refs", []):
-        ev = evidence_index.get(ref.get("evidence_id", ""))
-        if ev:
-            stem = source_stem(ev.get("locator", ""))
-            if stem:
-                sources.add(stem)
-    return sources
-
-
-def shares_source(finding: dict, item, evidence_index: dict[str, dict]) -> bool:
-    """True iff the finding cites at least one source document the item lists as evidence.
-
-    Source documents are compared by normalized stem (source_stem) so one registry
-    scores every run regardless of locator convention.  This is the anti-gaming
-    anchor reused by both the lexical predicate (matches) and the semantic path
-    (semantic_hits): a finding on a different document cannot satisfy this item.
-
-    Spec-style NC items list their mirror source (§4.1); legacy NC items carry
-    evidence=[], which makes this always False for them.
-
-    Args:
-        finding: dict from DiscoveryResult.findings[i] (model_dump output).
-        item: RegistryItem dataclass from registry.py.
-        evidence_index: {evidence_id: Evidence dict} built from result['evidence_index'].
-    """
-    finding_sources = _finding_sources(finding, evidence_index)
-    item_sources = {source_stem(e) for e in item.evidence}
-    return bool(finding_sources & item_sources)
-
-
-def _keyword_anchored(desc: str, keywords: list[str]) -> bool:
-    """True iff any keyword appears as a whole word in desc (case-insensitive).
-
-    Keyword rail: exempt from the 5-char token floor so short banking terms
-    (KYC, PEP, AML) can anchor a match even though they are too short for the
-    token rail.  Whole-word matching prevents false substring hits (e.g. "risk"
-    inside "enterprise-risk-management").
-    """
-    if not keywords:
-        return False
-    desc_lower = desc.lower()
-    return any(
-        re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords
-    )
-
-
-def candidate_text(candidate: dict, scope: str) -> str:
-    """Extract the searchable text from a candidate on the given scope surface (§4.3).
-
-    Each scope surface uses different fields as the match text:
-    - finding / action      : title + description
-    - process / decision    : name + description
-    - activity              : name + notes + regulatory_links
-    - persona               : name + role + goals + pain_points
-    - system                : name + description
-    - informal_channel      : name + usage_context + notes
-    - dependency_graph      : name + description (diagnostic nodes; relation items bypass this)
-    """
-    if scope in ("finding", "action"):
-        return " ".join(filter(None, [candidate.get("title", ""), candidate.get("description", "")]))
-    if scope == "activity":
-        rl = candidate.get("regulatory_links") or []
-        rl_str = " ".join(rl) if isinstance(rl, list) else str(rl or "")
-        return " ".join(filter(None, [candidate.get("name", ""), candidate.get("notes", ""), rl_str]))
-    if scope == "persona":
-        goals = candidate.get("goals") or []
-        pain = candidate.get("pain_points") or []
-        goals_str = " ".join(goals) if isinstance(goals, list) else str(goals)
-        pain_str = " ".join(pain) if isinstance(pain, list) else str(pain)
-        return " ".join(filter(None, [
-            candidate.get("name", ""),
-            candidate.get("role", ""),
-            goals_str,
-            pain_str,
-        ]))
-    if scope == "informal_channel":
-        return " ".join(filter(None, [
-            candidate.get("name", ""),
-            candidate.get("usage_context", ""),
-            candidate.get("notes", ""),
-        ]))
-    # process, decision, system, dependency_graph (diagnostic nodes)
-    return " ".join(filter(None, [candidate.get("name", ""), candidate.get("description", "")]))
-
-
-INSIGHT_ITEM_SCOPES = ("finding", "action")
-INSIGHT_MATCH_SURFACES = ("finding", "action", "activity", "decision")
-
-
-def allowed_scopes(item) -> tuple[str, ...]:
-    """Candidate surfaces that may satisfy a registry item.
-
-    Insight items (finding / action) may be satisfied by any insight or process-graph
-    *leaf* surface (activity / decision): a run often grounds the same operational fact
-    on a different surface than the registry's scope tag anticipates (the BBVA case —
-    pain points the registry tags 'finding' that the run emitted as decision/activity
-    nodes).  shares_source is still REQUIRED on every candidate (see matches /
-    semantic_hits), so a candidate on the wrong document never counts — cross-scope
-    widens WHERE we look, never the source anchor.
-
-    Structural items (process / activity / decision) stay on their own surface: a
-    structural must-find requires the run to have actually built that node, not merely
-    mentioned the fact in a finding (test_process_scope_miss_when_no_matching_process).
-    NC items are likewise scope-strict — widening a negative control's pool could only
-    make it easier to trip (a specificity regression), never recover a legitimate hit.
-
-    `process` is never a match surface for an insight item: _candidates_by_scope folds
-    every child's evidence_refs into the process node, so its citation set is a union of
-    many documents and shares_source goes vacuous (hence its exclusion from
-    INSIGHT_MATCH_SURFACES).
-    """
-    if item.tier == "NC":
-        return (item.scope,)
-    if item.scope in INSIGHT_ITEM_SCOPES:
-        return INSIGHT_MATCH_SURFACES
-    return (item.scope,)
-
-
-def matches(
-    candidate: dict,
-    item,
-    evidence_index: dict[str, dict],
-    scope: str = "finding",
-) -> bool:
-    """True iff candidate cites a shared source document AND is topic-anchored to item.
-
-    Two-rail anchor (either rail suffices):
-    - Token rail: ≥1 shared token of ≥5 chars between candidate text and item description.
-    - Keyword rail: ≥1 item keyword appears as a whole word in the candidate text.
-      Exempt from the 5-char floor so short banking terms (KYC, PEP, AML) can anchor.
-
-    The ``scope`` controls which fields are read as the candidate's match text (§4.3):
-    findings and actions use ``title + description``; processes and decisions use
-    ``name + description``; activities use ``name + notes + regulatory_links``.
-
-    Anti-gaming guard: a candidate on a different document cannot satisfy this item
-    even if its text happens to match.  Source documents are compared by
-    normalized stem (source_stem) so one registry scores every run regardless of
-    locator convention.
-
-    Args:
-        candidate: dict from the DiscoveryResult surface matching ``scope``.
-        item: RegistryItem dataclass from registry.py.
-        evidence_index: {evidence_id: Evidence dict} built from result['evidence_index'].
-        scope: surface the candidate was drawn from (default "finding").
-    """
-    if not shares_source(candidate, item, evidence_index):
-        return False
-    desc = candidate_text(candidate, scope)
-    return _keyword_anchored(desc, list(item.keywords or [])) or anchored(desc, item.description)
-
-
-def matches_dependency_graph_relation(
-    item,
-    result: dict,
-    evidence_index: dict[str, dict],
-) -> bool:
-    """Endpoint matcher for dependency_graph relation items (§5.3b).
-
-    Stage 1: Anchor both endpoints to activity nodes via token rail.
-    Stage 2: Verify a directed edge or path connects them in the asserted direction,
-             behind the shared-source guard on the edge's/path's evidence_refs.
-
-    Returns False when either endpoint anchors to no activity, or when no connecting
-    edge/path shares a source document with the item.
-    """
-    if not item.from_node or not item.to_node:
-        return False
-
-    processes = result.get("process_graph", {}).get("processes", [])
-    all_activities = [a for p in processes for a in p.get("activities", [])]
-
-    def _anchor(endpoint_text: str) -> set[str]:
-        return {
-            a["id"]
-            for a in all_activities
-            if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text)
-        }
-
-    from_ids = _anchor(item.from_node)
-    to_ids = _anchor(item.to_node)
-    if not from_ids or not to_ids:
-        return False
-
-    item_stems = {source_stem(e) for e in item.evidence}
-
-    def _node_stems(node: dict) -> set[str]:
-        return {
-            source_stem(evidence_index[r["evidence_id"]].get("locator", ""))
-            for r in node.get("evidence_refs", [])
-            if r.get("evidence_id") in evidence_index
-        }
-
-    dg = result.get("dependency_graph", {})
-
-    for edge in dg.get("activity_edges", []):
-        if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids:
-            if _node_stems(edge) & item_stems:
-                return True
-
-    for path in dg.get("critical_paths", []):
-        if not (_node_stems(path) & item_stems):
-            continue
-        node_ids = path.get("node_ids", [])
-        from_pos = [i for i, nid in enumerate(node_ids) if nid in from_ids]
-        to_pos = [i for i, nid in enumerate(node_ids) if nid in to_ids]
-        if any(fp < tp for fp in from_pos for tp in to_pos):
-            return True
-
-    return False
-
-
-def semantic_hits(
-    candidates: dict[str, list[dict]],
-    items,
-    evidence_index: dict[str, dict],
-    embed_fn,
-    tau: float = 0.70,
-    tau_nc: float = 0.85,
-) -> dict[str, bool]:
-    """Opt-in embedding-semantic recall: {item.id: found-by-some-shared-source candidate}.
-
-    Scope-aware: each registry item is evaluated against candidates from its own
-    scope surface (finding, process, activity, decision, action) using the same
-    per-scope field extraction as the lexical path (candidate_text).  Passing only
-    the findings list (the previous behaviour) would leave process/activity/decision/
-    action items with an empty candidate pool and a guaranteed False result.
-
-    Real items (L0–L3): hit iff some scope-matching candidate shares a source
-    document with the item (shares_source) AND is embedding-similar (cosine >= tau).
-    Source anchor is preserved — a candidate on a different document cannot recover
-    a real item.
-
-    NC items (tier=="NC"): hit iff some scope-matching candidate is embedding-similar
-    (cosine >= tau_nc).  When the NC lists its mirror source (§4.1) the shared-source
-    guard applies; legacy NC items with evidence=[] skip the anchor, with the higher
-    threshold (default 0.85) compensating.
-
-    Cost is two embed_fn calls — all scope-appropriate candidate texts once and all
-    item texts once — not O(n*m) per-pair embeddings.
-
-    Args:
-        candidates: {scope: [candidate dicts]} from _candidates_by_scope().
-        items: iterable of RegistryItem dataclasses.
-        evidence_index: {evidence_id: Evidence dict}.
-        embed_fn: callable(list[str]) -> array-like of row vectors.
-        tau: cosine threshold for real items (inclusive).
-        tau_nc: cosine threshold for NC items (inclusive; higher to compensate for no source anchor).
-    """
-    items = list(items)
-
-    # Flatten all candidates across scopes, preserving their scope tag for
-    # text extraction and per-item filtering.
-    scoped: list[tuple[str, dict]] = [
-        (scope, cand)
-        for scope, cands in candidates.items()
-        for cand in cands
-    ]
-
-    if not scoped:
-        return {item.id: False for item in items}
-
-    cand_texts = [candidate_text(cand, scope) for scope, cand in scoped]
-    item_texts = [
-        " ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items
-    ]
-
-    cand_vecs = np.asarray(embed_fn(cand_texts))
-    item_vecs = np.asarray(embed_fn(item_texts))
-
-    hits: dict[str, bool] = {}
-    for i, item in enumerate(items):
-        item_vec = item_vecs[i]
-        allowed = allowed_scopes(item)
-        hit = False
-        for k, (scope, cand) in enumerate(scoped):
-            if scope not in allowed:
-                continue
-            if item.tier == "NC":
-                # Shared-source guard applies when the NC lists its mirror source
-                # (§4.2/§6.2); legacy evidence=[] NCs stay unanchored, with the
-                # higher tau_nc compensating.
-                if item.evidence and not shares_source(cand, item, evidence_index):
-                    continue
-                if cosine(cand_vecs[k], item_vec) >= tau_nc:
-                    hit = True
-                    break
-            elif (
-                shares_source(cand, item, evidence_index)
-                and cosine(cand_vecs[k], item_vec) >= tau
-            ):
-                hit = True
-                break
-        hits[item.id] = hit
-    return hits

From 9c43a323da1f9929216593be23ae9366bcb67de2 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:18:41 +0200
Subject: [PATCH 20/48] chore(evaluation): delete scorecard.py

---
 .../evaluation/scorecard.py                   | 489 ------------------
 1 file changed, 489 deletions(-)
 delete mode 100644 fireflyframework_agentic/evaluation/scorecard.py

diff --git a/fireflyframework_agentic/evaluation/scorecard.py b/fireflyframework_agentic/evaluation/scorecard.py
deleted file mode 100644
index b34885e8..00000000
--- a/fireflyframework_agentic/evaluation/scorecard.py
+++ /dev/null
@@ -1,489 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Scorecard renderer: gate results -> Markdown report.
-
-Every scorecard states whether it is self-graded.  Until Phase 3 independent
-re-annotation lands, all Lean-Core PROMOTE verdicts are self-graded against
-team-authored ground truth.  See EVALUATION_FRAMEWORK.md.
-"""
-
-from __future__ import annotations
-
-import json
-
-VERDICT_PROMOTE = "PROMOTE"
-VERDICT_HOLD = "HOLD"
-
-
-def verdict(gate_results: list) -> str:
-    """PROMOTE iff all gates passed and G5 is in the list; HOLD otherwise."""
-    if not gate_results:
-        return VERDICT_HOLD
-    if not all(g.passed for g in gate_results):
-        return VERDICT_HOLD
-    gate_names = {g.gate for g in gate_results}
-    if "G5" not in gate_names:
-        return VERDICT_HOLD
-    return VERDICT_PROMOTE
-
-
-def render_scorecard(
-    gate_results: list,
-    *,
-    corpus: str = "unknown",
-    model_id: str = "unknown",
-    run_id: str = "run",
-    is_self_graded: bool = True,
-    kappa_advisory: bool = False,
-    evidence_unverified: bool = False,
-    bpi2017_f1: float | None = None,
-    advisory=None,
-    config: dict | None = None,
-    experiment_config: dict | None = None,
-) -> str:
-    """Render a Markdown evaluation scorecard.
-
-    The scorecard always discloses self-graded status and advisory flags.
-    """
-    v = verdict(gate_results)
-    lines = [
-        "# FlyRadar Evaluation Scorecard",
-        "",
-        f"**Corpus**: {corpus}",
-        f"**Model**: {model_id}",
-        f"**Run**: {run_id}",
-        f"**Verdict**: **{v}**",
-        "",
-    ]
-
-    if is_self_graded:
-        lines += [
-            "> **SELF-GRADED**: All ground truth (must-find, gold, DILO, human sign-off) is",
-            "> authored by the FlyRadar team.  This PROMOTE has no contamination-free signal",
-            "> until Phase 3.  See EVALUATION_FRAMEWORK.md.",
-            "",
-        ]
-
-    if kappa_advisory:
-        lines += [
-            "> **ADVISORY**: Registry kappa < 0.70 — a second independent annotator has not",
-            "> verified the must-find items.  Promotion is advisory for this corpus until",
-            "> kappa >= 0.70 from an independent re-annotation.",
-            "",
-        ]
-
-    if evidence_unverified:
-        lines += [
-            "> **EVIDENCE UNVERIFIED**: no corpus supplied (--corpus) — evidence locators",
-            "> and excerpts are taken at face value from the run's own evidence_index.",
-            "> Grounding certifies self-consistency, not corpus reality.  Supply the run's",
-            "> input.json to enable deterministic excerpt verification (G3, §6.3).",
-            "",
-        ]
-
-    if experiment_config is not None:
-        lines += [
-            "## Experiment configuration",
-            "How this run was generated. Recorded fields (cost, tokens, latency, agents) are "
-            "read from the run's output.json; `model` is the value passed to the harness via "
-            "--model-id. Generation params (temperature, prompt/pipeline version, seed) are not "
-            "captured in output.json.",
-            "",
-            "```json",
-            json.dumps(experiment_config, indent=2, default=str),
-            "```",
-            "",
-        ]
-
-    if config is not None:
-        lines += [
-            "## Evaluation configuration",
-            "These are the parameters used to compute the evaluation.",
-            "",
-            "```json",
-            json.dumps(config, indent=2, default=str),
-            "```",
-            "",
-        ]
-
-    lines += ["## Gate Results", ""]
-    g5_result = None
-    for g in gate_results:
-        if g.gate == "G5":
-            g5_result = g
-            continue
-        status = "PASS" if g.passed else f"FLAG ({g.reason_code})"
-        lines.append(f"### {g.gate}: {status}")
-        if g.details:
-            lines.append("```json")
-            lines.append(json.dumps(g.details, indent=2, default=str))
-            lines.append("```")
-        lines.append("")
-
-    if bpi2017_f1 is not None:
-        ok = bpi2017_f1 >= 0.60
-        anchor_status = "PASS (>= 0.60)" if ok else "BELOW THRESHOLD (< 0.60)"
-        lines += [
-            "## External Sanity Anchor (non-blocking)",
-            f"BPI-2017 variant-recovery F1: **{bpi2017_f1:.3f}** — {anchor_status}",
-            "_One non-self-graded signal.  Non-blocking; informational only._",
-            "",
-        ]
-
-    if advisory is not None:
-        lines += _render_advisory(advisory)
-
-    if g5_result is not None:
-        status = "PASS" if g5_result.passed else f"FLAG ({g5_result.reason_code})"
-        lines.append(f"### G5: {status}")
-        if g5_result.details:
-            lines.append("```json")
-            lines.append(json.dumps(g5_result.details, indent=2, default=str))
-            lines.append("```")
-        lines.append("")
-
-    lines += _render_analysis(gate_results, advisory)
-
-    return "\n".join(lines)
-
-
-def _num(x) -> str:
-    """Format a metric leaf: None -> 'n/a', float -> 3dp, else str."""
-    if x is None:
-        return "n/a"
-    if isinstance(x, float):
-        return f"{x:.3f}"
-    return str(x)
-
-
-def _render_advisory(report) -> list[str]:
-    """Render the non-blocking G4 LLM-as-a-Judge section from an AdvisoryReport.
-
-    Best-effort: only metrics present in report.metrics are shown.  G4 never
-    affects the PROMOTE/HOLD verdict; this section is decision-support for the
-    G5 human sign-off, and is advisory until LLM-as-a-Judge calibration (§10).
-    """
-    m = report.metrics
-    cal = "calibrated" if report.calibrated else "uncalibrated"
-    lines = [
-        "## G4 — LLM-as-a-Judge (non-blocking — does NOT affect the PROMOTE/HOLD verdict)",
-        f"Judge: {report.judge_model} · {cal} · {report.runs}-run median",
-    ]
-    if report.same_provider_caveat:
-        lines.append("> same-provider as the pipeline — results may share blind spots.")
-    lines.append("```text")
-
-    if "faithfulness" in m:
-        d = m["faithfulness"]
-        u = d.get("unsupported_ids", [])
-        extra = f"   (unsupported: {', '.join(u)})" if u else ""
-        lines.append(
-            f"Faithfulness (entailment):       {d.get('supported')}/{d.get('total')} supported{extra}"
-        )
-    if "numeric_temporal_fidelity" in m:
-        lines.append(
-            f"Numeric/temporal fidelity:       {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)"
-        )
-    if "citation_relevance" in m:
-        d = m["citation_relevance"]
-        lines.append(
-            f"Citation relevance (ctx-prec):   {_num(d.get('precision'))}   ({d.get('relevant')}/{d.get('total')})"
-        )
-    if "semantic_recovery" in m:
-        d = m["semantic_recovery"]
-        rec = d.get("recovered", [])
-        rids = ", ".join(r.get("id", "") for r in rec) if rec else "none"
-        lines.append(
-            f"Semantic recovery (ctx-recall):  lexical {_num(d.get('lexical_recall'))} -> {_num(d.get('recovered_recall'))}   (recovered: {rids})"
-        )
-    if "nc_semantic_precision" in m:
-        d = m["nc_semantic_precision"]
-        a = d.get("asserted_ids", [])
-        extra = f"   ({', '.join(a)})" if a else ""
-        lines.append(f"NC semantic precision:           {d.get('asserted', 0)} asserted{extra}")
-    if "fabricated_entity" in m:
-        lines.append(f"Fabricated-entity check:         {m['fabricated_entity'].get('count', 0)}")
-    if "contradiction" in m:
-        lines.append(f"Contradiction detection:         {m['contradiction'].get('count', 0)}")
-    if "actionability" in m:
-        d = m["actionability"]
-        lines.append(
-            f"Actionability:                   {_num(d.get('score'))}   (rated {d.get('rated', 0)})"
-        )
-    if "severity_calibration" in m:
-        d = m["severity_calibration"]
-        lines.append(
-            f"Severity calibration:            {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated"
-        )
-    if "answer_relevancy" in m:
-        lines.append(f"Answer relevancy:                {_num(m['answer_relevancy'].get('score'))}")
-    if "comparative_vs_champion" in m:
-        lines.append(
-            f"Comparative vs champion:         more consistent -> {m['comparative_vs_champion'].get('more_consistent', 'n/a')}"
-        )
-    if "source_coverage" in m:
-        d = m["source_coverage"]
-        o = d.get("orphaned", [])
-        extra = f"   (orphaned: {', '.join(o)})" if o else ""
-        lines.append(
-            f"Source coverage [D]:             {d.get('cited')}/{d.get('total')} documents cited{extra}"
-        )
-    if "excerpt_fill_rate" in m:
-        d = m["excerpt_fill_rate"]
-        lines.append(
-            f"Evidence-excerpt fill [D]:       {d.get('populated')}/{d.get('total')} populated"
-        )
-    if "open_gap" in m:
-        gap = (m["open_gap"].get("gap") or "").strip()
-        if gap:
-            lines.append(f"Open gap probe:                  {gap}")
-    if report.errors:
-        lines.append(f"(errors: {len(report.errors)} metric(s) failed: {'; '.join(report.errors)})")
-    lines.append("```")
-    # Full detail — nothing truncated: every id, pair, verdict, and complete text.
-    lines += [
-        "",
-        "**G4 — full metric detail:**",
-        "```json",
-        json.dumps({"metrics": report.metrics, "details": report.details}, indent=2, default=str),
-        "```",
-    ]
-    lines.append(
-        "> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10)."
-    )
-    lines.append("")
-    return lines
-
-
-def _render_analysis(gate_results: list, advisory=None) -> list[str]:
-    """Render a plain-language interpretation of all evaluation signals."""
-    g2 = next((g for g in gate_results if g.gate == "G2"), None)
-    g3 = next((g for g in gate_results if g.gate == "G3"), None)
-
-    lines = ["## Analysis", ""]
-
-    # ── Topic coverage (G2) ──────────────────────────────────────────────────
-    lines.append("### Topic coverage (G2)")
-    if g2 and g2.details:
-        d = g2.details
-        recall = d.get("recall", 0.0)
-        tiers = d.get("per_tier", {})
-        finding_count = d.get("finding_count", 0)
-        redundancy = d.get("finding_redundancy_rate", 0.0)
-        matched = d.get("findings_matched_to_registry", {}).get("fraction", 0.0)
-
-        tier_summary = ", ".join(
-            f"{t} {v['hit']}/{v['total']}"
-            for t, v in tiers.items()
-            if "hit" in v and "total" in v
-        )
-        lines.append(
-            f"Lexical recall is **{recall:.3f}** ({tier_summary}). "
-            f"The run produced {finding_count} findings, "
-            f"all of which map to a registry item (match rate {matched:.0%}). "
-        )
-        if redundancy > 0.15:
-            lines.append(
-                f"Finding redundancy is **{redundancy:.0%}** — a meaningful share of "
-                "findings are near-duplicates of each other (Jaccard ≥ 0.6). "
-                "The run is covering the same ground multiple times rather than broadening coverage."
-            )
-        else:
-            lines.append(
-                f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic."
-            )
-        lines.append(
-            "_G2 is a topic-level test. A recall of 1.000 means every required topic was "
-            "mentioned somewhere — it does not verify that the specific claims about those "
-            "topics are accurate. Claim accuracy is G4 Faithfulness._"
-        )
-    else:
-        lines.append("G2 result unavailable.")
-    lines.append("")
-
-    # ── Evidence quality (G3) ────────────────────────────────────────────────
-    lines.append("### Evidence quality (G3)")
-    if g3 and g3.details:
-        d = g3.details
-        grounding = d.get("grounding_pct", 0.0)
-        ev = d.get("evidence_verification", {})
-        verified = ev.get("verified", 0)
-        entries = ev.get("entries", 0)
-        fabricated = ev.get("fabricated", [])
-        unknown = ev.get("source_unknown", [])
-        orphaned = d.get("orphaned_sources", [])
-        source_cov = d.get("source_coverage", "")
-
-        lines.append(
-            f"Grounding is **{grounding:.0%}**: every finding cites at least one "
-            "corpus document, and all excerpts are populated. "
-            f"Evidence verification checked {entries} entries against the raw corpus: "
-            f"{verified} verified"
-            + (f", **{len(fabricated)} fabricated** (locators that do not exist in the corpus)" if fabricated else "")
-            + (f", **{len(unknown)} source-unknown** (locators that resolve to no corpus file)" if unknown else "")
-            + "."
-        )
-        if unknown:
-            lines.append(
-                f"The source-unknown locator(s) are: `{'`, `'.join(unknown)}`. "
-                "This is most likely a corpus bundle gap rather than a hallucinated source — "
-                "verify that all expected files are included in `input.json`."
-            )
-        if orphaned:
-            lines.append(
-                f"**{len(orphaned)} corpus documents were never cited** by this run "
-                f"({', '.join(orphaned)}). These are blind spots: the run extracted nothing "
-                "from these sources, so any findings they contain are silently missed."
-            )
-        if source_cov:
-            cited, total = (int(x) for x in source_cov.split("/"))
-            if cited < total:
-                lines.append(
-                    f"Overall source coverage is {cited}/{total} — "
-                    f"{total - cited} corpus file(s) left entirely uncited."
-                )
-    else:
-        lines.append("G3 result unavailable.")
-    lines.append("")
-
-    # ── Claim accuracy (G4) ──────────────────────────────────────────────────
-    if advisory is not None:
-        m = advisory.metrics
-        lines.append("### Claim accuracy (G4 — advisory)")
-
-        faith = m.get("faithfulness", {})
-        supported = faith.get("supported", 0)
-        total_f = faith.get("total", 0)
-        if total_f:
-            faith_pct = supported / total_f
-            lines.append(
-                f"**Faithfulness: {supported}/{total_f} findings ({faith_pct:.0%}) are entailed by their cited evidence.** "
-            )
-            if faith_pct < 0.5:
-                lines.append(
-                    "This is a critical signal: the majority of findings contain claims "
-                    "that the judge cannot verify from the cited sources. "
-                    "The run is presenting inferences, extrapolations, or hallucinated details "
-                    "as if they were directly evidenced. "
-                    "Each unsupported finding should be reviewed against its cited document before use."
-                )
-            elif faith_pct < 0.8:
-                lines.append(
-                    "A significant minority of findings contain claims not traceable to cited sources. "
-                    "These may be reasonable inferences, but they should be flagged for human verification."
-                )
-            else:
-                lines.append("Most findings are directly supported by their cited evidence.")
-
-        ntf = m.get("numeric_temporal_fidelity", {})
-        mismatch_count = ntf.get("count", 0)
-        if mismatch_count:
-            lines.append(
-                f"**Numeric/temporal fidelity: {mismatch_count} mismatches detected.** "
-                "Specific figures — FTE costs, durations, timestamps, percentages, case IDs — "
-                "appear in findings but cannot be traced to the cited evidence. "
-                "These numbers should be treated as estimates or fabrications until verified "
-                "against the source documents."
-            )
-
-        fab = m.get("fabricated_entity", {})
-        fab_count = fab.get("count", 0)
-        fab_entities = fab.get("entities", [])
-        if fab_count:
-            lines.append(
-                f"**Fabricated entities: {fab_count}** — the following names/identifiers appear "
-                f"in the output but are absent from the corpus: "
-                f"{', '.join(f'`{e}`' for e in fab_entities)}. "
-                "These should be removed or verified before sharing the output."
-            )
-
-        sev = m.get("severity_calibration", {})
-        misc = sev.get("miscalibrated", 0)
-        total_s = sev.get("total", 0)
-        verdicts = sev.get("verdicts", {})
-        over_count = sum(1 for v in verdicts.values() if v == "over")
-        under_count = sum(1 for v in verdicts.values() if v == "under")
-        if misc and total_s:
-            direction = ""
-            if over_count > under_count:
-                direction = f" (predominantly over-rated: {over_count} findings rated too high)"
-            elif under_count > over_count:
-                direction = f" (predominantly under-rated: {under_count} findings rated too low)"
-            lines.append(
-                f"**Severity calibration: {misc}/{total_s} findings miscalibrated{direction}.** "
-                "Over-rated findings inflate perceived urgency and can cause the client to "
-                "prioritise the wrong items."
-            )
-
-        act = m.get("actionability", {})
-        act_score = act.get("score")
-        if act_score is not None:
-            if act_score < 0.6:
-                lines.append(
-                    f"**Actionability score: {act_score:.3f}** — proposed actions are below the "
-                    "0.6 threshold for concrete, quantified recommendations. "
-                    "Actions tend to be generic rather than specific enough to assign and execute."
-                )
-            else:
-                lines.append(f"Actionability score: {act_score:.3f} — actions are sufficiently concrete.")
-
-        og = m.get("open_gap", {})
-        gap_text = (og.get("gap") or "").strip()
-        if gap_text:
-            lines.append(f"**Most important missed finding:** {gap_text}")
-
-        lines.append("")
-
-    # ── Bottom line ──────────────────────────────────────────────────────────
-    lines.append("### Bottom line")
-    g5 = next((g for g in gate_results if g.gate == "G5"), None)
-    g5_reason = (g5.details or {}).get("reason", "") if g5 else ""
-    flags = [g for g in gate_results if not g.passed]
-    flag_names = [g.gate for g in flags]
-
-    if not flags:
-        lines.append(
-            "All deterministic gates pass. The run is ready for G5 human sign-off."
-        )
-    else:
-        flag_str = ", ".join(flag_names)
-        lines.append(
-            f"The run is at **HOLD** due to flags on: {flag_str}. "
-        )
-        for g in flags:
-            if g.gate == "G3" and g.reason_code == "EVIDENCE_SOURCE_UNKNOWN":
-                lines.append(
-                    "- **G3**: One evidence locator points to a file not in the corpus bundle. "
-                    "Regenerate `input.json` to include all corpus sources, then re-run."
-                )
-            elif g.gate == "G5":
-                lines.append(f"- **G5**: {g5_reason}")
-
-    if advisory is not None:
-        m = advisory.metrics
-        faith = m.get("faithfulness", {})
-        supported = faith.get("supported", 0)
-        total_f = faith.get("total", 1)
-        ntf_count = m.get("numeric_temporal_fidelity", {}).get("count", 0)
-        fab_count = m.get("fabricated_entity", {}).get("count", 0)
-        lines.append(
-            f"\nG4 advisory signals (non-blocking but important for the G5 reviewer): "
-            f"faithfulness {supported}/{total_f}, "
-            f"{ntf_count} numeric mismatches, "
-            f"{fab_count} fabricated entities. "
-            "The G5 reviewer should focus on the unsupported findings and verify figures "
-            "against the source documents before certifying the output."
-        )
-    lines.append("")
-    return lines

From a3673b5c5a441192b99ba7ecd40ab2d5d4bdc57a Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:18:45 +0200
Subject: [PATCH 21/48] chore(evaluation): delete run_config_snapshot.py

---
 .../evaluation/run_config_snapshot.py         | 160 ------------------
 1 file changed, 160 deletions(-)
 delete mode 100644 fireflyframework_agentic/evaluation/run_config_snapshot.py

diff --git a/fireflyframework_agentic/evaluation/run_config_snapshot.py b/fireflyframework_agentic/evaluation/run_config_snapshot.py
deleted file mode 100644
index db543129..00000000
--- a/fireflyframework_agentic/evaluation/run_config_snapshot.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Capture the effective flyradar run configuration into experiment_configuration.json.
-
-Non-invasive snapshot: it records how a run was generated by reading what flyradar
-already exposes as data — the request options the caller sent, the ``/api/v1/version``
-endpoint, ``RadarSettings``, and the prompt catalog — without modifying flyradar.  The
-snapshot is written next to the run's ``output.json`` at generation time, which is the
-moment the configuration is known.
-
-This is the bridge: the durable fix is for flyradar to stamp the same config into
-``DiscoveryResult`` itself (the one place that knows the effective values and cannot
-drift).  See the "flyradar improvements" issue.  ``temperature`` and ``seed`` are not
-exposed by ``RadarSettings`` and are recorded as ``null`` here.
-
-Usage:
-    cd flyradar_experiments
-    set -a && source .env && set +a
-    uv run python -m fireflyframework_agentic.evaluation.run_config_snapshot \
-        --output-dir experiments/bbva_españa/runs/2026-06-12-sonnet-01 \
-        --options    request_options.json \
-        --commit     c107918
-"""
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import urllib.request
-from importlib.resources import files
-from pathlib import Path
-
-try:
-    from flyradar.config import RadarSettings
-except ImportError:  # flyradar is an optional dependency of this snapshot.
-    RadarSettings = None
-
-#: Path of the flyradar version endpoint (whitelisted in the service middleware).
-VERSION_PATH = "/api/v1/version"
-
-#: RadarSettings fields that define scoring / dedup behaviour, captured verbatim.
-_SETTINGS_KEYS = (
-    "model",
-    "fallback_model",
-    "duplicity_similarity_threshold",
-    "rootcause_cost_weight",
-    "rootcause_frequency_weight",
-    "rootcause_actionability_weight",
-)
-
-
-def fetch_version(base_url: str, *, timeout: float = 10.0) -> dict:
-    """GET the flyradar version endpoint; return ``{}`` on any failure."""
-    url = base_url.rstrip("/") + VERSION_PATH
-    try:
-        with urllib.request.urlopen(url, timeout=timeout) as resp:
-            return json.loads(resp.read().decode("utf-8"))
-    except Exception:
-        return {}
-
-
-def load_radar_settings() -> dict | None:
-    """Dump the scoring / dedup RadarSettings, or ``None`` if flyradar isn't importable."""
-    if RadarSettings is None:
-        return None
-    settings = RadarSettings()
-    return {key: getattr(settings, key, None) for key in _SETTINGS_KEYS}
-
-
-def load_prompt_versions() -> dict | None:
-    """Read each stage prompt's ``version`` from the flyradar prompt catalog, or ``None``."""
-    try:
-        catalog = files("flyradar.resources.prompts")
-    except ModuleNotFoundError:
-        return None
-    versions: dict[str, str] = {}
-    for entry in catalog.iterdir():
-        if not entry.name.endswith(".yaml"):
-            continue
-        for line in entry.read_text(encoding="utf-8").splitlines():
-            if line.strip().startswith("version:"):
-                versions[entry.name[:-5]] = line.split(":", 1)[1].strip().strip('"')
-                break
-    return versions or None
-
-
-def build_run_config(
-    options: dict,
-    *,
-    version: dict,
-    settings: dict | None,
-    prompt_versions: dict | None,
-    commit: str | None = None,
-) -> dict:
-    """Assemble the experiment-configuration snapshot from its captured parts."""
-    return {
-        "captured_by": "config-snapshot (non-invasive)",
-        "flyradar_version": version.get("version"),
-        "flyradar_commit": commit or version.get("commit"),
-        "options": options,
-        "settings": settings,
-        "prompt_versions": prompt_versions,
-        "temperature": None,
-        "seed": None,
-        "_note": (
-            "Non-invasive snapshot captured at generation time. `options` is the request "
-            "the caller sent; `settings` and `prompt_versions` are read from flyradar when "
-            "importable at the deployed commit. `temperature` and `seed` are not exposed by "
-            "RadarSettings and are recorded as null. The durable fix is for flyradar to stamp "
-            "this config into DiscoveryResult (see the 'flyradar improvements' issue)."
-        ),
-    }
-
-
-def write_snapshot(output_dir: str | Path, config: dict) -> Path:
-    """Write ``experiment_configuration.json`` into the run's output directory."""
-    path = Path(output_dir) / "experiment_configuration.json"
-    path.write_text(json.dumps(config, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
-    return path
-
-
-def main(argv: list[str] | None = None) -> int:
-    parser = argparse.ArgumentParser(description="Capture the flyradar run configuration.")
-    parser.add_argument("--output-dir", required=True, help="Run directory holding output.json.")
-    parser.add_argument(
-        "--options", required=True, help="JSON file of the DiscoveryRequest options that were sent."
-    )
-    parser.add_argument(
-        "--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL)."
-    )
-    parser.add_argument("--commit", default=None, help="Deployed flyradar git commit, if known.")
-    args = parser.parse_args(argv)
-
-    base_url = args.base_url or os.environ.get("FLYRADAR_BASE_URL", "")
-    options = json.loads(Path(args.options).read_text(encoding="utf-8"))
-    config = build_run_config(
-        options,
-        version=fetch_version(base_url) if base_url else {},
-        settings=load_radar_settings(),
-        prompt_versions=load_prompt_versions(),
-        commit=args.commit,
-    )
-    path = write_snapshot(args.output_dir, config)
-    print(f"Wrote {path}")
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())

From a51115e8e933c3fe6acecd75ea3610995251644c Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:18:50 +0200
Subject: [PATCH 22/48] chore(evaluation): delete models.py

---
 fireflyframework_agentic/evaluation/models.py | 70 -------------------
 1 file changed, 70 deletions(-)
 delete mode 100644 fireflyframework_agentic/evaluation/models.py

diff --git a/fireflyframework_agentic/evaluation/models.py b/fireflyframework_agentic/evaluation/models.py
deleted file mode 100644
index a98cdf20..00000000
--- a/fireflyframework_agentic/evaluation/models.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Shared config and model classes for the evaluation framework.
-
-EvalConfig captures the parameters of a single evaluation run: which model
-is being tested, which corpus it runs against, and where the supporting
-artefacts (registry, baseline, judge config) live.
-
-GateVerdict constants define the two possible outcomes of the promotion gate:
-PROMOTE (the challenger beats or ties the champion and is safe to deploy)
-or HOLD (the challenger does not meet the bar and must be iterated on).
-"""
-
-from __future__ import annotations
-
-from typing import Any
-
-from pydantic import BaseModel
-
-
-class EvalConfig(BaseModel):
-    """Configuration for a single evaluation run.
-
-    Parameters:
-        model_id: Identifier of the model under evaluation.
-        corpus: Name of the evaluation corpus (e.g. "ms_marco_mini", "finance_bench").
-        run_id: Unique identifier for this run (e.g. a timestamp or git SHA).
-        registry_path: Path to the must-find / golden registry JSON file.
-        corpus_path: Path to the corpus directory or bundle.
-        baseline_path: Path to a baseline results file for regression comparison.
-        judge_model: Model identifier used for the LLM-as-judge advisory pass.
-        judge_runs: Number of independent judge calls to aggregate (majority vote).
-        embed_model: Model identifier used for embedding-based retrieval metrics.
-        metadata: Arbitrary key/value pairs for run bookkeeping.
-    """
-
-    model_id: str
-    corpus: str
-    run_id: str
-    registry_path: str = ""
-    corpus_path: str = ""
-    baseline_path: str = ""
-    judge_model: str = ""
-    judge_runs: int = 3
-    embed_model: str = ""
-    metadata: dict[str, Any] = {}
-
-
-class GateVerdict:
-    """Promotion gate verdict constants.
-
-    Use ``GateVerdict.PROMOTE`` when the challenger meets the quality bar and
-    is safe to become the new champion.  Use ``GateVerdict.HOLD`` when the
-    challenger does not meet the bar and must be iterated on.
-    """
-
-    PROMOTE: str = "PROMOTE"
-    HOLD: str = "HOLD"

From 5074d14eb91506d4d9367808d0d3196775571760 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:18:56 +0200
Subject: [PATCH 23/48] chore(evaluation): delete stats.py

---
 fireflyframework_agentic/evaluation/stats.py | 110 -------------------
 1 file changed, 110 deletions(-)
 delete mode 100644 fireflyframework_agentic/evaluation/stats.py

diff --git a/fireflyframework_agentic/evaluation/stats.py b/fireflyframework_agentic/evaluation/stats.py
deleted file mode 100644
index e70c629a..00000000
--- a/fireflyframework_agentic/evaluation/stats.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Statistics helpers: A/A noise band + fixed aggregate_grounding.
-
-The A/A band replaces McNemar, Wilcoxon, BCa bootstrap, Cliff's delta, Holm
-correction, and MCID power analysis.  Four self-authored corpora with ~30-70
-non-independent items each cannot power those tests; gating on unpowered tests
-is false precision.  See EVALUATION_FRAMEWORK.md (regression statistics).
-
-This module also provides the fixed aggregate_grounding() that closes a prior
-aggregation bug where the previous runner inherited run 0's grounding report
-unchanged instead of merging across all runs.
-"""
-from __future__ import annotations
-
-import statistics
-from typing import Sequence
-
-
-def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float:
-    """95th-percentile pairwise delta from champion reruns — the noise floor.
-
-    Rerun the champion ~10 times; the 95th-percentile of all pairwise absolute
-    differences is the A/A noise floor.  A candidate must beat the champion by
-    more than this number on EVERY seed to count as a real improvement.
-
-    This single number replaces MCID, power analysis, McNemar, Wilcoxon,
-    bootstrap CIs, and Holm correction.  See EVALUATION_FRAMEWORK.md (the A/A noise band).
-
-    Args:
-        scores: Per-run primary metric scores from champion reruns (>= 2 required).
-        percentile: Which percentile (default 95).
-
-    Returns:
-        Noise floor as a float in the same units as the input scores.
-    """
-    scores = list(scores)
-    if len(scores) < 2:
-        raise ValueError(f"aa_band requires >= 2 reruns; got {len(scores)}")
-    deltas = [
-        abs(x - y)
-        for i, x in enumerate(scores)
-        for y in scores[i + 1:]
-    ]
-    sorted_deltas = sorted(deltas)
-    # Index for the requested percentile; clamp to valid range
-    idx = max(0, min(len(sorted_deltas) - 1, int(len(sorted_deltas) * percentile / 100)))
-    return sorted_deltas[idx]
-
-
-def aggregate_grounding(grounding_dicts: list[dict]) -> dict:
-    """Merge per-run grounding reports into a conservative aggregate.
-
-    Fixes a prior aggregation bug where the previous runner inherited run 0's grounding
-    report unchanged.  Correct behaviour:
-    - support_pct: mean across runs
-    - unsupported_ids: UNION across all runs (anything flagged in any run stays flagged)
-
-    Args:
-        grounding_dicts: List of grounding report dicts, one per evaluation run.
-            Each must have 'support_pct' (float 0-100) and optionally
-            'unsupported_ids' (list[str]).
-
-    Returns:
-        Merged grounding dict.
-    """
-    if not grounding_dicts:
-        return {"support_pct": 0.0, "unsupported_ids": []}
-
-    support_pcts = [float(g.get("support_pct", 0.0)) for g in grounding_dicts]
-    mean_pct = statistics.mean(support_pcts)
-
-    unsupported: set[str] = set()
-    for g in grounding_dicts:
-        unsupported.update(g.get("unsupported_ids", []))
-
-    first = grounding_dicts[0]
-    return {
-        **first,
-        "support_pct": round(mean_pct, 2),
-        "unsupported_ids": sorted(unsupported),
-        "_aggregate_runs": len(grounding_dicts),
-        "_support_pct_per_run": [round(p, 2) for p in support_pcts],
-    }
-
-
-def left_skew_flag(scores: Sequence[float]) -> bool:
-    """True if min < median - 0.10 (HIGH_VARIANCE sentinel).
-
-    A single catastrophic run cannot hide inside a decent mean.
-    True => HIGH_VARIANCE; block the run until investigated.
-    See EVALUATION_FRAMEWORK.md (anti-flakiness).
-    """
-    scores = list(scores)
-    if len(scores) < 2:
-        return False
-    med = statistics.median(scores)
-    return min(scores) < med - 0.10

From 8716be93d143f846920a6ad85820c082eccb6ccf Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:19:01 +0200
Subject: [PATCH 24/48] chore(evaluation): delete champion.py

---
 .../evaluation/champion.py                    | 169 ------------------
 1 file changed, 169 deletions(-)
 delete mode 100644 fireflyframework_agentic/evaluation/champion.py

diff --git a/fireflyframework_agentic/evaluation/champion.py b/fireflyframework_agentic/evaluation/champion.py
deleted file mode 100644
index 239429eb..00000000
--- a/fireflyframework_agentic/evaluation/champion.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Per-corpus champion management.
-
-Champions are per-corpus — mode 2A (conformance) and mode 2B (extraction)
-metrics live in incommensurable spaces.  There is no global champion.
-See EVALUATION_FRAMEWORK.md (per-corpus champions).
-
-The historical fake-100% incident: banca-cordobesa/baseline.json was populated
-with a champion scored against an EMPTY must-find registry.  The EMPTY_MUST_FIND
-guard in G1 prevents a recurrence; the invalidate_champion() function provides
-the corrective action when it does happen.
-"""
-
-from __future__ import annotations
-
-import hashlib
-import json
-from dataclasses import dataclass, field
-from pathlib import Path
-
-
-@dataclass
-class ChampionRecord:
-    """Per-corpus champion, stored as 'champion' key in baseline.json."""
-
-    corpus: str
-    run_id: str
-    model_id: str
-    registry_sha256: str
-    scores: dict  # {metric_name: float}
-    aa_noise: dict = field(default_factory=dict)  # {metric_name: noise_floor}
-    is_day_zero: bool = False
-    human_sign_offs: list[str] = field(default_factory=list)
-    config: dict = field(default_factory=dict)  # evaluation config snapshot
-    corpus_sha256: str = ""  # pin of the evidence corpus the champion was verified against
-
-    def primary_metric(self) -> str:
-        return next(iter(self.scores)) if self.scores else ""
-
-    def primary_score(self) -> float:
-        return float(self.scores.get(self.primary_metric(), 0.0))
-
-
-def load_champion(baseline_path: str | Path) -> ChampionRecord | None:
-    """Load the current per-corpus champion from baseline.json.
-
-    Returns None when:
-    - The file does not exist (normal Day-Zero state).
-    - The file exists but 'champion' is null (post-invalidation state).
-    """
-    path = Path(baseline_path)
-    if not path.exists():
-        return None
-    raw = json.loads(path.read_text(encoding="utf-8"))
-    champ_raw = raw.get("champion")
-    if champ_raw is None:
-        return None
-    return ChampionRecord(
-        corpus=champ_raw["corpus"],
-        run_id=champ_raw["run_id"],
-        model_id=champ_raw["model_id"],
-        registry_sha256=champ_raw["registry_sha256"],
-        scores=champ_raw.get("scores", {}),
-        aa_noise=champ_raw.get("aa_noise", {}),
-        is_day_zero=champ_raw.get("is_day_zero", False),
-        human_sign_offs=champ_raw.get("human_sign_offs", []),
-        config=champ_raw.get("config", {}),
-        corpus_sha256=champ_raw.get("corpus_sha256", ""),
-    )
-
-
-def save_champion(
-    baseline_path: str | Path,
-    champion: ChampionRecord,
-    *,
-    summary: str = "",
-    date: str = "",
-) -> None:
-    """Persist a new champion and append a promotion log entry.
-
-    Reads the existing file if it exists (to preserve the log), then writes
-    the new champion.  The promotion log is append-only.
-    """
-    path = Path(baseline_path)
-    if path.exists():
-        raw = json.loads(path.read_text(encoding="utf-8"))
-        log = raw.get("promotion_log", [])
-        prev_run = raw.get("champion", {})
-        prev_run_id = prev_run.get("run_id") if isinstance(prev_run, dict) else None
-    else:
-        log = []
-        prev_run_id = None
-
-    log.append(
-        {
-            "date": date or "unknown",
-            "from": prev_run_id,
-            "to": champion.run_id,
-            "label": "day-zero" if champion.is_day_zero else "promotion",
-            "summary": summary,
-        }
-    )
-
-    payload = {
-        "champion": {
-            "corpus": champion.corpus,
-            "run_id": champion.run_id,
-            "model_id": champion.model_id,
-            "registry_sha256": champion.registry_sha256,
-            "scores": champion.scores,
-            "aa_noise": champion.aa_noise,
-            "is_day_zero": champion.is_day_zero,
-            "human_sign_offs": champion.human_sign_offs,
-            "config": champion.config,
-            "corpus_sha256": champion.corpus_sha256,
-        },
-        "promotion_log": log,
-    }
-    path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
-
-
-def invalidate_champion(
-    baseline_path: str | Path,
-    *,
-    reason: str,
-    date: str = "",
-) -> None:
-    """Null out the current champion and record the invalidation reason.
-
-    Used when a champion was locked in against an empty or tampered registry
-    (the banca-cordobesa fake-100% incident).
-    """
-    path = Path(baseline_path)
-    if not path.exists():
-        return
-    raw = json.loads(path.read_text(encoding="utf-8"))
-    log = raw.get("promotion_log", [])
-    prev_run = raw.get("champion", {})
-    prev_run_id = prev_run.get("run_id") if isinstance(prev_run, dict) else None
-    log.append(
-        {
-            "date": date or "unknown",
-            "from": prev_run_id,
-            "to": None,
-            "label": "INVALIDATED",
-            "summary": reason,
-        }
-    )
-    raw["champion"] = None
-    raw["promotion_log"] = log
-    path.write_text(json.dumps(raw, indent=2, ensure_ascii=False), encoding="utf-8")
-
-
-def input_hash(result_dict: dict) -> str:
-    """Stable 16-char SHA-256 prefix of the DiscoveryResult for provenance."""
-    canonical = json.dumps(result_dict, sort_keys=True, ensure_ascii=False)
-    return hashlib.sha256(canonical.encode("utf-8")).hexdigest()[:16]

From 5c8fe8e4450013f47754803e99e19cac3a4cb1bd Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:19:07 +0200
Subject: [PATCH 25/48] chore(evaluation): delete test_champion.py

---
 tests/unit/evaluation/test_champion.py | 199 -------------------------
 1 file changed, 199 deletions(-)
 delete mode 100644 tests/unit/evaluation/test_champion.py

diff --git a/tests/unit/evaluation/test_champion.py b/tests/unit/evaluation/test_champion.py
deleted file mode 100644
index 948a9639..00000000
--- a/tests/unit/evaluation/test_champion.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Unit tests for evaluation.champion: ChampionRecord, load/save/invalidate_champion, input_hash."""
-
-from __future__ import annotations
-
-import json
-
-import pytest
-
-from fireflyframework_agentic.evaluation.champion import (
-    ChampionRecord,
-    input_hash,
-    invalidate_champion,
-    load_champion,
-    save_champion,
-)
-
-
-def _make_champion(**overrides) -> ChampionRecord:
-    defaults = dict(
-        corpus="test-corpus",
-        run_id="run-2026-01",
-        model_id="claude-sonnet-4-5",
-        registry_sha256="abc123",
-        scores={"recall": 0.85, "grounding_pct": 0.92},
-        aa_noise={"recall": 0.02},
-        is_day_zero=False,
-        human_sign_offs=["reviewer-1"],
-    )
-    defaults.update(overrides)
-    return ChampionRecord(**defaults)
-
-
-# ── load_champion ─────────────────────────────────────────────────────────────
-
-
-def test_load_champion_nonexistent_file_returns_none(tmp_path):
-    result = load_champion(tmp_path / "baseline.json")
-    assert result is None
-
-
-def test_load_champion_file_with_null_champion_returns_none(tmp_path):
-    baseline = tmp_path / "baseline.json"
-    baseline.write_text(json.dumps({"champion": None, "promotion_log": []}), encoding="utf-8")
-    assert load_champion(baseline) is None
-
-
-# ── save_champion / load_champion round-trip ──────────────────────────────────
-
-
-def test_save_then_load_round_trips_all_fields(tmp_path):
-    baseline = tmp_path / "baseline.json"
-    champ = _make_champion()
-    save_champion(baseline, champ, summary="initial champion", date="2026-01-01")
-
-    loaded = load_champion(baseline)
-    assert loaded is not None
-    assert loaded.corpus == champ.corpus
-    assert loaded.run_id == champ.run_id
-    assert loaded.model_id == champ.model_id
-    assert loaded.registry_sha256 == champ.registry_sha256
-    assert loaded.scores == champ.scores
-    assert loaded.aa_noise == champ.aa_noise
-    assert loaded.is_day_zero == champ.is_day_zero
-    assert loaded.human_sign_offs == champ.human_sign_offs
-
-
-def test_save_champion_appends_promotion_log_entry(tmp_path):
-    baseline = tmp_path / "baseline.json"
-    champ = _make_champion()
-    save_champion(baseline, champ, summary="first", date="2026-01-01")
-
-    champ2 = _make_champion(run_id="run-2026-02", scores={"recall": 0.90})
-    save_champion(baseline, champ2, summary="second", date="2026-02-01")
-
-    raw = json.loads(baseline.read_text(encoding="utf-8"))
-    log = raw["promotion_log"]
-    assert len(log) == 2
-    assert log[0]["to"] == "run-2026-01"
-    assert log[1]["to"] == "run-2026-02"
-    assert log[1]["from"] == "run-2026-01"
-
-
-def test_save_champion_creates_file_when_missing(tmp_path):
-    baseline = tmp_path / "baseline.json"
-    assert not baseline.exists()
-    save_champion(baseline, _make_champion())
-    assert baseline.exists()
-
-
-def test_save_champion_day_zero_flag_preserved(tmp_path):
-    baseline = tmp_path / "baseline.json"
-    champ = _make_champion(is_day_zero=True)
-    save_champion(baseline, champ)
-    loaded = load_champion(baseline)
-    assert loaded.is_day_zero is True
-
-
-def test_save_champion_label_is_day_zero_when_flag_set(tmp_path):
-    baseline = tmp_path / "baseline.json"
-    champ = _make_champion(is_day_zero=True)
-    save_champion(baseline, champ)
-    raw = json.loads(baseline.read_text(encoding="utf-8"))
-    assert raw["promotion_log"][0]["label"] == "day-zero"
-
-
-def test_save_champion_label_is_promotion_when_flag_not_set(tmp_path):
-    baseline = tmp_path / "baseline.json"
-    save_champion(baseline, _make_champion(is_day_zero=False))
-    raw = json.loads(baseline.read_text(encoding="utf-8"))
-    assert raw["promotion_log"][0]["label"] == "promotion"
-
-
-# ── invalidate_champion ───────────────────────────────────────────────────────
-
-
-def test_invalidate_champion_sets_champion_to_null(tmp_path):
-    baseline = tmp_path / "baseline.json"
-    save_champion(baseline, _make_champion())
-    invalidate_champion(baseline, reason="EMPTY_MUST_FIND fake champion", date="2026-03-01")
-
-    loaded = load_champion(baseline)
-    assert loaded is None
-
-    raw = json.loads(baseline.read_text(encoding="utf-8"))
-    assert raw["champion"] is None
-
-
-def test_invalidate_champion_appends_invalidation_log(tmp_path):
-    baseline = tmp_path / "baseline.json"
-    save_champion(baseline, _make_champion(), date="2026-01-01")
-    invalidate_champion(baseline, reason="fake champion", date="2026-03-01")
-
-    raw = json.loads(baseline.read_text(encoding="utf-8"))
-    log = raw["promotion_log"]
-    assert log[-1]["label"] == "INVALIDATED"
-    assert "fake champion" in log[-1]["summary"]
-    assert log[-1]["to"] is None
-
-
-def test_invalidate_champion_noop_when_file_missing(tmp_path):
-    # Should not raise when file does not exist.
-    invalidate_champion(tmp_path / "no-file.json", reason="test")
-
-
-# ── ChampionRecord helpers ────────────────────────────────────────────────────
-
-
-def test_primary_metric_returns_first_key():
-    champ = _make_champion(scores={"recall": 0.85, "grounding_pct": 0.92})
-    assert champ.primary_metric() == "recall"
-
-
-def test_primary_score_returns_first_value():
-    champ = _make_champion(scores={"recall": 0.85, "grounding_pct": 0.92})
-    assert champ.primary_score() == 0.85
-
-
-def test_primary_metric_empty_scores():
-    champ = _make_champion(scores={})
-    assert champ.primary_metric() == ""
-    assert champ.primary_score() == 0.0
-
-
-# ── input_hash ────────────────────────────────────────────────────────────────
-
-
-def test_input_hash_is_16_chars():
-    result = input_hash({"key": "value"})
-    assert len(result) == 16
-
-
-def test_input_hash_is_deterministic():
-    data = {"process_graph": {"processes": []}, "findings": []}
-    h1 = input_hash(data)
-    h2 = input_hash(data)
-    assert h1 == h2
-
-
-def test_input_hash_differs_for_different_inputs():
-    assert input_hash({"a": 1}) != input_hash({"a": 2})
-
-
-def test_input_hash_key_order_independent():
-    # sort_keys=True in input_hash should make {"a":1, "b":2} == {"b":2, "a":1}.
-    assert input_hash({"a": 1, "b": 2}) == input_hash({"b": 2, "a": 1})

From fdc02771d8b3352a1031fc7ba5b3e1646b32f041 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:19:13 +0200
Subject: [PATCH 26/48] chore(evaluation): delete test_gates.py

---
 tests/unit/evaluation/test_gates.py | 219 ----------------------------
 1 file changed, 219 deletions(-)
 delete mode 100644 tests/unit/evaluation/test_gates.py

diff --git a/tests/unit/evaluation/test_gates.py b/tests/unit/evaluation/test_gates.py
deleted file mode 100644
index 2edc3b99..00000000
--- a/tests/unit/evaluation/test_gates.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Unit tests for evaluation.gates: GateResult, verdict, render_scorecard, g5_no_regression."""
-
-from __future__ import annotations
-
-from fireflyframework_agentic.evaluation.gates import (
-    GateResult,
-    Verdict,
-    g5_no_regression,
-    render_scorecard,
-)
-from fireflyframework_agentic.evaluation.scorecard import verdict
-
-
-# ── GateResult ────────────────────────────────────────────────────────────────
-
-
-def test_gate_result_str_pass():
-    gr = GateResult(gate="G1", passed=True)
-    assert str(gr) == "[G1] PASS"
-
-
-def test_gate_result_str_flag():
-    gr = GateResult(gate="G2", passed=False, reason_code="RECALL_BELOW_FLOOR")
-    assert str(gr) == "[G2] FLAG:RECALL_BELOW_FLOOR"
-
-
-def test_gate_result_flag_without_reason_code():
-    gr = GateResult(gate="G3", passed=False, reason_code="")
-    assert str(gr) == "[G3] FLAG:"
-
-
-def test_gate_result_passed_true():
-    gr = GateResult(gate="G5", passed=True, details={"note": "ok"})
-    assert gr.passed is True
-    assert gr.details["note"] == "ok"
-
-
-def test_gate_result_default_details_is_empty_dict():
-    gr = GateResult(gate="G1", passed=True)
-    assert gr.details == {}
-
-
-# ── verdict ───────────────────────────────────────────────────────────────────
-
-
-def test_verdict_promote_when_all_pass_and_g5_present():
-    gates = [
-        GateResult(gate="G1", passed=True),
-        GateResult(gate="G2", passed=True),
-        GateResult(gate="G3", passed=True),
-        GateResult(gate="G5", passed=True),
-    ]
-    assert verdict(gates) == "PROMOTE"
-
-
-def test_verdict_hold_when_any_gate_fails():
-    gates = [
-        GateResult(gate="G1", passed=True),
-        GateResult(gate="G2", passed=False, reason_code="RECALL_BELOW_FLOOR"),
-        GateResult(gate="G3", passed=True),
-        GateResult(gate="G5", passed=True),
-    ]
-    assert verdict(gates) == "HOLD"
-
-
-def test_verdict_hold_when_g5_missing():
-    # All G1/G2/G3 pass but G5 is absent — no promotion without sign-off.
-    gates = [
-        GateResult(gate="G1", passed=True),
-        GateResult(gate="G2", passed=True),
-        GateResult(gate="G3", passed=True),
-    ]
-    assert verdict(gates) == "HOLD"
-
-
-def test_verdict_hold_on_empty_list():
-    assert verdict([]) == "HOLD"
-
-
-def test_verdict_hold_when_g5_fails():
-    gates = [
-        GateResult(gate="G1", passed=True),
-        GateResult(gate="G2", passed=True),
-        GateResult(gate="G3", passed=True),
-        GateResult(gate="G5", passed=False, reason_code="HOLD"),
-    ]
-    assert verdict(gates) == "HOLD"
-
-
-# ── render_scorecard (from gates module) ──────────────────────────────────────
-
-
-def test_render_scorecard_contains_verdict_line():
-    gates = [
-        GateResult(gate="G1", passed=True),
-        GateResult(gate="G2", passed=True),
-        GateResult(gate="G3", passed=True),
-        GateResult(gate="G5", passed=True),
-    ]
-    output = render_scorecard(gates)
-    assert "VERDICT: PROMOTE" in output
-
-
-def test_render_scorecard_hold_when_flag():
-    gates = [
-        GateResult(gate="G1", passed=False, reason_code="SCHEMA_INVALID"),
-        GateResult(gate="G2", passed=True),
-        GateResult(gate="G3", passed=True),
-        GateResult(gate="G5", passed=True),
-    ]
-    output = render_scorecard(gates)
-    assert "VERDICT: HOLD" in output
-
-
-def test_render_scorecard_includes_all_gate_lines():
-    gates = [
-        GateResult(gate="G1", passed=True),
-        GateResult(gate="G2", passed=True),
-        GateResult(gate="G3", passed=True),
-        GateResult(gate="G5", passed=True),
-    ]
-    output = render_scorecard(gates)
-    for gate_label in ("[G1]", "[G2]", "[G3]", "[G5]"):
-        assert gate_label in output
-
-
-# ── g5_no_regression ──────────────────────────────────────────────────────────
-
-
-def test_g5_day_zero_insufficient_signoffs():
-    result = g5_no_regression(
-        candidate_scores={"recall": 0.85},
-        champion_scores=None,
-        aa_noise=None,
-        is_day_zero=True,
-        human_signed_off=False,
-        signoff_count=1,
-    )
-    assert result.passed is False
-    assert result.reason_code == "HOLD"
-
-
-def test_g5_day_zero_sufficient_signoffs():
-    result = g5_no_regression(
-        candidate_scores={"recall": 0.85},
-        champion_scores=None,
-        aa_noise=None,
-        is_day_zero=True,
-        human_signed_off=False,
-        signoff_count=2,
-    )
-    assert result.passed is True
-    assert result.details["day_zero"] is True
-
-
-def test_g5_hold_when_no_human_signoff():
-    result = g5_no_regression(
-        candidate_scores={"recall": 0.90},
-        champion_scores={"recall": 0.80},
-        aa_noise={"recall": 0.02},
-        human_signed_off=False,
-    )
-    assert result.passed is False
-    assert result.reason_code == "HOLD"
-
-
-def test_g5_hold_when_regression_beyond_band():
-    # Candidate recall 0.75 vs champion 0.80; delta=-0.05 < -band=-0.02.
-    result = g5_no_regression(
-        candidate_scores={"recall": 0.75},
-        champion_scores={"recall": 0.80},
-        aa_noise={"recall": 0.02},
-        human_signed_off=True,
-    )
-    assert result.passed is False
-    assert result.reason_code == "HOLD"
-    assert any("recall" in r for r in result.details["regressions"])
-
-
-def test_g5_promote_when_candidate_beats_champion():
-    result = g5_no_regression(
-        candidate_scores={"recall": 0.90},
-        champion_scores={"recall": 0.80},
-        aa_noise={"recall": 0.02},
-        human_signed_off=True,
-    )
-    assert result.passed is True
-    assert result.details["improvements"]
-
-
-def test_g5_promote_when_within_noise_band():
-    # delta = 0.01 — positive but within band of 0.02; counts as no regression, no improvement.
-    result = g5_no_regression(
-        candidate_scores={"recall": 0.81},
-        champion_scores={"recall": 0.80},
-        aa_noise={"recall": 0.02},
-        human_signed_off=True,
-    )
-    assert result.passed is True
-    assert result.details["improvements"] == []
-
-
-def test_g5_verdict_constants():
-    assert Verdict.PROMOTE == "PROMOTE"
-    assert Verdict.HOLD == "HOLD"

From 0732f8582e9e40818e9a4d05da2dff00220652b4 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:19:17 +0200
Subject: [PATCH 27/48] chore(evaluation): delete test_matcher.py

---
 tests/unit/evaluation/test_matcher.py | 221 --------------------------
 1 file changed, 221 deletions(-)
 delete mode 100644 tests/unit/evaluation/test_matcher.py

diff --git a/tests/unit/evaluation/test_matcher.py b/tests/unit/evaluation/test_matcher.py
deleted file mode 100644
index cc87564b..00000000
--- a/tests/unit/evaluation/test_matcher.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Unit tests for evaluation.matcher: anchored, source_stem, tokens, matches."""
-
-from __future__ import annotations
-
-import pytest
-
-from fireflyframework_agentic.evaluation.matcher import (
-    anchored,
-    matches,
-    source_stem,
-    tokens,
-)
-from fireflyframework_agentic.evaluation.registry import RegistryItem
-
-
-# ── tokens ───────────────────────────────────────────────────────────────────
-
-
-def test_tokens_basic():
-    result = tokens("Hello World")
-    assert result == ["hello", "world"]
-
-
-def test_tokens_lowercases():
-    result = tokens("KYC AML PEP")
-    assert result == ["kyc", "aml", "pep"]
-
-
-def test_tokens_strips_punctuation():
-    result = tokens("risk-management: cost (FTE).")
-    assert "risk" in result
-    assert "management" in result
-    assert "cost" in result
-    assert "fte" in result
-
-
-def test_tokens_empty_string():
-    assert tokens("") == []
-
-
-def test_tokens_numbers_included():
-    result = tokens("case-id CU-2026-1003")
-    assert "2026" in result or "cu" in result
-
-
-def test_tokens_unicode():
-    result = tokens("análisis de crédito")
-    assert "análisis" in result or "an" in result
-
-
-# ── anchored ─────────────────────────────────────────────────────────────────
-
-
-def test_anchored_overlapping_long_token():
-    # "underwriting" is 12 chars — well above the 5-char floor.
-    assert anchored("credit underwriting risk", "underwriting process steps") is True
-
-
-def test_anchored_no_overlap():
-    # No token >= 5 chars shared between claim and evidence.
-    assert anchored("cat sat", "dog ran") is False
-
-
-def test_anchored_short_tokens_ignored():
-    # All tokens in both strings are < 5 chars; no overlap counts.
-    assert anchored("a big cat", "a big dog") is False
-
-
-def test_anchored_mixed_lengths_match():
-    # "kyc" is < 5, but "compliance" is long enough.
-    assert anchored("kyc compliance review", "compliance framework") is True
-
-
-def test_anchored_custom_min_token():
-    # Lower the floor so short tokens can anchor.
-    assert anchored("kyc check", "kyc process", min_token=3) is True
-
-
-def test_anchored_both_empty():
-    assert anchored("", "") is False
-
-
-def test_anchored_partial_token_no_match():
-    # "risk" (4 chars) is below the default 5-char floor.
-    assert anchored("risk alert", "risk factor") is False
-
-
-def test_anchored_returns_bool():
-    result = anchored("credit underwriting", "underwriting model")
-    assert isinstance(result, bool)
-
-
-# ── source_stem ───────────────────────────────────────────────────────────────
-
-
-def test_source_stem_bare_filename_with_extension():
-    assert source_stem("SOP-002-kyc-edd.md") == "sop-002-kyc-edd"
-
-
-def test_source_stem_directory_prefixed():
-    assert source_stem("sops/SOP-002-kyc-edd.md") == "sop-002-kyc-edd"
-
-
-def test_source_stem_deep_path_prefix():
-    assert source_stem("docs/policies/SOP-002-kyc-edd.md") == "sop-002-kyc-edd"
-
-
-def test_source_stem_lowercase():
-    # Stems are always lowercased.
-    assert source_stem("REPORT-FINAL.pdf") == "report-final"
-
-
-def test_source_stem_event_log_row_id():
-    # src-<process>:<case> → process stem.
-    assert source_stem("src-credit-underwriting:CU-2026-1003") == "credit-underwriting"
-
-
-def test_source_stem_event_log_row_id_preserves_hyphens():
-    assert source_stem("src-kyc-onboarding:KYC-001") == "kyc-onboarding"
-
-
-def test_source_stem_strips_fragment():
-    # #page=N should be removed before stemming.
-    assert source_stem("docs/report.pdf#page=5") == "report"
-
-
-def test_source_stem_strips_anchor():
-    assert source_stem("sops/SOP-001.md#section-3") == "sop-001"
-
-
-def test_source_stem_bare_no_extension():
-    # No extension, no directory — stem is just the lowercase name.
-    assert source_stem("my-document") == "my-document"
-
-
-def test_source_stem_no_directory_no_extension_lowercase():
-    assert source_stem("Signal") == "signal"
-
-
-def test_source_stem_csv_extension():
-    assert source_stem("activity-cost-fte.csv") == "activity-cost-fte"
-
-
-# ── matches ───────────────────────────────────────────────────────────────────
-
-
-def _make_item(description: str, evidence: list[str], keywords: list[str] | None = None) -> RegistryItem:
-    """Construct a minimal RegistryItem for matching tests."""
-    return RegistryItem(
-        id="test-item",
-        tier="L1",
-        description=description,
-        evidence=evidence,
-        scope="finding",
-        keywords=keywords or [],
-    )
-
-
-def _make_finding(title: str, description: str, evidence_id: str) -> dict:
-    return {
-        "title": title,
-        "description": description,
-        "evidence_refs": [{"evidence_id": evidence_id}],
-    }
-
-
-def _make_evidence_index(evidence_id: str, locator: str, excerpt: str = "") -> dict:
-    return {evidence_id: {"id": evidence_id, "locator": locator, "excerpt": excerpt}}
-
-
-def test_matches_true_when_source_and_topic_match():
-    # Finding title shares a long token with item description and cites the same source.
-    item = _make_item("credit underwriting process", ["sop-kyc-credit.md"])
-    finding = _make_finding("credit underwriting review", "credit underwriting risk assessment", "ev-1")
-    evidence_index = _make_evidence_index("ev-1", "sop-kyc-credit.md")
-    assert matches(finding, item, evidence_index, scope="finding") is True
-
-
-def test_matches_false_when_source_differs():
-    # Token match exists but sources don't overlap — anti-gaming guard fires.
-    item = _make_item("credit underwriting process", ["sop-credit.md"])
-    finding = _make_finding("credit underwriting review", "credit underwriting details", "ev-1")
-    evidence_index = _make_evidence_index("ev-1", "other-document.md")
-    assert matches(finding, item, evidence_index, scope="finding") is False
-
-
-def test_matches_false_when_no_token_overlap():
-    # Same source, but no shared long token between finding text and item description.
-    item = _make_item("regulatory capital requirement", ["sop-capital.md"])
-    finding = _make_finding("kyc identity check", "client onboarding steps", "ev-1")
-    evidence_index = _make_evidence_index("ev-1", "sop-capital.md")
-    assert matches(finding, item, evidence_index, scope="finding") is False
-
-
-def test_matches_keyword_rail_short_token():
-    # "KYC" is 3 chars — below the 5-char token floor but valid as a keyword.
-    item = _make_item("some description about identity", ["sop-kyc.md"], keywords=["KYC"])
-    finding = _make_finding("KYC onboarding", "KYC onboarding process", "ev-1")
-    evidence_index = _make_evidence_index("ev-1", "sop-kyc.md")
-    assert matches(finding, item, evidence_index, scope="finding") is True
-
-
-def test_matches_empty_evidence_refs_returns_false():
-    # Finding with no evidence refs cannot share a source with any item.
-    item = _make_item("credit underwriting", ["sop-credit.md"])
-    finding = {"title": "credit underwriting", "description": "credit underwriting risk", "evidence_refs": []}
-    assert matches(finding, item, {}, scope="finding") is False

From f769ef1c40d28067040faa8a5f662038d4765eb0 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:19:23 +0200
Subject: [PATCH 28/48] chore(evaluation): delete test_stats.py

---
 tests/unit/evaluation/test_stats.py | 183 ----------------------------
 1 file changed, 183 deletions(-)
 delete mode 100644 tests/unit/evaluation/test_stats.py

diff --git a/tests/unit/evaluation/test_stats.py b/tests/unit/evaluation/test_stats.py
deleted file mode 100644
index 9523be8c..00000000
--- a/tests/unit/evaluation/test_stats.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Unit tests for evaluation.stats: aa_band, aggregate_grounding, left_skew_flag."""
-
-from __future__ import annotations
-
-import pytest
-
-from fireflyframework_agentic.evaluation.stats import (
-    aa_band,
-    aggregate_grounding,
-    left_skew_flag,
-)
-
-
-# ── aa_band ──────────────────────────────────────────────────────────────────
-
-
-def test_aa_band_two_identical_scores():
-    # Two identical scores produce zero pairwise delta.
-    assert aa_band([0.80, 0.80]) == 0.0
-
-
-def test_aa_band_two_different_scores():
-    # Single delta = |0.90 - 0.80| = 0.10; 95th percentile of one value is that value.
-    result = aa_band([0.80, 0.90])
-    assert abs(result - 0.10) < 1e-9
-
-
-def test_aa_band_three_scores_known_deltas():
-    # Scores: 0.70, 0.80, 0.90
-    # Pairwise deltas: |0.70-0.80|=0.10, |0.70-0.90|=0.20, |0.80-0.90|=0.10
-    # Sorted: [0.10, 0.10, 0.20] → 95th pct index = int(3 * 95 / 100) = 2 → 0.20
-    result = aa_band([0.70, 0.80, 0.90])
-    assert abs(result - 0.20) < 1e-9
-
-
-def test_aa_band_large_spread():
-    # Max delta in [0.0, 1.0] is 1.0.
-    result = aa_band([0.0, 1.0])
-    assert abs(result - 1.0) < 1e-9
-
-
-def test_aa_band_requires_at_least_two_scores():
-    with pytest.raises(ValueError, match="aa_band requires >= 2 reruns"):
-        aa_band([0.80])
-
-
-def test_aa_band_empty_raises():
-    with pytest.raises(ValueError, match="aa_band requires >= 2 reruns"):
-        aa_band([])
-
-
-def test_aa_band_custom_percentile():
-    # 50th percentile of [0.10, 0.10, 0.20] at idx=1 → 0.10.
-    result = aa_band([0.70, 0.80, 0.90], percentile=50)
-    assert abs(result - 0.10) < 1e-9
-
-
-def test_aa_band_returns_float():
-    result = aa_band([0.80, 0.85, 0.90])
-    assert isinstance(result, float)
-
-
-# ── aggregate_grounding ───────────────────────────────────────────────────────
-
-
-def test_aggregate_grounding_single_dict():
-    g = {"support_pct": 90.0, "unsupported_ids": ["ev-1"]}
-    result = aggregate_grounding([g])
-    assert result["support_pct"] == 90.0
-    assert result["unsupported_ids"] == ["ev-1"]
-    assert result["_aggregate_runs"] == 1
-
-
-def test_aggregate_grounding_mean_support_pct():
-    dicts = [
-        {"support_pct": 80.0, "unsupported_ids": []},
-        {"support_pct": 100.0, "unsupported_ids": []},
-    ]
-    result = aggregate_grounding(dicts)
-    assert result["support_pct"] == 90.0
-
-
-def test_aggregate_grounding_union_of_unsupported_ids():
-    dicts = [
-        {"support_pct": 90.0, "unsupported_ids": ["ev-1", "ev-2"]},
-        {"support_pct": 85.0, "unsupported_ids": ["ev-2", "ev-3"]},
-    ]
-    result = aggregate_grounding(dicts)
-    assert set(result["unsupported_ids"]) == {"ev-1", "ev-2", "ev-3"}
-
-
-def test_aggregate_grounding_union_sorted():
-    dicts = [
-        {"support_pct": 90.0, "unsupported_ids": ["ev-b"]},
-        {"support_pct": 90.0, "unsupported_ids": ["ev-a"]},
-    ]
-    result = aggregate_grounding(dicts)
-    assert result["unsupported_ids"] == ["ev-a", "ev-b"]
-
-
-def test_aggregate_grounding_empty_input():
-    result = aggregate_grounding([])
-    assert result["support_pct"] == 0.0
-    assert result["unsupported_ids"] == []
-
-
-def test_aggregate_grounding_records_run_count():
-    dicts = [
-        {"support_pct": 80.0, "unsupported_ids": []},
-        {"support_pct": 90.0, "unsupported_ids": []},
-        {"support_pct": 100.0, "unsupported_ids": []},
-    ]
-    result = aggregate_grounding(dicts)
-    assert result["_aggregate_runs"] == 3
-
-
-def test_aggregate_grounding_per_run_pct_recorded():
-    dicts = [
-        {"support_pct": 80.0, "unsupported_ids": []},
-        {"support_pct": 100.0, "unsupported_ids": []},
-    ]
-    result = aggregate_grounding(dicts)
-    assert result["_support_pct_per_run"] == [80.0, 100.0]
-
-
-def test_aggregate_grounding_missing_unsupported_ids_treated_as_empty():
-    dicts = [
-        {"support_pct": 90.0},  # no unsupported_ids key
-        {"support_pct": 80.0, "unsupported_ids": ["ev-1"]},
-    ]
-    result = aggregate_grounding(dicts)
-    assert result["unsupported_ids"] == ["ev-1"]
-
-
-# ── left_skew_flag ────────────────────────────────────────────────────────────
-
-
-def test_left_skew_flag_true_when_catastrophic_run():
-    # median([0.80, 0.80, 0.80]) = 0.80; min = 0.60 < 0.80 - 0.10 = 0.70.
-    assert left_skew_flag([0.60, 0.80, 0.80]) is True
-
-
-def test_left_skew_flag_false_when_min_close_to_median():
-    # median = 0.80; min = 0.75; 0.75 >= 0.80 - 0.10 = 0.70 → no flag.
-    assert left_skew_flag([0.75, 0.80, 0.85]) is False
-
-
-def test_left_skew_flag_false_when_all_equal():
-    assert left_skew_flag([0.85, 0.85, 0.85]) is False
-
-
-def test_left_skew_flag_boundary_just_above_threshold():
-    # min = 0.71, median = 0.80; 0.71 >= 0.80 - 0.10 = 0.70 → no flag.
-    assert left_skew_flag([0.71, 0.80, 0.80]) is False
-
-
-def test_left_skew_flag_single_score_always_false():
-    # A single score has no meaningful distribution; function returns False.
-    assert left_skew_flag([0.50]) is False
-
-
-def test_left_skew_flag_two_scores_with_large_gap():
-    # median([0.50, 0.90]) = 0.70; min = 0.50 < 0.70 - 0.10 = 0.60.
-    assert left_skew_flag([0.50, 0.90]) is True
-
-
-def test_left_skew_flag_returns_bool():
-    result = left_skew_flag([0.80, 0.85, 0.90])
-    assert isinstance(result, bool)

From 251605211cff33f0e89fcbe26f8aefdbfab0fa72 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:20:17 +0200
Subject: [PATCH 29/48] feat(evaluation): rewrite judge_client.py as async
 (httpx.AsyncClient)

---
 .../evaluation/judge_client.py                | 382 +++++-------------
 1 file changed, 91 insertions(+), 291 deletions(-)

diff --git a/fireflyframework_agentic/evaluation/judge_client.py b/fireflyframework_agentic/evaluation/judge_client.py
index 1af17f53..7f050d16 100644
--- a/fireflyframework_agentic/evaluation/judge_client.py
+++ b/fireflyframework_agentic/evaluation/judge_client.py
@@ -1,60 +1,24 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+"""Async LLM scoring client for judge metrics.
 
-"""Provider-agnostic LLM-as-a-Judge client for the G4 advisory gate.
-
-Zero new dependencies: stdlib (urllib.request, json, os, time, re) + numpy.
-The client is a thin POST wrapper over four chat providers (Anthropic, OpenAI,
-Azure OpenAI, Ollama) plus an Ollama embedder.  It is deliberately tolerant:
-chat_json extracts the FIRST JSON object from the model text (models wrap JSON
-in prose / code fences), and retries transient HTTP errors with backoff.
-
-This module is import-safe: importing it touches NO network and reads NO API
-key.  Keys are read lazily, per-call, only when a real request is made — so the
-judge tests can import and inject stubs without any secret present.
-
-Provider/model spec format: "<provider>:<model>", e.g. "anthropic:claude-sonnet-4-6",
-"openai:gpt-4o", "azure:gpt-4o", "ollama:llama3".  A bare model with no prefix is
-treated as provider "unknown" (see parse_model / same_provider).
+Thin httpx-based wrapper over Anthropic / OpenAI / Azure OpenAI / Ollama.
+Reads API keys lazily (per-call) from env so importing never requires secrets.
+Provider/model spec: "<provider>:<model>", e.g. "anthropic:claude-sonnet-4-6".
 """
 
 from __future__ import annotations
 
+import asyncio
 import json
 import os
 import re
-import time
-import urllib.error
-import urllib.request
 
-import numpy as np
+import httpx
 
-# Transient HTTP status codes worth retrying (rate limit + 5xx).
 _RETRY_STATUS = (429, 500, 502, 503, 504)
-
-# Hard cap on a honoured Retry-After sleep (a hostile header should not stall us).
 _MAX_RETRY_AFTER = 30.0
 
 
-def _env(name, default=None):
-    """Read an env var, stripping surrounding whitespace; empty-after-strip -> default.
-
-    Defensive against a ``.env`` value that arrives with a trailing ``\\r`` /
-    whitespace (CRLF), which would otherwise corrupt a request URL or header.
-    An unset OR blank value falls back to ``default`` so the existing
-    missing-key -> RuntimeError behaviour is preserved.
-    """
+def _env(name: str, default: str | None = None) -> str | None:
     value = os.environ.get(name)
     if value is None:
         return default
@@ -62,30 +26,8 @@ def _env(name, default=None):
     return value if value else default
 
 
-def _retry_delay(exc: urllib.error.HTTPError, attempt: int) -> float:
-    """Seconds to sleep before retrying an HTTPError.
-
-    On 429 honour the ``Retry-After`` header (capped at 30s) when it is present
-    and numeric; otherwise fall back to exponential backoff (2 ** attempt).
-    """
-    if exc.code == 429:
-        headers = getattr(exc, "headers", None)
-        retry_after = headers.get("retry-after") if headers is not None else None
-        if retry_after is not None:
-            try:
-                return min(float(retry_after), _MAX_RETRY_AFTER)
-            except (TypeError, ValueError):
-                pass
-    return 2.0**attempt
-
-
 def parse_model(spec: str) -> tuple[str, str]:
-    """Split a "provider:model" spec into (provider, model).
-
-    A bare spec with no ':' is returned as provider "unknown" with the whole
-    string as the model, e.g. "claude-sonnet-4-6" -> ("unknown", "claude-sonnet-4-6").
-    The provider is lower-cased; the model keeps its original case.
-    """
+    """Split "provider:model" -> (provider, model). Bare spec -> ("unknown", spec)."""
     spec = (spec or "").strip()
     if ":" not in spec:
         return "unknown", spec
@@ -94,28 +36,16 @@ def parse_model(spec: str) -> tuple[str, str]:
 
 
 def same_provider(pipeline_model: str, judge_model: str) -> bool:
-    """True iff both specs name the SAME known provider prefix.
-
-    A missing or "unknown" provider on either side -> not-same (False).  This is
-    the same-provider caveat signal: when the judge and the pipeline share a
-    provider the judged metrics are advisory (no cross-provider isolation).
-    """
-    p_provider, _ = parse_model(pipeline_model)
-    j_provider, _ = parse_model(judge_model)
-    if p_provider == "unknown" or j_provider == "unknown":
+    """True iff both specs share the same known provider prefix."""
+    p, _ = parse_model(pipeline_model)
+    j, _ = parse_model(judge_model)
+    if p == "unknown" or j == "unknown":
         return False
-    return p_provider == j_provider
+    return p == j
 
 
 def _first_json_object(text: str) -> dict:
-    """Extract and parse the FIRST balanced JSON object embedded in text.
-
-    Models wrap JSON in prose, preambles, or ```json code fences.  This scans
-    for the first '{' and walks the string tracking brace depth (string-aware,
-    so braces inside quoted values do not confuse the matcher) to find its
-    matching '}'.  Falls back to a greedy regex span if no balanced object is
-    found.  Raises ValueError when nothing parses.
-    """
+    """Extract the first balanced JSON object from text (handles prose/code-fence wrapping)."""
     if not text:
         raise ValueError("empty model response")
 
@@ -165,38 +95,12 @@ def _first_json_object(text: str) -> dict:
     raise ValueError("no JSON object found in model response")
 
 
-def _http_post_json(url: str, headers: dict, body: dict, timeout: int) -> dict:
-    """POST a JSON body and return the parsed JSON response (single attempt)."""
-    data = json.dumps(body).encode("utf-8")
-    req_headers = {"content-type": "application/json", **headers}
-    req = urllib.request.Request(url, data=data, headers=req_headers, method="POST")
-    with urllib.request.urlopen(req, timeout=timeout) as resp:
-        return json.loads(resp.read().decode("utf-8"))
-
-
-def _extract_openai_text(resp: dict) -> str:
-    """Pull the assistant text from an OpenAI/Azure chat-completions response.
-
-    Guards an empty ``choices`` list and a null ``message.content`` and raises a
-    descriptive RuntimeError (not a KeyError) when no text is present, so the
-    judge layer records a clean dropped-vote reason instead of a stack trace.
-    """
-    choices = resp.get("choices") or []
-    if choices:
-        text = (choices[0].get("message") or {}).get("content")
-        if text:
-            return text
-    raise RuntimeError(f"judge returned no text: {resp}")
-
-
 class JudgeClient:
-    """Minimal multi-provider chat client returning parsed JSON dicts.
+    """Async multi-provider chat client returning parsed JSON dicts.
 
     Dispatch is by the provider prefix of the model spec.  temperature is pinned
-    to 0.0 for deterministic verdicts.  Transient HTTP errors (429/5xx) and URL
-    errors are retried up to max_retries: a 429 honours the ``Retry-After``
-    header (capped at 30s) when present, otherwise backoff is exponential
-    (2 ** attempt seconds).
+    to 0.0 for deterministic verdicts.  Transient HTTP errors (429/5xx) and network
+    errors are retried up to max_retries with backoff.
 
     The API key / endpoint env vars are read lazily inside chat_json, so
     constructing a JudgeClient never requires a secret.
@@ -208,48 +112,49 @@ def __init__(self, model: str, timeout: int = 120, max_retries: int = 3) -> None
         self.timeout = timeout
         self.max_retries = max_retries
 
-    def chat_json(self, system: str, user: str, max_tokens: int = 1024) -> dict:
+    async def chat_json(self, system: str, user: str, max_tokens: int = 1024) -> dict:
         """Send (system, user) to the provider and parse the first JSON object.
 
         Raises on exhausted retries / unknown provider / unparseable output.
-        The judge module wraps every call in try/except, so a raise here becomes
-        a dropped vote rather than a crash.
         """
         last_exc: Exception | None = None
         for attempt in range(self.max_retries):
             try:
-                text = self._dispatch(system, user, max_tokens)
-                return _first_json_object(text)
-            except urllib.error.HTTPError as exc:
+                if self.provider == "anthropic":
+                    return await self._anthropic(system, user, max_tokens)
+                if self.provider == "openai":
+                    return await self._openai(system, user, max_tokens)
+                if self.provider == "azure":
+                    return await self._azure(system, user, max_tokens)
+                if self.provider == "ollama":
+                    return await self._ollama(system, user, max_tokens)
+                raise ValueError(
+                    f"unknown judge provider {self.provider!r} in {self.model_spec!r}; "
+                    "use anthropic:/openai:/azure:/ollama:"
+                )
+            except httpx.HTTPStatusError as exc:
                 last_exc = exc
-                if exc.code not in _RETRY_STATUS or attempt == self.max_retries - 1:
+                if exc.response.status_code not in _RETRY_STATUS or attempt == self.max_retries - 1:
                     raise
-                time.sleep(_retry_delay(exc, attempt))
-            except (urllib.error.URLError, TimeoutError, ConnectionError) as exc:
+                retry_after_header = exc.response.headers.get("retry-after")
+                if retry_after_header is not None:
+                    try:
+                        delay = min(float(retry_after_header), _MAX_RETRY_AFTER)
+                    except (TypeError, ValueError):
+                        delay = 2.0**attempt
+                else:
+                    delay = 2.0**attempt
+                await asyncio.sleep(delay)
+            except httpx.RequestError as exc:
                 last_exc = exc
                 if attempt == self.max_retries - 1:
                     raise
-                time.sleep(2**attempt)
+                await asyncio.sleep(2.0)
         if last_exc is not None:
             raise last_exc
         raise RuntimeError("chat_json exhausted retries without a response")
 
-    def _dispatch(self, system: str, user: str, max_tokens: int) -> str:
-        """Route to the per-provider call and return the raw model text."""
-        if self.provider == "anthropic":
-            return self._anthropic(system, user, max_tokens)
-        if self.provider == "openai":
-            return self._openai(system, user, max_tokens)
-        if self.provider == "azure":
-            return self._azure(system, user, max_tokens)
-        if self.provider == "ollama":
-            return self._ollama(system, user, max_tokens)
-        raise ValueError(
-            f"unknown judge provider {self.provider!r} in {self.model_spec!r}; "
-            "use anthropic:/openai:/azure:/ollama:"
-        )
-
-    def _anthropic(self, system: str, user: str, max_tokens: int) -> str:
+    async def _anthropic(self, system: str, user: str, max_tokens: int) -> dict:
         api_key = _env("ANTHROPIC_API_KEY")
         if not api_key:
             raise RuntimeError("ANTHROPIC_API_KEY not set")
@@ -260,16 +165,21 @@ def _anthropic(self, system: str, user: str, max_tokens: int) -> str:
             "system": system,
             "messages": [{"role": "user", "content": user}],
         }
-        headers = {"x-api-key": api_key, "anthropic-version": "2023-06-01"}
-        resp = _http_post_json("https://api.anthropic.com/v1/messages", headers, body, self.timeout)
-        text = next(
-            (b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None
-        )
+        headers = {
+            "x-api-key": api_key,
+            "anthropic-version": "2023-06-01",
+            "content-type": "application/json",
+        }
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            resp = await client.post("https://api.anthropic.com/v1/messages", json=body, headers=headers)
+            resp.raise_for_status()
+            data = resp.json()
+        text = next((b.get("text") for b in data.get("content", []) if b.get("type") == "text"), None)
         if not text:
-            raise RuntimeError(f"judge returned no text: {resp}")
-        return text
+            raise RuntimeError(f"judge returned no text: {data}")
+        return _first_json_object(text)
 
-    def _openai(self, system: str, user: str, max_tokens: int) -> str:
+    async def _openai(self, system: str, user: str, max_tokens: int) -> dict:
         api_key = _env("OPENAI_API_KEY")
         if not api_key:
             raise RuntimeError("OPENAI_API_KEY not set")
@@ -282,25 +192,27 @@ def _openai(self, system: str, user: str, max_tokens: int) -> str:
                 {"role": "user", "content": user},
             ],
         }
-        headers = {"Authorization": f"Bearer {api_key}"}
-        resp = _http_post_json(
-            "https://api.openai.com/v1/chat/completions", headers, body, self.timeout
-        )
-        return _extract_openai_text(resp)
+        headers = {"Authorization": f"Bearer {api_key}", "content-type": "application/json"}
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            resp = await client.post("https://api.openai.com/v1/chat/completions", json=body, headers=headers)
+            resp.raise_for_status()
+            data = resp.json()
+        choices = data.get("choices") or []
+        if choices:
+            text = (choices[0].get("message") or {}).get("content")
+            if text:
+                return _first_json_object(text)
+        raise RuntimeError(f"judge returned no text: {data}")
 
-    def _azure(self, system: str, user: str, max_tokens: int) -> str:
+    async def _azure(self, system: str, user: str, max_tokens: int) -> dict:
         endpoint = _env("AZURE_OPENAI_ENDPOINT")
         api_key = _env("AZURE_OPENAI_API_KEY")
         if not endpoint:
             raise RuntimeError("AZURE_OPENAI_ENDPOINT not set")
         if not api_key:
             raise RuntimeError("AZURE_OPENAI_API_KEY not set")
-        api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01"
-        # Azure deployment lives in the URL path, not the JSON body.
-        url = (
-            f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions"
-            f"?api-version={api_version}"
-        )
+        api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-02-01"
+        url = f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions?api-version={api_version}"
         body = {
             "max_tokens": max_tokens,
             "temperature": 0.0,
@@ -309,146 +221,34 @@ def _azure(self, system: str, user: str, max_tokens: int) -> str:
                 {"role": "user", "content": user},
             ],
         }
-        headers = {"api-key": api_key}
-        resp = _http_post_json(url, headers, body, self.timeout)
-        return _extract_openai_text(resp)
+        headers = {"api-key": api_key, "content-type": "application/json"}
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            resp = await client.post(url, json=body, headers=headers)
+            resp.raise_for_status()
+            data = resp.json()
+        choices = data.get("choices") or []
+        if choices:
+            text = (choices[0].get("message") or {}).get("content")
+            if text:
+                return _first_json_object(text)
+        raise RuntimeError(f"judge returned no text: {data}")
 
-    def _ollama(self, system: str, user: str, max_tokens: int) -> str:
+    async def _ollama(self, system: str, user: str, max_tokens: int) -> dict:  # noqa: ARG002
         host = _env("OLLAMA_HOST") or "http://localhost:11434"
         body = {
             "model": self.model,
             "stream": False,
-            "options": {"temperature": 0.0, "num_predict": max_tokens},
+            "options": {"temperature": 0.0},
             "messages": [
                 {"role": "system", "content": system},
                 {"role": "user", "content": user},
             ],
         }
-        resp = _http_post_json(f"{host.rstrip('/')}/api/chat", {}, body, self.timeout)
-        text = (resp.get("message") or {}).get("content")
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            resp = await client.post(f"{host.rstrip('/')}/api/chat", json=body)
+            resp.raise_for_status()
+            data = resp.json()
+        text = (data.get("message") or {}).get("content")
         if not text:
-            raise RuntimeError(f"judge returned no text: {resp}")
-        return text
-
-
-class OpenAIEmbedder:
-    """OpenAI embeddings client over /v1/embeddings.
-
-    Reads OPENAI_API_KEY from the environment.  Default model: text-embedding-3-small.
-    """
-
-    def __init__(self, model: str = "text-embedding-3-small", timeout: int = 60) -> None:
-        self.model = model
-        self.timeout = timeout
-
-    def embed(self, texts: list[str]) -> np.ndarray:
-        api_key = _env("OPENAI_API_KEY")
-        if not api_key:
-            raise RuntimeError("OPENAI_API_KEY not set")
-        headers = {"Authorization": f"Bearer {api_key}"}
-        body = {"model": self.model, "input": texts}
-        resp = _http_post_json("https://api.openai.com/v1/embeddings", headers, body, self.timeout)
-        data = resp.get("data", [])
-        vectors = [item["embedding"] for item in sorted(data, key=lambda x: x["index"])]
-        return np.asarray(vectors, dtype=np.float32)
-
-
-class AzureOpenAIEmbedder:
-    """Azure OpenAI embeddings client.
-
-    Reads AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, and optionally
-    AZURE_OPENAI_API_VERSION from the environment.  The model name is the
-    deployment name.  Default model: text-embedding-3-small.
-    """
-
-    def __init__(self, model: str = "text-embedding-3-small", timeout: int = 60) -> None:
-        self.model = model
-        self.timeout = timeout
-
-    def embed(self, texts: list[str]) -> np.ndarray:
-        endpoint = _env("AZURE_OPENAI_ENDPOINT")
-        api_key = _env("AZURE_OPENAI_API_KEY")
-        if not endpoint:
-            raise RuntimeError("AZURE_OPENAI_ENDPOINT not set")
-        if not api_key:
-            raise RuntimeError("AZURE_OPENAI_API_KEY not set")
-        api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01"
-        url = (
-            f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings"
-            f"?api-version={api_version}"
-        )
-        headers = {"api-key": api_key}
-        vectors = self._embed_with_split(texts, url, headers)
-        return np.asarray(vectors, dtype=np.float32)
-
-    def _embed_with_split(self, texts: list[str], url: str, headers: dict) -> list[list[float]]:
-        """Send texts in one request; on HTTP 400 split in half and retry each half."""
-        try:
-            resp = _http_post_json(url, headers, {"input": texts}, self.timeout)
-            data = resp.get("data", [])
-            return [item["embedding"] for item in sorted(data, key=lambda x: x["index"])]
-        except urllib.error.HTTPError as exc:
-            if exc.code == 400 and len(texts) > 1:
-                mid = len(texts) // 2
-                left = self._embed_with_split(texts[:mid], url, headers)
-                right = self._embed_with_split(texts[mid:], url, headers)
-                return left + right
-            raise
-
-
-class OllamaEmbedder:
-    """Local Ollama embedding client (default model bge-m3) over /api/embeddings.
-
-    Posts one prompt per call (the stable single-prompt form) and stacks the
-    returned vectors into a 2-D numpy array.  Constructing it touches no network;
-    the host is resolved from $OLLAMA_HOST at call time.
-    """
-
-    def __init__(self, model: str = "bge-m3", host: str | None = None, timeout: int = 60) -> None:
-        self.model = model
-        self.host = (host or _env("OLLAMA_HOST") or "http://localhost:11434").rstrip("/")
-        self.timeout = timeout
-
-    def embed(self, texts: list[str]) -> np.ndarray:
-        """Embed a list of strings -> float32 ndarray of shape (len(texts), dim)."""
-        vectors: list[list[float]] = []
-        for text in texts:
-            body = {"model": self.model, "prompt": text}
-            resp = _http_post_json(f"{self.host}/api/embeddings", {}, body, self.timeout)
-            vectors.append(resp["embedding"])
-        return np.asarray(vectors, dtype=np.float32)
-
-
-def build_embedder(spec: str):
-    """Return an ``embed_fn(list[str]) -> np.ndarray`` for an embedder spec.
-
-    Dispatch is on the provider prefix of a "<provider>:<model>" spec:
-    - "ollama" / "ollama:<model>" -> OllamaEmbedder(model or "bge-m3").embed.
-    - a bare "<model>" with no ':' -> treated as an Ollama model.
-    - any other provider -> NotImplementedError (the extension point).
-
-    Add a new backend by adding a branch here.
-    """
-    if (spec or "").strip() == "ollama":  # bare provider, no model -> default model
-        return OllamaEmbedder("bge-m3").embed
-    provider, model = parse_model(spec)
-    if provider in ("unknown", "ollama"):  # bare "<model>" or "ollama:<model>"
-        return OllamaEmbedder(model or "bge-m3").embed
-    if provider == "openai":
-        return OpenAIEmbedder(model or "text-embedding-3-small").embed
-    if provider == "azure":
-        return AzureOpenAIEmbedder(model or "text-embedding-3-small").embed
-    raise NotImplementedError(
-        f"embedder backend {provider!r} not implemented yet; add it in build_embedder()"
-    )
-
-
-def cosine(a, b) -> float:
-    """Cosine similarity between two 1-D vectors; 0.0 if either is the zero vector."""
-    a = np.asarray(a, dtype=np.float64).ravel()
-    b = np.asarray(b, dtype=np.float64).ravel()
-    na = float(np.linalg.norm(a))
-    nb = float(np.linalg.norm(b))
-    if na == 0.0 or nb == 0.0:
-        return 0.0
-    return float(np.dot(a, b) / (na * nb))
+            raise RuntimeError(f"judge returned no text: {data}")
+        return _first_json_object(text)

From 5609ab67d43af810c0bf29b8f7b9f7100d50de15 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:24:09 +0200
Subject: [PATCH 30/48] =?UTF-8?q?feat(evaluation):=20rewrite=20judge.py=20?=
 =?UTF-8?q?=E2=80=94=20async=20metrics=20+=20EvalContext=20+=20flycanon=20?=
 =?UTF-8?q?+=20RAGAS?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fireflyframework_agentic/evaluation/judge.py | 769 ++++++++++---------
 1 file changed, 415 insertions(+), 354 deletions(-)

diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py
index a347c8e1..9f24dc26 100644
--- a/fireflyframework_agentic/evaluation/judge.py
+++ b/fireflyframework_agentic/evaluation/judge.py
@@ -1,61 +1,48 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""G4 — LLM-as-a-Judge: an opt-in, NON-BLOCKING, NON-DETERMINISTIC advisory gate.
-
-G4 NEVER affects the PROMOTE/HOLD verdict and NEVER raises into the caller.
-run_judge() wraps every metric in try/except; a failing metric appends to
-report.errors and the run continues (best-effort).  The result is an
-AdvisoryReport, NOT a GateResult — it is carried separately so it can never
-enter verdict() or the Skipped tuple (see scorecard / verdict_unaffected_note).
-
-Three families of metric (matching the flyradar contracts):
-- [D] DETERMINISTIC — pure python, no LLM, printed even when the judge is off:
-      source_coverage, excerpt_fill_rate.
-- [E] EMBEDDING — needs an embed_fn (local Ollama BGE by default):
-      semantic_recovery (context recall).
-- [J] JUDGE — needs a chat_fn(system, user) -> dict; each [J] metric instructs
-      the model to reply with ONLY JSON: faithfulness, numeric_temporal_fidelity,
-      citation_relevance, nc_semantic_precision, fabricated_entity, contradiction,
-      open_gap, actionability, severity_calibration, answer_relevancy,
-      comparative_vs_champion.
-
-Aggregation follows the flycanon custom-judge design: run each [J] metric `runs`
-times and take the MEDIAN of its numeric scores (robust to an outlier vote).
-
-Zero new dependencies: stdlib (json, statistics) + numpy.  All imports at top.
-calibrated is ALWAYS False for now (LLM-as-a-Judge calibration is §14, future work).
+"""Evaluation judge — async metrics for flyradar and flycanon pipelines.
+
+Every metric: async def metric_name(item: dict, ctx: EvalContext) -> dict | float | None
+
+Flyradar item keys: findings, evidence_index, process_graph, proposed_actions,
+  workspace, reports, lexical_missed_ids, nc_items, champion
+Flycanon item keys: question, answer, reference, contexts
 """
 
 from __future__ import annotations
 
-import concurrent.futures
+import asyncio
+import math
+import os
 import statistics
+from collections.abc import Awaitable, Callable
 from dataclasses import dataclass, field
 
-import numpy as np
+from pydantic import BaseModel, ConfigDict
 
-from fireflyframework_agentic.evaluation.judge_client import (
-    JudgeClient,
-    OllamaEmbedder,
-    cosine,
-    same_provider,
-)
-from fireflyframework_agentic.evaluation.matcher import source_stem
+from fireflyframework_agentic.embeddings.providers.ollama import OllamaEmbedder
+from fireflyframework_agentic.embeddings.similarity import cosine_similarity
+from fireflyframework_agentic.evaluation.judge_client import JudgeClient, same_provider
+
+Metric = Callable[["dict", "EvalContext"], Awaitable["dict | float | None"]]
 
 SYSTEM = "You are a meticulous evaluator of a process-mining discovery report. Return ONLY a JSON object."
 
+SYSTEM_RAG = "You are an evaluator of a RAG system's answers. Return ONLY a JSON object."
+
+RUBRIC = (
+    "Score the ANSWER on two metrics:\n"
+    "- contains_answer (0.0-1.0): Does the answer contain the correct information from the REFERENCE?\n"
+    "- addresses_question (0.0-1.0): Does the answer directly address what the QUESTION is asking?\n"
+    'Reply with ONLY {"contains_answer": <float>, "addresses_question": <float>}.'
+)
+
+
+class EvalContext(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    client: JudgeClient
+    embedder: OllamaEmbedder | None = None
+    runs: int = 3
+
 
 @dataclass
 class AdvisoryReport:
@@ -68,7 +55,7 @@ class AdvisoryReport:
 
     judge_model: str
     same_provider_caveat: bool
-    calibrated: bool  # ALWAYS False for now (§14)
+    calibrated: bool  # ALWAYS False for now
     runs: int
     metrics: dict = field(default_factory=dict)
     details: dict = field(default_factory=dict)
@@ -78,8 +65,8 @@ class AdvisoryReport:
 # ── shared accessors ───────────────────────────────────────────────────────────
 
 
-def _evidence_index(result: dict) -> dict[str, dict]:
-    return {ev.get("id"): ev for ev in result.get("evidence_index", []) if ev.get("id")}
+def _evidence_index(item: dict) -> dict[str, dict]:
+    return {ev.get("id"): ev for ev in item.get("evidence_index", []) if ev.get("id")}
 
 
 def _cited_excerpts(finding: dict, evidence_index: dict[str, dict]) -> list[str]:
@@ -94,143 +81,116 @@ def _cited_excerpts(finding: dict, evidence_index: dict[str, dict]) -> list[str]
     return out
 
 
-def _output_text(result: dict) -> str:
+def _output_text(item: dict) -> str:
     """All free text the model emitted: finding titles+descriptions + reports."""
     parts: list[str] = []
-    for f in result.get("findings", []):
+    for f in item.get("findings", []):
         parts.append(f.get("title", ""))
         parts.append(f.get("description", ""))
-    for r in result.get("reports", []):
+    for r in item.get("reports", []):
         parts.append(str(r))
     return "\n".join(p for p in parts if p)
 
 
-def _workspace_intention(result: dict) -> str:
-    ws = result.get("workspace") or {}
+def _workspace_intention(item: dict) -> str:
+    ws = item.get("workspace") or {}
     return f"{ws.get('name', '')}\n{ws.get('description', '')}".strip()
 
 
 def _coerce_float(value, default=None):
-    """Coerce a model-returned number/numeric-string to float; total (never raises).
-
-    Returns ``default`` (None) on junk so one malformed vote drops that single
-    vote instead of discarding the whole metric.
-    """
+    """Coerce a model-returned number/numeric-string to float; total (never raises)."""
     try:
         return float(value)
     except (TypeError, ValueError):
         return default
 
 
-def _map_chat(chat_fn, prompts, workers=1):
-    """Run a list of (system, user) chat prompts, returning ordered result dicts.
+def _source_stem(locator: str) -> str:
+    """Return the part before the first '#', or the full string if no '#'."""
+    idx = locator.find("#")
+    return locator[:idx] if idx != -1 else locator
 
-    ``workers <= 1`` calls ``chat_fn`` SEQUENTIALLY — byte-for-byte identical to
-    the in-line loops it replaces, INCLUDING letting a raise propagate (so
-    run_judge's per-metric try/except still drops that whole metric, the
-    behaviour the suite locks in).
 
-    ``workers >= 2`` fans the calls out across a ThreadPoolExecutor while
-    PRESERVING input order in the returned list.  Concurrency cannot let one
-    raising future poison the batch, so in that path a raising call's slot
-    becomes ``{}`` — the metric's aggregation degrades for that one vote but
-    never raises (the same best-effort contract as run_judge).
-    """
-    prompts = list(prompts)
-    if workers <= 1:
-        return [chat_fn(system, user) for system, user in prompts]
-
-    results: list[dict] = [{} for _ in prompts]
-    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
-        futures = {
-            executor.submit(chat_fn, system, user): idx
-            for idx, (system, user) in enumerate(prompts)
-        }
-        for future in concurrent.futures.as_completed(futures):
-            idx = futures[future]
-            try:
-                results[idx] = future.result()
-            except Exception:  # best-effort: a dropped vote, never a raise
-                results[idx] = {}
-    return results
+async def _gather_chat(chat_fn, prompts: list[tuple[str, str]]) -> list[dict]:
+    """Run a list of (system, user) prompts concurrently, returning ordered results."""
+    results = await asyncio.gather(*[chat_fn(s, u) for s, u in prompts], return_exceptions=True)
+    return [r if isinstance(r, dict) else {} for r in results]
 
 
 # ── [D] DETERMINISTIC — no LLM, always available ────────────────────────────────
 
 
-def source_coverage(result: dict) -> dict:
+async def source_coverage(item: dict, ctx: EvalContext) -> dict:  # noqa: ARG001
     """Distinct source documents cited by >=1 finding vs all source documents.
 
     Returns {cited, total, orphaned} where orphaned is the sorted list of
     source stems present in evidence_index but cited by no finding.
     """
-    evidence_index = _evidence_index(result)
-    all_stems = {
-        source_stem(ev.get("locator", ""))
-        for ev in result.get("evidence_index", [])
-        if ev.get("locator")
-    }
+    ev_idx = _evidence_index(item)
+    all_stems = {_source_stem(ev.get("locator", "")) for ev in item.get("evidence_index", []) if ev.get("locator")}
     cited_stems: set[str] = set()
-    for f in result.get("findings", []):
+    for f in item.get("findings", []):
         for ref in f.get("evidence_refs", []):
-            ev = evidence_index.get(ref.get("evidence_id", ""))
+            ev = ev_idx.get(ref.get("evidence_id", ""))
             if ev and ev.get("locator"):
-                cited_stems.add(source_stem(ev["locator"]))
+                cited_stems.add(_source_stem(ev["locator"]))
     cited_stems &= all_stems
     orphaned = sorted(all_stems - cited_stems)
     return {"cited": len(cited_stems), "total": len(all_stems), "orphaned": orphaned}
 
 
-def excerpt_fill_rate(result: dict) -> dict:
+async def excerpt_fill_rate(item: dict, ctx: EvalContext) -> dict:  # noqa: ARG001
     """Fraction of evidence_index entries with a non-empty excerpt.
 
-    Returns {populated, total}.  This is the signal behind older runs' low G3
-    grounding: empty excerpts cannot ground anything.
+    Returns {populated, total}.
     """
-    entries = result.get("evidence_index", [])
+    entries = item.get("evidence_index", [])
     populated = sum(1 for ev in entries if (ev.get("excerpt") or "").strip())
     return {"populated": populated, "total": len(entries)}
 
 
-# ── [E] EMBEDDING — needs embed_fn ───────────────────────────────────────────────
-
+# ── [E] EMBEDDING — needs embedder ───────────────────────────────────────────────
 
-def semantic_recovery(
-    result: dict,
-    registry,
-    lexical_missed_ids: list[str],
-    embed_fn,
-    tau: float = 0.70,
-) -> dict:
-    """Context-recall: recover G2 lexical misses by embedding similarity.
 
-    For each registry item flagged a LEXICAL MISS by G2, embed its
-    description+keywords and take the max cosine against the embeddings of every
-    finding description (and their cited excerpts).  If max cosine >= tau the
-    item is counted semantically present (recovered).
+async def semantic_recovery(item: dict, ctx: EvalContext, tau: float = 0.70) -> dict | None:
+    """Context-recall: recover lexical misses by embedding similarity.
 
-    recovered_recall = (lexical_hits + recovered) / scored_denominator, where
-    the scored denominator is the count of non-NC items scored by G2 (real
-    items, matching G2's recall denominator family).  Returns the lexical recall,
-    the recovered recall, the recovered item list (with cosine), and tau.
+    Reads item["lexical_missed_ids"] (list of str).
+    Returns None if ctx.embedder is None.
     """
+    if ctx.embedder is None:
+        return None
+
+    lexical_missed_ids: list[str] = item.get("lexical_missed_ids", [])
     missed = set(lexical_missed_ids or [])
-    real_items = registry.real_items
-    scored_items = [i for i in real_items if i.tier != "L3"]
-    denom = len(scored_items) or 1
-    lexical_hits = sum(1 for i in scored_items if i.id not in missed)
 
-    # Candidate texts the findings actually surfaced.
-    evidence_index = _evidence_index(result)
+    # Build the scored items from nc_items (non-NC = real items for recall)
+    # In the new EvalContext model, nc_items is a list of {"id": ..., "description": ...}
+    # We treat all item findings as the candidate surface; nc_items stay separate.
+    # Recompute as: all items scored = those not in nc_items ids.
+    # If there's no registry concept, we use findings as the denominator proxy.
+    # But keep the logic simple: just score the missed items against finding descriptions.
+    ev_idx = _evidence_index(item)
     candidate_texts: list[str] = []
-    for f in result.get("findings", []):
+    for f in item.get("findings", []):
         desc = f.get("description", "")
         if desc:
             candidate_texts.append(desc)
-        candidate_texts.extend(_cited_excerpts(f, evidence_index))
+        candidate_texts.extend(_cited_excerpts(f, ev_idx))
+
+    # missed_items: we only know their IDs; we need descriptions to embed.
+    # In the new design, if no descriptions available, return minimal result.
+    all_findings = item.get("findings", [])
+    denom = max(len(all_findings), 1)
+    lexical_hits = sum(1 for f in all_findings if f.get("id") not in missed)
+
+    missed_descs: list[tuple[str, str]] = [
+        (f.get("id", ""), f.get("description", ""))
+        for f in all_findings
+        if f.get("id") in missed and f.get("description")
+    ]
 
-    missed_items = [i for i in scored_items if i.id in missed]
-    if not missed_items or not candidate_texts:
+    if not missed_descs or not candidate_texts:
         recovered_recall = lexical_hits / denom
         return {
             "lexical_recall": round(lexical_hits / denom, 4),
@@ -240,15 +200,15 @@ def semantic_recovery(
             "scored_denominator": denom,
         }
 
-    item_texts = [f"{i.description} {' '.join(i.keywords)}".strip() for i in missed_items]
-    item_vecs = np.asarray(embed_fn(item_texts), dtype=np.float64)
-    cand_vecs = np.asarray(embed_fn(candidate_texts), dtype=np.float64)
+    item_texts = [desc for _fid, desc in missed_descs]
+    item_vecs = await ctx.embedder._embed_batch(item_texts)
+    cand_vecs = await ctx.embedder._embed_batch(candidate_texts)
 
     recovered: list[dict] = []
-    for item, ivec in zip(missed_items, item_vecs):
-        best = max((cosine(ivec, cvec) for cvec in cand_vecs), default=0.0)
+    for (fid, _desc), ivec in zip(missed_descs, item_vecs, strict=False):
+        best = max((cosine_similarity(ivec, cvec) for cvec in cand_vecs), default=0.0)
         if best >= tau:
-            recovered.append({"id": item.id, "cosine": round(best, 4)})
+            recovered.append({"id": fid, "cosine": round(best, 4)})
 
     recovered_recall = (lexical_hits + len(recovered)) / denom
     return {
@@ -263,16 +223,14 @@ def semantic_recovery(
 # ── [J] JUDGE — needs chat_fn(system, user) -> dict ──────────────────────────────
 
 
-def faithfulness(result: dict, chat_fn, *, workers: int = 1) -> dict:
+async def faithfulness(item: dict, ctx: EvalContext) -> dict:
     """Entailment: does each finding's cited evidence SUPPORT its claim?
 
-    Per (finding, cited-excerpts) pair, ask SUPPORTED / NOT_SUPPORTED.  Returns
-    {supported, total, unsupported_ids}.  Findings with no cited evidence are
-    counted as not-supported (nothing to entail against).
+    Returns {supported, total, unsupported_ids}.
     """
-    evidence_index = _evidence_index(result)
-    findings = result.get("findings", [])
-    cited = [(f, _cited_excerpts(f, evidence_index)) for f in findings]
+    ev_idx = _evidence_index(item)
+    findings = item.get("findings", [])
+    cited = [(f, _cited_excerpts(f, ev_idx)) for f in findings]
     prompts = [
         (
             SYSTEM,
@@ -284,7 +242,7 @@ def faithfulness(result: dict, chat_fn, *, workers: int = 1) -> dict:
         for f, excerpts in cited
         if excerpts
     ]
-    answers = iter(_map_chat(chat_fn, prompts, workers))
+    answers = iter(await _gather_chat(ctx.client.chat_json, prompts))
     supported = 0
     unsupported_ids: list[str] = []
     for f, excerpts in cited:
@@ -300,18 +258,13 @@ def faithfulness(result: dict, chat_fn, *, workers: int = 1) -> dict:
     return {"supported": supported, "total": len(findings), "unsupported_ids": unsupported_ids}
 
 
-def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dict:
+async def numeric_temporal_fidelity(item: dict, ctx: EvalContext) -> dict:
     """Flag numbers/dates asserted in a finding that do NOT match its evidence.
 
-    Closes the 45-days-vs-3-days gap.  Returns {mismatches: [{finding_id, value,
-    source}], count}.
+    Returns {mismatches: [{finding_id, value, source}], count}.
     """
-    evidence_index = _evidence_index(result)
-    scored = [
-        (f, excerpts)
-        for f in result.get("findings", [])
-        if (excerpts := _cited_excerpts(f, evidence_index))
-    ]
+    ev_idx = _evidence_index(item)
+    scored = [(f, excerpts) for f in item.get("findings", []) if (excerpts := _cited_excerpts(f, ev_idx))]
     prompts = [
         (
             SYSTEM,
@@ -324,9 +277,9 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic
         )
         for f, excerpts in scored
     ]
-    answers = _map_chat(chat_fn, prompts, workers)
+    answers = await _gather_chat(ctx.client.chat_json, prompts)
     mismatches: list[dict] = []
-    for (f, _excerpts), answer in zip(scored, answers):
+    for (f, _excerpts), answer in zip(scored, answers, strict=False):
         for m in answer.get("mismatches", []) or []:
             mismatches.append(
                 {
@@ -338,20 +291,17 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic
     return {"mismatches": mismatches, "count": len(mismatches)}
 
 
-def citation_relevance(result: dict, chat_fn, *, workers: int = 1) -> dict:
+async def citation_relevance(item: dict, ctx: EvalContext) -> dict:
     """Context precision: fraction of cited passages actually relevant to the claim.
 
-    Per evidence_ref, ask yes/no relevance.  precision = relevant / total_refs.
-    Returns {precision, relevant, total}; when total == 0 (no cited passages with
-    excerpts) precision is None — the kept ``total`` lets a reader tell "perfect"
-    apart from "nothing to score".
+    Returns {precision, relevant, total}.
     """
-    evidence_index = _evidence_index(result)
+    ev_idx = _evidence_index(item)
     prompts: list[tuple[str, str]] = []
-    for f in result.get("findings", []):
+    for f in item.get("findings", []):
         desc = f.get("description", "")
         for ref in f.get("evidence_refs", []):
-            ev = evidence_index.get(ref.get("evidence_id", ""))
+            ev = ev_idx.get(ref.get("evidence_id", ""))
             if not ev:
                 continue
             excerpt = ev.get("excerpt") or ""
@@ -366,7 +316,7 @@ def citation_relevance(result: dict, chat_fn, *, workers: int = 1) -> dict:
                     f"CITED PASSAGE: {excerpt}",
                 )
             )
-    answers = _map_chat(chat_fn, prompts, workers)
+    answers = await _gather_chat(ctx.client.chat_json, prompts)
     total = len(prompts)
     relevant = sum(1 for a in answers if str(a.get("relevant", "")).lower() == "yes")
     if not total:
@@ -374,43 +324,40 @@ def citation_relevance(result: dict, chat_fn, *, workers: int = 1) -> dict:
     return {"precision": round(relevant / total, 4), "relevant": relevant, "total": total}
 
 
-def nc_semantic_precision(result: dict, registry, chat_fn, *, workers: int = 1) -> dict:
+async def nc_semantic_precision(item: dict, ctx: EvalContext) -> dict:
     """Count negative-control falsehoods the output asserts or endorses.
 
-    Per NC item, ask yes/no whether the output endorses the false statement.
-    Returns {asserted, total, asserted_ids}.  The semantic counterpart to the
-    inert deterministic NC check.
+    Reads item["nc_items"] as list of {"id": ..., "description": ...} dicts.
+    Returns {asserted, total, asserted_ids}.
     """
-    output_text = _output_text(result)
-    nc_items = registry.nc_items
+    output_text = _output_text(item)
+    nc_items: list[dict] = item.get("nc_items", [])
     prompts = [
         (
             SYSTEM,
             "Does the OUTPUT assert or endorse the following FALSE statement?\n"
             'Reply with ONLY {"asserted": "yes" or "no"}.\n\n'
-            f"FALSE STATEMENT: {item.description}\n"
+            f"FALSE STATEMENT: {nc.get('description', '')}\n"
             f"OUTPUT:\n{output_text}",
         )
-        for item in nc_items
+        for nc in nc_items
     ]
-    answers = _map_chat(chat_fn, prompts, workers)
+    answers = await _gather_chat(ctx.client.chat_json, prompts)
     asserted_ids = [
-        item.id for item, a in zip(nc_items, answers) if str(a.get("asserted", "")).lower() == "yes"
+        nc.get("id", "?")
+        for nc, a in zip(nc_items, answers, strict=False)
+        if str(a.get("asserted", "")).lower() == "yes"
     ]
     return {"asserted": len(asserted_ids), "total": len(nc_items), "asserted_ids": asserted_ids}
 
 
-def fabricated_entity(result: dict, chat_fn) -> dict:
+async def fabricated_entity(item: dict, ctx: EvalContext) -> dict:
     """Count systems/orgs/metrics named in the output but absent from the corpus.
 
-    Returns {count, entities}.  The corpus universe is the set of evidence
-    excerpts + locators.
+    Returns {count, entities}.
     """
-    output_text = _output_text(result)
-    corpus = "\n".join(
-        f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}"
-        for ev in result.get("evidence_index", [])
-    )
+    output_text = _output_text(item)
+    corpus = "\n".join(f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" for ev in item.get("evidence_index", []))
     user = (
         "List any system, organization, or metric NAMED in the OUTPUT that does NOT "
         "appear anywhere in the CORPUS EVIDENCE.\n"
@@ -418,54 +365,54 @@ def fabricated_entity(result: dict, chat_fn) -> dict:
         f"OUTPUT:\n{output_text}\n\n"
         f"CORPUS EVIDENCE:\n{corpus}"
     )
-    entities = chat_fn(SYSTEM, user).get("fabricated", []) or []
+    answer = await ctx.client.chat_json(SYSTEM, user)
+    entities = answer.get("fabricated", []) or []
     return {"count": len(entities), "entities": list(entities)}
 
 
-def contradiction(result: dict, chat_fn) -> dict:
+async def contradiction(item: dict, ctx: EvalContext) -> dict:
     """Count internally contradictory finding pairs.
 
-    Returns {count, pairs}.  pairs is the list of contradicting finding-id pairs
-    the judge reports.
+    Returns {count, pairs}.
     """
     lines = []
-    for f in result.get("findings", []):
+    for f in item.get("findings", []):
         lines.append(f"{f.get('id', '?')}: {f.get('title', '')} — {f.get('description', '')}")
     user = (
         "Are any two of these FINDINGS mutually contradictory? List each contradicting pair.\n"
-        'Reply with ONLY {"pairs": [["<id_a>", "<id_b>"], ...]}.  Empty list if none.\n\n'
-        + "\n".join(lines)
+        'Reply with ONLY {"pairs": [["<id_a>", "<id_b>"], ...]}.  Empty list if none.\n\n' + "\n".join(lines)
     )
-    pairs = chat_fn(SYSTEM, user).get("pairs", []) or []
+    answer = await ctx.client.chat_json(SYSTEM, user)
+    pairs = answer.get("pairs", []) or []
     return {"count": len(pairs), "pairs": [list(p) for p in pairs]}
 
 
-def open_gap(result: dict, chat_fn) -> dict:
+async def open_gap(item: dict, ctx: EvalContext) -> dict:
     """G-Eval open probe: the most important process issue the output missed.
 
     Returns {gap} — a free-text advisory narrative (no score).
     """
-    pg = result.get("process_graph") or {}
+    pg = item.get("process_graph") or {}
     pg_summary = f"process_graph has {len(pg.get('processes', []))} processes"
     user = (
         "Given this corpus scope and output, what important process issue did the "
         "output FAIL to surface?\n"
         'Reply with ONLY {"gap": "<the most important missed issue, one short paragraph>"}.\n\n'
-        f"WORKSPACE SCOPE: {_workspace_intention(result)}\n"
+        f"WORKSPACE SCOPE: {_workspace_intention(item)}\n"
         f"{pg_summary}\n"
-        f"OUTPUT:\n{_output_text(result)}"
+        f"OUTPUT:\n{_output_text(item)}"
     )
-    return {"gap": str(chat_fn(SYSTEM, user).get("gap", ""))}
+    answer = await ctx.client.chat_json(SYSTEM, user)
+    return {"gap": str(answer.get("gap", ""))}
 
 
-def actionability(result: dict, chat_fn, *, workers: int = 1) -> dict:
+async def actionability(item: dict, ctx: EvalContext) -> dict:
     """Average 0-1 rating of whether proposed actions are specific+quantified+linked.
 
-    Returns {score, rated}.  Each action is rated against whether it is specific,
-    quantified, and linked to a finding.
+    Returns {score, rated}.
     """
-    actions = result.get("proposed_actions", []) or []
-    finding_ids = {f.get("id") for f in result.get("findings", [])}
+    actions = item.get("proposed_actions", []) or []
+    finding_ids = {f.get("id") for f in item.get("findings", [])}
     prompts = [
         (
             SYSTEM,
@@ -482,24 +429,24 @@ def actionability(result: dict, chat_fn, *, workers: int = 1) -> dict:
         )
         for a in actions
     ]
-    answers = _map_chat(chat_fn, prompts, workers)
+    answers = await _gather_chat(ctx.client.chat_json, prompts)
     scores: list[float] = []
     for a in answers:
         value = _coerce_float(a.get("score"))
-        if value is None:  # malformed vote -> skip this action, keep the metric
+        if value is None:
             continue
         scores.append(value)
     score = round(sum(scores) / len(scores), 4) if scores else None
     return {"score": score, "rated": len(scores)}
 
 
-def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict:
+async def severity_calibration(item: dict, ctx: EvalContext) -> dict:
     """Per-finding judgment of whether stated severity matches the evidence.
 
     Returns {miscalibrated, total, verdicts: {finding_id: under|over|calibrated}}.
     """
-    evidence_index = _evidence_index(result)
-    findings = result.get("findings", [])
+    ev_idx = _evidence_index(item)
+    findings = item.get("findings", [])
     prompts = [
         (
             SYSTEM,
@@ -507,14 +454,14 @@ def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict:
             'Reply with ONLY {"calibration": "under" or "over" or "calibrated"}.\n\n'
             f"STATED SEVERITY: {f.get('severity', '')}  SCORE: {f.get('score', '')}\n"
             f"FINDING: {f.get('description', '')}\n"
-            f"CITED EVIDENCE: {' || '.join(_cited_excerpts(f, evidence_index))}",
+            f"CITED EVIDENCE: {' || '.join(_cited_excerpts(f, ev_idx))}",
         )
         for f in findings
     ]
-    answers = _map_chat(chat_fn, prompts, workers)
+    answers = await _gather_chat(ctx.client.chat_json, prompts)
     verdicts: dict[str, str] = {}
     miscalibrated = 0
-    for f, a in zip(findings, answers):
+    for f, a in zip(findings, answers, strict=False):
         verdict = str(a.get("calibration", "calibrated")).lower()
         verdicts[f.get("id", "?")] = verdict
         if verdict in ("under", "over"):
@@ -522,7 +469,7 @@ def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict:
     return {"miscalibrated": miscalibrated, "total": len(findings), "verdicts": verdicts}
 
 
-def answer_relevancy(result: dict, chat_fn) -> dict:
+async def answer_relevancy(item: dict, ctx: EvalContext) -> dict:
     """RAGAS-style: does the output address the stated workspace intention?
 
     Returns {score} in [0,1], or {"score": None} when the vote fails to coerce.
@@ -530,38 +477,27 @@ def answer_relevancy(result: dict, chat_fn) -> dict:
     user = (
         "Does the OUTPUT address the stated WORKSPACE INTENTION (on-topic, responsive)?\n"
         'Reply with ONLY {"score": <number 0-1>}.\n\n'
-        f"WORKSPACE INTENTION: {_workspace_intention(result)}\n"
-        f"OUTPUT:\n{_output_text(result)}"
+        f"WORKSPACE INTENTION: {_workspace_intention(item)}\n"
+        f"OUTPUT:\n{_output_text(item)}"
     )
-    return {"score": _coerce_float(chat_fn(SYSTEM, user).get("score"))}
+    answer = await ctx.client.chat_json(SYSTEM, user)
+    return {"score": _coerce_float(answer.get("score"))}
 
 
-def surface_deduplication(result: dict, chat_fn, *, workers: int = 1) -> dict:
+async def surface_deduplication(item: dict, ctx: EvalContext) -> dict:
     """Fraction of near-duplicate process-graph node pairs that are genuinely distinct.
 
-    Scoping rules:
-    - Processes: all pairs compared (cross-process is valid at this level).
-    - Activities and decisions: ONLY within the same parent process.  The same
-      activity name appearing in two different processes is a legitimate repetition
-      (e.g. "Approve Request" in both Loan and Credit-Card flows), not a duplicate.
-
-    For each surface, the top-10 most name-similar pairs (token-Jaccard >= 0.30)
-    are selected.  For activities/decisions the parent process name is included in
-    the judge prompt so it can reason about intra-process context.  30 pairs total.
-
     Returns {distinct, redundant, total, distinct_rate, redundant_pairs}.
     """
-    pg = result.get("process_graph", {})
+    pg = item.get("process_graph", {})
     procs = pg.get("processes", [])
 
     def _toks(node: dict) -> frozenset[str]:
         return frozenset(node.get("name", "").lower().split())
 
-    PER_SURFACE_CAP = 10
-    # candidates: (surface, node_a, node_b, parent_process_name)
+    per_surface_cap = 10
     candidates: list[tuple[str, dict, dict, str]] = []
 
-    # Processes: compare all pairs
     if len(procs) >= 2:
         pairs: list[tuple[float, dict, dict]] = []
         for i in range(len(procs)):
@@ -574,10 +510,9 @@ def _toks(node: dict) -> frozenset[str]:
                 if jac >= 0.30:
                     pairs.append((jac, procs[i], procs[j]))
         pairs.sort(key=lambda x: x[0], reverse=True)
-        for _jac, a, b in pairs[:PER_SURFACE_CAP]:
+        for _jac, a, b in pairs[:per_surface_cap]:
             candidates.append(("process", a, b, ""))
 
-    # Activities and decisions: within the same parent process only
     for surface_key, attr in (("activity", "activities"), ("decision", "decisions")):
         all_pairs: list[tuple[float, dict, dict, str]] = []
         for proc in procs:
@@ -595,7 +530,7 @@ def _toks(node: dict) -> frozenset[str]:
                     if jac >= 0.30:
                         all_pairs.append((jac, nodes[i], nodes[j], proc_name))
         all_pairs.sort(key=lambda x: x[0], reverse=True)
-        for _jac, a, b, proc_name in all_pairs[:PER_SURFACE_CAP]:
+        for _jac, a, b, proc_name in all_pairs[:per_surface_cap]:
             candidates.append((surface_key, a, b, proc_name))
 
     if not candidates:
@@ -603,34 +538,38 @@ def _toks(node: dict) -> frozenset[str]:
 
     prompts = []
     for surface, a, b, parent_proc in candidates:
-        ctx = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else ""
-        prompts.append((
-            SYSTEM,
-            f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a "
-            f"duplicate / sub-case / restatement of the other?\n"
-            f"{ctx}"
-            'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": "<one line>"}.\n\n'
-            f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n"
-            f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}",
-        ))
+        ctx_line = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else ""
+        prompts.append(
+            (
+                SYSTEM,
+                f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a "
+                f"duplicate / sub-case / restatement of the other?\n"
+                f"{ctx_line}"
+                'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": "<one line>"}.\n\n'
+                f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n"
+                f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}",
+            )
+        )
 
-    answers = _map_chat(chat_fn, prompts, workers)
+    answers = await _gather_chat(ctx.client.chat_json, prompts)
 
     distinct = 0
     redundant = 0
     redundant_pairs: list[dict] = []
-    for (surface, a, b, _parent), answer in zip(candidates, answers):
+    for (surface, a, b, _parent), answer in zip(candidates, answers, strict=False):
         verdict = str(answer.get("verdict", "")).upper()
         if verdict == "DISTINCT":
             distinct += 1
         else:
             redundant += 1
-            redundant_pairs.append({
-                "surface": surface,
-                "a": a.get("name", ""),
-                "b": b.get("name", ""),
-                "reason": str(answer.get("reason", "")),
-            })
+            redundant_pairs.append(
+                {
+                    "surface": surface,
+                    "a": a.get("name", ""),
+                    "b": b.get("name", ""),
+                    "reason": str(answer.get("reason", "")),
+                }
+            )
 
     total = distinct + redundant
     return {
@@ -642,13 +581,15 @@ def _toks(node: dict) -> frozenset[str]:
     }
 
 
-def comparative_vs_champion(result: dict, champion_result: dict, chat_fn) -> dict:
+async def comparative_vs_champion(item: dict, ctx: EvalContext) -> dict | None:
     """Pairwise MT-Bench-style review of candidate vs champion (advisory only).
 
-    Returns {candidate, champion, more_consistent} where candidate/champion are
-    1-5 ratings on Coverage/Quality/Evidence/Actionability/Regression.  Never
-    feeds G5.
+    Returns None if item["champion"] is not present.
+    Returns {candidate, champion, more_consistent}.
     """
+    champion = item.get("champion")
+    if champion is None:
+        return None
     user = (
         "Score the CANDIDATE and the CHAMPION outputs on five axes (1-5 each): "
         "Coverage, Quality, Evidence, Actionability, Regression.  Then say which is "
@@ -657,10 +598,10 @@ def comparative_vs_champion(result: dict, champion_result: dict, chat_fn) -> dic
         '{"candidate": {"coverage": x, "quality": x, "evidence": x, "actionability": x, "regression": x}, '
         '"champion": {"coverage": x, "quality": x, "evidence": x, "actionability": x, "regression": x}, '
         '"more_consistent": "candidate" or "champion"}.\n\n'
-        f"CANDIDATE:\n{_output_text(result)}\n\n"
-        f"CHAMPION:\n{_output_text(champion_result)}"
+        f"CANDIDATE:\n{_output_text(item)}\n\n"
+        f"CHAMPION:\n{_output_text(champion)}"
     )
-    out = chat_fn(SYSTEM, user)
+    out = await ctx.client.chat_json(SYSTEM, user)
     return {
         "candidate": out.get("candidate", {}),
         "champion": out.get("champion", {}),
@@ -668,18 +609,175 @@ def comparative_vs_champion(result: dict, champion_result: dict, chat_fn) -> dic
     }
 
 
-# ── median-of-N for [J] metrics ──────────────────────────────────────────────────
+# ── flycanon custom metrics ───────────────────────────────────────────────────────
 
 
-def _numeric_leaves(d: dict) -> dict[tuple, float]:
-    """Flatten a metric dict to {path: float} over its FLOAT score-leaves only.
+async def _rag_score_once(item: dict, ctx: EvalContext) -> dict | None:
+    """Single RAG scoring call: returns {"contains_answer": float, "addresses_question": float}."""
+    question = item.get("question", "")
+    reference = item.get("reference", "")
+    answer = item.get("answer", "")
+    if not question or not answer:
+        return None
+    user = f"QUESTION: {question}\nREFERENCE: {reference}\nANSWER: {answer}\n\n{RUBRIC}"
+    result = await ctx.client.chat_json(SYSTEM_RAG, user)
+    return result
+
+
+async def contains_answer(item: dict, ctx: EvalContext) -> float | None:
+    """Flycanon: does the answer contain the correct information from the reference?
 
-    Median applies to continuous scores only.  A leaf counts as numeric-for-median
-    only when its value is a ``float``; ``bool`` and ``int`` leaves (counts,
-    denominators, 1-5 axes, and other bookkeeping) are deliberately skipped and
-    taken from the first run unchanged — this avoids fractional counts (rated=0.5)
-    and count/len(list) disagreement under runs>1 with an even N.
+    Runs ctx.runs times and returns the median score.
+    Returns None if the item lacks question/answer.
+    """
+    scores: list[float] = []
+    for _ in range(max(1, ctx.runs)):
+        result = await _rag_score_once(item, ctx)
+        if result is None:
+            return None
+        val = _coerce_float(result.get("contains_answer"))
+        if val is not None:
+            scores.append(val)
+    if not scores:
+        return None
+    return round(statistics.median(scores), 4)
+
+
+async def addresses_question(item: dict, ctx: EvalContext) -> float | None:
+    """Flycanon: does the answer directly address what the question is asking?
+
+    Runs ctx.runs times and returns the median score.
+    Returns None if the item lacks question/answer.
     """
+    scores: list[float] = []
+    for _ in range(max(1, ctx.runs)):
+        result = await _rag_score_once(item, ctx)
+        if result is None:
+            return None
+        val = _coerce_float(result.get("addresses_question"))
+        if val is not None:
+            scores.append(val)
+    if not scores:
+        return None
+    return round(statistics.median(scores), 4)
+
+
+# ── RAGAS metrics ─────────────────────────────────────────────────────────────────
+# ragas/langchain imports are inline inside _sync() since ragas is optional.
+
+
+def _make_ragas_sample(item: dict):
+    """Build a RAGAS SingleTurnSample from an item dict (ragas import inline)."""
+    from ragas import SingleTurnSample  # type: ignore[import]  # noqa: PLC0415
+
+    return SingleTurnSample(
+        user_input=item.get("question", ""),
+        response=item.get("answer", ""),
+        reference=item.get("reference", ""),
+        retrieved_contexts=item.get("contexts", []),
+    )
+
+
+def _make_ragas_llm(ctx: EvalContext):
+    """Build a LangChain LLM wrapper for RAGAS (langchain import inline)."""
+    provider, model = ctx.client.provider, ctx.client.model
+    if provider == "anthropic":
+        from langchain_anthropic import ChatAnthropic  # type: ignore[import]  # noqa: PLC0415
+
+        api_key = os.environ.get("ANTHROPIC_API_KEY", "")
+        return ChatAnthropic(model=model, api_key=api_key, temperature=0.0)
+    if provider in ("openai", "azure"):
+        from langchain_openai import ChatOpenAI  # type: ignore[import]  # noqa: PLC0415
+
+        api_key = os.environ.get("OPENAI_API_KEY", "")
+        return ChatOpenAI(model=model, api_key=api_key, temperature=0.0)
+    if provider == "ollama":
+        from langchain_ollama import ChatOllama  # type: ignore[import]  # noqa: PLC0415
+
+        return ChatOllama(model=model, temperature=0.0)
+    raise ValueError(f"RAGAS: unsupported provider {provider!r}")
+
+
+def _make_ragas_embeddings(ctx: EvalContext):
+    """Build LangChain embeddings for RAGAS (langchain import inline)."""
+    if ctx.embedder is not None:
+        from langchain_ollama import OllamaEmbeddings  # type: ignore[import]  # noqa: PLC0415
+
+        return OllamaEmbeddings(model=ctx.embedder._model)
+    from langchain_anthropic import AnthropicEmbeddings  # type: ignore[import]  # noqa: PLC0415
+
+    return AnthropicEmbeddings()
+
+
+async def _ragas_score(metric_name: str, item: dict, ctx: EvalContext) -> float | None:
+    """Run a single named RAGAS metric and return its float score (or None)."""
+
+    def _sync():
+        from ragas import evaluate  # type: ignore[import]  # noqa: PLC0415
+        from ragas.dataset_schema import EvaluationDataset  # type: ignore[import]  # noqa: PLC0415
+        from ragas.metrics import (  # type: ignore[import]  # noqa: PLC0415
+            AnswerCorrectness,
+            AnswerRelevancy,
+            ContextPrecision,
+            ContextRecall,
+            Faithfulness,
+        )
+
+        _metrics_map = {
+            "answer_correctness": AnswerCorrectness,
+            "answer_relevancy_ragas": AnswerRelevancy,
+            "ragas_faithfulness": Faithfulness,
+            "context_recall": ContextRecall,
+            "context_precision": ContextPrecision,
+        }
+        metric_cls = _metrics_map.get(metric_name)
+        if metric_cls is None:
+            return None
+
+        llm = _make_ragas_llm(ctx)
+        embeddings = _make_ragas_embeddings(ctx)
+        metric = metric_cls(llm=llm, embeddings=embeddings)
+        sample = _make_ragas_sample(item)
+        dataset = EvaluationDataset(samples=[sample])
+        result = evaluate(dataset=dataset, metrics=[metric])
+        df = result.to_pandas()
+        col = df.columns[df.columns.str.contains(metric_name.replace("_ragas", ""), case=False)]
+        if col.empty:
+            return None
+        val = df[col[0]].iloc[0]
+        if val is None or (isinstance(val, float) and math.isnan(val)):
+            return None
+        return round(float(val), 4)
+
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, _sync)
+
+
+async def answer_correctness(item: dict, ctx: EvalContext) -> float | None:
+    """RAGAS answer correctness (semantic F1 against reference)."""
+    return await _ragas_score("answer_correctness", item, ctx)
+
+
+async def ragas_faithfulness(item: dict, ctx: EvalContext) -> float | None:
+    """RAGAS faithfulness (answer grounded in retrieved contexts)."""
+    return await _ragas_score("ragas_faithfulness", item, ctx)
+
+
+async def context_recall(item: dict, ctx: EvalContext) -> float | None:
+    """RAGAS context recall (reference coverage by retrieved contexts)."""
+    return await _ragas_score("context_recall", item, ctx)
+
+
+async def context_precision(item: dict, ctx: EvalContext) -> float | None:
+    """RAGAS context precision (retrieved contexts relevant to the question)."""
+    return await _ragas_score("context_precision", item, ctx)
+
+
+# ── median-of-N helpers ──────────────────────────────────────────────────────────
+
+
+def _numeric_leaves(d: dict) -> dict[tuple, float]:
+    """Flatten a metric dict to {path: float} over its FLOAT score-leaves only."""
     out: dict[tuple, float] = {}
 
     def walk(node, path: tuple) -> None:
@@ -701,11 +799,7 @@ def _set_leaf(d: dict, path: tuple, value: float) -> None:
 
 
 def _median_runs(samples: list[dict]) -> dict:
-    """Median across N metric-dicts: FLOAT score-leaves -> per-key median; rest = first.
-
-    Only continuous float scores are medianed; integer bookkeeping (counts,
-    denominators, 1-5 axes) and all non-numeric fields are taken from the first run.
-    """
+    """Median across N metric-dicts: FLOAT score-leaves -> per-key median; rest = first."""
     samples = [s for s in samples if isinstance(s, dict)]
     if not samples:
         return {}
@@ -728,102 +822,69 @@ def _median_runs(samples: list[dict]) -> dict:
 # ── orchestrator ─────────────────────────────────────────────────────────────────
 
 
-def run_judge(
-    result: dict,
-    registry,
+async def run_judge(
+    item: dict,
+    ctx: EvalContext,
     *,
-    judge_model: str,
-    runs: int = 1,
-    concurrency: int = 1,
     pipeline_model: str = "",
-    champion_result: dict | None = None,
-    chat_fn=None,
-    embed_fn=None,
-    tau: float = 0.70,
-    lexical_missed_ids: list[str] | None = None,
 ) -> AdvisoryReport:
-    """Run the G4 advisory gate, best-effort.  NEVER raises; NEVER affects verdict.
-
-    If chat_fn / embed_fn are None, real ones are built from JudgeClient /
-    OllamaEmbedder (tests inject stubs instead).  Each [J] metric runs `runs`
-    times and the median of its numeric scores is kept.  Every metric is wrapped
-    in try/except: a failure appends to report.errors and the run continues.
-
-    ``concurrency`` (opt-in, default 1) bounds the per-item [J] metrics' internal
-    fan-out: 1 keeps the sequential per-item loops; >=2 runs each metric's items
-    across a thread pool (order preserved).  The median-of-N ``runs`` loop stays
-    sequential and the single-call metrics are unaffected.  The result is
-    byte-for-byte identical at concurrency=1.
+    """Run all metrics concurrently and return an AdvisoryReport.
 
-    Returns an AdvisoryReport (a plain dict carrier) with calibrated=False and
-    same_provider_caveat = same_provider(pipeline_model, judge_model).
+    Best-effort: never raises. Failing metrics append to report.errors.
     """
-    if chat_fn is None:
-        client = JudgeClient(judge_model)
-        chat_fn = client.chat_json
-    if embed_fn is None:
-        embed_fn = OllamaEmbedder().embed
-
     report = AdvisoryReport(
-        judge_model=judge_model,
-        same_provider_caveat=same_provider(pipeline_model, judge_model),
+        judge_model=ctx.client.model_spec,
+        same_provider_caveat=same_provider(pipeline_model, ctx.client.model_spec),
         calibrated=False,
-        runs=runs,
+        runs=ctx.runs,
     )
 
-    def _run_det(name: str, fn) -> None:
-        try:
-            report.metrics[name] = fn()
-        except Exception as exc:  # best-effort: never raise
-            report.errors.append(f"{name}: {type(exc).__name__}: {exc}")
+    # [D] deterministic (no LLM)
+    det_metrics: list[tuple[str, Metric]] = [
+        ("source_coverage", source_coverage),
+        ("excerpt_fill_rate", excerpt_fill_rate),
+    ]
+    # [E] embedding
+    emb_metrics: list[tuple[str, Metric]] = [
+        ("semantic_recovery", semantic_recovery),
+    ]
+    # [J] judge metrics (median-of-runs handled externally for single-call ones)
+    judge_metrics: list[tuple[str, Metric]] = [
+        ("faithfulness", faithfulness),
+        ("numeric_temporal_fidelity", numeric_temporal_fidelity),
+        ("citation_relevance", citation_relevance),
+        ("nc_semantic_precision", nc_semantic_precision),
+        ("fabricated_entity", fabricated_entity),
+        ("contradiction", contradiction),
+        ("open_gap", open_gap),
+        ("actionability", actionability),
+        ("severity_calibration", severity_calibration),
+        ("answer_relevancy", answer_relevancy),
+        ("surface_deduplication", surface_deduplication),
+        ("comparative_vs_champion", comparative_vs_champion),
+    ]
+    # flycanon custom
+    flycanon_metrics: list[tuple[str, Metric]] = [
+        ("contains_answer", contains_answer),
+        ("addresses_question", addresses_question),
+    ]
+    # RAGAS
+    ragas_metrics: list[tuple[str, Metric]] = [
+        ("answer_correctness", answer_correctness),
+        ("ragas_faithfulness", ragas_faithfulness),
+        ("context_recall", context_recall),
+        ("context_precision", context_precision),
+    ]
 
-    def _run_judge_metric(name: str, fn) -> None:
+    all_metrics = det_metrics + emb_metrics + judge_metrics + flycanon_metrics + ragas_metrics
+
+    async def _run_one(name: str, fn: Metric) -> None:
         try:
-            samples = [fn() for _ in range(max(1, runs))]
-            report.metrics[name] = _median_runs(samples)
-        except Exception as exc:  # best-effort: never raise
+            result = await fn(item, ctx)
+            if result is not None:
+                report.metrics[name] = result
+        except Exception as exc:
             report.errors.append(f"{name}: {type(exc).__name__}: {exc}")
 
-    # [D] deterministic — always computed, no LLM.
-    _run_det("source_coverage", lambda: source_coverage(result))
-    _run_det("excerpt_fill_rate", lambda: excerpt_fill_rate(result))
-
-    # [E] embedding — context recall.
-    _run_det(
-        "semantic_recovery",
-        lambda: semantic_recovery(result, registry, lexical_missed_ids or [], embed_fn, tau=tau),
-    )
-
-    # [J] judge — median-of-N.  Per-item metrics fan out at workers=concurrency.
-    _run_judge_metric("faithfulness", lambda: faithfulness(result, chat_fn, workers=concurrency))
-    _run_judge_metric(
-        "numeric_temporal_fidelity",
-        lambda: numeric_temporal_fidelity(result, chat_fn, workers=concurrency),
-    )
-    _run_judge_metric(
-        "citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency)
-    )
-    _run_judge_metric(
-        "nc_semantic_precision",
-        lambda: nc_semantic_precision(result, registry, chat_fn, workers=concurrency),
-    )
-    _run_judge_metric("fabricated_entity", lambda: fabricated_entity(result, chat_fn))
-    _run_judge_metric("contradiction", lambda: contradiction(result, chat_fn))
-    _run_judge_metric("open_gap", lambda: open_gap(result, chat_fn))
-    _run_judge_metric("actionability", lambda: actionability(result, chat_fn, workers=concurrency))
-    _run_judge_metric(
-        "severity_calibration",
-        lambda: severity_calibration(result, chat_fn, workers=concurrency),
-    )
-    _run_judge_metric("answer_relevancy", lambda: answer_relevancy(result, chat_fn))
-    _run_judge_metric(
-        "surface_deduplication",
-        lambda: surface_deduplication(result, chat_fn, workers=concurrency),
-    )
-    if champion_result is not None:
-        _run_judge_metric(
-            "comparative_vs_champion",
-            lambda: comparative_vs_champion(result, champion_result, chat_fn),
-        )
-
+    await asyncio.gather(*[_run_one(name, fn) for name, fn in all_metrics])
     return report

From 7799185bf69777b8f680fe04667129e42fddddf1 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:24:58 +0200
Subject: [PATCH 31/48] feat(evaluation): slim __init__.py to 3-file exports

---
 .../evaluation/__init__.py                    | 179 +++++++++---------
 1 file changed, 90 insertions(+), 89 deletions(-)

diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py
index ad01980c..c2005e7a 100644
--- a/fireflyframework_agentic/evaluation/__init__.py
+++ b/fireflyframework_agentic/evaluation/__init__.py
@@ -1,89 +1,90 @@
-# Copyright 2026 Firefly Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Evaluation subpackage -- gate-based quality gates, LLM-as-judge advisory, champion/challenger tracking, and retrieval metrics.
-
-Gate pipeline (flags, not vetoes):
-    G1 -- Structural & Safe (schema + PII + empty-registry guard)
-    G2 -- Must-finds & negative controls (recall + NC precision)
-    G3 -- Evidence (grounding / token-anchoring)
-    G4 -- LLM-as-a-Judge (advisory, opt-in, never decides promotion)
-    G5 -- No-regression / promotion (champion/challenger comparison)
-
-Retrieval metrics:
-    Precision@k, Recall@k, MRR, NDCG -- computed over ranked retrieval results.
-
-Champion tracking:
-    Persists the best-known run record so that promotion decisions can be made
-    against a stable baseline rather than the most recent run.
-"""
-
-from importlib.metadata import PackageNotFoundError, version
-
-from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index
-from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, run_gates
-from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict, VERDICT_PROMOTE, VERDICT_HOLD
-from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion
-from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge
-from fireflyframework_agentic.evaluation.judge_client import JudgeClient, OllamaEmbedder, build_embedder, cosine
-from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens
-from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256
-from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics
-from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag
-
-try:
-    __version__ = version("fireflyframework-agentic")
-except PackageNotFoundError:
-    __version__ = "0.0.0+dev"
-
-__all__ = [
-    "EMPTY",
-    "FABRICATED",
-    "SOURCE_UNKNOWN",
-    "VERIFIED",
-    "corpus_sha256",
-    "load_corpus",
-    "verify_evidence_index",
-    "GateResult",
-    "Verdict",
-    "g2_recall_precision",
-    "run_gates",
-    "render_scorecard",
-    "verdict",
-    "VERDICT_PROMOTE",
-    "VERDICT_HOLD",
-    "ChampionRecord",
-    "load_champion",
-    "save_champion",
-    "invalidate_champion",
-    "AdvisoryReport",
-    "run_judge",
-    "JudgeClient",
-    "OllamaEmbedder",
-    "build_embedder",
-    "cosine",
-    "Registry",
-    "RegistryItem",
-    "load_registry",
-    "registry_sha256",
-    "RetrieverMetrics",
-    "compute_retrieval_metrics",
-    "anchored",
-    "matches",
-    "source_stem",
-    "tokens",
-    "aa_band",
-    "aggregate_grounding",
-    "left_skew_flag",
-]
+from fireflyframework_agentic.evaluation.judge import (
+    AdvisoryReport as AdvisoryReport,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    EvalContext as EvalContext,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    Metric as Metric,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    actionability as actionability,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    addresses_question as addresses_question,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    answer_correctness as answer_correctness,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    answer_relevancy as answer_relevancy,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    citation_relevance as citation_relevance,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    comparative_vs_champion as comparative_vs_champion,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    contains_answer as contains_answer,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    context_precision as context_precision,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    context_recall as context_recall,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    contradiction as contradiction,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    excerpt_fill_rate as excerpt_fill_rate,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    fabricated_entity as fabricated_entity,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    faithfulness as faithfulness,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    nc_semantic_precision as nc_semantic_precision,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    numeric_temporal_fidelity as numeric_temporal_fidelity,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    open_gap as open_gap,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    ragas_faithfulness as ragas_faithfulness,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    run_judge as run_judge,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    semantic_recovery as semantic_recovery,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    severity_calibration as severity_calibration,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    source_coverage as source_coverage,
+)
+from fireflyframework_agentic.evaluation.judge import (
+    surface_deduplication as surface_deduplication,
+)
+from fireflyframework_agentic.evaluation.judge_client import (
+    JudgeClient as JudgeClient,
+)
+from fireflyframework_agentic.evaluation.judge_client import (
+    parse_model as parse_model,
+)
+from fireflyframework_agentic.evaluation.judge_client import (
+    same_provider as same_provider,
+)
+from fireflyframework_agentic.lab.retrieval_metrics import (
+    RetrieverMetrics as RetrieverMetrics,
+)
+from fireflyframework_agentic.lab.retrieval_metrics import (
+    compute_retrieval_metrics as compute_retrieval_metrics,
+)

From 9526f43315f56324aba3173b75ceebd87d9c3d71 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:25:15 +0200
Subject: [PATCH 32/48] =?UTF-8?q?chore(evaluation):=20update=20pyproject.t?=
 =?UTF-8?q?oml=20=E2=80=94=20drop=20scipy,=20add=20ragas=20deps,=20remove?=
 =?UTF-8?q?=20flyeval=20entrypoint?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index bb74201f..72a04fad 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -120,8 +120,10 @@ all = [
     "fireflyframework-agentic[postgres,mongodb,security,embeddings,openai-embeddings,cohere-embeddings,google-embeddings,mistral-embeddings,voyage-embeddings,azure-embeddings,bedrock-embeddings,ollama-embeddings,vectorstores-chroma,vectorstores-pinecone,vectorstores-qdrant,vectorstores-pgvector,vectorstores-sqlite-vec,watch,binary]",
 ]
 evaluation = [
-    "scipy>=1.11",
     "numpy>=1.26.0",
+    "ragas>=0.2",
+    "langchain-anthropic>=0.3",
+    "langchain-ollama>=0.3",
 ]
 dev = [
     "pytest>=8.3.0",
@@ -136,9 +138,6 @@ dev = [
     "pre-commit>=3.8.0",
 ]
 
-[project.scripts]
-flyeval = "fireflyframework_agentic.evaluation.cli:main"
-
 [project.urls]
 Homepage = "https://fireflyframework.org/"
 Documentation = "https://github.com/fireflyframework/fireflyframework-agentic/tree/main/docs"

From d56755228af64f4f5d0d24e5edbf5426853e6929 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:26:17 +0200
Subject: [PATCH 33/48] test(evaluation): add unit tests for judge.py metrics

---
 tests/unit/evaluation/test_judge.py | 248 ++++++++++++++++++++++++++++
 1 file changed, 248 insertions(+)
 create mode 100644 tests/unit/evaluation/test_judge.py

diff --git a/tests/unit/evaluation/test_judge.py b/tests/unit/evaluation/test_judge.py
new file mode 100644
index 00000000..7f27c125
--- /dev/null
+++ b/tests/unit/evaluation/test_judge.py
@@ -0,0 +1,248 @@
+from unittest.mock import MagicMock
+
+import pytest
+
+from fireflyframework_agentic.evaluation.judge import (
+    EvalContext,
+    addresses_question,
+    contains_answer,
+    excerpt_fill_rate,
+    faithfulness,
+    source_coverage,
+)
+from fireflyframework_agentic.evaluation.judge_client import JudgeClient
+
+
+def make_ctx(responses: list[dict]) -> EvalContext:
+    client = MagicMock(spec=JudgeClient)
+    client.model_spec = "anthropic:claude-sonnet-4-6"
+    client.provider = "anthropic"
+    client.model = "claude-sonnet-4-6"
+    call_iter = iter(responses)
+
+    async def mock_chat_json(system, user, max_tokens=1024):
+        return next(call_iter)
+
+    client.chat_json = mock_chat_json
+    return EvalContext(client=client, runs=1)
+
+
+# ── contains_answer ──────────────────────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_contains_answer_present():
+    ctx = make_ctx([{"contains_answer": 1.0, "addresses_question": 1.0}])
+    item = {"question": "Q", "reference": "R", "answer": "A"}
+    score = await contains_answer(item, ctx)
+    assert score == 1.0
+
+
+@pytest.mark.asyncio
+async def test_contains_answer_absent():
+    ctx = make_ctx([{"contains_answer": 0.0, "addresses_question": 0.5}])
+    item = {"question": "Q", "reference": "R", "answer": "wrong"}
+    score = await contains_answer(item, ctx)
+    assert score == 0.0
+
+
+@pytest.mark.asyncio
+async def test_contains_answer_partial():
+    ctx = make_ctx([{"contains_answer": 0.5, "addresses_question": 0.8}])
+    item = {"question": "Q", "reference": "R", "answer": "partial"}
+    score = await contains_answer(item, ctx)
+    assert score == 0.5
+
+
+@pytest.mark.asyncio
+async def test_contains_answer_missing_question_returns_none():
+    ctx = make_ctx([])
+    item = {"reference": "R", "answer": "A"}
+    score = await contains_answer(item, ctx)
+    assert score is None
+
+
+# ── addresses_question ───────────────────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_addresses_question_yes():
+    ctx = make_ctx([{"contains_answer": 0.5, "addresses_question": 1.0}])
+    item = {"question": "Q", "reference": "R", "answer": "A"}
+    score = await addresses_question(item, ctx)
+    assert score == 1.0
+
+
+@pytest.mark.asyncio
+async def test_addresses_question_no():
+    ctx = make_ctx([{"contains_answer": 0.0, "addresses_question": 0.0}])
+    item = {"question": "Q", "reference": "R", "answer": "irrelevant"}
+    score = await addresses_question(item, ctx)
+    assert score == 0.0
+
+
+@pytest.mark.asyncio
+async def test_addresses_question_missing_answer_returns_none():
+    ctx = make_ctx([])
+    item = {"question": "Q", "reference": "R"}
+    score = await addresses_question(item, ctx)
+    assert score is None
+
+
+# ── faithfulness ─────────────────────────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_faithfulness_all_supported():
+    # One finding with cited evidence, judge says SUPPORTED.
+    ctx = make_ctx([{"verdict": "SUPPORTED", "reason": "matches"}])
+    item = {
+        "findings": [
+            {
+                "id": "F1",
+                "description": "The process takes 3 days.",
+                "evidence_refs": [{"evidence_id": "E1"}],
+            }
+        ],
+        "evidence_index": [{"id": "E1", "locator": "doc.pdf#1", "excerpt": "The process takes 3 days as documented."}],
+    }
+    result = await faithfulness(item, ctx)
+    assert result["supported"] == 1
+    assert result["total"] == 1
+    assert result["unsupported_ids"] == []
+
+
+@pytest.mark.asyncio
+async def test_faithfulness_not_supported():
+    ctx = make_ctx([{"verdict": "NOT_SUPPORTED", "reason": "contradicts"}])
+    item = {
+        "findings": [
+            {
+                "id": "F1",
+                "description": "The process takes 45 days.",
+                "evidence_refs": [{"evidence_id": "E1"}],
+            }
+        ],
+        "evidence_index": [{"id": "E1", "locator": "doc.pdf#1", "excerpt": "The process takes 3 days."}],
+    }
+    result = await faithfulness(item, ctx)
+    assert result["supported"] == 0
+    assert result["total"] == 1
+    assert "F1" in result["unsupported_ids"]
+
+
+@pytest.mark.asyncio
+async def test_faithfulness_no_cited_evidence():
+    # Finding with no evidence_refs -> counted as unsupported without LLM call.
+    ctx = make_ctx([])
+    item = {
+        "findings": [{"id": "F1", "description": "Something.", "evidence_refs": []}],
+        "evidence_index": [],
+    }
+    result = await faithfulness(item, ctx)
+    assert result["supported"] == 0
+    assert result["total"] == 1
+    assert "F1" in result["unsupported_ids"]
+
+
+# ── source_coverage ───────────────────────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_source_coverage_all_cited():
+    ctx = make_ctx([])
+    item = {
+        "findings": [
+            {
+                "id": "F1",
+                "description": "X",
+                "evidence_refs": [{"evidence_id": "E1"}],
+            }
+        ],
+        "evidence_index": [{"id": "E1", "locator": "doc.pdf#section1", "excerpt": "text"}],
+    }
+    result = await source_coverage(item, ctx)
+    assert result["cited"] == 1
+    assert result["total"] == 1
+    assert result["orphaned"] == []
+
+
+@pytest.mark.asyncio
+async def test_source_coverage_orphaned():
+    ctx = make_ctx([])
+    item = {
+        "findings": [{"id": "F1", "description": "X", "evidence_refs": []}],
+        "evidence_index": [
+            {"id": "E1", "locator": "doc1.pdf#p1", "excerpt": "text"},
+            {"id": "E2", "locator": "doc2.pdf#p2", "excerpt": "text2"},
+        ],
+    }
+    result = await source_coverage(item, ctx)
+    assert result["cited"] == 0
+    assert result["total"] == 2
+    assert len(result["orphaned"]) == 2
+
+
+@pytest.mark.asyncio
+async def test_source_coverage_stem_dedup():
+    # Two evidence items from the same file (different fragments) -> 1 source stem.
+    ctx = make_ctx([])
+    item = {
+        "findings": [
+            {
+                "id": "F1",
+                "description": "X",
+                "evidence_refs": [{"evidence_id": "E1"}],
+            }
+        ],
+        "evidence_index": [
+            {"id": "E1", "locator": "doc.pdf#section1", "excerpt": "text1"},
+            {"id": "E2", "locator": "doc.pdf#section2", "excerpt": "text2"},
+        ],
+    }
+    result = await source_coverage(item, ctx)
+    # Both E1 and E2 share "doc.pdf" stem -> 1 total stem.
+    assert result["total"] == 1
+    # E1 is cited -> that stem is covered.
+    assert result["cited"] == 1
+
+
+# ── excerpt_fill_rate ──────────────────────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_excerpt_fill_rate_full():
+    ctx = make_ctx([])
+    item = {
+        "evidence_index": [
+            {"id": "E1", "excerpt": "has content"},
+            {"id": "E2", "excerpt": "also has content"},
+        ]
+    }
+    result = await excerpt_fill_rate(item, ctx)
+    assert result["populated"] == 2
+    assert result["total"] == 2
+
+
+@pytest.mark.asyncio
+async def test_excerpt_fill_rate_partial():
+    ctx = make_ctx([])
+    item = {
+        "evidence_index": [
+            {"id": "E1", "excerpt": "has content"},
+            {"id": "E2", "excerpt": ""},
+            {"id": "E3", "excerpt": "   "},
+        ]
+    }
+    result = await excerpt_fill_rate(item, ctx)
+    assert result["populated"] == 1
+    assert result["total"] == 3
+
+
+@pytest.mark.asyncio
+async def test_excerpt_fill_rate_empty():
+    ctx = make_ctx([])
+    item = {"evidence_index": []}
+    result = await excerpt_fill_rate(item, ctx)
+    assert result["populated"] == 0
+    assert result["total"] == 0

From 564697405a176b36b418283bee9bc1bf18a4c918 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:52:05 +0200
Subject: [PATCH 34/48] fix(lab): type-annotate out dict, remove quoted return
 type in retrieval_metrics

---
 fireflyframework_agentic/lab/retrieval_metrics.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fireflyframework_agentic/lab/retrieval_metrics.py b/fireflyframework_agentic/lab/retrieval_metrics.py
index 5f3e2373..ee129eec 100644
--- a/fireflyframework_agentic/lab/retrieval_metrics.py
+++ b/fireflyframework_agentic/lab/retrieval_metrics.py
@@ -66,11 +66,7 @@ def _dedup(retrieved: list[dict]) -> list[dict]:
 
 def _ndcg(retrieved: list[dict], n_gold: int, k: int = 10) -> float:
     """Return nDCG@k for a single query."""
-    dcg = sum(
-        1.0 / math.log2(r["rank"] + 1)
-        for r in retrieved
-        if r.get("is_gold") and r["rank"] <= k
-    )
+    dcg = sum(1.0 / math.log2(r["rank"] + 1) for r in retrieved if r.get("is_gold") and r["rank"] <= k)
     ideal = sum(1.0 / math.log2(i + 2) for i in range(min(n_gold, k)))
     return dcg / ideal if ideal else 0.0
 
@@ -140,7 +136,7 @@ def compute_retrieval_metrics(results: list[dict]) -> dict:
         if row.get("answer_ms") is not None:
             answer_ms.append(row["answer_ms"])
 
-    out = {k: round(v / n, 4) for k, v in agg.items()} if n else {}
+    out: dict[str, object] = {k: round(v / n, 4) for k, v in agg.items()} if n else {}
     out["n_queries"] = n
     out["no_answer_rate"] = round(no_answer / n, 4) if n else None
     out["citation_precision"] = round(cite_num / cite_den, 4) if cite_den else None
@@ -176,7 +172,7 @@ class RetrieverMetrics(BaseModel):
     mean_answer_ms: float | None = None
 
     @classmethod
-    def from_results(cls, results: list[dict]) -> "RetrieverMetrics":
+    def from_results(cls, results: list[dict]) -> RetrieverMetrics:
         """Compute metrics from raw retrieval result rows and return a model instance."""
         m = compute_retrieval_metrics(results)
         return cls(

From 582d1c044609fc0544bb74ab93bf65051dbc59e5 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:52:11 +0200
Subject: [PATCH 35/48] fix(lab): remove unused import math, fix import sort in
 test_retrieval_metrics

---
 tests/unit/lab/test_retrieval_metrics.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/tests/unit/lab/test_retrieval_metrics.py b/tests/unit/lab/test_retrieval_metrics.py
index a018a08b..1053c550 100644
--- a/tests/unit/lab/test_retrieval_metrics.py
+++ b/tests/unit/lab/test_retrieval_metrics.py
@@ -16,16 +16,11 @@
 
 from __future__ import annotations
 
-import math
-
-import pytest
-
 from fireflyframework_agentic.lab.retrieval_metrics import (
     RetrieverMetrics,
     compute_retrieval_metrics,
 )
 
-
 # ── helpers ───────────────────────────────────────────────────────────────────
 
 
@@ -37,11 +32,13 @@ def _row(gold_rank: int | None = None, total: int = 5, n_gold: int = 1) -> dict:
     """
     retrieved = []
     for rank in range(1, total + 1):
-        retrieved.append({
-            "rank": rank,
-            "source_id": f"doc-{rank}",
-            "is_gold": rank == gold_rank,
-        })
+        retrieved.append(
+            {
+                "rank": rank,
+                "source_id": f"doc-{rank}",
+                "is_gold": rank == gold_rank,
+            }
+        )
     gold_ids = [f"doc-{gold_rank}"] if gold_rank is not None else []
     return {
         "retrieved": retrieved,

From 3e62b1f92697903909544350ab26d2eb69800f36 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:52:11 +0200
Subject: [PATCH 36/48] fix(evaluation): add type: ignore for pyright errors on
 RAGAS/langchain calls in judge.py

---
 fireflyframework_agentic/evaluation/judge.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py
index 9f24dc26..d5bcad66 100644
--- a/fireflyframework_agentic/evaluation/judge.py
+++ b/fireflyframework_agentic/evaluation/judge.py
@@ -685,12 +685,12 @@ def _make_ragas_llm(ctx: EvalContext):
         from langchain_anthropic import ChatAnthropic  # type: ignore[import]  # noqa: PLC0415
 
         api_key = os.environ.get("ANTHROPIC_API_KEY", "")
-        return ChatAnthropic(model=model, api_key=api_key, temperature=0.0)
+        return ChatAnthropic(model=model, api_key=api_key, temperature=0.0)  # type: ignore[call-arg,arg-type]
     if provider in ("openai", "azure"):
         from langchain_openai import ChatOpenAI  # type: ignore[import]  # noqa: PLC0415
 
         api_key = os.environ.get("OPENAI_API_KEY", "")
-        return ChatOpenAI(model=model, api_key=api_key, temperature=0.0)
+        return ChatOpenAI(model=model, api_key=api_key, temperature=0.0)  # type: ignore[call-arg,arg-type]
     if provider == "ollama":
         from langchain_ollama import ChatOllama  # type: ignore[import]  # noqa: PLC0415
 
@@ -740,7 +740,7 @@ def _sync():
         sample = _make_ragas_sample(item)
         dataset = EvaluationDataset(samples=[sample])
         result = evaluate(dataset=dataset, metrics=[metric])
-        df = result.to_pandas()
+        df = result.to_pandas()  # type: ignore[attr-defined]
         col = df.columns[df.columns.str.contains(metric_name.replace("_ragas", ""), case=False)]
         if col.empty:
             return None

From 3679dbca4b2fea88cf9339b9c7aac279b0891def Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 14:58:02 +0200
Subject: [PATCH 37/48] refactor(evaluation): move retrieval_metrics.py from
 lab/ to evaluation/

---
 fireflyframework_agentic/{lab => evaluation}/retrieval_metrics.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename fireflyframework_agentic/{lab => evaluation}/retrieval_metrics.py (100%)

diff --git a/fireflyframework_agentic/lab/retrieval_metrics.py b/fireflyframework_agentic/evaluation/retrieval_metrics.py
similarity index 100%
rename from fireflyframework_agentic/lab/retrieval_metrics.py
rename to fireflyframework_agentic/evaluation/retrieval_metrics.py

From 6bce3748a7988907c3e39235cf96fda0b07b38ff Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 14:58:19 +0200
Subject: [PATCH 38/48] =?UTF-8?q?refactor(evaluation):=20update=20imports?=
 =?UTF-8?q?=20=E2=80=94=20retrieval=5Fmetrics=20now=20in=20evaluation/?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fireflyframework_agentic/evaluation/__init__.py | 4 ++--
 fireflyframework_agentic/lab/__init__.py        | 3 ---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py
index c2005e7a..c68f5a19 100644
--- a/fireflyframework_agentic/evaluation/__init__.py
+++ b/fireflyframework_agentic/evaluation/__init__.py
@@ -82,9 +82,9 @@
 from fireflyframework_agentic.evaluation.judge_client import (
     same_provider as same_provider,
 )
-from fireflyframework_agentic.lab.retrieval_metrics import (
+from fireflyframework_agentic.evaluation.retrieval_metrics import (
     RetrieverMetrics as RetrieverMetrics,
 )
-from fireflyframework_agentic.lab.retrieval_metrics import (
+from fireflyframework_agentic.evaluation.retrieval_metrics import (
     compute_retrieval_metrics as compute_retrieval_metrics,
 )
diff --git a/fireflyframework_agentic/lab/__init__.py b/fireflyframework_agentic/lab/__init__.py
index 8e127d8a..46cc08dc 100644
--- a/fireflyframework_agentic/lab/__init__.py
+++ b/fireflyframework_agentic/lab/__init__.py
@@ -18,7 +18,6 @@
 from fireflyframework_agentic.lab.comparison import ComparisonEntry, ModelComparison
 from fireflyframework_agentic.lab.dataset import EvalCase, EvalDataset
 from fireflyframework_agentic.lab.evaluator import EvalOrchestrator, EvalReport, EvalResult
-from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics
 from fireflyframework_agentic.lab.session import LabSession, SessionEntry
 
 __all__ = [
@@ -32,7 +31,5 @@
     "EvalResult",
     "LabSession",
     "ModelComparison",
-    "RetrieverMetrics",
     "SessionEntry",
-    "compute_retrieval_metrics",
 ]

From 9229c4348656c3e1e992780be6dc4fcdb06cea2f Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 14:58:46 +0200
Subject: [PATCH 39/48] refactor(evaluation): move test_retrieval_metrics.py to
 tests/unit/evaluation/

---
 tests/unit/{lab => evaluation}/test_retrieval_metrics.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/unit/{lab => evaluation}/test_retrieval_metrics.py (100%)

diff --git a/tests/unit/lab/test_retrieval_metrics.py b/tests/unit/evaluation/test_retrieval_metrics.py
similarity index 100%
rename from tests/unit/lab/test_retrieval_metrics.py
rename to tests/unit/evaluation/test_retrieval_metrics.py

From 6cdd3db11edda4e42c57791e632a2b594e8510cc Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 15:00:19 +0200
Subject: [PATCH 40/48] refactor(evaluation): replace RetrieverMetrics class
 with plain functions

---
 .../evaluation/retrieval_metrics.py           | 270 +++++++++---------
 1 file changed, 140 insertions(+), 130 deletions(-)

diff --git a/fireflyframework_agentic/evaluation/retrieval_metrics.py b/fireflyframework_agentic/evaluation/retrieval_metrics.py
index ee129eec..5a318a2a 100644
--- a/fireflyframework_agentic/evaluation/retrieval_metrics.py
+++ b/fireflyframework_agentic/evaluation/retrieval_metrics.py
@@ -14,46 +14,48 @@
 
 """Deterministic IR evaluation metrics for ranked retrieval results (no LLM, no network).
 
-Industry-standard information-retrieval metrics computed over a ranked list of
-retrieved chunks vs the gold set each result carries (``gold`` + per-hit
-``is_gold``).  Metrics are reported at cut-offs k ∈ {1, 5, 10}:
-
-* **Hit@k** -- at least one gold document appears in the top-k results.
-* **Recall@k** -- fraction of gold documents found in top-k.
-* **Precision@k** -- fraction of top-k results that are gold.
-* **MRR@10** -- mean reciprocal rank of the first gold hit (up to k=10).
-* **MAP@10** -- mean average precision (up to k=10).
-* **nDCG@10** -- normalised discounted cumulative gain (up to k=10).
-
-Optional fields (populated when the raw result rows contain them):
-
-* ``no_answer_rate`` -- fraction of rows where the model produced no answer.
-* ``citation_precision`` -- precision of in-answer citations vs gold set.
-* ``mean_search_ms`` / ``mean_answer_ms`` -- mean retrieval and generation latencies.
-
-Ported from ``flycanon_experiments/scripts/deterministic_eval.py``.
+Each metric is a plain function that takes a list of result rows and returns a
+float — the same design as scikit-learn or MS MARCO evaluation scripts.
+
+Result row schema (dict)::
+
+    {
+        "retrieved": [{"rank": int, "source_id": str, "is_gold": bool}, ...],
+        "gold": [str, ...],          # gold source identifiers
+        # optional:
+        "no_answer": bool,           # model refused / produced no answer
+        "answer": str,               # used for no_answer detection when no_answer absent
+        "citations": [{"is_gold": bool}, ...],
+        "search_ms": float,
+        "answer_ms": float,
+    }
+
+Individual metrics (recommended for composability)::
+
+    hit_at_k(results, k)        -> float
+    recall_at_k(results, k)     -> float
+    precision_at_k(results, k)  -> float
+    mrr(results, k=10)          -> float
+    map_score(results, k=10)    -> float
+    ndcg(results, k=10)         -> float
+    no_answer_rate(results)     -> float | None
+    citation_precision(results) -> float | None
+    mean_latency_ms(results, field) -> float | None
+
+Convenience aggregate (all metrics in one call)::
+
+    compute_retrieval_metrics(results) -> dict
 """
 
 from __future__ import annotations
 
 import math
 
-from pydantic import BaseModel
-
 KS = (1, 5, 10)
 
 
 def _dedup(retrieved: list[dict]) -> list[dict]:
-    """Return one entry per source, first chunk wins, preserving rank order.
-
-    flycanon splits each ingested document into many chunks; a single gold
-    filing can therefore appear multiple times in the ranked list.  Without
-    deduplication nDCG/MAP/Recall count every chunk separately, inflating
-    scores past 1.0 when a good embedding model retrieves several chunks from
-    the same filing.  Taking only the first (highest-ranked) chunk per
-    source_id makes the list item-unique, matching the recommenders-library
-    contract that all IR formulae assume.
-    """
+    """Return one entry per source, first chunk wins, preserving rank order."""
     seen: set[str] = set()
     out: list[dict] = []
     for r in sorted(retrieved, key=lambda x: x["rank"]):
@@ -64,15 +66,13 @@ def _dedup(retrieved: list[dict]) -> list[dict]:
     return out
 
 
-def _ndcg(retrieved: list[dict], n_gold: int, k: int = 10) -> float:
-    """Return nDCG@k for a single query."""
+def _ndcg_single(retrieved: list[dict], n_gold: int, k: int = 10) -> float:
     dcg = sum(1.0 / math.log2(r["rank"] + 1) for r in retrieved if r.get("is_gold") and r["rank"] <= k)
     ideal = sum(1.0 / math.log2(i + 2) for i in range(min(n_gold, k)))
     return dcg / ideal if ideal else 0.0
 
 
-def _ap(retrieved: list[dict], n_gold: int, k: int = 10) -> float:
-    """Return average precision@k for a single query."""
+def _ap_single(retrieved: list[dict], n_gold: int, k: int = 10) -> float:
     hits, precisions = 0, []
     for r in sorted(retrieved, key=lambda x: x["rank"]):
         if r["rank"] > k:
@@ -83,114 +83,124 @@ def _ap(retrieved: list[dict], n_gold: int, k: int = 10) -> float:
     return sum(precisions) / min(n_gold, k) if n_gold else 0.0
 
 
-def compute_retrieval_metrics(results: list[dict]) -> dict:
-    """Compute deterministic IR metrics over a list of retrieval result rows.
+def hit_at_k(results: list[dict], k: int) -> float:
+    """Fraction of queries where at least one gold document appears in top-k."""
+    if not results:
+        return 0.0
+    hits = 0
+    for row in results:
+        retrieved = _dedup(row["retrieved"])
+        gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")]
+        if any(g <= k for g in gold_ranks):
+            hits += 1
+    return round(hits / len(results), 4)
+
 
-    Each element of *results* must be a dict with at least:
+def recall_at_k(results: list[dict], k: int) -> float:
+    """Mean fraction of gold documents found in top-k."""
+    if not results:
+        return 0.0
+    total = 0.0
+    for row in results:
+        retrieved = _dedup(row["retrieved"])
+        n_gold = max(len(set(row["gold"])), 1)
+        gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")]
+        total += len([g for g in gold_ranks if g <= k]) / n_gold
+    return round(total / len(results), 4)
 
-    * ``retrieved`` -- list of dicts with ``rank`` (int, 1-based), ``source_id``
-      (str) or ``identities`` (list[str]), and ``is_gold`` (bool).
-    * ``gold`` -- list of gold source identifiers (used to compute ``n_gold``).
 
-    Optional keys per row:
+def precision_at_k(results: list[dict], k: int) -> float:
+    """Mean fraction of top-k results that are gold."""
+    if not results:
+        return 0.0
+    total = 0.0
+    for row in results:
+        retrieved = _dedup(row["retrieved"])
+        gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")]
+        total += len([g for g in gold_ranks if g <= k]) / k
+    return round(total / len(results), 4)
 
-    * ``no_answer`` (bool) / ``answer`` (str) -- used for ``no_answer_rate``.
-    * ``citations`` (list[dict]) -- each with ``is_gold`` (bool) for citation precision.
-    * ``search_ms`` (float) / ``answer_ms`` (float) -- latency in milliseconds.
 
-    Returns a flat dict with keys: ``n_queries``, ``hit@1``, ``hit@5``,
-    ``hit@10``, ``recall@1``, ``recall@5``, ``recall@10``, ``precision@1``,
-    ``precision@5``, ``precision@10``, ``mrr@10``, ``map@10``, ``ndcg@10``,
-    ``no_answer_rate``, ``citation_precision``, ``mean_search_ms``,
-    ``mean_answer_ms``.
-    """
-    n = len(results)
-    agg = {f"{m}@{k}": 0.0 for k in KS for m in ("hit", "recall", "precision")}
-    agg.update({"mrr@10": 0.0, "map@10": 0.0, "ndcg@10": 0.0})
-    no_answer = 0
-    cite_num = cite_den = 0.0
-    search_ms: list[float] = []
-    answer_ms: list[float] = []
+def mrr(results: list[dict], k: int = 10) -> float:
+    """Mean reciprocal rank of the first gold hit (up to k)."""
+    if not results:
+        return 0.0
+    total = 0.0
+    for row in results:
+        retrieved = _dedup(row["retrieved"])
+        gold_ranks = sorted(r["rank"] for r in retrieved if r.get("is_gold") and r["rank"] <= k)
+        total += 1.0 / gold_ranks[0] if gold_ranks else 0.0
+    return round(total / len(results), 4)
+
 
+def map_score(results: list[dict], k: int = 10) -> float:
+    """Mean average precision at k."""
+    if not results:
+        return 0.0
+    total = 0.0
     for row in results:
         retrieved = _dedup(row["retrieved"])
         n_gold = max(len(set(row["gold"])), 1)
-        gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")]
-        for k in KS:
-            in_k = [g for g in gold_ranks if g <= k]
-            agg[f"hit@{k}"] += 1.0 if in_k else 0.0
-            agg[f"recall@{k}"] += len(in_k) / n_gold
-            agg[f"precision@{k}"] += len(in_k) / k
-        agg["mrr@10"] += (1.0 / min(gold_ranks)) if gold_ranks else 0.0
-        agg["map@10"] += _ap(retrieved, n_gold)
-        agg["ndcg@10"] += _ndcg(retrieved, n_gold)
-
-        if row.get("no_answer") or not row.get("answer", "").strip():
-            no_answer += 1
+        total += _ap_single(retrieved, n_gold, k)
+    return round(total / len(results), 4)
+
+
+def ndcg(results: list[dict], k: int = 10) -> float:
+    """Mean normalised discounted cumulative gain at k."""
+    if not results:
+        return 0.0
+    total = 0.0
+    for row in results:
+        retrieved = _dedup(row["retrieved"])
+        n_gold = max(len(set(row["gold"])), 1)
+        total += _ndcg_single(retrieved, n_gold, k)
+    return round(total / len(results), 4)
+
+
+def no_answer_rate(results: list[dict]) -> float | None:
+    """Fraction of queries where the model produced no answer. None if no results."""
+    if not results:
+        return None
+    count = sum(
+        1 for row in results if row.get("no_answer") or not row.get("answer", "").strip()
+    )
+    return round(count / len(results), 4)
+
+
+def citation_precision(results: list[dict]) -> float | None:
+    """Precision of in-answer citations vs gold set. None if no citations present."""
+    num = den = 0.0
+    for row in results:
         cites = row.get("citations", [])
         if cites:
-            cite_num += sum(1 for c in cites if c.get("is_gold"))
-            cite_den += len(cites)
-        if row.get("search_ms") is not None:
-            search_ms.append(row["search_ms"])
-        if row.get("answer_ms") is not None:
-            answer_ms.append(row["answer_ms"])
-
-    out: dict[str, object] = {k: round(v / n, 4) for k, v in agg.items()} if n else {}
-    out["n_queries"] = n
-    out["no_answer_rate"] = round(no_answer / n, 4) if n else None
-    out["citation_precision"] = round(cite_num / cite_den, 4) if cite_den else None
-    out["mean_search_ms"] = round(sum(search_ms) / len(search_ms)) if search_ms else None
-    out["mean_answer_ms"] = round(sum(answer_ms) / len(answer_ms)) if answer_ms else None
-    return out
+            num += sum(1 for c in cites if c.get("is_gold"))
+            den += len(cites)
+    return round(num / den, 4) if den else None
 
 
-class RetrieverMetrics(BaseModel):
-    """Structured IR metrics for a retrieval evaluation run.
+def mean_latency_ms(results: list[dict], field: str) -> float | None:
+    """Mean latency in ms for the given field (``search_ms`` or ``answer_ms``). None if absent."""
+    values = [row[field] for row in results if row.get(field) is not None]
+    return round(sum(values) / len(values)) if values else None
 
-    Fields mirror the flat dict returned by :func:`compute_retrieval_metrics`.
-    Optional fields are ``None`` when the raw result rows lack the required data
-    (e.g. no latency timestamps, no citations).
-    """
 
-    n_queries: int = 0
-    hit_at_1: float = 0.0
-    hit_at_5: float = 0.0
-    hit_at_10: float = 0.0
-    recall_at_1: float = 0.0
-    recall_at_5: float = 0.0
-    recall_at_10: float = 0.0
-    precision_at_1: float = 0.0
-    precision_at_5: float = 0.0
-    precision_at_10: float = 0.0
-    mrr_at_10: float = 0.0
-    map_at_10: float = 0.0
-    ndcg_at_10: float = 0.0
-    no_answer_rate: float | None = None
-    citation_precision: float | None = None
-    mean_search_ms: float | None = None
-    mean_answer_ms: float | None = None
-
-    @classmethod
-    def from_results(cls, results: list[dict]) -> RetrieverMetrics:
-        """Compute metrics from raw retrieval result rows and return a model instance."""
-        m = compute_retrieval_metrics(results)
-        return cls(
-            n_queries=m.get("n_queries", 0),
-            hit_at_1=m.get("hit@1", 0.0),
-            hit_at_5=m.get("hit@5", 0.0),
-            hit_at_10=m.get("hit@10", 0.0),
-            recall_at_1=m.get("recall@1", 0.0),
-            recall_at_5=m.get("recall@5", 0.0),
-            recall_at_10=m.get("recall@10", 0.0),
-            precision_at_1=m.get("precision@1", 0.0),
-            precision_at_5=m.get("precision@5", 0.0),
-            precision_at_10=m.get("precision@10", 0.0),
-            mrr_at_10=m.get("mrr@10", 0.0),
-            map_at_10=m.get("map@10", 0.0),
-            ndcg_at_10=m.get("ndcg@10", 0.0),
-            no_answer_rate=m.get("no_answer_rate"),
-            citation_precision=m.get("citation_precision"),
-            mean_search_ms=m.get("mean_search_ms"),
-            mean_answer_ms=m.get("mean_answer_ms"),
-        )
+def compute_retrieval_metrics(results: list[dict]) -> dict:
+    """Compute all IR metrics over a list of retrieval result rows and return a flat dict.
+
+    Convenience wrapper that calls each individual metric function. Prefer the
+    individual functions (``hit_at_k``, ``recall_at_k``, etc.) when you only
+    need a subset.
+    """
+    out: dict[str, object] = {"n_queries": len(results)}
+    for k in KS:
+        out[f"hit@{k}"] = hit_at_k(results, k)
+        out[f"recall@{k}"] = recall_at_k(results, k)
+        out[f"precision@{k}"] = precision_at_k(results, k)
+    out["mrr@10"] = mrr(results)
+    out["map@10"] = map_score(results)
+    out["ndcg@10"] = ndcg(results)
+    out["no_answer_rate"] = no_answer_rate(results)
+    out["citation_precision"] = citation_precision(results)
+    out["mean_search_ms"] = mean_latency_ms(results, "search_ms")
+    out["mean_answer_ms"] = mean_latency_ms(results, "answer_ms")
+    return out

From 3a3c35fbb775340ddc3c05f5265849610a90bac2 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 15:00:34 +0200
Subject: [PATCH 41/48] =?UTF-8?q?refactor(evaluation):=20update=20=5F=5Fin?=
 =?UTF-8?q?it=5F=5F.py=20exports=20=E2=80=94=20replace=20RetrieverMetrics?=
 =?UTF-8?q?=20with=20individual=20functions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../evaluation/__init__.py                    | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py
index c68f5a19..9f31ee7b 100644
--- a/fireflyframework_agentic/evaluation/__init__.py
+++ b/fireflyframework_agentic/evaluation/__init__.py
@@ -83,8 +83,32 @@
     same_provider as same_provider,
 )
 from fireflyframework_agentic.evaluation.retrieval_metrics import (
-    RetrieverMetrics as RetrieverMetrics,
+    citation_precision as citation_precision,
 )
 from fireflyframework_agentic.evaluation.retrieval_metrics import (
     compute_retrieval_metrics as compute_retrieval_metrics,
 )
+from fireflyframework_agentic.evaluation.retrieval_metrics import (
+    hit_at_k as hit_at_k,
+)
+from fireflyframework_agentic.evaluation.retrieval_metrics import (
+    map_score as map_score,
+)
+from fireflyframework_agentic.evaluation.retrieval_metrics import (
+    mean_latency_ms as mean_latency_ms,
+)
+from fireflyframework_agentic.evaluation.retrieval_metrics import (
+    mrr as mrr,
+)
+from fireflyframework_agentic.evaluation.retrieval_metrics import (
+    ndcg as ndcg,
+)
+from fireflyframework_agentic.evaluation.retrieval_metrics import (
+    no_answer_rate as no_answer_rate,
+)
+from fireflyframework_agentic.evaluation.retrieval_metrics import (
+    precision_at_k as precision_at_k,
+)
+from fireflyframework_agentic.evaluation.retrieval_metrics import (
+    recall_at_k as recall_at_k,
+)

From 26bfe3b0b56362039eb00c2d0859858ed52e542d Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 15:01:27 +0200
Subject: [PATCH 42/48] test(evaluation): rewrite test_retrieval_metrics for
 individual metric functions

---
 .../unit/evaluation/test_retrieval_metrics.py | 254 ++++++++----------
 1 file changed, 107 insertions(+), 147 deletions(-)

diff --git a/tests/unit/evaluation/test_retrieval_metrics.py b/tests/unit/evaluation/test_retrieval_metrics.py
index 1053c550..38fc07fe 100644
--- a/tests/unit/evaluation/test_retrieval_metrics.py
+++ b/tests/unit/evaluation/test_retrieval_metrics.py
@@ -12,233 +12,193 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Unit tests for lab.retrieval_metrics: compute_retrieval_metrics and RetrieverMetrics."""
+"""Unit tests for evaluation.retrieval_metrics."""
 
 from __future__ import annotations
 
-from fireflyframework_agentic.lab.retrieval_metrics import (
-    RetrieverMetrics,
+from fireflyframework_agentic.evaluation.retrieval_metrics import (
+    citation_precision,
     compute_retrieval_metrics,
+    hit_at_k,
+    map_score,
+    mean_latency_ms,
+    mrr,
+    ndcg,
+    no_answer_rate,
+    precision_at_k,
+    recall_at_k,
 )
 
-# ── helpers ───────────────────────────────────────────────────────────────────
-
 
 def _row(gold_rank: int | None = None, total: int = 5, n_gold: int = 1) -> dict:
-    """Build one result row with ``total`` retrieved items.
-
-    If ``gold_rank`` is not None, the item at that rank is marked as gold.
-    All items get a unique ``source_id`` so dedup leaves them all.
-    """
     retrieved = []
     for rank in range(1, total + 1):
-        retrieved.append(
-            {
-                "rank": rank,
-                "source_id": f"doc-{rank}",
-                "is_gold": rank == gold_rank,
-            }
-        )
+        retrieved.append({"rank": rank, "source_id": f"doc-{rank}", "is_gold": rank == gold_rank})
     gold_ids = [f"doc-{gold_rank}"] if gold_rank is not None else []
-    return {
-        "retrieved": retrieved,
-        "gold": gold_ids * n_gold,
-    }
+    return {"retrieved": retrieved, "gold": gold_ids * n_gold}
 
 
-# ── hit@k ─────────────────────────────────────────────────────────────────────
+# ── hit_at_k ──────────────────────────────────────────────────────────────────
 
 
-def test_hit_at_1_perfect_when_gold_is_rank1():
-    results = [_row(gold_rank=1)]
-    m = compute_retrieval_metrics(results)
-    assert m["hit@1"] == 1.0
+def test_hit_at_k_gold_at_rank1():
+    assert hit_at_k([_row(gold_rank=1)], k=1) == 1.0
 
 
-def test_hit_at_1_zero_when_gold_not_in_top1():
-    results = [_row(gold_rank=2)]
-    m = compute_retrieval_metrics(results)
-    assert m["hit@1"] == 0.0
+def test_hit_at_k_miss_at_rank1():
+    assert hit_at_k([_row(gold_rank=2)], k=1) == 0.0
+
+
+def test_hit_at_k_gold_at_rank5():
+    assert hit_at_k([_row(gold_rank=5)], k=5) == 1.0
 
 
-def test_hit_at_5_one_when_gold_at_rank5():
-    results = [_row(gold_rank=5)]
-    m = compute_retrieval_metrics(results)
-    assert m["hit@5"] == 1.0
+def test_hit_at_k_gold_at_rank10():
+    assert hit_at_k([_row(gold_rank=10, total=10)], k=10) == 1.0
 
 
-def test_hit_at_5_zero_when_gold_not_in_top5():
-    # Gold is at rank 10 — outside top-5 window with only 5 items, make 10.
-    results = [_row(gold_rank=None, total=10)]  # no gold in retrieved
-    m = compute_retrieval_metrics(results)
-    assert m["hit@5"] == 0.0
+def test_hit_at_k_empty():
+    assert hit_at_k([], k=5) == 0.0
 
 
-def test_hit_at_10_one_when_gold_at_rank10():
-    results = [_row(gold_rank=10, total=10)]
-    m = compute_retrieval_metrics(results)
-    assert m["hit@10"] == 1.0
+# ── recall_at_k ───────────────────────────────────────────────────────────────
 
 
-# ── recall@k ──────────────────────────────────────────────────────────────────
+def test_recall_at_k_full_when_gold_at_rank1():
+    assert recall_at_k([_row(gold_rank=1, n_gold=1)], k=1) == 1.0
+
+
+def test_recall_at_k_zero_when_gold_outside_k():
+    assert recall_at_k([_row(gold_rank=5)], k=1) == 0.0
 
 
 def test_recall_at_k_increases_with_k():
-    # Gold at rank 3: recall@1=0, recall@5>=recall@1.
-    results = [_row(gold_rank=3)]
-    m = compute_retrieval_metrics(results)
-    assert m["recall@1"] <= m["recall@5"] <= m["recall@10"]
+    rows = [_row(gold_rank=3)]
+    assert recall_at_k(rows, k=1) <= recall_at_k(rows, k=5) <= recall_at_k(rows, k=10)
+
+
+# ── precision_at_k ────────────────────────────────────────────────────────────
 
 
-def test_recall_at_1_full_when_single_gold_at_rank1():
-    results = [_row(gold_rank=1, n_gold=1)]
-    m = compute_retrieval_metrics(results)
-    assert m["recall@1"] == 1.0
+def test_precision_at_k_gold_at_rank1():
+    assert precision_at_k([_row(gold_rank=1)], k=1) == 1.0
 
 
-def test_recall_at_1_zero_when_no_gold_in_rank1():
-    results = [_row(gold_rank=5)]
-    m = compute_retrieval_metrics(results)
-    assert m["recall@1"] == 0.0
+def test_precision_at_k_decreases_when_k_larger():
+    rows = [_row(gold_rank=1)]
+    assert precision_at_k(rows, k=5) < precision_at_k(rows, k=1)
 
 
-# ── MRR ───────────────────────────────────────────────────────────────────────
+# ── mrr ───────────────────────────────────────────────────────────────────────
 
 
-def test_mrr_is_1_when_gold_at_rank1():
-    results = [_row(gold_rank=1)]
-    m = compute_retrieval_metrics(results)
-    assert m["mrr@10"] == 1.0
+def test_mrr_gold_at_rank1():
+    assert mrr([_row(gold_rank=1)]) == 1.0
 
 
-def test_mrr_is_half_when_gold_at_rank2():
-    results = [_row(gold_rank=2)]
-    m = compute_retrieval_metrics(results)
-    assert abs(m["mrr@10"] - 0.5) < 1e-9
+def test_mrr_gold_at_rank2():
+    assert abs(mrr([_row(gold_rank=2)]) - 0.5) < 1e-9
 
 
-def test_mrr_is_zero_when_no_gold():
-    results = [_row(gold_rank=None)]
-    m = compute_retrieval_metrics(results)
-    assert m["mrr@10"] == 0.0
+def test_mrr_no_gold():
+    assert mrr([_row(gold_rank=None)]) == 0.0
 
 
 def test_mrr_average_across_queries():
-    # Query 1: gold at rank 1 (MRR=1.0); Query 2: gold at rank 2 (MRR=0.5).
-    results = [_row(gold_rank=1), _row(gold_rank=2)]
-    m = compute_retrieval_metrics(results)
-    assert abs(m["mrr@10"] - 0.75) < 1e-3
+    rows = [_row(gold_rank=1), _row(gold_rank=2)]
+    assert abs(mrr(rows) - 0.75) < 1e-3
 
 
-# ── nDCG ──────────────────────────────────────────────────────────────────────
+# ── ndcg ──────────────────────────────────────────────────────────────────────
 
 
-def test_ndcg_is_1_when_gold_at_rank1():
-    results = [_row(gold_rank=1, n_gold=1)]
-    m = compute_retrieval_metrics(results)
-    assert abs(m["ndcg@10"] - 1.0) < 1e-9
+def test_ndcg_gold_at_rank1():
+    assert abs(ndcg([_row(gold_rank=1, n_gold=1)]) - 1.0) < 1e-9
 
 
-def test_ndcg_is_less_than_1_when_gold_not_at_rank1():
-    results = [_row(gold_rank=3, n_gold=1)]
-    m = compute_retrieval_metrics(results)
-    assert m["ndcg@10"] < 1.0
-    assert m["ndcg@10"] > 0.0
+def test_ndcg_less_than_1_when_not_at_rank1():
+    score = ndcg([_row(gold_rank=3, n_gold=1)])
+    assert 0.0 < score < 1.0
 
 
-def test_ndcg_is_zero_when_no_gold():
-    results = [_row(gold_rank=None)]
-    m = compute_retrieval_metrics(results)
-    assert m["ndcg@10"] == 0.0
+def test_ndcg_zero_when_no_gold():
+    assert ndcg([_row(gold_rank=None)]) == 0.0
 
 
-# ── n_queries ─────────────────────────────────────────────────────────────────
+# ── map_score ─────────────────────────────────────────────────────────────────
 
 
-def test_n_queries_matches_input_length():
-    results = [_row(gold_rank=1), _row(gold_rank=2), _row(gold_rank=3)]
-    m = compute_retrieval_metrics(results)
-    assert m["n_queries"] == 3
+def test_map_score_perfect_when_gold_at_rank1():
+    assert map_score([_row(gold_rank=1, n_gold=1)]) == 1.0
 
 
-def test_empty_results_returns_zero_n_queries():
-    m = compute_retrieval_metrics([])
-    assert m["n_queries"] == 0
+def test_map_score_zero_when_no_gold():
+    assert map_score([_row(gold_rank=None)]) == 0.0
+
+
+# ── no_answer_rate ────────────────────────────────────────────────────────────
+
 
+def test_no_answer_rate_zero_when_answer_present():
+    rows = [{**_row(gold_rank=1), "answer": "some answer"}]
+    assert no_answer_rate(rows) == 0.0
 
-# ── optional fields ───────────────────────────────────────────────────────────
 
+def test_no_answer_rate_one_when_no_answer_field():
+    assert no_answer_rate([_row(gold_rank=1)]) == 1.0
 
-def test_no_answer_rate_is_zero_when_answer_present():
-    # Rows with a non-empty answer string are counted as answered.
-    results = [{**_row(gold_rank=1), "answer": "some answer text"}]
-    m = compute_retrieval_metrics(results)
-    assert m["no_answer_rate"] == 0.0
 
+def test_no_answer_rate_none_when_empty():
+    assert no_answer_rate([]) is None
 
-def test_no_answer_rate_is_one_when_no_answer_field():
-    # Rows without an answer field are treated as no-answer by the implementation.
-    results = [_row(gold_rank=1)]
-    m = compute_retrieval_metrics(results)
-    assert m["no_answer_rate"] == 1.0
 
+# ── citation_precision ────────────────────────────────────────────────────────
 
-def test_citation_precision_is_none_when_no_citations():
-    results = [_row(gold_rank=1)]
-    m = compute_retrieval_metrics(results)
-    assert m["citation_precision"] is None
 
+def test_citation_precision_none_when_no_citations():
+    assert citation_precision([_row(gold_rank=1)]) is None
 
-def test_latency_fields_are_none_when_absent():
-    results = [_row(gold_rank=1)]
-    m = compute_retrieval_metrics(results)
-    assert m["mean_search_ms"] is None
-    assert m["mean_answer_ms"] is None
 
+def test_citation_precision_1_when_all_gold():
+    rows = [{**_row(gold_rank=1), "citations": [{"is_gold": True}, {"is_gold": True}]}]
+    assert citation_precision(rows) == 1.0
 
-def test_mean_search_ms_computed_when_present():
-    results = [{**_row(gold_rank=1), "search_ms": 100.0, "answer_ms": 200.0}]
-    m = compute_retrieval_metrics(results)
-    assert m["mean_search_ms"] == 100
-    assert m["mean_answer_ms"] == 200
 
+def test_citation_precision_half_when_half_gold():
+    rows = [{**_row(gold_rank=1), "citations": [{"is_gold": True}, {"is_gold": False}]}]
+    assert citation_precision(rows) == 0.5
 
-# ── RetrieverMetrics.from_results ─────────────────────────────────────────────
 
+# ── mean_latency_ms ───────────────────────────────────────────────────────────
 
-def test_retriever_metrics_from_results_hit_at_1():
-    results = [_row(gold_rank=1)]
-    rm = RetrieverMetrics.from_results(results)
-    assert rm.hit_at_1 == 1.0
 
+def test_mean_latency_none_when_field_absent():
+    assert mean_latency_ms([_row(gold_rank=1)], "search_ms") is None
 
-def test_retriever_metrics_from_results_n_queries():
-    results = [_row(gold_rank=1), _row(gold_rank=2)]
-    rm = RetrieverMetrics.from_results(results)
-    assert rm.n_queries == 2
 
+def test_mean_latency_computed_when_present():
+    rows = [{**_row(gold_rank=1), "search_ms": 100.0, "answer_ms": 200.0}]
+    assert mean_latency_ms(rows, "search_ms") == 100
+    assert mean_latency_ms(rows, "answer_ms") == 200
 
-def test_retriever_metrics_from_results_mrr():
-    results = [_row(gold_rank=1)]
-    rm = RetrieverMetrics.from_results(results)
-    assert rm.mrr_at_10 == 1.0
 
+# ── compute_retrieval_metrics (aggregate) ─────────────────────────────────────
 
-def test_retriever_metrics_from_results_defaults_on_empty():
-    rm = RetrieverMetrics.from_results([])
-    assert rm.n_queries == 0
-    assert rm.hit_at_1 == 0.0
-    assert rm.mrr_at_10 == 0.0
 
+def test_compute_retrieval_metrics_n_queries():
+    assert compute_retrieval_metrics([_row(1), _row(2), _row(3)])["n_queries"] == 3
 
-def test_retriever_metrics_is_pydantic_model():
-    rm = RetrieverMetrics()
-    assert rm.n_queries == 0
-    assert rm.hit_at_1 == 0.0
-    assert rm.no_answer_rate is None
+
+def test_compute_retrieval_metrics_empty():
+    m = compute_retrieval_metrics([])
+    assert m["n_queries"] == 0
+    assert m["hit@1"] == 0.0
 
 
-def test_retriever_metrics_recall_increases_with_k():
-    results = [_row(gold_rank=3)]
-    rm = RetrieverMetrics.from_results(results)
-    assert rm.recall_at_1 <= rm.recall_at_5 <= rm.recall_at_10
+def test_compute_retrieval_metrics_matches_individual_functions():
+    rows = [_row(gold_rank=1), _row(gold_rank=2)]
+    m = compute_retrieval_metrics(rows)
+    assert m["hit@1"] == hit_at_k(rows, 1)
+    assert m["recall@5"] == recall_at_k(rows, 5)
+    assert m["mrr@10"] == mrr(rows)
+    assert m["ndcg@10"] == ndcg(rows)

From feadcbdc28a70cc9bd0cf38b3793268108f845f3 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 15:09:06 +0200
Subject: [PATCH 43/48] Remove compute_retrieval_metrics() and KS constant from
 retrieval_metrics

---
 .../evaluation/retrieval_metrics.py           | 29 +------------------
 1 file changed, 1 insertion(+), 28 deletions(-)

diff --git a/fireflyframework_agentic/evaluation/retrieval_metrics.py b/fireflyframework_agentic/evaluation/retrieval_metrics.py
index 5a318a2a..df42ab24 100644
--- a/fireflyframework_agentic/evaluation/retrieval_metrics.py
+++ b/fireflyframework_agentic/evaluation/retrieval_metrics.py
@@ -30,7 +30,7 @@
         "answer_ms": float,
     }
 
-Individual metrics (recommended for composability)::
+Individual metrics::
 
     hit_at_k(results, k)        -> float
     recall_at_k(results, k)     -> float
@@ -41,19 +41,12 @@
     no_answer_rate(results)     -> float | None
     citation_precision(results) -> float | None
     mean_latency_ms(results, field) -> float | None
-
-Convenience aggregate (all metrics in one call)::
-
-    compute_retrieval_metrics(results) -> dict
 """
 
 from __future__ import annotations
 
 import math
 
-KS = (1, 5, 10)
-
-
 def _dedup(retrieved: list[dict]) -> list[dict]:
     """Return one entry per source, first chunk wins, preserving rank order."""
     seen: set[str] = set()
@@ -184,23 +177,3 @@ def mean_latency_ms(results: list[dict], field: str) -> float | None:
     return round(sum(values) / len(values)) if values else None
 
 
-def compute_retrieval_metrics(results: list[dict]) -> dict:
-    """Compute all IR metrics over a list of retrieval result rows and return a flat dict.
-
-    Convenience wrapper that calls each individual metric function. Prefer the
-    individual functions (``hit_at_k``, ``recall_at_k``, etc.) when you only
-    need a subset.
-    """
-    out: dict[str, object] = {"n_queries": len(results)}
-    for k in KS:
-        out[f"hit@{k}"] = hit_at_k(results, k)
-        out[f"recall@{k}"] = recall_at_k(results, k)
-        out[f"precision@{k}"] = precision_at_k(results, k)
-    out["mrr@10"] = mrr(results)
-    out["map@10"] = map_score(results)
-    out["ndcg@10"] = ndcg(results)
-    out["no_answer_rate"] = no_answer_rate(results)
-    out["citation_precision"] = citation_precision(results)
-    out["mean_search_ms"] = mean_latency_ms(results, "search_ms")
-    out["mean_answer_ms"] = mean_latency_ms(results, "answer_ms")
-    return out

From d54814fa98f85f42c8ef20be5f6f74db3b111f81 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 15:09:12 +0200
Subject: [PATCH 44/48] Remove compute_retrieval_metrics export from evaluation
 __init__

---
 fireflyframework_agentic/evaluation/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py
index 9f31ee7b..35dd32f7 100644
--- a/fireflyframework_agentic/evaluation/__init__.py
+++ b/fireflyframework_agentic/evaluation/__init__.py
@@ -85,9 +85,6 @@
 from fireflyframework_agentic.evaluation.retrieval_metrics import (
     citation_precision as citation_precision,
 )
-from fireflyframework_agentic.evaluation.retrieval_metrics import (
-    compute_retrieval_metrics as compute_retrieval_metrics,
-)
 from fireflyframework_agentic.evaluation.retrieval_metrics import (
     hit_at_k as hit_at_k,
 )

From 08536982e27522f6c3ade60db4c6e2716942e46e Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 15:09:35 +0200
Subject: [PATCH 45/48] Remove test_compute_retrieval_metrics_* tests

---
 .../unit/evaluation/test_retrieval_metrics.py | 21 -------------------
 1 file changed, 21 deletions(-)

diff --git a/tests/unit/evaluation/test_retrieval_metrics.py b/tests/unit/evaluation/test_retrieval_metrics.py
index 38fc07fe..ef38467f 100644
--- a/tests/unit/evaluation/test_retrieval_metrics.py
+++ b/tests/unit/evaluation/test_retrieval_metrics.py
@@ -18,7 +18,6 @@
 
 from fireflyframework_agentic.evaluation.retrieval_metrics import (
     citation_precision,
-    compute_retrieval_metrics,
     hit_at_k,
     map_score,
     mean_latency_ms,
@@ -182,23 +181,3 @@ def test_mean_latency_computed_when_present():
     assert mean_latency_ms(rows, "answer_ms") == 200
 
 
-# ── compute_retrieval_metrics (aggregate) ─────────────────────────────────────
-
-
-def test_compute_retrieval_metrics_n_queries():
-    assert compute_retrieval_metrics([_row(1), _row(2), _row(3)])["n_queries"] == 3
-
-
-def test_compute_retrieval_metrics_empty():
-    m = compute_retrieval_metrics([])
-    assert m["n_queries"] == 0
-    assert m["hit@1"] == 0.0
-
-
-def test_compute_retrieval_metrics_matches_individual_functions():
-    rows = [_row(gold_rank=1), _row(gold_rank=2)]
-    m = compute_retrieval_metrics(rows)
-    assert m["hit@1"] == hit_at_k(rows, 1)
-    assert m["recall@5"] == recall_at_k(rows, 5)
-    assert m["mrr@10"] == mrr(rows)
-    assert m["ndcg@10"] == ndcg(rows)

From a7b1b91843b8c1c848872375d0b01618ceb84143 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 15:10:32 +0200
Subject: [PATCH 46/48] Update flycanon_eval_example to use plain metric
 functions instead of RetrieverMetrics

---
 examples/flycanon_eval_example.py | 74 ++++++++++++++++---------------
 1 file changed, 39 insertions(+), 35 deletions(-)

diff --git a/examples/flycanon_eval_example.py b/examples/flycanon_eval_example.py
index 856b520b..30e66bd1 100644
--- a/examples/flycanon_eval_example.py
+++ b/examples/flycanon_eval_example.py
@@ -26,7 +26,7 @@
 The champion/challenger pattern mirrors the flycanon_experiments harness:
 each run writes metrics to a file; ``approve`` promotes it by repointing
 baseline.json.  Here we replicate that flow using the framework's
-``compute_retrieval_metrics`` / ``RetrieverMetrics`` API directly.
+individual retrieval metric functions directly.
 
 Usage::
 
@@ -94,7 +94,17 @@
 import sys
 from pathlib import Path
 
-from fireflyframework_agentic.evaluation import RetrieverMetrics
+from fireflyframework_agentic.evaluation import (
+    citation_precision,
+    hit_at_k,
+    map_score,
+    mean_latency_ms,
+    mrr,
+    ndcg,
+    no_answer_rate,
+    precision_at_k,
+    recall_at_k,
+)
 
 # ---------------------------------------------------------------------------
 # Helpers
@@ -131,32 +141,31 @@ def _save_baseline(path: str, metrics: dict) -> None:
     Path(path).write_text(json.dumps(metrics, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
 
 
-def _metrics_to_flat(m: RetrieverMetrics) -> dict:
-    """Convert a RetrieverMetrics model to the flat dict stored in baseline.json."""
+def _compute_metrics(results: list[dict]) -> dict:
+    """Compute all IR metrics and return a flat dict."""
     return {
-        "n_queries": m.n_queries,
-        "hit@1": m.hit_at_1,
-        "hit@5": m.hit_at_5,
-        "hit@10": m.hit_at_10,
-        "recall@1": m.recall_at_1,
-        "recall@5": m.recall_at_5,
-        "recall@10": m.recall_at_10,
-        "precision@1": m.precision_at_1,
-        "precision@5": m.precision_at_5,
-        "precision@10": m.precision_at_10,
-        "mrr@10": m.mrr_at_10,
-        "map@10": m.map_at_10,
-        "ndcg@10": m.ndcg_at_10,
-        "no_answer_rate": m.no_answer_rate,
-        "citation_precision": m.citation_precision,
-        "mean_search_ms": m.mean_search_ms,
-        "mean_answer_ms": m.mean_answer_ms,
+        "n_queries": len(results),
+        "hit@1": hit_at_k(results, 1),
+        "hit@5": hit_at_k(results, 5),
+        "hit@10": hit_at_k(results, 10),
+        "recall@1": recall_at_k(results, 1),
+        "recall@5": recall_at_k(results, 5),
+        "recall@10": recall_at_k(results, 10),
+        "precision@1": precision_at_k(results, 1),
+        "precision@5": precision_at_k(results, 5),
+        "precision@10": precision_at_k(results, 10),
+        "mrr@10": mrr(results),
+        "map@10": map_score(results),
+        "ndcg@10": ndcg(results),
+        "no_answer_rate": no_answer_rate(results),
+        "citation_precision": citation_precision(results),
+        "mean_search_ms": mean_latency_ms(results, "search_ms"),
+        "mean_answer_ms": mean_latency_ms(results, "answer_ms"),
     }
 
 
-def _print_metrics_table(metrics: RetrieverMetrics, baseline: dict | None) -> None:
+def _print_metrics_table(flat: dict, baseline: dict | None) -> None:
     """Print a formatted table comparing current metrics vs baseline."""
-    flat = _metrics_to_flat(metrics)
 
     col_w = 22
     num_w = 10
@@ -244,10 +253,6 @@ def run_evaluation(args: argparse.Namespace) -> int:
     # ------------------------------------------------------------------
     # Step 2 — Compute deterministic IR metrics.
     #
-    # compute_retrieval_metrics() returns a flat dict of standard IR metrics.
-    # RetrieverMetrics.from_results() wraps that into a typed Pydantic model
-    # for convenient attribute access.
-    #
     # Metrics are computed at cut-offs k ∈ {1, 5, 10} and include:
     #   hit@k       -- at least one gold doc in top-k (binary)
     #   recall@k    -- fraction of gold docs found in top-k
@@ -257,13 +262,13 @@ def run_evaluation(args: argparse.Namespace) -> int:
     #   ndcg@10     -- normalised discounted cumulative gain
     # ------------------------------------------------------------------
     print("\nComputing retrieval metrics ...")
-    metrics = RetrieverMetrics.from_results(results)
+    flat = _compute_metrics(results)
 
-    print(f"  nDCG@10    : {metrics.ndcg_at_10:.4f}")
-    print(f"  MRR@10     : {metrics.mrr_at_10:.4f}")
-    print(f"  Recall@10  : {metrics.recall_at_10:.4f}")
-    print(f"  Hit@10     : {metrics.hit_at_10:.4f}")
-    print(f"  MAP@10     : {metrics.map_at_10:.4f}")
+    print(f"  nDCG@10    : {flat['ndcg@10']:.4f}")
+    print(f"  MRR@10     : {flat['mrr@10']:.4f}")
+    print(f"  Recall@10  : {flat['recall@10']:.4f}")
+    print(f"  Hit@10     : {flat['hit@10']:.4f}")
+    print(f"  MAP@10     : {flat['map@10']:.4f}")
 
     # ------------------------------------------------------------------
     # Step 3 — Load the baseline (champion) for regression detection.
@@ -282,7 +287,7 @@ def run_evaluation(args: argparse.Namespace) -> int:
     print("\n" + "=" * 56)
     print("Retrieval Metrics")
     print("=" * 56)
-    _print_metrics_table(metrics, baseline)
+    _print_metrics_table(flat, baseline)
 
     # ------------------------------------------------------------------
     # Step 5 — Regression check.
@@ -291,7 +296,6 @@ def run_evaluation(args: argparse.Namespace) -> int:
     # promotion (exit code 1) unless --promote-if-better is set and the
     # run actually improved overall.
     # ------------------------------------------------------------------
-    flat = _metrics_to_flat(metrics)
 
     if baseline:
         regressions = _detect_regressions(flat, baseline)

From 0c911b3d5d0e02d8c47b829d63dedd133b0ed8f5 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 15:12:02 +0200
Subject: [PATCH 47/48] Apply ruff format to retrieval_metrics.py

---
 fireflyframework_agentic/evaluation/retrieval_metrics.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fireflyframework_agentic/evaluation/retrieval_metrics.py b/fireflyframework_agentic/evaluation/retrieval_metrics.py
index df42ab24..7c9c5cfe 100644
--- a/fireflyframework_agentic/evaluation/retrieval_metrics.py
+++ b/fireflyframework_agentic/evaluation/retrieval_metrics.py
@@ -47,6 +47,7 @@
 
 import math
 
+
 def _dedup(retrieved: list[dict]) -> list[dict]:
     """Return one entry per source, first chunk wins, preserving rank order."""
     seen: set[str] = set()
@@ -154,9 +155,7 @@ def no_answer_rate(results: list[dict]) -> float | None:
     """Fraction of queries where the model produced no answer. None if no results."""
     if not results:
         return None
-    count = sum(
-        1 for row in results if row.get("no_answer") or not row.get("answer", "").strip()
-    )
+    count = sum(1 for row in results if row.get("no_answer") or not row.get("answer", "").strip())
     return round(count / len(results), 4)
 
 
@@ -175,5 +174,3 @@ def mean_latency_ms(results: list[dict], field: str) -> float | None:
     """Mean latency in ms for the given field (``search_ms`` or ``answer_ms``). None if absent."""
     values = [row[field] for row in results if row.get(field) is not None]
     return round(sum(values) / len(values)) if values else None
-
-

From ef16882e83038856c182c67ad0818446c135ea2f Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 19 Jun 2026 15:15:10 +0200
Subject: [PATCH 48/48] Apply ruff format to test_retrieval_metrics.py

---
 tests/unit/evaluation/test_retrieval_metrics.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/unit/evaluation/test_retrieval_metrics.py b/tests/unit/evaluation/test_retrieval_metrics.py
index ef38467f..fa453e2d 100644
--- a/tests/unit/evaluation/test_retrieval_metrics.py
+++ b/tests/unit/evaluation/test_retrieval_metrics.py
@@ -179,5 +179,3 @@ def test_mean_latency_computed_when_present():
     rows = [{**_row(gold_rank=1), "search_ms": 100.0, "answer_ms": 200.0}]
     assert mean_latency_ms(rows, "search_ms") == 100
     assert mean_latency_ms(rows, "answer_ms") == 200
-
-