From a2d6770a2339abad096fe6670045a8e81ecdba86 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:33:03 +0200 Subject: [PATCH 01/48] feat(evaluation): add evaluation subpackage skeleton and pyproject entry point (#268) * feat(evaluation): add evaluation subpackage __init__ with gate/champion/judge/retrieval exports * feat(evaluation): add EvalConfig and GateVerdict models * feat(evaluation): add evaluation optional-deps and flyeval CLI entry point to pyproject.toml * feat(evaluation): note evaluation as optional subpackage in top-level __init__ docstring --------- Co-authored-by: miguelgfierro --- fireflyframework_agentic/__init__.py | 7 ++ .../evaluation/__init__.py | 57 +++++++++++++++ fireflyframework_agentic/evaluation/models.py | 70 +++++++++++++++++++ pyproject.toml | 7 ++ 4 files changed, 141 insertions(+) create mode 100644 fireflyframework_agentic/evaluation/__init__.py create mode 100644 fireflyframework_agentic/evaluation/models.py diff --git a/fireflyframework_agentic/__init__.py b/fireflyframework_agentic/__init__.py index 993b0248..1736f1f4 100644 --- a/fireflyframework_agentic/__init__.py +++ b/fireflyframework_agentic/__init__.py @@ -24,6 +24,13 @@ config = get_config() print(config.default_model) + +Optional subpackages (not imported eagerly at the top level): + fireflyframework_agentic.lab -- sessions, benchmarks, datasets, evaluation orchestration + fireflyframework_agentic.experiments -- experiment tracking and comparison + fireflyframework_agentic.evaluation -- gate-based quality gates, LLM-as-judge advisory, + champion/challenger tracking, retrieval metrics + (requires the ``evaluation`` optional extra) """ from importlib.metadata import PackageNotFoundError, version diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py new file mode 100644 index 00000000..1c264f07 --- /dev/null +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -0,0 +1,57 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Evaluation subpackage -- gate-based quality gates, LLM-as-judge advisory, champion/challenger tracking, and retrieval metrics. + +Gate pipeline (flags, not vetoes): + G1 -- Structural & Safe (schema + PII + empty-registry guard) + G2 -- Must-finds & negative controls (recall + NC precision) + G3 -- Evidence (grounding / token-anchoring) + G4 -- LLM-as-a-Judge (advisory, opt-in, never decides promotion) + G5 -- No-regression / promotion (champion/challenger comparison) + +Retrieval metrics: + Precision@k, Recall@k, MRR, NDCG -- computed over ranked retrieval results. + +Champion tracking: + Persists the best-known run record so that promotion decisions can be made + against a stable baseline rather than the most recent run. +""" + +from importlib.metadata import PackageNotFoundError, version + +from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, render_scorecard, run_gates +from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion +from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge +from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics + +try: + __version__ = version("fireflyframework-agentic") +except PackageNotFoundError: + __version__ = "0.0.0+dev" + +__all__ = [ + "GateResult", + "Verdict", + "run_gates", + "render_scorecard", + "ChampionRecord", + "load_champion", + "save_champion", + "invalidate_champion", + "AdvisoryReport", + "run_judge", + "RetrieverMetrics", + "compute_retrieval_metrics", +] diff --git a/fireflyframework_agentic/evaluation/models.py b/fireflyframework_agentic/evaluation/models.py new file mode 100644 index 00000000..a98cdf20 --- /dev/null +++ b/fireflyframework_agentic/evaluation/models.py @@ -0,0 +1,70 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared config and model classes for the evaluation framework. + +EvalConfig captures the parameters of a single evaluation run: which model +is being tested, which corpus it runs against, and where the supporting +artefacts (registry, baseline, judge config) live. + +GateVerdict constants define the two possible outcomes of the promotion gate: +PROMOTE (the challenger beats or ties the champion and is safe to deploy) +or HOLD (the challenger does not meet the bar and must be iterated on). +""" + +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel + + +class EvalConfig(BaseModel): + """Configuration for a single evaluation run. + + Parameters: + model_id: Identifier of the model under evaluation. + corpus: Name of the evaluation corpus (e.g. "ms_marco_mini", "finance_bench"). + run_id: Unique identifier for this run (e.g. a timestamp or git SHA). + registry_path: Path to the must-find / golden registry JSON file. + corpus_path: Path to the corpus directory or bundle. + baseline_path: Path to a baseline results file for regression comparison. + judge_model: Model identifier used for the LLM-as-judge advisory pass. + judge_runs: Number of independent judge calls to aggregate (majority vote). + embed_model: Model identifier used for embedding-based retrieval metrics. + metadata: Arbitrary key/value pairs for run bookkeeping. + """ + + model_id: str + corpus: str + run_id: str + registry_path: str = "" + corpus_path: str = "" + baseline_path: str = "" + judge_model: str = "" + judge_runs: int = 3 + embed_model: str = "" + metadata: dict[str, Any] = {} + + +class GateVerdict: + """Promotion gate verdict constants. + + Use ``GateVerdict.PROMOTE`` when the challenger meets the quality bar and + is safe to become the new champion. Use ``GateVerdict.HOLD`` when the + challenger does not meet the bar and must be iterated on. + """ + + PROMOTE: str = "PROMOTE" + HOLD: str = "HOLD" diff --git a/pyproject.toml b/pyproject.toml index e575323e..bb74201f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -119,6 +119,10 @@ binary = [ all = [ "fireflyframework-agentic[postgres,mongodb,security,embeddings,openai-embeddings,cohere-embeddings,google-embeddings,mistral-embeddings,voyage-embeddings,azure-embeddings,bedrock-embeddings,ollama-embeddings,vectorstores-chroma,vectorstores-pinecone,vectorstores-qdrant,vectorstores-pgvector,vectorstores-sqlite-vec,watch,binary]", ] +evaluation = [ + "scipy>=1.11", + "numpy>=1.26.0", +] dev = [ "pytest>=8.3.0", "pytest-asyncio>=0.24.0", @@ -132,6 +136,9 @@ dev = [ "pre-commit>=3.8.0", ] +[project.scripts] +flyeval = "fireflyframework_agentic.evaluation.cli:main" + [project.urls] Homepage = "https://fireflyframework.org/" Documentation = "https://github.com/fireflyframework/fireflyframework-agentic/tree/main/docs" From 8676b6adbc3319845dc1f7b2faede2e8d4b9cd56 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:36:17 +0200 Subject: [PATCH 02/48] feat(evaluation): add matcher primitives and statistics helpers (#269) * feat(evaluation): add matcher primitives (anchored, matches, source_stem, tokens) * feat(evaluation): add statistics helpers (aa_band, aggregate_grounding, left_skew_flag) * feat(evaluation): export matcher and stats primitives from evaluation package --------- Co-authored-by: miguelgfierro --- .../evaluation/__init__.py | 9 + .../evaluation/matcher.py | 374 ++++++++++++++++++ fireflyframework_agentic/evaluation/stats.py | 110 ++++++ 3 files changed, 493 insertions(+) create mode 100644 fireflyframework_agentic/evaluation/matcher.py create mode 100644 fireflyframework_agentic/evaluation/stats.py diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index 1c264f07..7d740b00 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -34,7 +34,9 @@ from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, render_scorecard, run_gates from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge +from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics +from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag try: __version__ = version("fireflyframework-agentic") @@ -54,4 +56,11 @@ "run_judge", "RetrieverMetrics", "compute_retrieval_metrics", + "anchored", + "matches", + "source_stem", + "tokens", + "aa_band", + "aggregate_grounding", + "left_skew_flag", ] diff --git a/fireflyframework_agentic/evaluation/matcher.py b/fireflyframework_agentic/evaluation/matcher.py new file mode 100644 index 00000000..2f5065df --- /dev/null +++ b/fireflyframework_agentic/evaluation/matcher.py @@ -0,0 +1,374 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Single matching primitive reused across G2 (recall/precision) and G3 (grounding). + +anchored() is topic-level lexical overlap. matches() is the gate predicate. +One function, three uses — do not write three matching functions. + +Known limitation (EVALUATION_FRAMEWORK.md): anchored() is topic-anchored, not claim-verified. +A '45 days' claim cited to a '3 days' source passes if they share the process name. +Real claim entailment (NLI/AIS) is Phase 2. The G3 human spot-check is the +binding faithfulness signal until then. +""" + +from __future__ import annotations + +import re + +import numpy as np + + +def cosine(a, b) -> float: + """Cosine similarity between two vectors.""" + a = np.asarray(a, dtype=float) + b = np.asarray(b, dtype=float) + return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9)) + + +def tokens(text: str) -> list[str]: + return re.findall(r"\b\w+\b", text.lower()) + + +def anchored(claim: str, evidence: str, *, min_token: int = 5) -> bool: + """True if claim and evidence share at least one non-trivial token (>= min_token chars). + + Rejects a citation to an unrelated document. Does NOT verify the claim value — + that gap is closed by the deferred NLI/AIS check in Phase 2. + """ + a = {t for t in tokens(claim) if len(t) >= min_token} + b = {t for t in tokens(evidence) if len(t) >= min_token} + return bool(a & b) + + +def source_stem(locator: str) -> str: + """Normalize a locator/source path to a stable document stem for matching. + + Robust to the two locator conventions observed across runs: + - directory-prefixed ('sops/SOP-002-kyc-edd.md') and bare ('SOP-002-kyc-edd.md') + both reduce to 'sop-002-kyc-edd'; + - event-log row ids ('src-credit-underwriting:CU-2026-1003') reduce to the + process stem 'credit-underwriting', so they join the CSV the registry cites. + + Preserves the same-document anti-gaming property of matches(): it still keys + on which source document a finding cites — just independent of directory + prefix, file extension, and case, so one registry scores every run. + """ + s = locator.split("#")[0] # drop the locator fragment (#page=N, #anchor) + s = s.rsplit("/", 1)[-1] # basename — strip any directory prefix + if s.startswith("src-") and ":" in s: # event-log row id: src-: + return s.split(":", 1)[0][len("src-") :].lower() + if "." in s: # strip a trailing file extension + s = s.rsplit(".", 1)[0] + return s.lower() + + +def _finding_sources(finding: dict, evidence_index: dict[str, dict]) -> set[str]: + """Return the set of normalized source-document stems cited by a finding.""" + sources: set[str] = set() + for ref in finding.get("evidence_refs", []): + ev = evidence_index.get(ref.get("evidence_id", "")) + if ev: + stem = source_stem(ev.get("locator", "")) + if stem: + sources.add(stem) + return sources + + +def shares_source(finding: dict, item, evidence_index: dict[str, dict]) -> bool: + """True iff the finding cites at least one source document the item lists as evidence. + + Source documents are compared by normalized stem (source_stem) so one registry + scores every run regardless of locator convention. This is the anti-gaming + anchor reused by both the lexical predicate (matches) and the semantic path + (semantic_hits): a finding on a different document cannot satisfy this item. + + Spec-style NC items list their mirror source (§4.1); legacy NC items carry + evidence=[], which makes this always False for them. + + Args: + finding: dict from DiscoveryResult.findings[i] (model_dump output). + item: RegistryItem dataclass from registry.py. + evidence_index: {evidence_id: Evidence dict} built from result['evidence_index']. + """ + finding_sources = _finding_sources(finding, evidence_index) + item_sources = {source_stem(e) for e in item.evidence} + return bool(finding_sources & item_sources) + + +def _keyword_anchored(desc: str, keywords: list[str]) -> bool: + """True iff any keyword appears as a whole word in desc (case-insensitive). + + Keyword rail: exempt from the 5-char token floor so short banking terms + (KYC, PEP, AML) can anchor a match even though they are too short for the + token rail. Whole-word matching prevents false substring hits (e.g. "risk" + inside "enterprise-risk-management"). + """ + if not keywords: + return False + desc_lower = desc.lower() + return any( + re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords + ) + + +def candidate_text(candidate: dict, scope: str) -> str: + """Extract the searchable text from a candidate on the given scope surface (§4.3). + + Each scope surface uses different fields as the match text: + - finding / action : title + description + - process / decision : name + description + - activity : name + notes + regulatory_links + - persona : name + role + goals + pain_points + - system : name + description + - informal_channel : name + usage_context + notes + - dependency_graph : name + description (diagnostic nodes; relation items bypass this) + """ + if scope in ("finding", "action"): + return " ".join(filter(None, [candidate.get("title", ""), candidate.get("description", "")])) + if scope == "activity": + rl = candidate.get("regulatory_links") or [] + rl_str = " ".join(rl) if isinstance(rl, list) else str(rl or "") + return " ".join(filter(None, [candidate.get("name", ""), candidate.get("notes", ""), rl_str])) + if scope == "persona": + goals = candidate.get("goals") or [] + pain = candidate.get("pain_points") or [] + goals_str = " ".join(goals) if isinstance(goals, list) else str(goals) + pain_str = " ".join(pain) if isinstance(pain, list) else str(pain) + return " ".join(filter(None, [ + candidate.get("name", ""), + candidate.get("role", ""), + goals_str, + pain_str, + ])) + if scope == "informal_channel": + return " ".join(filter(None, [ + candidate.get("name", ""), + candidate.get("usage_context", ""), + candidate.get("notes", ""), + ])) + # process, decision, system, dependency_graph (diagnostic nodes) + return " ".join(filter(None, [candidate.get("name", ""), candidate.get("description", "")])) + + +INSIGHT_ITEM_SCOPES = ("finding", "action") +INSIGHT_MATCH_SURFACES = ("finding", "action", "activity", "decision") + + +def allowed_scopes(item) -> tuple[str, ...]: + """Candidate surfaces that may satisfy a registry item. + + Insight items (finding / action) may be satisfied by any insight or process-graph + *leaf* surface (activity / decision): a run often grounds the same operational fact + on a different surface than the registry's scope tag anticipates (the BBVA case — + pain points the registry tags 'finding' that the run emitted as decision/activity + nodes). shares_source is still REQUIRED on every candidate (see matches / + semantic_hits), so a candidate on the wrong document never counts — cross-scope + widens WHERE we look, never the source anchor. + + Structural items (process / activity / decision) stay on their own surface: a + structural must-find requires the run to have actually built that node, not merely + mentioned the fact in a finding (test_process_scope_miss_when_no_matching_process). + NC items are likewise scope-strict — widening a negative control's pool could only + make it easier to trip (a specificity regression), never recover a legitimate hit. + + `process` is never a match surface for an insight item: _candidates_by_scope folds + every child's evidence_refs into the process node, so its citation set is a union of + many documents and shares_source goes vacuous (hence its exclusion from + INSIGHT_MATCH_SURFACES). + """ + if item.tier == "NC": + return (item.scope,) + if item.scope in INSIGHT_ITEM_SCOPES: + return INSIGHT_MATCH_SURFACES + return (item.scope,) + + +def matches( + candidate: dict, + item, + evidence_index: dict[str, dict], + scope: str = "finding", +) -> bool: + """True iff candidate cites a shared source document AND is topic-anchored to item. + + Two-rail anchor (either rail suffices): + - Token rail: ≥1 shared token of ≥5 chars between candidate text and item description. + - Keyword rail: ≥1 item keyword appears as a whole word in the candidate text. + Exempt from the 5-char floor so short banking terms (KYC, PEP, AML) can anchor. + + The ``scope`` controls which fields are read as the candidate's match text (§4.3): + findings and actions use ``title + description``; processes and decisions use + ``name + description``; activities use ``name + notes + regulatory_links``. + + Anti-gaming guard: a candidate on a different document cannot satisfy this item + even if its text happens to match. Source documents are compared by + normalized stem (source_stem) so one registry scores every run regardless of + locator convention. + + Args: + candidate: dict from the DiscoveryResult surface matching ``scope``. + item: RegistryItem dataclass from registry.py. + evidence_index: {evidence_id: Evidence dict} built from result['evidence_index']. + scope: surface the candidate was drawn from (default "finding"). + """ + if not shares_source(candidate, item, evidence_index): + return False + desc = candidate_text(candidate, scope) + return _keyword_anchored(desc, list(item.keywords or [])) or anchored(desc, item.description) + + +def matches_dependency_graph_relation( + item, + result: dict, + evidence_index: dict[str, dict], +) -> bool: + """Endpoint matcher for dependency_graph relation items (§5.3b). + + Stage 1: Anchor both endpoints to activity nodes via token rail. + Stage 2: Verify a directed edge or path connects them in the asserted direction, + behind the shared-source guard on the edge's/path's evidence_refs. + + Returns False when either endpoint anchors to no activity, or when no connecting + edge/path shares a source document with the item. + """ + if not item.from_node or not item.to_node: + return False + + processes = result.get("process_graph", {}).get("processes", []) + all_activities = [a for p in processes for a in p.get("activities", [])] + + def _anchor(endpoint_text: str) -> set[str]: + return { + a["id"] + for a in all_activities + if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text) + } + + from_ids = _anchor(item.from_node) + to_ids = _anchor(item.to_node) + if not from_ids or not to_ids: + return False + + item_stems = {source_stem(e) for e in item.evidence} + + def _node_stems(node: dict) -> set[str]: + return { + source_stem(evidence_index[r["evidence_id"]].get("locator", "")) + for r in node.get("evidence_refs", []) + if r.get("evidence_id") in evidence_index + } + + dg = result.get("dependency_graph", {}) + + for edge in dg.get("activity_edges", []): + if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids: + if _node_stems(edge) & item_stems: + return True + + for path in dg.get("critical_paths", []): + if not (_node_stems(path) & item_stems): + continue + node_ids = path.get("node_ids", []) + from_pos = [i for i, nid in enumerate(node_ids) if nid in from_ids] + to_pos = [i for i, nid in enumerate(node_ids) if nid in to_ids] + if any(fp < tp for fp in from_pos for tp in to_pos): + return True + + return False + + +def semantic_hits( + candidates: dict[str, list[dict]], + items, + evidence_index: dict[str, dict], + embed_fn, + tau: float = 0.70, + tau_nc: float = 0.85, +) -> dict[str, bool]: + """Opt-in embedding-semantic recall: {item.id: found-by-some-shared-source candidate}. + + Scope-aware: each registry item is evaluated against candidates from its own + scope surface (finding, process, activity, decision, action) using the same + per-scope field extraction as the lexical path (candidate_text). Passing only + the findings list (the previous behaviour) would leave process/activity/decision/ + action items with an empty candidate pool and a guaranteed False result. + + Real items (L0–L3): hit iff some scope-matching candidate shares a source + document with the item (shares_source) AND is embedding-similar (cosine >= tau). + Source anchor is preserved — a candidate on a different document cannot recover + a real item. + + NC items (tier=="NC"): hit iff some scope-matching candidate is embedding-similar + (cosine >= tau_nc). When the NC lists its mirror source (§4.1) the shared-source + guard applies; legacy NC items with evidence=[] skip the anchor, with the higher + threshold (default 0.85) compensating. + + Cost is two embed_fn calls — all scope-appropriate candidate texts once and all + item texts once — not O(n*m) per-pair embeddings. + + Args: + candidates: {scope: [candidate dicts]} from _candidates_by_scope(). + items: iterable of RegistryItem dataclasses. + evidence_index: {evidence_id: Evidence dict}. + embed_fn: callable(list[str]) -> array-like of row vectors. + tau: cosine threshold for real items (inclusive). + tau_nc: cosine threshold for NC items (inclusive; higher to compensate for no source anchor). + """ + items = list(items) + + # Flatten all candidates across scopes, preserving their scope tag for + # text extraction and per-item filtering. + scoped: list[tuple[str, dict]] = [ + (scope, cand) + for scope, cands in candidates.items() + for cand in cands + ] + + if not scoped: + return {item.id: False for item in items} + + cand_texts = [candidate_text(cand, scope) for scope, cand in scoped] + item_texts = [ + " ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items + ] + + cand_vecs = np.asarray(embed_fn(cand_texts)) + item_vecs = np.asarray(embed_fn(item_texts)) + + hits: dict[str, bool] = {} + for i, item in enumerate(items): + item_vec = item_vecs[i] + allowed = allowed_scopes(item) + hit = False + for k, (scope, cand) in enumerate(scoped): + if scope not in allowed: + continue + if item.tier == "NC": + # Shared-source guard applies when the NC lists its mirror source + # (§4.2/§6.2); legacy evidence=[] NCs stay unanchored, with the + # higher tau_nc compensating. + if item.evidence and not shares_source(cand, item, evidence_index): + continue + if cosine(cand_vecs[k], item_vec) >= tau_nc: + hit = True + break + elif ( + shares_source(cand, item, evidence_index) + and cosine(cand_vecs[k], item_vec) >= tau + ): + hit = True + break + hits[item.id] = hit + return hits diff --git a/fireflyframework_agentic/evaluation/stats.py b/fireflyframework_agentic/evaluation/stats.py new file mode 100644 index 00000000..e70c629a --- /dev/null +++ b/fireflyframework_agentic/evaluation/stats.py @@ -0,0 +1,110 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Statistics helpers: A/A noise band + fixed aggregate_grounding. + +The A/A band replaces McNemar, Wilcoxon, BCa bootstrap, Cliff's delta, Holm +correction, and MCID power analysis. Four self-authored corpora with ~30-70 +non-independent items each cannot power those tests; gating on unpowered tests +is false precision. See EVALUATION_FRAMEWORK.md (regression statistics). + +This module also provides the fixed aggregate_grounding() that closes a prior +aggregation bug where the previous runner inherited run 0's grounding report +unchanged instead of merging across all runs. +""" +from __future__ import annotations + +import statistics +from typing import Sequence + + +def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float: + """95th-percentile pairwise delta from champion reruns — the noise floor. + + Rerun the champion ~10 times; the 95th-percentile of all pairwise absolute + differences is the A/A noise floor. A candidate must beat the champion by + more than this number on EVERY seed to count as a real improvement. + + This single number replaces MCID, power analysis, McNemar, Wilcoxon, + bootstrap CIs, and Holm correction. See EVALUATION_FRAMEWORK.md (the A/A noise band). + + Args: + scores: Per-run primary metric scores from champion reruns (>= 2 required). + percentile: Which percentile (default 95). + + Returns: + Noise floor as a float in the same units as the input scores. + """ + scores = list(scores) + if len(scores) < 2: + raise ValueError(f"aa_band requires >= 2 reruns; got {len(scores)}") + deltas = [ + abs(x - y) + for i, x in enumerate(scores) + for y in scores[i + 1:] + ] + sorted_deltas = sorted(deltas) + # Index for the requested percentile; clamp to valid range + idx = max(0, min(len(sorted_deltas) - 1, int(len(sorted_deltas) * percentile / 100))) + return sorted_deltas[idx] + + +def aggregate_grounding(grounding_dicts: list[dict]) -> dict: + """Merge per-run grounding reports into a conservative aggregate. + + Fixes a prior aggregation bug where the previous runner inherited run 0's grounding + report unchanged. Correct behaviour: + - support_pct: mean across runs + - unsupported_ids: UNION across all runs (anything flagged in any run stays flagged) + + Args: + grounding_dicts: List of grounding report dicts, one per evaluation run. + Each must have 'support_pct' (float 0-100) and optionally + 'unsupported_ids' (list[str]). + + Returns: + Merged grounding dict. + """ + if not grounding_dicts: + return {"support_pct": 0.0, "unsupported_ids": []} + + support_pcts = [float(g.get("support_pct", 0.0)) for g in grounding_dicts] + mean_pct = statistics.mean(support_pcts) + + unsupported: set[str] = set() + for g in grounding_dicts: + unsupported.update(g.get("unsupported_ids", [])) + + first = grounding_dicts[0] + return { + **first, + "support_pct": round(mean_pct, 2), + "unsupported_ids": sorted(unsupported), + "_aggregate_runs": len(grounding_dicts), + "_support_pct_per_run": [round(p, 2) for p in support_pcts], + } + + +def left_skew_flag(scores: Sequence[float]) -> bool: + """True if min < median - 0.10 (HIGH_VARIANCE sentinel). + + A single catastrophic run cannot hide inside a decent mean. + True => HIGH_VARIANCE; block the run until investigated. + See EVALUATION_FRAMEWORK.md (anti-flakiness). + """ + scores = list(scores) + if len(scores) < 2: + return False + med = statistics.median(scores) + return min(scores) < med - 0.10 From 8eb2110ef25ad1579d0f093e011664dfe40935e6 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:39:04 +0200 Subject: [PATCH 03/48] feat(evaluation): add corpus loader and registry modules (#270) * feat(evaluation): add corpus loader and evidence verification module * feat(evaluation): add lean-1 registry loader and RegistryItem/Registry models * feat(evaluation): re-export corpus and registry symbols from evaluation package --------- Co-authored-by: miguelgfierro --- .../evaluation/__init__.py | 13 ++ fireflyframework_agentic/evaluation/corpus.py | 185 +++++++++++++++ .../evaluation/registry.py | 214 ++++++++++++++++++ 3 files changed, 412 insertions(+) create mode 100644 fireflyframework_agentic/evaluation/corpus.py create mode 100644 fireflyframework_agentic/evaluation/registry.py diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index 7d740b00..b6283d8b 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -31,10 +31,12 @@ from importlib.metadata import PackageNotFoundError, version +from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, render_scorecard, run_gates from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens +from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256 from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag @@ -44,6 +46,13 @@ __version__ = "0.0.0+dev" __all__ = [ + "EMPTY", + "FABRICATED", + "SOURCE_UNKNOWN", + "VERIFIED", + "corpus_sha256", + "load_corpus", + "verify_evidence_index", "GateResult", "Verdict", "run_gates", @@ -54,6 +63,10 @@ "invalidate_champion", "AdvisoryReport", "run_judge", + "Registry", + "RegistryItem", + "load_registry", + "registry_sha256", "RetrieverMetrics", "compute_retrieval_metrics", "anchored", diff --git a/fireflyframework_agentic/evaluation/corpus.py b/fireflyframework_agentic/evaluation/corpus.py new file mode 100644 index 00000000..32835f2c --- /dev/null +++ b/fireflyframework_agentic/evaluation/corpus.py @@ -0,0 +1,185 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Corpus loading and evidence verification (EVALUATION_FRAMEWORK.md §6.3). + +The corpus is the third pinned evaluation input, next to the DiscoveryResult +and the registry: the raw document bundle (input.json) the discovery pipeline +read. It is the trusted side of every evidence anchor — the registry tells +the evaluator what *should* be found; only the corpus can tell it whether what +a run cited is *real*. + +verify_entry() closes the fabricated-evidence channel: a run controls every +byte of its own evidence_index, so any check computable from (result, registry) +alone can be satisfied by self-reported evidence. Checking each excerpt +against the actual corpus text is the only deterministic counter. + +Excerpt contract: excerpts are verbatim quotes from the source document. +Spliced quotes (fragments joined with '...' or '…') are supported — each +fragment is verified independently. Paraphrase belongs in the finding +description, never in an excerpt. +""" + +from __future__ import annotations + +import base64 +import difflib +import hashlib +import json +import re +import unicodedata +from dataclasses import dataclass +from pathlib import Path + +from fireflyframework_agentic.evaluation.matcher import source_stem + +# Verification statuses for one evidence_index entry. +VERIFIED = "verified" # excerpt found (verbatim or spliced) in the cited source +EMPTY = "empty" # entry carries no excerpt text — nothing to verify +SOURCE_UNKNOWN = "source_unknown" # locator resolves to no corpus document +FABRICATED = "fabricated" # populated excerpt not found in the cited source + +# A spliced excerpt is split on these joiners; fragments shorter than +# _MIN_FRAGMENT_CHARS are too generic to verify and are skipped. +_SPLICE_PATTERN = re.compile(r"\.\.\.|…| -- ") +_MIN_FRAGMENT_CHARS = 15 + +# A fragment passes fuzzily when matching blocks (>= _MIN_BLOCK_CHARS chars) +# cover at least _COVERAGE_THRESHOLD of it — tolerates punctuation/whitespace +# drift while rejecting invented text (measured ~0.10-0.32 coverage). +_COVERAGE_THRESHOLD = 0.85 +_MIN_BLOCK_CHARS = 4 + + +@dataclass +class Corpus: + """The decoded, normalized corpus: {source stem: normalized text}. + + sha256 pins the corpus file exactly like the registry pin (§4.6): the + champion record stores it, and G1 re-hashes the file at scoring time to + flag CORPUS_DRIFT. + """ + + texts: dict[str, str] + sha256: str + path: str + + +def normalize(text: str) -> str: + """Normalize text for excerpt matching: NFKC, strip markdown emphasis and + smart quotes, collapse whitespace, casefold.""" + text = unicodedata.normalize("NFKC", text) + text = text.replace("**", "").replace("*", "") + text = re.sub(r"[\"""''']", "", text) + return re.sub(r"\s+", " ", text).strip().casefold() + + +def corpus_sha256(path: str | Path) -> str: + """SHA-256 of the corpus file on disk (the CORPUS_DRIFT re-hash).""" + return hashlib.sha256(Path(path).read_bytes()).hexdigest() + + +def load_corpus(path: str | Path) -> Corpus: + """Load a FlyRadar input.json bundle into a stem-indexed normalized Corpus. + + Decodes every artifacts[] file and signals[] event log (base64), normalizes + the text, and keys each by the same source_stem the matcher uses — so a + locator in any convention resolves to its document. + + Raises: + ValueError: when the bundle contains no documents, or two documents + reduce to the same stem (a collision would let a fabricated + citation resolve against the wrong real file). + """ + path = Path(path) + raw = json.loads(path.read_text(encoding="utf-8")) + + named_contents: list[tuple[str, str]] = [] + for artifact in raw.get("artifacts", []): + named_contents.append((artifact["filename"], artifact["content_base64"])) + for signal in raw.get("signals", []): + named_contents.append((signal["name"], signal["content_base64"])) + + if not named_contents: + raise ValueError(f"corpus bundle {path} contains no artifacts or signals") + + texts: dict[str, str] = {} + for name, content_b64 in named_contents: + stem = source_stem(name) + if stem in texts: + raise ValueError( + f"corpus stem collision: two documents reduce to {stem!r} — " + "rename one; a collision would verify citations against the wrong file" + ) + decoded = base64.b64decode(content_b64).decode("utf-8", errors="replace") + texts[stem] = normalize(decoded) + + return Corpus(texts=texts, sha256=corpus_sha256(path), path=str(path)) + + +def _fragment_coverage(fragment: str, source: str) -> float: + """Fraction of fragment covered by matching blocks of >= _MIN_BLOCK_CHARS chars.""" + blocks = difflib.SequenceMatcher( + None, fragment, source, autojunk=False + ).get_matching_blocks() + covered = sum(b.size for b in blocks if b.size >= _MIN_BLOCK_CHARS) + return covered / len(fragment) + + +def verify_entry(corpus: Corpus, entry: dict) -> str: + """Verify one evidence_index entry against the corpus. + + Returns one of VERIFIED / EMPTY / SOURCE_UNKNOWN / FABRICATED: + - the locator must resolve (by source stem) to a corpus document, and + - every fragment of the excerpt must appear in that document's text, + verbatim after normalization or with matching-block coverage >= + _COVERAGE_THRESHOLD. + + The score is the minimum over fragments, so one invented fragment sinks a + spliced excerpt. + + """ + stem = source_stem(entry.get("locator", "")) + source = corpus.texts.get(stem) + if source is None: + return SOURCE_UNKNOWN + + excerpt = normalize(entry.get("excerpt") or "") + if not excerpt: + return EMPTY + + fragments = [ + f.strip() + for f in _SPLICE_PATTERN.split(excerpt) + if len(f.strip()) >= _MIN_FRAGMENT_CHARS + ] or [excerpt] + + for fragment in fragments: + if fragment in source: + continue + if _fragment_coverage(fragment, source) < _COVERAGE_THRESHOLD: + return FABRICATED + return VERIFIED + + +def verify_evidence_index(corpus: Corpus, result: dict) -> dict[str, str]: + """Verify every evidence_index entry of a DiscoveryResult. + + Returns {evidence_id: status} over all entries — referenced or not — so + the gates share one verification pass. + """ + return { + ev["id"]: verify_entry(corpus, ev) + for ev in result.get("evidence_index", []) + if ev.get("id") + } diff --git a/fireflyframework_agentic/evaluation/registry.py b/fireflyframework_agentic/evaluation/registry.py new file mode 100644 index 00000000..2b869ba9 --- /dev/null +++ b/fireflyframework_agentic/evaluation/registry.py @@ -0,0 +1,214 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""lean-1 registry loader — one schema for all four corpora. + +Replaces the four mutually incompatible schemes in use today (L1-L5, +documented/observed/pain-point, critical/important, and no tiers). +Loader enforces all invariants; they are not documentation. + +Invariants (EVALUATION_FRAMEWORK.md, the must-find registry): +- schema_version == "lean-1" +- every tier is one of L0 L1 L2 L3 NC +- negative_control_count >= ceil(real_items / 10) +- kappa present (0.0 placeholder allowed; G2 advisory until >= 0.70) +- ABANCA DILO items must target a single measured sub-population +""" +from __future__ import annotations + +import hashlib +import json +import math +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal + +VALID_TIERS = ("L0", "L1", "L2", "L3", "NC") +VALID_SCOPES = ( + "process", "activity", "decision", "finding", "action", + "persona", "system", "informal_channel", "dependency_graph", +) +SCHEMA_VERSION = "lean-1" +KAPPA_ADVISORY_THRESHOLD = 0.70 + + +@dataclass(frozen=True) +class RegistryItem: + id: str + tier: Literal["L0", "L1", "L2", "L3", "NC"] + description: str + evidence: list[str] # source file paths (path portion of locator, no #page=N) + scope: str = "finding" # which DiscoveryResult surface to match against (§4.3) + keywords: list[str] = field(default_factory=list) + weight: float = 1.0 + from_node: str = "" # dependency_graph relation items only + to_node: str = "" # dependency_graph relation items only + relation: str = "" # defaults to "precedes" when from/to present + + +@dataclass(frozen=True) +class Registry: + schema_version: str + corpus: str + author: str + date: str + kappa: float + items: list[RegistryItem] + _sha256: str = field(default="", compare=False) + + @property + def real_items(self) -> list[RegistryItem]: + return [i for i in self.items if i.tier != "NC"] + + @property + def nc_items(self) -> list[RegistryItem]: + return [i for i in self.items if i.tier == "NC"] + + @property + def l0_items(self) -> list[RegistryItem]: + return [i for i in self.items if i.tier == "L0"] + + def is_kappa_advisory(self) -> bool: + return self.kappa < KAPPA_ADVISORY_THRESHOLD + + def sha256(self) -> str: + return self._sha256 + + +def _validate(raw: dict, path: Path) -> None: + if raw.get("schema_version") != SCHEMA_VERSION: + raise ValueError( + f"{path.name}: schema_version must be '{SCHEMA_VERSION}', " + f"got {raw.get('schema_version')!r}" + ) + for fname in ("corpus", "author", "date"): + if not raw.get(fname): + raise ValueError(f"{path.name}: missing required field '{fname}'") + if "kappa" not in raw: + raise ValueError(f"{path.name}: missing 'kappa' field (use 0.0 as placeholder)") + + items = raw.get("items", []) + + # EMPTY_MUST_FIND guard — must be first; kills fake-champion bug + if not items: + raise ValueError( + f"{path.name}: EMPTY_MUST_FIND — items list is empty; " + "cannot evaluate recall. This guard exists to prevent the " + "fake-100%-champion failure." + ) + + ids = [it.get("id") for it in items] + if len(ids) != len(set(ids)): + dupes = sorted({i for i in ids if ids.count(i) > 1}) + raise ValueError(f"{path.name}: duplicate item ids: {dupes}") + + for it in items: + tier = it.get("tier") + if tier not in VALID_TIERS: + raise ValueError( + f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; " + f"must be one of {VALID_TIERS}" + ) + scope = it.get("scope", "finding") + if scope not in VALID_SCOPES: + raise ValueError( + f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; " + f"must be one of {VALID_SCOPES}" + ) + if scope == "dependency_graph": + if not it.get("from") or not it.get("to"): + raise ValueError( + f"{path.name}: dependency_graph item '{it.get('id')}' must have " + "non-empty 'from' and 'to'" + ) + else: + if "from" in it or "to" in it or "relation" in it: + raise ValueError( + f"{path.name}: item '{it.get('id')}' has 'from'/'to'/'relation' " + f"but scope is '{scope}'; these fields are only valid on " + "dependency_graph-scoped items" + ) + + real_count = sum(1 for it in items if it.get("tier") != "NC") + nc_count = sum(1 for it in items if it.get("tier") == "NC") + required_nc = max(1, math.ceil(real_count / 10)) + if nc_count < required_nc: + raise ValueError( + f"{path.name}: NC density too low — {nc_count} NC item(s) for " + f"{real_count} real items; need >= {required_nc} (ceil(real/10)). " + "Without NC items the eval measures recall only; a verbose hallucinator " + "scores perfectly." + ) + + # ABANCA DILO blend guard: items must assert a single sub-population target. + # Checks for phrases that would indicate a blended numeric target is asserted. + # "blend" alone is too broad (items may reference it negatively). + BLEND_PHRASES = ("combined distribution", "across all offices regardless of segment") + for it in items: + if it.get("tier") == "NC": + continue + desc = it.get("description", "").lower() + iid = it.get("id", "") + if any(phrase in desc for phrase in BLEND_PHRASES): + raise ValueError( + f"{path.name}: item '{iid}' description targets a blended distribution; " + "ABANCA DILO items must target a single measured sub-population " + "(Empresas or PyMEs). Use segment-keyed items: " + "dilo-empresas-operativa-42pct AND dilo-pymes-operativa-29pct separately." + ) + + +def _compute_sha256(path: Path) -> str: + return hashlib.sha256(path.read_bytes()).hexdigest() + + +def load_registry(path: str | Path) -> Registry: + """Load and validate a lean-1 registry file. + + Raises ValueError with a descriptive message on any invariant violation. + The EMPTY_MUST_FIND check runs first — it is the fake-champion guard. + """ + path = Path(path) + raw = json.loads(path.read_text(encoding="utf-8")) + _validate(raw, path) + sha = _compute_sha256(path) + + items = [ + RegistryItem( + id=it["id"], + tier=it["tier"], + scope=it.get("scope", "finding"), + description=it.get("description", ""), + evidence=it.get("evidence", []), + keywords=it.get("keywords", []), + weight=float(it.get("weight", 1.0)), + from_node=it.get("from", "") if it.get("scope") == "dependency_graph" else "", + to_node=it.get("to", "") if it.get("scope") == "dependency_graph" else "", + relation=it.get("relation", "precedes") if it.get("scope") == "dependency_graph" else "", + ) + for it in raw["items"] + ] + + return Registry( + schema_version=raw["schema_version"], + corpus=raw["corpus"], + author=raw["author"], + date=raw["date"], + kappa=float(raw["kappa"] or 0.0), + items=items, + _sha256=sha, + ) + + +def registry_sha256(path: str | Path) -> str: + return _compute_sha256(Path(path)) From ee64cfad1881e32be569a28acd5981373c8cd04f Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:43:24 +0200 Subject: [PATCH 04/48] feat(evaluation): add G1-G5 gate framework (#271) * feat(evaluation): add G1-G5 gate framework (GateResult, run_gates, g2_recall_precision) * feat(evaluation): export g2_recall_precision from evaluation package --------- Co-authored-by: miguelgfierro --- .../evaluation/__init__.py | 3 +- fireflyframework_agentic/evaluation/gates.py | 840 ++++++++++++++++++ 2 files changed, 842 insertions(+), 1 deletion(-) create mode 100644 fireflyframework_agentic/evaluation/gates.py diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index b6283d8b..401244c9 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -32,7 +32,7 @@ from importlib.metadata import PackageNotFoundError, version from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index -from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, render_scorecard, run_gates +from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, render_scorecard, run_gates from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens @@ -55,6 +55,7 @@ "verify_evidence_index", "GateResult", "Verdict", + "g2_recall_precision", "run_gates", "render_scorecard", "ChampionRecord", diff --git a/fireflyframework_agentic/evaluation/gates.py b/fireflyframework_agentic/evaluation/gates.py new file mode 100644 index 00000000..057bfea7 --- /dev/null +++ b/fireflyframework_agentic/evaluation/gates.py @@ -0,0 +1,840 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Four gates — every gate always runs; a failure raises a flag, not a veto. + +Gate pipeline (EVALUATION_FRAMEWORK.md §6): + G1 — Structural & Safe + G2 — Must-finds & negative controls + G3 — Evidence (grounding) + G5 — No-regression / promotion (human decision) + +Each gate is a pure function of the result dict + supporting inputs. +run_gates() always executes all four gates and returns all four results so +the scorecard carries the complete picture regardless of which flags fire. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +from fireflyframework_agentic.evaluation import matcher +from fireflyframework_agentic.evaluation.corpus import ( + EMPTY, + FABRICATED, + SOURCE_UNKNOWN, + VERIFIED, + Corpus, + corpus_sha256, + verify_evidence_index, +) +from fireflyframework_agentic.evaluation.matcher import anchored, matches +from fireflyframework_agentic.evaluation.registry import Registry, registry_sha256 + + +@dataclass +class GateResult: + gate: str + passed: bool + reason_code: str = "" + details: dict = field(default_factory=dict) + + def __str__(self) -> str: + status = "PASS" if self.passed else f"FLAG:{self.reason_code}" + return f"[{self.gate}] {status}" + + +class Verdict: + """Promotion gate verdict constants. + + Use ``Verdict.PROMOTE`` when the challenger meets the quality bar and + is safe to become the new champion. Use ``Verdict.HOLD`` when the + challenger does not meet the bar and must be iterated on. + """ + + PROMOTE: str = "PROMOTE" + HOLD: str = "HOLD" + + +def render_scorecard(gate_results: list[GateResult]) -> str: + """Render a human-readable scorecard from a list of GateResult objects. + + Emits one line per gate: ``[G1] PASS`` or ``[G2] FLAG:RECALL_BELOW_FLOOR``. + The overall verdict (PROMOTE / HOLD) appears on the final line. A run + promotes only when every gate passes; any flag signals HOLD. + """ + lines = [str(r) for r in gate_results] + all_passed = all(r.passed for r in gate_results) + verdict = Verdict.PROMOTE if all_passed else Verdict.HOLD + lines.append(f"VERDICT: {verdict}") + return "\n".join(lines) + + +def _build_evidence_index(result: dict, corpus: Corpus | None = None) -> dict[str, dict]: + """Index evidence by id; with a corpus, drop entries that fail verification. + + Dropped entries (FABRICATED excerpt or SOURCE_UNKNOWN locator) cannot + contribute source stems to G2's shared-source guard or excerpts to G3's + grounding — a run cannot anchor anything on evidence it invented. EMPTY + entries are kept: an empty excerpt is a format problem, not fabrication, + and its (verified) locator stem is still a legitimate citation. + """ + index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")} + if corpus is None: + return index + statuses = verify_evidence_index(corpus, result) + return { + eid: ev + for eid, ev in index.items() + if statuses[eid] in (VERIFIED, EMPTY) + } + + +# ── G1: Structural & Safe ──────────────────────────────────────────────────── + + +def _name_duplication_rate(nodes: list[dict]) -> float: + """Tier-1 + Tier-2 name clustering; returns 1 - clusters/count. + + Tier 1: same normalized id (lower-case) merges nodes into one cluster. + Tier 2: name token-Jaccard >= 0.6 merges nodes into one cluster. + + Report-only: no gate flag fires on any threshold. + """ + n = len(nodes) + if n < 2: + return 0.0 + + group = list(range(n)) + + def _root(i: int) -> int: + while group[i] != i: + group[i] = group[group[i]] + i = group[i] + return i + + seen: dict[str, int] = {} + for i, node in enumerate(nodes): + nid = node.get("id", "").lower() + if nid in seen: + group[_root(i)] = _root(seen[nid]) + else: + seen[nid] = i + + toks = [frozenset(node.get("name", "").lower().split()) for node in nodes] + for i in range(n): + for j in range(i + 1, n): + a, b = toks[i], toks[j] + union_ab = a | b + if union_ab and len(a & b) / len(union_ab) >= 0.6: + group[_root(i)] = _root(j) + + clusters = len({_root(i) for i in range(n)}) + return round(1 - clusters / n, 4) + + +def g1_structural( + result: dict, + registry: Registry, + registry_path: str, + *, + pii_list: list[str] | None = None, + corpus: Corpus | None = None, +) -> GateResult: + """G1 — Structural & Safe (hard veto). + + Checks (in order): + 1. EMPTY_MUST_FIND — must run first; kills the fake-100%-champion bug. + 2. Registry SHA-256 pin: loaded Registry matches the file on disk. + 3. Corpus SHA-256 pin (when a corpus is supplied): same drift guard for + the evidence universe (CORPUS_DRIFT). + 4. Required top-level keys present in result. + 5. PII non-disclosure: no corpus PII name in any finding/report text. + """ + # Guard 1: empty registry (fake-champion guard — always first) + if not registry.real_items: + return GateResult( + gate="G1", + passed=False, + reason_code="EMPTY_MUST_FIND", + details={"message": "Registry has zero real items — cannot evaluate recall."}, + ) + + # Guard 2: registry SHA-256 pin + computed_sha = registry_sha256(registry_path) + if computed_sha != registry.sha256(): + return GateResult( + gate="G1", + passed=False, + reason_code="GOLD_DRIFT", + details={ + "message": "Registry file has changed since it was loaded.", + "expected": registry.sha256(), + "actual": computed_sha, + }, + ) + + # Guard 3: corpus SHA-256 pin (CORPUS_DRIFT — the GOLD_DRIFT twin for evidence) + if corpus is not None: + current_corpus_sha = corpus_sha256(corpus.path) + if current_corpus_sha != corpus.sha256: + return GateResult( + gate="G1", + passed=False, + reason_code="CORPUS_DRIFT", + details={ + "message": "Corpus file has changed since it was loaded.", + "expected": corpus.sha256, + "actual": current_corpus_sha, + }, + ) + + # Guard 4: required result keys + required = ("process_graph", "findings", "evidence_index") + missing = [k for k in required if k not in result] + if missing: + return GateResult( + gate="G1", + passed=False, + reason_code="SCHEMA_INVALID", + details={"missing_keys": missing}, + ) + + # Guard 5: PII check + if pii_list: + free_text: list[str] = [] + for finding in result.get("findings", []): + free_text.extend([finding.get("title", ""), finding.get("description", "")]) + for report in result.get("reports", []): + free_text.append(str(report)) + combined = " ".join(free_text).lower() + hits = [name for name in pii_list if name.lower() in combined] + if hits: + return GateResult( + gate="G1", + passed=False, + reason_code="PII_LEAK", + details={ + "message": "Corpus PII names found in findings/reports.", + "matches": hits[:5], + }, + ) + + pg = result.get("process_graph", {}) + processes = pg.get("processes", []) + activities = [a for p in processes for a in p.get("activities", [])] + decisions = [d for p in processes for d in p.get("decisions", [])] + dg = result.get("dependency_graph", {}) + + details = { + "registry_sha256": registry.sha256(), + "real_items": len(registry.real_items), + "nc_items": len(registry.nc_items), + "map": { + "processes": { + "count": len(processes), + "duplication_rate": _name_duplication_rate(processes), + }, + "activities": { + "count": len(activities), + "duplication_rate": _name_duplication_rate(activities), + }, + "decisions": { + "count": len(decisions), + "duplication_rate": _name_duplication_rate(decisions), + }, + "personas": { + "count": len(result.get("personas", [])), + "duplication_rate": _name_duplication_rate(result.get("personas", [])), + }, + "systems": { + "count": len(result.get("systems", [])), + "duplication_rate": _name_duplication_rate(result.get("systems", [])), + }, + "informal_channels": { + "count": len(result.get("informal_channels", [])), + "duplication_rate": _name_duplication_rate(result.get("informal_channels", [])), + }, + "dependency_graph_edges": len(dg.get("activity_edges", [])), + }, + } + if corpus is not None: + details["corpus_sha256"] = corpus.sha256 + return GateResult(gate="G1", passed=True, details=details) + + +# ── G2: Recall & Precision ─────────────────────────────────────────────────── + + +def _candidates_by_scope(result: dict) -> dict[str, list[dict]]: + """Build per-scope candidate lists from a DiscoveryResult (§4.3). + + Process candidates are augmented with their children's evidence_refs because + process nodes typically carry no own refs — the source-document guard uses the + union of the process's own refs and all its activities' and decisions' refs. + + dependency_graph-scoped items are relation items (all carry from/to) and are + matched via matcher.matches_dependency_graph_relation() — not through per-candidate + iteration — so no "dependency_graph" key is included here. + """ + pg = result.get("process_graph", {}) + processes = pg.get("processes", []) + + def _merge_refs(proc: dict) -> dict: + children_refs = [ + ref + for child_list in (proc.get("activities", []), proc.get("decisions", [])) + for child in child_list + for ref in child.get("evidence_refs", []) + ] + return {**proc, "evidence_refs": list(proc.get("evidence_refs", [])) + children_refs} + + return { + "process": [_merge_refs(p) for p in processes], + "activity": [a for p in processes for a in p.get("activities", [])], + "decision": [d for p in processes for d in p.get("decisions", [])], + "finding": result.get("findings", []), + "action": result.get("proposed_actions", []), + "persona": result.get("personas", []), + "system": result.get("systems", []), + "informal_channel": result.get("informal_channels", []), + } + + +def _weighted_recall(scored_items: list, hits: dict[str, bool]) -> float: + """Weighted recall of a hit map over the scored (non-L3) items.""" + total_weight = sum(item.weight for item in scored_items) or 1.0 + weighted_hit = sum(item.weight for item in scored_items if hits[item.id]) + return weighted_hit / total_weight + + +def _finding_redundancy_rate(findings: list[dict]) -> float: + """Fraction of findings that are near-duplicates of another (Jaccard ≥0.6 on ≥5-char tokens).""" + if len(findings) < 2: + return 0.0 + def _tok(text: str) -> frozenset[str]: + return frozenset(t.lower() for t in text.split() if len(t) >= 5) + token_sets = [_tok(f.get("description", "")) for f in findings] + in_redundant: set[int] = set() + for i in range(len(token_sets)): + for j in range(i + 1, len(token_sets)): + a, b = token_sets[i], token_sets[j] + union = a | b + sim = len(a & b) / len(union) if union else 1.0 + if sim >= 0.6: + in_redundant.add(i) + in_redundant.add(j) + return round(len(in_redundant) / len(findings), 4) + + +def g2_recall_precision( + result: dict, + registry: Registry, + *, + recall_floor: float = 0.70, + embed_fn=None, + tau: float = 0.70, + tau_nc: float = 0.85, + recall_metric: str = "lexical", + corpus: Corpus | None = None, +) -> GateResult: + """G2 — Recall & Precision (hard veto). + + - L0 miss -> BLOCK (zeros the evaluation; regulatory-mandatory item absent) + - NC hit -> BLOCK (precision failure; plausible-but-false item was emitted) + - recall < floor -> BLOCK + + With a ``corpus``, evidence entries that fail verification (fabricated + excerpt or unknown source) are excluded from the evidence index before + matching, so the shared-source guard only accepts citations to real + corpus documents — a fabricated locator cannot satisfy any item. + + ``recall_metric`` ("lexical"/"semantic"/"hybrid") selects which hit map GATES. + "lexical" is matcher.matches (shared-source + topic-anchored token overlap) and + needs no embedder. "semantic"/"hybrid" add the embedding path (matcher.semantic_hits, + threshold ``tau`` for real items, ``tau_nc`` for NC items) and REQUIRE ``embed_fn`` + — passing them without one raises ValueError (use "lexical" for the offline path). + When an embedder is supplied, all three recalls (lexical/semantic/hybrid) are + reported in details regardless of which one gates. + """ + evidence_index = _build_evidence_index(result, corpus) + candidates = _candidates_by_scope(result) + findings = candidates["finding"] + + # NC items anchor via the embedding path only (§6.2): a correct finding about + # the true mirror fact shares vocabulary with the false description, so a + # token or keyword match would falsely convict it. Lexical NC is always False. + # dependency_graph relation items (those with from_node) use the endpoint + # matcher (§5.3b) instead of the per-candidate text predicate. + lexical: dict[str, bool] = {} + for item in registry.items: + if item.tier == "NC": + lexical[item.id] = False + elif item.scope == "dependency_graph" and item.from_node: + lexical[item.id] = matcher.matches_dependency_graph_relation( + item, result, evidence_index + ) + else: + lexical[item.id] = any( + matches(c, item, evidence_index, scope=scope) + for scope in matcher.allowed_scopes(item) + for c in candidates.get(scope, []) + ) + + if recall_metric not in ("lexical", "semantic", "hybrid"): + raise ValueError(f"unknown recall_metric {recall_metric!r}") + if recall_metric in ("semantic", "hybrid") and embed_fn is None: + raise ValueError( + f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn" + ) + + if embed_fn is not None: + semantic = matcher.semantic_hits( + candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc + ) + # dependency_graph relation items have no embedding candidates (§5.3b uses + # the endpoint matcher, not per-candidate text embeddings); mirror the + # lexical result so semantic/hybrid never under-credits them. + for item in registry.items: + if item.scope == "dependency_graph" and item.from_node: + semantic[item.id] = lexical[item.id] + else: + semantic = None + + metric = recall_metric + + if semantic is None or metric == "lexical": + hits = lexical + elif metric == "semantic": + hits = semantic + else: # hybrid + hits = {iid: lexical[iid] or semantic[iid] for iid in lexical} + + # Signal-to-noise panel — report-only, §6.2 item 3 + finding_count = len(findings) + finding_scoped_items = [i for i in registry.real_items if i.scope == "finding"] + findings_matched = sum( + 1 for f in findings + if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items) + ) + _sn = { + "finding_count": finding_count, + "findings_matched_to_registry": { + "count": findings_matched, + "fraction": round(findings_matched / finding_count, 4) if finding_count else 0.0, + }, + "finding_redundancy_rate": _finding_redundancy_rate(findings), + } + if corpus is not None: + excluded = len(_build_evidence_index(result)) - len(evidence_index) + _sn["evidence_entries_excluded_unverified"] = excluded + + # L0 misses + l0_misses = [item.id for item in registry.l0_items if not hits[item.id]] + if l0_misses: + return GateResult( + gate="G2", + passed=False, + reason_code="L0_MISSING", + details={ + "l0_misses": l0_misses, + "message": "Regulatory-mandatory items not found — evaluation zeroed.", + **_sn, + }, + ) + + # NC precision + nc_hits = [item.id for item in registry.nc_items if hits[item.id]] + if nc_hits: + return GateResult( + gate="G2", + passed=False, + reason_code="NC_HIT", + details={ + "nc_hits": nc_hits, + "message": "Plausible-but-false negative control items were matched — precision failure.", + **_sn, + }, + ) + + # Weighted recall — over scored items only (L0/L1/L2). L3 is a bonus tier + # ("extra credit"): an L3 miss must not lower recall, so L3 is excluded from + # the denominator and only reported in per_tier below. Recall is computed over + # the GATING hit map so the gate is internally consistent with the chosen metric. + real_items = registry.real_items + scored_items = [item for item in real_items if item.tier != "L3"] + recall = _weighted_recall(scored_items, hits) + + per_tier: dict[str, dict] = {} + for tier in ("L0", "L1", "L2", "L3"): + tier_items = [i for i in real_items if i.tier == tier] + if not tier_items: + continue + per_tier[tier] = { + "hit": sum(1 for i in tier_items if hits[i.id]), + "total": len(tier_items), + } + + def _semantic_details() -> dict: + """The extra recall-breakdown keys, only emitted when an embedder is given.""" + if semantic is None: + return {} + return { + "lexical_recall": round(_weighted_recall(scored_items, lexical), 4), + "semantic_recall": round(_weighted_recall(scored_items, semantic), 4), + "hybrid_recall": round( + _weighted_recall( + scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical} + ), + 4, + ), + "tau": tau, + } + + if recall < recall_floor: + return GateResult( + gate="G2", + passed=False, + reason_code="RECALL_BELOW_FLOOR", + details={ + "recall": round(recall, 4), + "recall_metric": metric, + "floor": recall_floor, + "per_tier": per_tier, + "misses": [item.id for item in scored_items if not hits[item.id]], + **_semantic_details(), + **_sn, + }, + ) + + return GateResult( + gate="G2", + passed=True, + details={ + "recall": round(recall, 4), + "recall_metric": metric, + "floor": recall_floor, + "per_tier": per_tier, + "nc_items_checked": len(registry.nc_items), + **_semantic_details(), + **_sn, + }, + ) + + +# ── G3: Grounded ───────────────────────────────────────────────────────────── + + +def g3_grounded( + result: dict, + *, + grounding_floor: float = 0.90, + human_spot_check_n: int = 5, + corpus: Corpus | None = None, +) -> GateResult: + """G3 — Grounded (automated portion; human spot-check triggered on pass). + + For each finding, verifies that at least one cited evidence excerpt shares a + non-trivial token with the finding description (topic-anchoring). + + With a ``corpus``, the gate also looks in a third direction — cited -> + exists: every evidence entry is verified against the actual corpus text + (corpus.verify_entry). A populated excerpt not found in its cited source + raises EVIDENCE_FABRICATED; a locator resolving to no corpus document + raises EVIDENCE_SOURCE_UNKNOWN; and only verified excerpts can ground a + finding, so a run cannot ground itself on evidence it invented. + + Also reports excerpt fill rate and source coverage so the reviewer can tell + whether ungrounded findings are a format problem (empty excerpts) or a real + faithfulness signal (populated excerpts that do not anchor). + + Known limitation: topic-anchoring, not claim entailment. A '45 days' claim + cited to a '3 days' source passes if they share the process name (excerpt + verification confirms the quote is real, not that the claim matches it). + The human spot-check is the binding faithfulness signal until NLI/AIS lands. + """ + evidence_index = _build_evidence_index(result) + findings = result.get("findings", []) + statuses = verify_evidence_index(corpus, result) if corpus is not None else None + + if not findings: + return GateResult( + gate="G3", + passed=False, + reason_code="NO_FINDINGS", + details={"message": "Result has zero findings — cannot compute grounding."}, + ) + + grounded_ids: list[str] = [] + # Ungrounded split (§6.3): distinguish format issues from real faithfulness failures. + ungrounded_empty_only: list[str] = [] # every ref had an empty excerpt + ungrounded_populated: list[str] = [] # had populated excerpt(s) but none anchored + + # Excerpt fill: count all resolved refs and how many carry a non-empty excerpt. + total_refs = 0 + populated_refs = 0 + + # Source coverage: which source stems are cited by at least one finding. + cited_stems: set[str] = set() + + for finding in findings: + fid = finding.get("id", "?") + desc = finding.get("description", "") + is_grounded = False + had_populated = False + for ref in finding.get("evidence_refs", []): + ev = evidence_index.get(ref.get("evidence_id", "")) + if ev: + total_refs += 1 + excerpt = ev.get("excerpt") or "" + if excerpt: + populated_refs += 1 + had_populated = True + # Track source coverage (even for ungrounded findings). + stem = matcher.source_stem(ev.get("locator", "")) + if stem: + cited_stems.add(stem) + # Only a corpus-verified excerpt can ground a finding. + if statuses is not None and statuses.get(ev.get("id")) != VERIFIED: + continue + if anchored(desc, excerpt): + is_grounded = True + break + if is_grounded: + grounded_ids.append(fid) + elif had_populated: + ungrounded_populated.append(fid) + else: + ungrounded_empty_only.append(fid) + + grounding_pct = len(grounded_ids) / len(findings) + + # All source stems present in the evidence index (not just those cited). + all_stems: set[str] = set() + for ev in result.get("evidence_index", []): + stem = matcher.source_stem(ev.get("locator", "")) + if stem: + all_stems.add(stem) + orphaned = sorted(all_stems - cited_stems) + + excerpt_fill = f"{populated_refs}/{total_refs}" if total_refs else "0/0" + source_coverage = f"{len(cited_stems)}/{len(all_stems)}" if all_stems else "0/0" + + details = { + "grounding_pct": round(grounding_pct, 4), + "grounded": len(grounded_ids), + "total": len(findings), + "excerpt_fill": excerpt_fill, + "source_coverage": source_coverage, + "orphaned_sources": orphaned, + } + + fabricated_ids: list[str] = [] + unknown_source_ids: list[str] = [] + if statuses is not None: + fabricated_ids = sorted(e for e, s in statuses.items() if s == FABRICATED) + unknown_source_ids = sorted(e for e, s in statuses.items() if s == SOURCE_UNKNOWN) + details["evidence_verification"] = { + "entries": len(statuses), + "verified": sum(1 for s in statuses.values() if s == VERIFIED), + "empty_excerpt": sum(1 for s in statuses.values() if s == EMPTY), + "fabricated": fabricated_ids, + "source_unknown": unknown_source_ids, + } + + if fabricated_ids: + details["message"] = ( + "Populated excerpt(s) not found in the cited corpus document — " + "the run asserts evidence the source does not contain." + ) + return GateResult( + gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details + ) + + if unknown_source_ids: + details["message"] = ( + "Evidence locator(s) resolve to no corpus document — either the " + "corpus bundle is incomplete or the run invented a source." + ) + return GateResult( + gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details + ) + + if grounding_pct < grounding_floor: + details["floor"] = grounding_floor + details["ungrounded_with_populated_excerpts"] = ungrounded_populated + details["ungrounded_with_empty_excerpts_only"] = ungrounded_empty_only + return GateResult(gate="G3", passed=False, reason_code="UNGROUNDED", details=details) + + spot_n = min(human_spot_check_n, len(findings)) + details["human_spot_check"] = ( + f"ACTION REQUIRED: manually review {spot_n} sampled findings for " + "field-consistency, citation-accuracy, and client-readiness. " + "This is the binding faithfulness signal until NLI/AIS lands." + ) + return GateResult(gate="G3", passed=True, details=details) + + +# ── G5: No-regression / promotion (human decision) ─────────────────────────── + + +def g5_no_regression( + candidate_scores: dict[str, float], + champion_scores: dict[str, float] | None, + aa_noise: dict[str, float] | None, + *, + is_day_zero: bool = False, + human_signed_off: bool = False, + signoff_count: int = 0, +) -> GateResult: + """G5 — No-regression / promotion gate (human decision). + + Day-Zero: no champion exists. Requires G1-G3 pass + 2 independent sign-offs. + Normal promotion: candidate must beat champion by > aa_noise on every metric, + no guardrail regresses, + 1 human sign-off. + + Champions are per-corpus. Do not compare across corpora. + """ + if is_day_zero or champion_scores is None: + required = 2 + if signoff_count < required: + return GateResult( + gate="G5", + passed=False, + reason_code="HOLD", + details={ + "reason": ( + f"Day-Zero requires {required} independent human sign-offs " + f"(kappa >= 0.70); got {signoff_count}." + ), + "action": "Collect sign-offs, then re-run with --day-zero --signoffs 2", + }, + ) + return GateResult( + gate="G5", + passed=True, + details={"day_zero": True, "signoffs": signoff_count}, + ) + + if not human_signed_off: + return GateResult( + gate="G5", + passed=False, + reason_code="HOLD", + details={"reason": "Human sign-off required for promotion."}, + ) + + noise = aa_noise or {} + regressions: list[str] = [] + improvements: list[str] = [] + + for metric, cand_val in candidate_scores.items(): + champ_val = champion_scores.get(metric) + if champ_val is None: + continue + delta = cand_val - champ_val + band = noise.get(metric, 0.0) + if delta < -band: + regressions.append( + f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} " + f"delta={delta:+.4f} < -band={-band:.4f}" + ) + elif delta > band: + improvements.append(f"{metric}: delta={delta:+.4f} > band={band:.4f}") + + if regressions: + return GateResult( + gate="G5", + passed=False, + reason_code="HOLD", + details={ + "regressions": regressions, + "improvements": improvements, + "message": "Guardrail metric(s) regressed beyond A/A noise band.", + }, + ) + + return GateResult( + gate="G5", + passed=True, + details={"improvements": improvements, "noise_band": noise}, + ) + + +# ── Full gate pipeline ──────────────────────────────────────────────────────── + + +def run_gates( + result: dict, + registry: Registry, + registry_path: str, + *, + pii_list: list[str] | None = None, + recall_floor: float = 0.70, + grounding_floor: float = 0.90, + champion_scores: dict[str, float] | None = None, + aa_noise: dict[str, float] | None = None, + is_day_zero: bool = False, + human_signed_off: bool = False, + signoff_count: int = 0, + embed_fn=None, + tau: float = 0.70, + tau_nc: float = 0.85, + recall_metric: str = "lexical", + corpus: Corpus | None = None, +) -> list[GateResult]: + """Run all gates G1 -> G2 -> G3 -> G5; every gate always executes. + + A failed gate raises a flag in its GateResult but never prevents the + remaining gates from running. The scorecard therefore always carries the + complete picture: a run that misses a regulatory item *and* grounds poorly + shows both flags. See EVALUATION_FRAMEWORK.md §2 ('No gate vetoes'). + + ``corpus`` (optional) enables deterministic evidence verification: G1 pins + the corpus hash, G2 ignores unverified evidence entries, and G3 flags + fabricated excerpts and unknown sources. Without it, evidence is taken at + face value from the run's own evidence_index (disclosed on the scorecard). + + Returns all four GateResult objects. + """ + g1 = g1_structural(result, registry, registry_path, pii_list=pii_list, corpus=corpus) + + g2 = g2_recall_precision( + result, + registry, + recall_floor=recall_floor, + embed_fn=embed_fn, + tau=tau, + tau_nc=tau_nc, + recall_metric=recall_metric, + corpus=corpus, + ) + + g3 = g3_grounded(result, grounding_floor=grounding_floor, corpus=corpus) + + # G5 uses whatever scores G2/G3 produced; 0.0 when a gate flagged and did + # not emit the metric (e.g. L0_MISSING returns before computing recall). + candidate_scores = { + "recall": g2.details.get("recall", 0.0), + "grounding_pct": g3.details.get("grounding_pct", 0.0), + } + g5 = g5_no_regression( + candidate_scores, + champion_scores, + aa_noise, + is_day_zero=is_day_zero, + human_signed_off=human_signed_off, + signoff_count=signoff_count, + ) + + return [g1, g2, g3, g5] From d964ba10735b918ed8e62ae7a5b1533238696495 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:46:21 +0200 Subject: [PATCH 05/48] feat(evaluation): add scorecard renderer (#272) * feat(evaluation): add scorecard renderer * feat(evaluation): export render_scorecard, verdict, VERDICT_PROMOTE/HOLD from scorecard module --------- Co-authored-by: miguelgfierro --- .../evaluation/__init__.py | 6 +- .../evaluation/scorecard.py | 489 ++++++++++++++++++ 2 files changed, 494 insertions(+), 1 deletion(-) create mode 100644 fireflyframework_agentic/evaluation/scorecard.py diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index 401244c9..61562db3 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -32,7 +32,8 @@ from importlib.metadata import PackageNotFoundError, version from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index -from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, render_scorecard, run_gates +from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, run_gates +from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict, VERDICT_PROMOTE, VERDICT_HOLD from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens @@ -58,6 +59,9 @@ "g2_recall_precision", "run_gates", "render_scorecard", + "verdict", + "VERDICT_PROMOTE", + "VERDICT_HOLD", "ChampionRecord", "load_champion", "save_champion", diff --git a/fireflyframework_agentic/evaluation/scorecard.py b/fireflyframework_agentic/evaluation/scorecard.py new file mode 100644 index 00000000..b34885e8 --- /dev/null +++ b/fireflyframework_agentic/evaluation/scorecard.py @@ -0,0 +1,489 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Scorecard renderer: gate results -> Markdown report. + +Every scorecard states whether it is self-graded. Until Phase 3 independent +re-annotation lands, all Lean-Core PROMOTE verdicts are self-graded against +team-authored ground truth. See EVALUATION_FRAMEWORK.md. +""" + +from __future__ import annotations + +import json + +VERDICT_PROMOTE = "PROMOTE" +VERDICT_HOLD = "HOLD" + + +def verdict(gate_results: list) -> str: + """PROMOTE iff all gates passed and G5 is in the list; HOLD otherwise.""" + if not gate_results: + return VERDICT_HOLD + if not all(g.passed for g in gate_results): + return VERDICT_HOLD + gate_names = {g.gate for g in gate_results} + if "G5" not in gate_names: + return VERDICT_HOLD + return VERDICT_PROMOTE + + +def render_scorecard( + gate_results: list, + *, + corpus: str = "unknown", + model_id: str = "unknown", + run_id: str = "run", + is_self_graded: bool = True, + kappa_advisory: bool = False, + evidence_unverified: bool = False, + bpi2017_f1: float | None = None, + advisory=None, + config: dict | None = None, + experiment_config: dict | None = None, +) -> str: + """Render a Markdown evaluation scorecard. + + The scorecard always discloses self-graded status and advisory flags. + """ + v = verdict(gate_results) + lines = [ + "# FlyRadar Evaluation Scorecard", + "", + f"**Corpus**: {corpus}", + f"**Model**: {model_id}", + f"**Run**: {run_id}", + f"**Verdict**: **{v}**", + "", + ] + + if is_self_graded: + lines += [ + "> **SELF-GRADED**: All ground truth (must-find, gold, DILO, human sign-off) is", + "> authored by the FlyRadar team. This PROMOTE has no contamination-free signal", + "> until Phase 3. See EVALUATION_FRAMEWORK.md.", + "", + ] + + if kappa_advisory: + lines += [ + "> **ADVISORY**: Registry kappa < 0.70 — a second independent annotator has not", + "> verified the must-find items. Promotion is advisory for this corpus until", + "> kappa >= 0.70 from an independent re-annotation.", + "", + ] + + if evidence_unverified: + lines += [ + "> **EVIDENCE UNVERIFIED**: no corpus supplied (--corpus) — evidence locators", + "> and excerpts are taken at face value from the run's own evidence_index.", + "> Grounding certifies self-consistency, not corpus reality. Supply the run's", + "> input.json to enable deterministic excerpt verification (G3, §6.3).", + "", + ] + + if experiment_config is not None: + lines += [ + "## Experiment configuration", + "How this run was generated. Recorded fields (cost, tokens, latency, agents) are " + "read from the run's output.json; `model` is the value passed to the harness via " + "--model-id. Generation params (temperature, prompt/pipeline version, seed) are not " + "captured in output.json.", + "", + "```json", + json.dumps(experiment_config, indent=2, default=str), + "```", + "", + ] + + if config is not None: + lines += [ + "## Evaluation configuration", + "These are the parameters used to compute the evaluation.", + "", + "```json", + json.dumps(config, indent=2, default=str), + "```", + "", + ] + + lines += ["## Gate Results", ""] + g5_result = None + for g in gate_results: + if g.gate == "G5": + g5_result = g + continue + status = "PASS" if g.passed else f"FLAG ({g.reason_code})" + lines.append(f"### {g.gate}: {status}") + if g.details: + lines.append("```json") + lines.append(json.dumps(g.details, indent=2, default=str)) + lines.append("```") + lines.append("") + + if bpi2017_f1 is not None: + ok = bpi2017_f1 >= 0.60 + anchor_status = "PASS (>= 0.60)" if ok else "BELOW THRESHOLD (< 0.60)" + lines += [ + "## External Sanity Anchor (non-blocking)", + f"BPI-2017 variant-recovery F1: **{bpi2017_f1:.3f}** — {anchor_status}", + "_One non-self-graded signal. Non-blocking; informational only._", + "", + ] + + if advisory is not None: + lines += _render_advisory(advisory) + + if g5_result is not None: + status = "PASS" if g5_result.passed else f"FLAG ({g5_result.reason_code})" + lines.append(f"### G5: {status}") + if g5_result.details: + lines.append("```json") + lines.append(json.dumps(g5_result.details, indent=2, default=str)) + lines.append("```") + lines.append("") + + lines += _render_analysis(gate_results, advisory) + + return "\n".join(lines) + + +def _num(x) -> str: + """Format a metric leaf: None -> 'n/a', float -> 3dp, else str.""" + if x is None: + return "n/a" + if isinstance(x, float): + return f"{x:.3f}" + return str(x) + + +def _render_advisory(report) -> list[str]: + """Render the non-blocking G4 LLM-as-a-Judge section from an AdvisoryReport. + + Best-effort: only metrics present in report.metrics are shown. G4 never + affects the PROMOTE/HOLD verdict; this section is decision-support for the + G5 human sign-off, and is advisory until LLM-as-a-Judge calibration (§10). + """ + m = report.metrics + cal = "calibrated" if report.calibrated else "uncalibrated" + lines = [ + "## G4 — LLM-as-a-Judge (non-blocking — does NOT affect the PROMOTE/HOLD verdict)", + f"Judge: {report.judge_model} · {cal} · {report.runs}-run median", + ] + if report.same_provider_caveat: + lines.append("> same-provider as the pipeline — results may share blind spots.") + lines.append("```text") + + if "faithfulness" in m: + d = m["faithfulness"] + u = d.get("unsupported_ids", []) + extra = f" (unsupported: {', '.join(u)})" if u else "" + lines.append( + f"Faithfulness (entailment): {d.get('supported')}/{d.get('total')} supported{extra}" + ) + if "numeric_temporal_fidelity" in m: + lines.append( + f"Numeric/temporal fidelity: {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)" + ) + if "citation_relevance" in m: + d = m["citation_relevance"] + lines.append( + f"Citation relevance (ctx-prec): {_num(d.get('precision'))} ({d.get('relevant')}/{d.get('total')})" + ) + if "semantic_recovery" in m: + d = m["semantic_recovery"] + rec = d.get("recovered", []) + rids = ", ".join(r.get("id", "") for r in rec) if rec else "none" + lines.append( + f"Semantic recovery (ctx-recall): lexical {_num(d.get('lexical_recall'))} -> {_num(d.get('recovered_recall'))} (recovered: {rids})" + ) + if "nc_semantic_precision" in m: + d = m["nc_semantic_precision"] + a = d.get("asserted_ids", []) + extra = f" ({', '.join(a)})" if a else "" + lines.append(f"NC semantic precision: {d.get('asserted', 0)} asserted{extra}") + if "fabricated_entity" in m: + lines.append(f"Fabricated-entity check: {m['fabricated_entity'].get('count', 0)}") + if "contradiction" in m: + lines.append(f"Contradiction detection: {m['contradiction'].get('count', 0)}") + if "actionability" in m: + d = m["actionability"] + lines.append( + f"Actionability: {_num(d.get('score'))} (rated {d.get('rated', 0)})" + ) + if "severity_calibration" in m: + d = m["severity_calibration"] + lines.append( + f"Severity calibration: {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated" + ) + if "answer_relevancy" in m: + lines.append(f"Answer relevancy: {_num(m['answer_relevancy'].get('score'))}") + if "comparative_vs_champion" in m: + lines.append( + f"Comparative vs champion: more consistent -> {m['comparative_vs_champion'].get('more_consistent', 'n/a')}" + ) + if "source_coverage" in m: + d = m["source_coverage"] + o = d.get("orphaned", []) + extra = f" (orphaned: {', '.join(o)})" if o else "" + lines.append( + f"Source coverage [D]: {d.get('cited')}/{d.get('total')} documents cited{extra}" + ) + if "excerpt_fill_rate" in m: + d = m["excerpt_fill_rate"] + lines.append( + f"Evidence-excerpt fill [D]: {d.get('populated')}/{d.get('total')} populated" + ) + if "open_gap" in m: + gap = (m["open_gap"].get("gap") or "").strip() + if gap: + lines.append(f"Open gap probe: {gap}") + if report.errors: + lines.append(f"(errors: {len(report.errors)} metric(s) failed: {'; '.join(report.errors)})") + lines.append("```") + # Full detail — nothing truncated: every id, pair, verdict, and complete text. + lines += [ + "", + "**G4 — full metric detail:**", + "```json", + json.dumps({"metrics": report.metrics, "details": report.details}, indent=2, default=str), + "```", + ] + lines.append( + "> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10)." + ) + lines.append("") + return lines + + +def _render_analysis(gate_results: list, advisory=None) -> list[str]: + """Render a plain-language interpretation of all evaluation signals.""" + g2 = next((g for g in gate_results if g.gate == "G2"), None) + g3 = next((g for g in gate_results if g.gate == "G3"), None) + + lines = ["## Analysis", ""] + + # ── Topic coverage (G2) ────────────────────────────────────────────────── + lines.append("### Topic coverage (G2)") + if g2 and g2.details: + d = g2.details + recall = d.get("recall", 0.0) + tiers = d.get("per_tier", {}) + finding_count = d.get("finding_count", 0) + redundancy = d.get("finding_redundancy_rate", 0.0) + matched = d.get("findings_matched_to_registry", {}).get("fraction", 0.0) + + tier_summary = ", ".join( + f"{t} {v['hit']}/{v['total']}" + for t, v in tiers.items() + if "hit" in v and "total" in v + ) + lines.append( + f"Lexical recall is **{recall:.3f}** ({tier_summary}). " + f"The run produced {finding_count} findings, " + f"all of which map to a registry item (match rate {matched:.0%}). " + ) + if redundancy > 0.15: + lines.append( + f"Finding redundancy is **{redundancy:.0%}** — a meaningful share of " + "findings are near-duplicates of each other (Jaccard ≥ 0.6). " + "The run is covering the same ground multiple times rather than broadening coverage." + ) + else: + lines.append( + f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic." + ) + lines.append( + "_G2 is a topic-level test. A recall of 1.000 means every required topic was " + "mentioned somewhere — it does not verify that the specific claims about those " + "topics are accurate. Claim accuracy is G4 Faithfulness._" + ) + else: + lines.append("G2 result unavailable.") + lines.append("") + + # ── Evidence quality (G3) ──────────────────────────────────────────────── + lines.append("### Evidence quality (G3)") + if g3 and g3.details: + d = g3.details + grounding = d.get("grounding_pct", 0.0) + ev = d.get("evidence_verification", {}) + verified = ev.get("verified", 0) + entries = ev.get("entries", 0) + fabricated = ev.get("fabricated", []) + unknown = ev.get("source_unknown", []) + orphaned = d.get("orphaned_sources", []) + source_cov = d.get("source_coverage", "") + + lines.append( + f"Grounding is **{grounding:.0%}**: every finding cites at least one " + "corpus document, and all excerpts are populated. " + f"Evidence verification checked {entries} entries against the raw corpus: " + f"{verified} verified" + + (f", **{len(fabricated)} fabricated** (locators that do not exist in the corpus)" if fabricated else "") + + (f", **{len(unknown)} source-unknown** (locators that resolve to no corpus file)" if unknown else "") + + "." + ) + if unknown: + lines.append( + f"The source-unknown locator(s) are: `{'`, `'.join(unknown)}`. " + "This is most likely a corpus bundle gap rather than a hallucinated source — " + "verify that all expected files are included in `input.json`." + ) + if orphaned: + lines.append( + f"**{len(orphaned)} corpus documents were never cited** by this run " + f"({', '.join(orphaned)}). These are blind spots: the run extracted nothing " + "from these sources, so any findings they contain are silently missed." + ) + if source_cov: + cited, total = (int(x) for x in source_cov.split("/")) + if cited < total: + lines.append( + f"Overall source coverage is {cited}/{total} — " + f"{total - cited} corpus file(s) left entirely uncited." + ) + else: + lines.append("G3 result unavailable.") + lines.append("") + + # ── Claim accuracy (G4) ────────────────────────────────────────────────── + if advisory is not None: + m = advisory.metrics + lines.append("### Claim accuracy (G4 — advisory)") + + faith = m.get("faithfulness", {}) + supported = faith.get("supported", 0) + total_f = faith.get("total", 0) + if total_f: + faith_pct = supported / total_f + lines.append( + f"**Faithfulness: {supported}/{total_f} findings ({faith_pct:.0%}) are entailed by their cited evidence.** " + ) + if faith_pct < 0.5: + lines.append( + "This is a critical signal: the majority of findings contain claims " + "that the judge cannot verify from the cited sources. " + "The run is presenting inferences, extrapolations, or hallucinated details " + "as if they were directly evidenced. " + "Each unsupported finding should be reviewed against its cited document before use." + ) + elif faith_pct < 0.8: + lines.append( + "A significant minority of findings contain claims not traceable to cited sources. " + "These may be reasonable inferences, but they should be flagged for human verification." + ) + else: + lines.append("Most findings are directly supported by their cited evidence.") + + ntf = m.get("numeric_temporal_fidelity", {}) + mismatch_count = ntf.get("count", 0) + if mismatch_count: + lines.append( + f"**Numeric/temporal fidelity: {mismatch_count} mismatches detected.** " + "Specific figures — FTE costs, durations, timestamps, percentages, case IDs — " + "appear in findings but cannot be traced to the cited evidence. " + "These numbers should be treated as estimates or fabrications until verified " + "against the source documents." + ) + + fab = m.get("fabricated_entity", {}) + fab_count = fab.get("count", 0) + fab_entities = fab.get("entities", []) + if fab_count: + lines.append( + f"**Fabricated entities: {fab_count}** — the following names/identifiers appear " + f"in the output but are absent from the corpus: " + f"{', '.join(f'`{e}`' for e in fab_entities)}. " + "These should be removed or verified before sharing the output." + ) + + sev = m.get("severity_calibration", {}) + misc = sev.get("miscalibrated", 0) + total_s = sev.get("total", 0) + verdicts = sev.get("verdicts", {}) + over_count = sum(1 for v in verdicts.values() if v == "over") + under_count = sum(1 for v in verdicts.values() if v == "under") + if misc and total_s: + direction = "" + if over_count > under_count: + direction = f" (predominantly over-rated: {over_count} findings rated too high)" + elif under_count > over_count: + direction = f" (predominantly under-rated: {under_count} findings rated too low)" + lines.append( + f"**Severity calibration: {misc}/{total_s} findings miscalibrated{direction}.** " + "Over-rated findings inflate perceived urgency and can cause the client to " + "prioritise the wrong items." + ) + + act = m.get("actionability", {}) + act_score = act.get("score") + if act_score is not None: + if act_score < 0.6: + lines.append( + f"**Actionability score: {act_score:.3f}** — proposed actions are below the " + "0.6 threshold for concrete, quantified recommendations. " + "Actions tend to be generic rather than specific enough to assign and execute." + ) + else: + lines.append(f"Actionability score: {act_score:.3f} — actions are sufficiently concrete.") + + og = m.get("open_gap", {}) + gap_text = (og.get("gap") or "").strip() + if gap_text: + lines.append(f"**Most important missed finding:** {gap_text}") + + lines.append("") + + # ── Bottom line ────────────────────────────────────────────────────────── + lines.append("### Bottom line") + g5 = next((g for g in gate_results if g.gate == "G5"), None) + g5_reason = (g5.details or {}).get("reason", "") if g5 else "" + flags = [g for g in gate_results if not g.passed] + flag_names = [g.gate for g in flags] + + if not flags: + lines.append( + "All deterministic gates pass. The run is ready for G5 human sign-off." + ) + else: + flag_str = ", ".join(flag_names) + lines.append( + f"The run is at **HOLD** due to flags on: {flag_str}. " + ) + for g in flags: + if g.gate == "G3" and g.reason_code == "EVIDENCE_SOURCE_UNKNOWN": + lines.append( + "- **G3**: One evidence locator points to a file not in the corpus bundle. " + "Regenerate `input.json` to include all corpus sources, then re-run." + ) + elif g.gate == "G5": + lines.append(f"- **G5**: {g5_reason}") + + if advisory is not None: + m = advisory.metrics + faith = m.get("faithfulness", {}) + supported = faith.get("supported", 0) + total_f = faith.get("total", 1) + ntf_count = m.get("numeric_temporal_fidelity", {}).get("count", 0) + fab_count = m.get("fabricated_entity", {}).get("count", 0) + lines.append( + f"\nG4 advisory signals (non-blocking but important for the G5 reviewer): " + f"faithfulness {supported}/{total_f}, " + f"{ntf_count} numeric mismatches, " + f"{fab_count} fabricated entities. " + "The G5 reviewer should focus on the unsupported findings and verify figures " + "against the source documents before certifying the output." + ) + lines.append("") + return lines From 09cfc34bd75869498a0d5a10216625a784d9e638 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:52:05 +0200 Subject: [PATCH 06/48] feat(evaluation): add LLM-as-judge and judge client (#273) * feat(evaluation): add JudgeClient and OllamaEmbedder (judge_client.py) * feat(evaluation): add AdvisoryReport and run_judge with [D]/[E]/[J] metric families (judge.py) * feat(evaluation): import cosine from judge_client in matcher.py * feat(evaluation): export JudgeClient, OllamaEmbedder, build_embedder, cosine from evaluation package --------- Co-authored-by: miguelgfierro --- .../evaluation/__init__.py | 5 + fireflyframework_agentic/evaluation/judge.py | 829 ++++++++++++++++++ .../evaluation/judge_client.py | 454 ++++++++++ .../evaluation/matcher.py | 7 +- 4 files changed, 1289 insertions(+), 6 deletions(-) create mode 100644 fireflyframework_agentic/evaluation/judge.py create mode 100644 fireflyframework_agentic/evaluation/judge_client.py diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index 61562db3..37093075 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -36,6 +36,7 @@ from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict, VERDICT_PROMOTE, VERDICT_HOLD from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge +from fireflyframework_agentic.evaluation.judge_client import JudgeClient, OllamaEmbedder, build_embedder, cosine from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256 from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics @@ -68,6 +69,10 @@ "invalidate_champion", "AdvisoryReport", "run_judge", + "JudgeClient", + "OllamaEmbedder", + "build_embedder", + "cosine", "Registry", "RegistryItem", "load_registry", diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py new file mode 100644 index 00000000..a347c8e1 --- /dev/null +++ b/fireflyframework_agentic/evaluation/judge.py @@ -0,0 +1,829 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""G4 — LLM-as-a-Judge: an opt-in, NON-BLOCKING, NON-DETERMINISTIC advisory gate. + +G4 NEVER affects the PROMOTE/HOLD verdict and NEVER raises into the caller. +run_judge() wraps every metric in try/except; a failing metric appends to +report.errors and the run continues (best-effort). The result is an +AdvisoryReport, NOT a GateResult — it is carried separately so it can never +enter verdict() or the Skipped tuple (see scorecard / verdict_unaffected_note). + +Three families of metric (matching the flyradar contracts): +- [D] DETERMINISTIC — pure python, no LLM, printed even when the judge is off: + source_coverage, excerpt_fill_rate. +- [E] EMBEDDING — needs an embed_fn (local Ollama BGE by default): + semantic_recovery (context recall). +- [J] JUDGE — needs a chat_fn(system, user) -> dict; each [J] metric instructs + the model to reply with ONLY JSON: faithfulness, numeric_temporal_fidelity, + citation_relevance, nc_semantic_precision, fabricated_entity, contradiction, + open_gap, actionability, severity_calibration, answer_relevancy, + comparative_vs_champion. + +Aggregation follows the flycanon custom-judge design: run each [J] metric `runs` +times and take the MEDIAN of its numeric scores (robust to an outlier vote). + +Zero new dependencies: stdlib (json, statistics) + numpy. All imports at top. +calibrated is ALWAYS False for now (LLM-as-a-Judge calibration is §14, future work). +""" + +from __future__ import annotations + +import concurrent.futures +import statistics +from dataclasses import dataclass, field + +import numpy as np + +from fireflyframework_agentic.evaluation.judge_client import ( + JudgeClient, + OllamaEmbedder, + cosine, + same_provider, +) +from fireflyframework_agentic.evaluation.matcher import source_stem + +SYSTEM = "You are a meticulous evaluator of a process-mining discovery report. Return ONLY a JSON object." + + +@dataclass +class AdvisoryReport: + """The G4 output: a plain metrics bag, never a GateResult. + + metrics maps metric-name -> small dict (the per-metric summary). details + carries supporting context (counts, ids). errors lists per-metric failures + captured by run_judge's best-effort try/except so nothing propagates. + """ + + judge_model: str + same_provider_caveat: bool + calibrated: bool # ALWAYS False for now (§14) + runs: int + metrics: dict = field(default_factory=dict) + details: dict = field(default_factory=dict) + errors: list[str] = field(default_factory=list) + + +# ── shared accessors ─────────────────────────────────────────────────────────── + + +def _evidence_index(result: dict) -> dict[str, dict]: + return {ev.get("id"): ev for ev in result.get("evidence_index", []) if ev.get("id")} + + +def _cited_excerpts(finding: dict, evidence_index: dict[str, dict]) -> list[str]: + """Excerpts of the evidence a finding cites (via evidence_refs.evidence_id).""" + out: list[str] = [] + for ref in finding.get("evidence_refs", []): + ev = evidence_index.get(ref.get("evidence_id", "")) + if ev: + excerpt = ev.get("excerpt") or "" + if excerpt: + out.append(excerpt) + return out + + +def _output_text(result: dict) -> str: + """All free text the model emitted: finding titles+descriptions + reports.""" + parts: list[str] = [] + for f in result.get("findings", []): + parts.append(f.get("title", "")) + parts.append(f.get("description", "")) + for r in result.get("reports", []): + parts.append(str(r)) + return "\n".join(p for p in parts if p) + + +def _workspace_intention(result: dict) -> str: + ws = result.get("workspace") or {} + return f"{ws.get('name', '')}\n{ws.get('description', '')}".strip() + + +def _coerce_float(value, default=None): + """Coerce a model-returned number/numeric-string to float; total (never raises). + + Returns ``default`` (None) on junk so one malformed vote drops that single + vote instead of discarding the whole metric. + """ + try: + return float(value) + except (TypeError, ValueError): + return default + + +def _map_chat(chat_fn, prompts, workers=1): + """Run a list of (system, user) chat prompts, returning ordered result dicts. + + ``workers <= 1`` calls ``chat_fn`` SEQUENTIALLY — byte-for-byte identical to + the in-line loops it replaces, INCLUDING letting a raise propagate (so + run_judge's per-metric try/except still drops that whole metric, the + behaviour the suite locks in). + + ``workers >= 2`` fans the calls out across a ThreadPoolExecutor while + PRESERVING input order in the returned list. Concurrency cannot let one + raising future poison the batch, so in that path a raising call's slot + becomes ``{}`` — the metric's aggregation degrades for that one vote but + never raises (the same best-effort contract as run_judge). + """ + prompts = list(prompts) + if workers <= 1: + return [chat_fn(system, user) for system, user in prompts] + + results: list[dict] = [{} for _ in prompts] + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: + futures = { + executor.submit(chat_fn, system, user): idx + for idx, (system, user) in enumerate(prompts) + } + for future in concurrent.futures.as_completed(futures): + idx = futures[future] + try: + results[idx] = future.result() + except Exception: # best-effort: a dropped vote, never a raise + results[idx] = {} + return results + + +# ── [D] DETERMINISTIC — no LLM, always available ──────────────────────────────── + + +def source_coverage(result: dict) -> dict: + """Distinct source documents cited by >=1 finding vs all source documents. + + Returns {cited, total, orphaned} where orphaned is the sorted list of + source stems present in evidence_index but cited by no finding. + """ + evidence_index = _evidence_index(result) + all_stems = { + source_stem(ev.get("locator", "")) + for ev in result.get("evidence_index", []) + if ev.get("locator") + } + cited_stems: set[str] = set() + for f in result.get("findings", []): + for ref in f.get("evidence_refs", []): + ev = evidence_index.get(ref.get("evidence_id", "")) + if ev and ev.get("locator"): + cited_stems.add(source_stem(ev["locator"])) + cited_stems &= all_stems + orphaned = sorted(all_stems - cited_stems) + return {"cited": len(cited_stems), "total": len(all_stems), "orphaned": orphaned} + + +def excerpt_fill_rate(result: dict) -> dict: + """Fraction of evidence_index entries with a non-empty excerpt. + + Returns {populated, total}. This is the signal behind older runs' low G3 + grounding: empty excerpts cannot ground anything. + """ + entries = result.get("evidence_index", []) + populated = sum(1 for ev in entries if (ev.get("excerpt") or "").strip()) + return {"populated": populated, "total": len(entries)} + + +# ── [E] EMBEDDING — needs embed_fn ─────────────────────────────────────────────── + + +def semantic_recovery( + result: dict, + registry, + lexical_missed_ids: list[str], + embed_fn, + tau: float = 0.70, +) -> dict: + """Context-recall: recover G2 lexical misses by embedding similarity. + + For each registry item flagged a LEXICAL MISS by G2, embed its + description+keywords and take the max cosine against the embeddings of every + finding description (and their cited excerpts). If max cosine >= tau the + item is counted semantically present (recovered). + + recovered_recall = (lexical_hits + recovered) / scored_denominator, where + the scored denominator is the count of non-NC items scored by G2 (real + items, matching G2's recall denominator family). Returns the lexical recall, + the recovered recall, the recovered item list (with cosine), and tau. + """ + missed = set(lexical_missed_ids or []) + real_items = registry.real_items + scored_items = [i for i in real_items if i.tier != "L3"] + denom = len(scored_items) or 1 + lexical_hits = sum(1 for i in scored_items if i.id not in missed) + + # Candidate texts the findings actually surfaced. + evidence_index = _evidence_index(result) + candidate_texts: list[str] = [] + for f in result.get("findings", []): + desc = f.get("description", "") + if desc: + candidate_texts.append(desc) + candidate_texts.extend(_cited_excerpts(f, evidence_index)) + + missed_items = [i for i in scored_items if i.id in missed] + if not missed_items or not candidate_texts: + recovered_recall = lexical_hits / denom + return { + "lexical_recall": round(lexical_hits / denom, 4), + "recovered_recall": round(recovered_recall, 4), + "recovered": [], + "tau": tau, + "scored_denominator": denom, + } + + item_texts = [f"{i.description} {' '.join(i.keywords)}".strip() for i in missed_items] + item_vecs = np.asarray(embed_fn(item_texts), dtype=np.float64) + cand_vecs = np.asarray(embed_fn(candidate_texts), dtype=np.float64) + + recovered: list[dict] = [] + for item, ivec in zip(missed_items, item_vecs): + best = max((cosine(ivec, cvec) for cvec in cand_vecs), default=0.0) + if best >= tau: + recovered.append({"id": item.id, "cosine": round(best, 4)}) + + recovered_recall = (lexical_hits + len(recovered)) / denom + return { + "lexical_recall": round(lexical_hits / denom, 4), + "recovered_recall": round(recovered_recall, 4), + "recovered": recovered, + "tau": tau, + "scored_denominator": denom, + } + + +# ── [J] JUDGE — needs chat_fn(system, user) -> dict ────────────────────────────── + + +def faithfulness(result: dict, chat_fn, *, workers: int = 1) -> dict: + """Entailment: does each finding's cited evidence SUPPORT its claim? + + Per (finding, cited-excerpts) pair, ask SUPPORTED / NOT_SUPPORTED. Returns + {supported, total, unsupported_ids}. Findings with no cited evidence are + counted as not-supported (nothing to entail against). + """ + evidence_index = _evidence_index(result) + findings = result.get("findings", []) + cited = [(f, _cited_excerpts(f, evidence_index)) for f in findings] + prompts = [ + ( + SYSTEM, + "Does the cited evidence span ENTAIL the claim made in this finding?\n" + 'Reply with ONLY {"verdict": "SUPPORTED" or "NOT_SUPPORTED", "reason": ""}.\n\n' + f"FINDING: {f.get('description', '')}\n" + f"CITED EVIDENCE: {' || '.join(excerpts)}", + ) + for f, excerpts in cited + if excerpts + ] + answers = iter(_map_chat(chat_fn, prompts, workers)) + supported = 0 + unsupported_ids: list[str] = [] + for f, excerpts in cited: + fid = f.get("id", "?") + if not excerpts: + unsupported_ids.append(fid) + continue + verdict = str(next(answers).get("verdict", "")).upper() + if verdict == "SUPPORTED": + supported += 1 + else: + unsupported_ids.append(fid) + return {"supported": supported, "total": len(findings), "unsupported_ids": unsupported_ids} + + +def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dict: + """Flag numbers/dates asserted in a finding that do NOT match its evidence. + + Closes the 45-days-vs-3-days gap. Returns {mismatches: [{finding_id, value, + source}], count}. + """ + evidence_index = _evidence_index(result) + scored = [ + (f, excerpts) + for f in result.get("findings", []) + if (excerpts := _cited_excerpts(f, evidence_index)) + ] + prompts = [ + ( + SYSTEM, + "List every specific number or date asserted in the FINDING that does " + "NOT match the CITED EVIDENCE.\n" + 'Reply with ONLY {"mismatches": [{"value": "", "source": ""}]}. ' + "Empty list if all match.\n\n" + f"FINDING: {f.get('description', '')}\n" + f"CITED EVIDENCE: {' || '.join(excerpts)}", + ) + for f, excerpts in scored + ] + answers = _map_chat(chat_fn, prompts, workers) + mismatches: list[dict] = [] + for (f, _excerpts), answer in zip(scored, answers): + for m in answer.get("mismatches", []) or []: + mismatches.append( + { + "finding_id": f.get("id", "?"), + "value": m.get("value", ""), + "source": m.get("source", ""), + } + ) + return {"mismatches": mismatches, "count": len(mismatches)} + + +def citation_relevance(result: dict, chat_fn, *, workers: int = 1) -> dict: + """Context precision: fraction of cited passages actually relevant to the claim. + + Per evidence_ref, ask yes/no relevance. precision = relevant / total_refs. + Returns {precision, relevant, total}; when total == 0 (no cited passages with + excerpts) precision is None — the kept ``total`` lets a reader tell "perfect" + apart from "nothing to score". + """ + evidence_index = _evidence_index(result) + prompts: list[tuple[str, str]] = [] + for f in result.get("findings", []): + desc = f.get("description", "") + for ref in f.get("evidence_refs", []): + ev = evidence_index.get(ref.get("evidence_id", "")) + if not ev: + continue + excerpt = ev.get("excerpt") or "" + if not excerpt: + continue + prompts.append( + ( + SYSTEM, + "Is this cited passage actually relevant to / used by this claim?\n" + 'Reply with ONLY {"relevant": "yes" or "no"}.\n\n' + f"CLAIM: {desc}\n" + f"CITED PASSAGE: {excerpt}", + ) + ) + answers = _map_chat(chat_fn, prompts, workers) + total = len(prompts) + relevant = sum(1 for a in answers if str(a.get("relevant", "")).lower() == "yes") + if not total: + return {"precision": None, "relevant": relevant, "total": total} + return {"precision": round(relevant / total, 4), "relevant": relevant, "total": total} + + +def nc_semantic_precision(result: dict, registry, chat_fn, *, workers: int = 1) -> dict: + """Count negative-control falsehoods the output asserts or endorses. + + Per NC item, ask yes/no whether the output endorses the false statement. + Returns {asserted, total, asserted_ids}. The semantic counterpart to the + inert deterministic NC check. + """ + output_text = _output_text(result) + nc_items = registry.nc_items + prompts = [ + ( + SYSTEM, + "Does the OUTPUT assert or endorse the following FALSE statement?\n" + 'Reply with ONLY {"asserted": "yes" or "no"}.\n\n' + f"FALSE STATEMENT: {item.description}\n" + f"OUTPUT:\n{output_text}", + ) + for item in nc_items + ] + answers = _map_chat(chat_fn, prompts, workers) + asserted_ids = [ + item.id for item, a in zip(nc_items, answers) if str(a.get("asserted", "")).lower() == "yes" + ] + return {"asserted": len(asserted_ids), "total": len(nc_items), "asserted_ids": asserted_ids} + + +def fabricated_entity(result: dict, chat_fn) -> dict: + """Count systems/orgs/metrics named in the output but absent from the corpus. + + Returns {count, entities}. The corpus universe is the set of evidence + excerpts + locators. + """ + output_text = _output_text(result) + corpus = "\n".join( + f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" + for ev in result.get("evidence_index", []) + ) + user = ( + "List any system, organization, or metric NAMED in the OUTPUT that does NOT " + "appear anywhere in the CORPUS EVIDENCE.\n" + 'Reply with ONLY {"fabricated": ["", ...]}. Empty list if none.\n\n' + f"OUTPUT:\n{output_text}\n\n" + f"CORPUS EVIDENCE:\n{corpus}" + ) + entities = chat_fn(SYSTEM, user).get("fabricated", []) or [] + return {"count": len(entities), "entities": list(entities)} + + +def contradiction(result: dict, chat_fn) -> dict: + """Count internally contradictory finding pairs. + + Returns {count, pairs}. pairs is the list of contradicting finding-id pairs + the judge reports. + """ + lines = [] + for f in result.get("findings", []): + lines.append(f"{f.get('id', '?')}: {f.get('title', '')} — {f.get('description', '')}") + user = ( + "Are any two of these FINDINGS mutually contradictory? List each contradicting pair.\n" + 'Reply with ONLY {"pairs": [["", ""], ...]}. Empty list if none.\n\n' + + "\n".join(lines) + ) + pairs = chat_fn(SYSTEM, user).get("pairs", []) or [] + return {"count": len(pairs), "pairs": [list(p) for p in pairs]} + + +def open_gap(result: dict, chat_fn) -> dict: + """G-Eval open probe: the most important process issue the output missed. + + Returns {gap} — a free-text advisory narrative (no score). + """ + pg = result.get("process_graph") or {} + pg_summary = f"process_graph has {len(pg.get('processes', []))} processes" + user = ( + "Given this corpus scope and output, what important process issue did the " + "output FAIL to surface?\n" + 'Reply with ONLY {"gap": ""}.\n\n' + f"WORKSPACE SCOPE: {_workspace_intention(result)}\n" + f"{pg_summary}\n" + f"OUTPUT:\n{_output_text(result)}" + ) + return {"gap": str(chat_fn(SYSTEM, user).get("gap", ""))} + + +def actionability(result: dict, chat_fn, *, workers: int = 1) -> dict: + """Average 0-1 rating of whether proposed actions are specific+quantified+linked. + + Returns {score, rated}. Each action is rated against whether it is specific, + quantified, and linked to a finding. + """ + actions = result.get("proposed_actions", []) or [] + finding_ids = {f.get("id") for f in result.get("findings", [])} + prompts = [ + ( + SYSTEM, + "Rate whether this proposed action is SPECIFIC, QUANTIFIED, and LINKED to a " + "finding.\n" + 'Reply with ONLY {"score": }.\n\n' + f"TITLE: {a.get('title', '')}\n" + f"DESCRIPTION: {a.get('description', '')}\n" + f"OWNER: {a.get('owner_persona', '')} HORIZON: {a.get('horizon', '')} " + f"LEVER: {a.get('lever', '')} EFFORT: {a.get('effort', '')}\n" + f"EXPECTED_SAVINGS_FTE: {a.get('expected_savings_fte', '')} " + f"EXPECTED_SAVINGS_USD: {a.get('expected_savings_usd', '')}\n" + f"LINKED_TO_FINDING: {a.get('finding_id') in finding_ids}", + ) + for a in actions + ] + answers = _map_chat(chat_fn, prompts, workers) + scores: list[float] = [] + for a in answers: + value = _coerce_float(a.get("score")) + if value is None: # malformed vote -> skip this action, keep the metric + continue + scores.append(value) + score = round(sum(scores) / len(scores), 4) if scores else None + return {"score": score, "rated": len(scores)} + + +def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict: + """Per-finding judgment of whether stated severity matches the evidence. + + Returns {miscalibrated, total, verdicts: {finding_id: under|over|calibrated}}. + """ + evidence_index = _evidence_index(result) + findings = result.get("findings", []) + prompts = [ + ( + SYSTEM, + "Does the STATED SEVERITY match what the CITED EVIDENCE supports?\n" + 'Reply with ONLY {"calibration": "under" or "over" or "calibrated"}.\n\n' + f"STATED SEVERITY: {f.get('severity', '')} SCORE: {f.get('score', '')}\n" + f"FINDING: {f.get('description', '')}\n" + f"CITED EVIDENCE: {' || '.join(_cited_excerpts(f, evidence_index))}", + ) + for f in findings + ] + answers = _map_chat(chat_fn, prompts, workers) + verdicts: dict[str, str] = {} + miscalibrated = 0 + for f, a in zip(findings, answers): + verdict = str(a.get("calibration", "calibrated")).lower() + verdicts[f.get("id", "?")] = verdict + if verdict in ("under", "over"): + miscalibrated += 1 + return {"miscalibrated": miscalibrated, "total": len(findings), "verdicts": verdicts} + + +def answer_relevancy(result: dict, chat_fn) -> dict: + """RAGAS-style: does the output address the stated workspace intention? + + Returns {score} in [0,1], or {"score": None} when the vote fails to coerce. + """ + user = ( + "Does the OUTPUT address the stated WORKSPACE INTENTION (on-topic, responsive)?\n" + 'Reply with ONLY {"score": }.\n\n' + f"WORKSPACE INTENTION: {_workspace_intention(result)}\n" + f"OUTPUT:\n{_output_text(result)}" + ) + return {"score": _coerce_float(chat_fn(SYSTEM, user).get("score"))} + + +def surface_deduplication(result: dict, chat_fn, *, workers: int = 1) -> dict: + """Fraction of near-duplicate process-graph node pairs that are genuinely distinct. + + Scoping rules: + - Processes: all pairs compared (cross-process is valid at this level). + - Activities and decisions: ONLY within the same parent process. The same + activity name appearing in two different processes is a legitimate repetition + (e.g. "Approve Request" in both Loan and Credit-Card flows), not a duplicate. + + For each surface, the top-10 most name-similar pairs (token-Jaccard >= 0.30) + are selected. For activities/decisions the parent process name is included in + the judge prompt so it can reason about intra-process context. 30 pairs total. + + Returns {distinct, redundant, total, distinct_rate, redundant_pairs}. + """ + pg = result.get("process_graph", {}) + procs = pg.get("processes", []) + + def _toks(node: dict) -> frozenset[str]: + return frozenset(node.get("name", "").lower().split()) + + PER_SURFACE_CAP = 10 + # candidates: (surface, node_a, node_b, parent_process_name) + candidates: list[tuple[str, dict, dict, str]] = [] + + # Processes: compare all pairs + if len(procs) >= 2: + pairs: list[tuple[float, dict, dict]] = [] + for i in range(len(procs)): + for j in range(i + 1, len(procs)): + a_t, b_t = _toks(procs[i]), _toks(procs[j]) + union = a_t | b_t + if not union: + continue + jac = len(a_t & b_t) / len(union) + if jac >= 0.30: + pairs.append((jac, procs[i], procs[j])) + pairs.sort(key=lambda x: x[0], reverse=True) + for _jac, a, b in pairs[:PER_SURFACE_CAP]: + candidates.append(("process", a, b, "")) + + # Activities and decisions: within the same parent process only + for surface_key, attr in (("activity", "activities"), ("decision", "decisions")): + all_pairs: list[tuple[float, dict, dict, str]] = [] + for proc in procs: + nodes = proc.get(attr, []) + proc_name = proc.get("name", "") + if len(nodes) < 2: + continue + for i in range(len(nodes)): + for j in range(i + 1, len(nodes)): + a_t, b_t = _toks(nodes[i]), _toks(nodes[j]) + union = a_t | b_t + if not union: + continue + jac = len(a_t & b_t) / len(union) + if jac >= 0.30: + all_pairs.append((jac, nodes[i], nodes[j], proc_name)) + all_pairs.sort(key=lambda x: x[0], reverse=True) + for _jac, a, b, proc_name in all_pairs[:PER_SURFACE_CAP]: + candidates.append((surface_key, a, b, proc_name)) + + if not candidates: + return {"distinct": 0, "redundant": 0, "total": 0, "distinct_rate": None, "redundant_pairs": []} + + prompts = [] + for surface, a, b, parent_proc in candidates: + ctx = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else "" + prompts.append(( + SYSTEM, + f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a " + f"duplicate / sub-case / restatement of the other?\n" + f"{ctx}" + 'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": ""}.\n\n' + f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n" + f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}", + )) + + answers = _map_chat(chat_fn, prompts, workers) + + distinct = 0 + redundant = 0 + redundant_pairs: list[dict] = [] + for (surface, a, b, _parent), answer in zip(candidates, answers): + verdict = str(answer.get("verdict", "")).upper() + if verdict == "DISTINCT": + distinct += 1 + else: + redundant += 1 + redundant_pairs.append({ + "surface": surface, + "a": a.get("name", ""), + "b": b.get("name", ""), + "reason": str(answer.get("reason", "")), + }) + + total = distinct + redundant + return { + "distinct": distinct, + "redundant": redundant, + "total": total, + "distinct_rate": round(distinct / total, 4) if total else None, + "redundant_pairs": redundant_pairs, + } + + +def comparative_vs_champion(result: dict, champion_result: dict, chat_fn) -> dict: + """Pairwise MT-Bench-style review of candidate vs champion (advisory only). + + Returns {candidate, champion, more_consistent} where candidate/champion are + 1-5 ratings on Coverage/Quality/Evidence/Actionability/Regression. Never + feeds G5. + """ + user = ( + "Score the CANDIDATE and the CHAMPION outputs on five axes (1-5 each): " + "Coverage, Quality, Evidence, Actionability, Regression. Then say which is " + "more internally consistent.\n" + "Reply with ONLY " + '{"candidate": {"coverage": x, "quality": x, "evidence": x, "actionability": x, "regression": x}, ' + '"champion": {"coverage": x, "quality": x, "evidence": x, "actionability": x, "regression": x}, ' + '"more_consistent": "candidate" or "champion"}.\n\n' + f"CANDIDATE:\n{_output_text(result)}\n\n" + f"CHAMPION:\n{_output_text(champion_result)}" + ) + out = chat_fn(SYSTEM, user) + return { + "candidate": out.get("candidate", {}), + "champion": out.get("champion", {}), + "more_consistent": out.get("more_consistent", ""), + } + + +# ── median-of-N for [J] metrics ────────────────────────────────────────────────── + + +def _numeric_leaves(d: dict) -> dict[tuple, float]: + """Flatten a metric dict to {path: float} over its FLOAT score-leaves only. + + Median applies to continuous scores only. A leaf counts as numeric-for-median + only when its value is a ``float``; ``bool`` and ``int`` leaves (counts, + denominators, 1-5 axes, and other bookkeeping) are deliberately skipped and + taken from the first run unchanged — this avoids fractional counts (rated=0.5) + and count/len(list) disagreement under runs>1 with an even N. + """ + out: dict[tuple, float] = {} + + def walk(node, path: tuple) -> None: + if isinstance(node, float): + out[path] = node + elif isinstance(node, dict): + for k, v in node.items(): + walk(v, path + (k,)) + + walk(d, ()) + return out + + +def _set_leaf(d: dict, path: tuple, value: float) -> None: + node = d + for key in path[:-1]: + node = node[key] + node[path[-1]] = value + + +def _median_runs(samples: list[dict]) -> dict: + """Median across N metric-dicts: FLOAT score-leaves -> per-key median; rest = first. + + Only continuous float scores are medianed; integer bookkeeping (counts, + denominators, 1-5 axes) and all non-numeric fields are taken from the first run. + """ + samples = [s for s in samples if isinstance(s, dict)] + if not samples: + return {} + base = samples[0] + if len(samples) == 1: + return base + leaf_values: dict[tuple, list[float]] = {} + for s in samples: + for path, val in _numeric_leaves(s).items(): + leaf_values.setdefault(path, []).append(val) + merged = dict(base) + for path, vals in leaf_values.items(): + try: + _set_leaf(merged, path, round(statistics.median(vals), 4)) + except (KeyError, TypeError): + continue + return merged + + +# ── orchestrator ───────────────────────────────────────────────────────────────── + + +def run_judge( + result: dict, + registry, + *, + judge_model: str, + runs: int = 1, + concurrency: int = 1, + pipeline_model: str = "", + champion_result: dict | None = None, + chat_fn=None, + embed_fn=None, + tau: float = 0.70, + lexical_missed_ids: list[str] | None = None, +) -> AdvisoryReport: + """Run the G4 advisory gate, best-effort. NEVER raises; NEVER affects verdict. + + If chat_fn / embed_fn are None, real ones are built from JudgeClient / + OllamaEmbedder (tests inject stubs instead). Each [J] metric runs `runs` + times and the median of its numeric scores is kept. Every metric is wrapped + in try/except: a failure appends to report.errors and the run continues. + + ``concurrency`` (opt-in, default 1) bounds the per-item [J] metrics' internal + fan-out: 1 keeps the sequential per-item loops; >=2 runs each metric's items + across a thread pool (order preserved). The median-of-N ``runs`` loop stays + sequential and the single-call metrics are unaffected. The result is + byte-for-byte identical at concurrency=1. + + Returns an AdvisoryReport (a plain dict carrier) with calibrated=False and + same_provider_caveat = same_provider(pipeline_model, judge_model). + """ + if chat_fn is None: + client = JudgeClient(judge_model) + chat_fn = client.chat_json + if embed_fn is None: + embed_fn = OllamaEmbedder().embed + + report = AdvisoryReport( + judge_model=judge_model, + same_provider_caveat=same_provider(pipeline_model, judge_model), + calibrated=False, + runs=runs, + ) + + def _run_det(name: str, fn) -> None: + try: + report.metrics[name] = fn() + except Exception as exc: # best-effort: never raise + report.errors.append(f"{name}: {type(exc).__name__}: {exc}") + + def _run_judge_metric(name: str, fn) -> None: + try: + samples = [fn() for _ in range(max(1, runs))] + report.metrics[name] = _median_runs(samples) + except Exception as exc: # best-effort: never raise + report.errors.append(f"{name}: {type(exc).__name__}: {exc}") + + # [D] deterministic — always computed, no LLM. + _run_det("source_coverage", lambda: source_coverage(result)) + _run_det("excerpt_fill_rate", lambda: excerpt_fill_rate(result)) + + # [E] embedding — context recall. + _run_det( + "semantic_recovery", + lambda: semantic_recovery(result, registry, lexical_missed_ids or [], embed_fn, tau=tau), + ) + + # [J] judge — median-of-N. Per-item metrics fan out at workers=concurrency. + _run_judge_metric("faithfulness", lambda: faithfulness(result, chat_fn, workers=concurrency)) + _run_judge_metric( + "numeric_temporal_fidelity", + lambda: numeric_temporal_fidelity(result, chat_fn, workers=concurrency), + ) + _run_judge_metric( + "citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency) + ) + _run_judge_metric( + "nc_semantic_precision", + lambda: nc_semantic_precision(result, registry, chat_fn, workers=concurrency), + ) + _run_judge_metric("fabricated_entity", lambda: fabricated_entity(result, chat_fn)) + _run_judge_metric("contradiction", lambda: contradiction(result, chat_fn)) + _run_judge_metric("open_gap", lambda: open_gap(result, chat_fn)) + _run_judge_metric("actionability", lambda: actionability(result, chat_fn, workers=concurrency)) + _run_judge_metric( + "severity_calibration", + lambda: severity_calibration(result, chat_fn, workers=concurrency), + ) + _run_judge_metric("answer_relevancy", lambda: answer_relevancy(result, chat_fn)) + _run_judge_metric( + "surface_deduplication", + lambda: surface_deduplication(result, chat_fn, workers=concurrency), + ) + if champion_result is not None: + _run_judge_metric( + "comparative_vs_champion", + lambda: comparative_vs_champion(result, champion_result, chat_fn), + ) + + return report diff --git a/fireflyframework_agentic/evaluation/judge_client.py b/fireflyframework_agentic/evaluation/judge_client.py new file mode 100644 index 00000000..1af17f53 --- /dev/null +++ b/fireflyframework_agentic/evaluation/judge_client.py @@ -0,0 +1,454 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Provider-agnostic LLM-as-a-Judge client for the G4 advisory gate. + +Zero new dependencies: stdlib (urllib.request, json, os, time, re) + numpy. +The client is a thin POST wrapper over four chat providers (Anthropic, OpenAI, +Azure OpenAI, Ollama) plus an Ollama embedder. It is deliberately tolerant: +chat_json extracts the FIRST JSON object from the model text (models wrap JSON +in prose / code fences), and retries transient HTTP errors with backoff. + +This module is import-safe: importing it touches NO network and reads NO API +key. Keys are read lazily, per-call, only when a real request is made — so the +judge tests can import and inject stubs without any secret present. + +Provider/model spec format: ":", e.g. "anthropic:claude-sonnet-4-6", +"openai:gpt-4o", "azure:gpt-4o", "ollama:llama3". A bare model with no prefix is +treated as provider "unknown" (see parse_model / same_provider). +""" + +from __future__ import annotations + +import json +import os +import re +import time +import urllib.error +import urllib.request + +import numpy as np + +# Transient HTTP status codes worth retrying (rate limit + 5xx). +_RETRY_STATUS = (429, 500, 502, 503, 504) + +# Hard cap on a honoured Retry-After sleep (a hostile header should not stall us). +_MAX_RETRY_AFTER = 30.0 + + +def _env(name, default=None): + """Read an env var, stripping surrounding whitespace; empty-after-strip -> default. + + Defensive against a ``.env`` value that arrives with a trailing ``\\r`` / + whitespace (CRLF), which would otherwise corrupt a request URL or header. + An unset OR blank value falls back to ``default`` so the existing + missing-key -> RuntimeError behaviour is preserved. + """ + value = os.environ.get(name) + if value is None: + return default + value = value.strip() + return value if value else default + + +def _retry_delay(exc: urllib.error.HTTPError, attempt: int) -> float: + """Seconds to sleep before retrying an HTTPError. + + On 429 honour the ``Retry-After`` header (capped at 30s) when it is present + and numeric; otherwise fall back to exponential backoff (2 ** attempt). + """ + if exc.code == 429: + headers = getattr(exc, "headers", None) + retry_after = headers.get("retry-after") if headers is not None else None + if retry_after is not None: + try: + return min(float(retry_after), _MAX_RETRY_AFTER) + except (TypeError, ValueError): + pass + return 2.0**attempt + + +def parse_model(spec: str) -> tuple[str, str]: + """Split a "provider:model" spec into (provider, model). + + A bare spec with no ':' is returned as provider "unknown" with the whole + string as the model, e.g. "claude-sonnet-4-6" -> ("unknown", "claude-sonnet-4-6"). + The provider is lower-cased; the model keeps its original case. + """ + spec = (spec or "").strip() + if ":" not in spec: + return "unknown", spec + provider, model = spec.split(":", 1) + return provider.strip().lower(), model.strip() + + +def same_provider(pipeline_model: str, judge_model: str) -> bool: + """True iff both specs name the SAME known provider prefix. + + A missing or "unknown" provider on either side -> not-same (False). This is + the same-provider caveat signal: when the judge and the pipeline share a + provider the judged metrics are advisory (no cross-provider isolation). + """ + p_provider, _ = parse_model(pipeline_model) + j_provider, _ = parse_model(judge_model) + if p_provider == "unknown" or j_provider == "unknown": + return False + return p_provider == j_provider + + +def _first_json_object(text: str) -> dict: + """Extract and parse the FIRST balanced JSON object embedded in text. + + Models wrap JSON in prose, preambles, or ```json code fences. This scans + for the first '{' and walks the string tracking brace depth (string-aware, + so braces inside quoted values do not confuse the matcher) to find its + matching '}'. Falls back to a greedy regex span if no balanced object is + found. Raises ValueError when nothing parses. + """ + if not text: + raise ValueError("empty model response") + + # Fast path: a clean JSON object with no surrounding prose. A non-dict + # clean parse (e.g. a top-level array) is intentionally ignored so the brace + # scanner can still find an embedded object rather than returning arr[0]. + try: + parsed = json.loads(text.strip()) + except (json.JSONDecodeError, ValueError): + parsed = None + if isinstance(parsed, dict): + return parsed + + start = text.find("{") + while start != -1: + depth = 0 + in_string = False + escape = False + for i in range(start, len(text)): + ch = text[i] + if in_string: + if escape: + escape = False + elif ch == "\\": + escape = True + elif ch == '"': + in_string = False + continue + if ch == '"': + in_string = True + elif ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + candidate = text[start : i + 1] + try: + return json.loads(candidate) + except json.JSONDecodeError: + break # try the next '{' + start = text.find("{", start + 1) + + # Greedy fallback: first '{' .. last '}' across newlines. + match = re.search(r"\{.*\}", text, re.DOTALL) + if match: + return json.loads(match.group(0)) + raise ValueError("no JSON object found in model response") + + +def _http_post_json(url: str, headers: dict, body: dict, timeout: int) -> dict: + """POST a JSON body and return the parsed JSON response (single attempt).""" + data = json.dumps(body).encode("utf-8") + req_headers = {"content-type": "application/json", **headers} + req = urllib.request.Request(url, data=data, headers=req_headers, method="POST") + with urllib.request.urlopen(req, timeout=timeout) as resp: + return json.loads(resp.read().decode("utf-8")) + + +def _extract_openai_text(resp: dict) -> str: + """Pull the assistant text from an OpenAI/Azure chat-completions response. + + Guards an empty ``choices`` list and a null ``message.content`` and raises a + descriptive RuntimeError (not a KeyError) when no text is present, so the + judge layer records a clean dropped-vote reason instead of a stack trace. + """ + choices = resp.get("choices") or [] + if choices: + text = (choices[0].get("message") or {}).get("content") + if text: + return text + raise RuntimeError(f"judge returned no text: {resp}") + + +class JudgeClient: + """Minimal multi-provider chat client returning parsed JSON dicts. + + Dispatch is by the provider prefix of the model spec. temperature is pinned + to 0.0 for deterministic verdicts. Transient HTTP errors (429/5xx) and URL + errors are retried up to max_retries: a 429 honours the ``Retry-After`` + header (capped at 30s) when present, otherwise backoff is exponential + (2 ** attempt seconds). + + The API key / endpoint env vars are read lazily inside chat_json, so + constructing a JudgeClient never requires a secret. + """ + + def __init__(self, model: str, timeout: int = 120, max_retries: int = 3) -> None: + self.model_spec = model + self.provider, self.model = parse_model(model) + self.timeout = timeout + self.max_retries = max_retries + + def chat_json(self, system: str, user: str, max_tokens: int = 1024) -> dict: + """Send (system, user) to the provider and parse the first JSON object. + + Raises on exhausted retries / unknown provider / unparseable output. + The judge module wraps every call in try/except, so a raise here becomes + a dropped vote rather than a crash. + """ + last_exc: Exception | None = None + for attempt in range(self.max_retries): + try: + text = self._dispatch(system, user, max_tokens) + return _first_json_object(text) + except urllib.error.HTTPError as exc: + last_exc = exc + if exc.code not in _RETRY_STATUS or attempt == self.max_retries - 1: + raise + time.sleep(_retry_delay(exc, attempt)) + except (urllib.error.URLError, TimeoutError, ConnectionError) as exc: + last_exc = exc + if attempt == self.max_retries - 1: + raise + time.sleep(2**attempt) + if last_exc is not None: + raise last_exc + raise RuntimeError("chat_json exhausted retries without a response") + + def _dispatch(self, system: str, user: str, max_tokens: int) -> str: + """Route to the per-provider call and return the raw model text.""" + if self.provider == "anthropic": + return self._anthropic(system, user, max_tokens) + if self.provider == "openai": + return self._openai(system, user, max_tokens) + if self.provider == "azure": + return self._azure(system, user, max_tokens) + if self.provider == "ollama": + return self._ollama(system, user, max_tokens) + raise ValueError( + f"unknown judge provider {self.provider!r} in {self.model_spec!r}; " + "use anthropic:/openai:/azure:/ollama:" + ) + + def _anthropic(self, system: str, user: str, max_tokens: int) -> str: + api_key = _env("ANTHROPIC_API_KEY") + if not api_key: + raise RuntimeError("ANTHROPIC_API_KEY not set") + body = { + "model": self.model, + "max_tokens": max_tokens, + "temperature": 0.0, + "system": system, + "messages": [{"role": "user", "content": user}], + } + headers = {"x-api-key": api_key, "anthropic-version": "2023-06-01"} + resp = _http_post_json("https://api.anthropic.com/v1/messages", headers, body, self.timeout) + text = next( + (b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None + ) + if not text: + raise RuntimeError(f"judge returned no text: {resp}") + return text + + def _openai(self, system: str, user: str, max_tokens: int) -> str: + api_key = _env("OPENAI_API_KEY") + if not api_key: + raise RuntimeError("OPENAI_API_KEY not set") + body = { + "model": self.model, + "max_tokens": max_tokens, + "temperature": 0.0, + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + } + headers = {"Authorization": f"Bearer {api_key}"} + resp = _http_post_json( + "https://api.openai.com/v1/chat/completions", headers, body, self.timeout + ) + return _extract_openai_text(resp) + + def _azure(self, system: str, user: str, max_tokens: int) -> str: + endpoint = _env("AZURE_OPENAI_ENDPOINT") + api_key = _env("AZURE_OPENAI_API_KEY") + if not endpoint: + raise RuntimeError("AZURE_OPENAI_ENDPOINT not set") + if not api_key: + raise RuntimeError("AZURE_OPENAI_API_KEY not set") + api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01" + # Azure deployment lives in the URL path, not the JSON body. + url = ( + f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions" + f"?api-version={api_version}" + ) + body = { + "max_tokens": max_tokens, + "temperature": 0.0, + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + } + headers = {"api-key": api_key} + resp = _http_post_json(url, headers, body, self.timeout) + return _extract_openai_text(resp) + + def _ollama(self, system: str, user: str, max_tokens: int) -> str: + host = _env("OLLAMA_HOST") or "http://localhost:11434" + body = { + "model": self.model, + "stream": False, + "options": {"temperature": 0.0, "num_predict": max_tokens}, + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + } + resp = _http_post_json(f"{host.rstrip('/')}/api/chat", {}, body, self.timeout) + text = (resp.get("message") or {}).get("content") + if not text: + raise RuntimeError(f"judge returned no text: {resp}") + return text + + +class OpenAIEmbedder: + """OpenAI embeddings client over /v1/embeddings. + + Reads OPENAI_API_KEY from the environment. Default model: text-embedding-3-small. + """ + + def __init__(self, model: str = "text-embedding-3-small", timeout: int = 60) -> None: + self.model = model + self.timeout = timeout + + def embed(self, texts: list[str]) -> np.ndarray: + api_key = _env("OPENAI_API_KEY") + if not api_key: + raise RuntimeError("OPENAI_API_KEY not set") + headers = {"Authorization": f"Bearer {api_key}"} + body = {"model": self.model, "input": texts} + resp = _http_post_json("https://api.openai.com/v1/embeddings", headers, body, self.timeout) + data = resp.get("data", []) + vectors = [item["embedding"] for item in sorted(data, key=lambda x: x["index"])] + return np.asarray(vectors, dtype=np.float32) + + +class AzureOpenAIEmbedder: + """Azure OpenAI embeddings client. + + Reads AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, and optionally + AZURE_OPENAI_API_VERSION from the environment. The model name is the + deployment name. Default model: text-embedding-3-small. + """ + + def __init__(self, model: str = "text-embedding-3-small", timeout: int = 60) -> None: + self.model = model + self.timeout = timeout + + def embed(self, texts: list[str]) -> np.ndarray: + endpoint = _env("AZURE_OPENAI_ENDPOINT") + api_key = _env("AZURE_OPENAI_API_KEY") + if not endpoint: + raise RuntimeError("AZURE_OPENAI_ENDPOINT not set") + if not api_key: + raise RuntimeError("AZURE_OPENAI_API_KEY not set") + api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01" + url = ( + f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings" + f"?api-version={api_version}" + ) + headers = {"api-key": api_key} + vectors = self._embed_with_split(texts, url, headers) + return np.asarray(vectors, dtype=np.float32) + + def _embed_with_split(self, texts: list[str], url: str, headers: dict) -> list[list[float]]: + """Send texts in one request; on HTTP 400 split in half and retry each half.""" + try: + resp = _http_post_json(url, headers, {"input": texts}, self.timeout) + data = resp.get("data", []) + return [item["embedding"] for item in sorted(data, key=lambda x: x["index"])] + except urllib.error.HTTPError as exc: + if exc.code == 400 and len(texts) > 1: + mid = len(texts) // 2 + left = self._embed_with_split(texts[:mid], url, headers) + right = self._embed_with_split(texts[mid:], url, headers) + return left + right + raise + + +class OllamaEmbedder: + """Local Ollama embedding client (default model bge-m3) over /api/embeddings. + + Posts one prompt per call (the stable single-prompt form) and stacks the + returned vectors into a 2-D numpy array. Constructing it touches no network; + the host is resolved from $OLLAMA_HOST at call time. + """ + + def __init__(self, model: str = "bge-m3", host: str | None = None, timeout: int = 60) -> None: + self.model = model + self.host = (host or _env("OLLAMA_HOST") or "http://localhost:11434").rstrip("/") + self.timeout = timeout + + def embed(self, texts: list[str]) -> np.ndarray: + """Embed a list of strings -> float32 ndarray of shape (len(texts), dim).""" + vectors: list[list[float]] = [] + for text in texts: + body = {"model": self.model, "prompt": text} + resp = _http_post_json(f"{self.host}/api/embeddings", {}, body, self.timeout) + vectors.append(resp["embedding"]) + return np.asarray(vectors, dtype=np.float32) + + +def build_embedder(spec: str): + """Return an ``embed_fn(list[str]) -> np.ndarray`` for an embedder spec. + + Dispatch is on the provider prefix of a ":" spec: + - "ollama" / "ollama:" -> OllamaEmbedder(model or "bge-m3").embed. + - a bare "" with no ':' -> treated as an Ollama model. + - any other provider -> NotImplementedError (the extension point). + + Add a new backend by adding a branch here. + """ + if (spec or "").strip() == "ollama": # bare provider, no model -> default model + return OllamaEmbedder("bge-m3").embed + provider, model = parse_model(spec) + if provider in ("unknown", "ollama"): # bare "" or "ollama:" + return OllamaEmbedder(model or "bge-m3").embed + if provider == "openai": + return OpenAIEmbedder(model or "text-embedding-3-small").embed + if provider == "azure": + return AzureOpenAIEmbedder(model or "text-embedding-3-small").embed + raise NotImplementedError( + f"embedder backend {provider!r} not implemented yet; add it in build_embedder()" + ) + + +def cosine(a, b) -> float: + """Cosine similarity between two 1-D vectors; 0.0 if either is the zero vector.""" + a = np.asarray(a, dtype=np.float64).ravel() + b = np.asarray(b, dtype=np.float64).ravel() + na = float(np.linalg.norm(a)) + nb = float(np.linalg.norm(b)) + if na == 0.0 or nb == 0.0: + return 0.0 + return float(np.dot(a, b) / (na * nb)) diff --git a/fireflyframework_agentic/evaluation/matcher.py b/fireflyframework_agentic/evaluation/matcher.py index 2f5065df..b4d81f44 100644 --- a/fireflyframework_agentic/evaluation/matcher.py +++ b/fireflyframework_agentic/evaluation/matcher.py @@ -29,12 +29,7 @@ import numpy as np - -def cosine(a, b) -> float: - """Cosine similarity between two vectors.""" - a = np.asarray(a, dtype=float) - b = np.asarray(b, dtype=float) - return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9)) +from fireflyframework_agentic.evaluation.judge_client import cosine def tokens(text: str) -> list[str]: From 1906ede934bb82cca1b127341a2f457a66e59a3c Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:56:09 +0200 Subject: [PATCH 07/48] feat(evaluation): add champion tracking and flyeval CLI (#274) * feat(evaluation): add ChampionRecord and champion management functions * feat(evaluation): add run_config_snapshot for flyradar run configuration capture * feat(evaluation): add flyeval CLI with gate, aa-band, day-zero, invalidate subcommands --------- Co-authored-by: miguelgfierro --- .../evaluation/champion.py | 169 ++++++ fireflyframework_agentic/evaluation/cli.py | 573 ++++++++++++++++++ .../evaluation/run_config_snapshot.py | 160 +++++ 3 files changed, 902 insertions(+) create mode 100644 fireflyframework_agentic/evaluation/champion.py create mode 100644 fireflyframework_agentic/evaluation/cli.py create mode 100644 fireflyframework_agentic/evaluation/run_config_snapshot.py diff --git a/fireflyframework_agentic/evaluation/champion.py b/fireflyframework_agentic/evaluation/champion.py new file mode 100644 index 00000000..239429eb --- /dev/null +++ b/fireflyframework_agentic/evaluation/champion.py @@ -0,0 +1,169 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Per-corpus champion management. + +Champions are per-corpus — mode 2A (conformance) and mode 2B (extraction) +metrics live in incommensurable spaces. There is no global champion. +See EVALUATION_FRAMEWORK.md (per-corpus champions). + +The historical fake-100% incident: banca-cordobesa/baseline.json was populated +with a champion scored against an EMPTY must-find registry. The EMPTY_MUST_FIND +guard in G1 prevents a recurrence; the invalidate_champion() function provides +the corrective action when it does happen. +""" + +from __future__ import annotations + +import hashlib +import json +from dataclasses import dataclass, field +from pathlib import Path + + +@dataclass +class ChampionRecord: + """Per-corpus champion, stored as 'champion' key in baseline.json.""" + + corpus: str + run_id: str + model_id: str + registry_sha256: str + scores: dict # {metric_name: float} + aa_noise: dict = field(default_factory=dict) # {metric_name: noise_floor} + is_day_zero: bool = False + human_sign_offs: list[str] = field(default_factory=list) + config: dict = field(default_factory=dict) # evaluation config snapshot + corpus_sha256: str = "" # pin of the evidence corpus the champion was verified against + + def primary_metric(self) -> str: + return next(iter(self.scores)) if self.scores else "" + + def primary_score(self) -> float: + return float(self.scores.get(self.primary_metric(), 0.0)) + + +def load_champion(baseline_path: str | Path) -> ChampionRecord | None: + """Load the current per-corpus champion from baseline.json. + + Returns None when: + - The file does not exist (normal Day-Zero state). + - The file exists but 'champion' is null (post-invalidation state). + """ + path = Path(baseline_path) + if not path.exists(): + return None + raw = json.loads(path.read_text(encoding="utf-8")) + champ_raw = raw.get("champion") + if champ_raw is None: + return None + return ChampionRecord( + corpus=champ_raw["corpus"], + run_id=champ_raw["run_id"], + model_id=champ_raw["model_id"], + registry_sha256=champ_raw["registry_sha256"], + scores=champ_raw.get("scores", {}), + aa_noise=champ_raw.get("aa_noise", {}), + is_day_zero=champ_raw.get("is_day_zero", False), + human_sign_offs=champ_raw.get("human_sign_offs", []), + config=champ_raw.get("config", {}), + corpus_sha256=champ_raw.get("corpus_sha256", ""), + ) + + +def save_champion( + baseline_path: str | Path, + champion: ChampionRecord, + *, + summary: str = "", + date: str = "", +) -> None: + """Persist a new champion and append a promotion log entry. + + Reads the existing file if it exists (to preserve the log), then writes + the new champion. The promotion log is append-only. + """ + path = Path(baseline_path) + if path.exists(): + raw = json.loads(path.read_text(encoding="utf-8")) + log = raw.get("promotion_log", []) + prev_run = raw.get("champion", {}) + prev_run_id = prev_run.get("run_id") if isinstance(prev_run, dict) else None + else: + log = [] + prev_run_id = None + + log.append( + { + "date": date or "unknown", + "from": prev_run_id, + "to": champion.run_id, + "label": "day-zero" if champion.is_day_zero else "promotion", + "summary": summary, + } + ) + + payload = { + "champion": { + "corpus": champion.corpus, + "run_id": champion.run_id, + "model_id": champion.model_id, + "registry_sha256": champion.registry_sha256, + "scores": champion.scores, + "aa_noise": champion.aa_noise, + "is_day_zero": champion.is_day_zero, + "human_sign_offs": champion.human_sign_offs, + "config": champion.config, + "corpus_sha256": champion.corpus_sha256, + }, + "promotion_log": log, + } + path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") + + +def invalidate_champion( + baseline_path: str | Path, + *, + reason: str, + date: str = "", +) -> None: + """Null out the current champion and record the invalidation reason. + + Used when a champion was locked in against an empty or tampered registry + (the banca-cordobesa fake-100% incident). + """ + path = Path(baseline_path) + if not path.exists(): + return + raw = json.loads(path.read_text(encoding="utf-8")) + log = raw.get("promotion_log", []) + prev_run = raw.get("champion", {}) + prev_run_id = prev_run.get("run_id") if isinstance(prev_run, dict) else None + log.append( + { + "date": date or "unknown", + "from": prev_run_id, + "to": None, + "label": "INVALIDATED", + "summary": reason, + } + ) + raw["champion"] = None + raw["promotion_log"] = log + path.write_text(json.dumps(raw, indent=2, ensure_ascii=False), encoding="utf-8") + + +def input_hash(result_dict: dict) -> str: + """Stable 16-char SHA-256 prefix of the DiscoveryResult for provenance.""" + canonical = json.dumps(result_dict, sort_keys=True, ensure_ascii=False) + return hashlib.sha256(canonical.encode("utf-8")).hexdigest()[:16] diff --git a/fireflyframework_agentic/evaluation/cli.py b/fireflyframework_agentic/evaluation/cli.py new file mode 100644 index 00000000..7ac868d9 --- /dev/null +++ b/fireflyframework_agentic/evaluation/cli.py @@ -0,0 +1,573 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""flyeval — FlyRadar Lean Core evaluation CLI. + +Usage +----- + flyeval gate --result R.json --registry REG.json [--baseline B.json] [--judge-model P:M] + flyeval aa-band --results R1.json R2.json ... --registry REG.json + flyeval day-zero --result R.json --registry REG.json --baseline B.json --signoffs 2 + flyeval invalidate --baseline B.json --reason "..." + +The deterministic gates G1-G3 + G5 (human sign-off) decide the verdict: every +subcommand exits 0 on PROMOTE, 1 on HOLD. G4 (the --judge-model LLM-as-a-Judge, +on by default, --no-judge to skip) is non-blocking — it prints advisory signals +and never changes the verdict or the exit code. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import sys +from pathlib import Path + +from fireflyframework_agentic.evaluation import __version__ +from fireflyframework_agentic.evaluation.champion import ( + ChampionRecord, + invalidate_champion, + load_champion, + save_champion, +) +from fireflyframework_agentic.evaluation.corpus import load_corpus +from fireflyframework_agentic.evaluation.gates import g2_recall_precision, run_gates +from fireflyframework_agentic.evaluation.judge import run_judge +from fireflyframework_agentic.evaluation.judge_client import build_embedder +from fireflyframework_agentic.evaluation.matcher import matches +from fireflyframework_agentic.evaluation.registry import load_registry +from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict as get_verdict +from fireflyframework_agentic.evaluation.stats import aa_band, left_skew_flag + + +def _load_json(path: str) -> dict: + return json.loads(Path(path).read_text(encoding="utf-8")) + + +def _lexical_missed_ids(result: dict, registry) -> list[str]: + """Scored (non-L3) real-item ids matched by no finding — the G2 lexical misses G4 recovers.""" + evidence_index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")} + findings = result.get("findings", []) + scored = [i for i in registry.real_items if i.tier != "L3"] + return [i.id for i in scored if not any(matches(f, i, evidence_index) for f in findings)] + + +def _read_experiment_config(result_path: str) -> dict | None: + """Read the experiment_configuration.json recorded next to the run's output.json. + + The experiment config records how the run was generated; it is authored by the + generation step at run time. Evaluation only reads it for display and never + writes or overwrites it. Returns None when the run has no recorded config. + """ + path = Path(result_path).parent / "experiment_configuration.json" + if not path.exists(): + return None + return json.loads(path.read_text(encoding="utf-8")) + + +def _write_eval_config(result_path: str, config: dict) -> Path: + """Write evaluation_configuration.json next to the run's output.json. + + The evaluation config is authored by flyeval at gate time (registry/corpus SHAs, + recall metric, floors, judge settings), so unlike the experiment config it is + owned here and safe to (over)write each run. It mirrors the block embedded in + the scorecard, as a machine-readable artifact. + """ + path = Path(result_path).parent / "evaluation_configuration.json" + path.write_text(json.dumps(config, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + return path + + +def _eval_config(args, registry, corpus=None) -> dict: + """Capture the run's evaluation configuration for provenance. + + Uses getattr defaults so it works for both `gate` (has every flag) and + `day-zero` (lacks the gate-only flags, falling back to the lexical/no-judge + defaults, which honestly reflects how day-zero scores). + """ + jm = getattr(args, "judge_model", None) + baseline = getattr(args, "baseline", None) + tau = getattr(args, "tau", 0.70) + return { + "evaluator_version": __version__, + "registry_sha256": registry.sha256(), + "corpus_sha256": corpus.sha256 if corpus else None, + "model_id": getattr(args, "model_id", None) or "unknown", + "gates": { + "G1": { + "name": "Structural & Safe", + "pii_list": getattr(args, "pii_list", None) or [], + "metrics": { + "empty_must_find": "registry has >=1 must-find item; guards the fake-100% " + "champion (EMPTY_MUST_FIND)", + "registry_sha256_pin": "loaded registry matches its file hash (GOLD_DRIFT)", + "corpus_sha256_pin": "corpus matches its hash when supplied (CORPUS_DRIFT)", + "schema_valid": "required top-level keys present in the result " + "(SCHEMA_INVALID)", + "pii_non_disclosure": "no corpus PII name appears in any finding/report text " + "(PII_LEAK)", + }, + }, + "G2": { + "name": "Recall & Precision", + "recall_metric": getattr(args, "recall_metric", "lexical"), + "recall_floor": getattr(args, "recall_floor", 0.70), + "tau": tau, + "tau_nc": getattr(args, "tau_nc", 0.85), + "embedder": getattr(args, "embedder", None), + "metrics": { + "lexical_recall": "token-overlap recall (always reported)", + "semantic_recall": "embedding-similarity recall at >= tau (needs embedder)", + "hybrid_recall": "per item, a lexical OR semantic match", + "per_tier_recall": "hit/total per tier L0-L3; an L0 miss blocks", + "nc_precision": "negative-control items wrongly emitted; an NC hit blocks", + "finding_redundancy_rate": "fraction of findings duplicating another's topic", + }, + }, + "G3": { + "name": "Grounded", + "grounding_floor": getattr(args, "grounding_floor", 0.90), + "human_spot_check_n": 5, + "corpus_verification": corpus is not None, + "metrics": { + "grounding_pct": "findings whose cited excerpt shares a topic token; blocks " + "below grounding_floor", + "evidence_verified": "cited excerpts located in the actual corpus " + "(when supplied)", + "evidence_fabricated": "populated excerpts not found in their cited source " + "(EVIDENCE_FABRICATED)", + "evidence_source_unknown": "locators resolving to no corpus document " + "(EVIDENCE_SOURCE_UNKNOWN)", + "excerpt_fill_rate": "evidence entries carrying a populated excerpt", + "source_coverage": "distinct corpus documents cited", + }, + }, + "G4": { + "name": "LLM Judge (advisory, non-blocking)", + "judge_model": jm, + "judge_runs": getattr(args, "judge_runs", 1) if jm else None, + "judge_concurrency": getattr(args, "judge_concurrency", 1) if jm else None, + "judge_temperature": 0.0 if jm else None, + "tau": tau if jm else None, + "metrics": { + "faithfulness": "each finding's claim entailed by its cited evidence", + "numeric_temporal_fidelity": "numbers and dates in findings match the evidence", + "citation_relevance": "cited evidence refs are on-topic (context precision)", + "nc_semantic_precision": "negative-control items semantically asserted", + "fabricated_entity": "named entities absent from the corpus", + "contradiction": "findings contradicting the evidence or each other", + "open_gap": "a consequential issue the output failed to surface", + "actionability": "proposed actions are specific and actionable", + "severity_calibration": "stated severity matches the evidence", + "answer_relevancy": "output addresses the workspace intention", + "source_coverage": "distinct corpus documents cited (deterministic)", + "excerpt_fill_rate": "evidence entries with a populated excerpt " + "(deterministic)", + }, + }, + "G5": { + "name": "No-regression / promotion", + "is_day_zero": baseline is None, + "human_signed_off": getattr(args, "human_signed_off", False), + "signoffs": getattr(args, "signoffs", 0), + "baseline": baseline, + "baseline_sha256": _file_sha256(baseline) if baseline else None, + "metrics": { + "improvements": "metrics beating the champion by more than the AA noise band", + "regressions": "metrics that regressed versus the champion", + "noise_band": "per-metric AA noise floor a candidate must exceed", + "guardrail_regression": "any guardrail metric that dropped", + "signoffs": "independent human sign-offs recorded", + }, + }, + }, + } + + +def _file_sha256(path: str) -> str | None: + """SHA-256 of a file's bytes, or None when it can't be read.""" + try: + return hashlib.sha256(Path(path).read_bytes()).hexdigest() + except OSError: + return None + + +# ── gate ────────────────────────────────────────────────────────────────────── + + +def cmd_gate(args: argparse.Namespace) -> int: + if getattr(args, "no_judge", False): + args.judge_model = None # explicit opt-out; G4 runs by default otherwise + result = _load_json(args.result) + registry = load_registry(args.registry) + corpus = load_corpus(args.corpus) if args.corpus else None + champion = load_champion(args.baseline) if args.baseline else None + champion_scores = champion.scores if champion else None + aa_noise = champion.aa_noise if champion else None + + embed_fn = build_embedder(args.embedder) if args.embedder else None + + if args.recall_metric in ("hybrid", "semantic") and embed_fn is None: + print( + f"ERROR: --recall-metric {args.recall_metric} requires --embedder.\n" + " Example: --embedder openai:text-embedding-3-small", + file=sys.stderr, + ) + return 2 + + gate_results = run_gates( + result, + registry, + args.registry, + pii_list=args.pii_list or [], + recall_floor=args.recall_floor, + grounding_floor=args.grounding_floor, + champion_scores=champion_scores, + aa_noise=aa_noise, + is_day_zero=(champion is None), + human_signed_off=args.human_signed_off, + signoff_count=args.signoffs, + embed_fn=embed_fn, + tau=args.tau, + recall_metric=args.recall_metric, + tau_nc=args.tau_nc, + corpus=corpus, + ) + + # G4 — on by default, non-blocking. Skipped only with --no-judge; never affects the verdict. + advisory = None + if args.judge_model: + champion_result = _load_json(args.champion_result) if args.champion_result else None + advisory = run_judge( + result, + registry, + judge_model=args.judge_model, + runs=args.judge_runs, + concurrency=args.judge_concurrency, + pipeline_model=args.model_id or "", + champion_result=champion_result, + embed_fn=embed_fn, + tau=args.tau, + lexical_missed_ids=_lexical_missed_ids(result, registry), + ) + + config = _eval_config(args, registry, corpus) + _write_eval_config(args.result, config) + experiment_config = _read_experiment_config(args.result) + scorecard = render_scorecard( + gate_results, + corpus=registry.corpus, + model_id=args.model_id or "unknown", + run_id=args.run_id or "run", + is_self_graded=True, + kappa_advisory=registry.is_kappa_advisory(), + evidence_unverified=corpus is None, + advisory=advisory, + config=config, + experiment_config=experiment_config, + ) + print(scorecard) + + v = get_verdict(gate_results) + return 0 if v == "PROMOTE" else 1 + + +# ── aa-band ─────────────────────────────────────────────────────────────────── + + +def cmd_aa_band(args: argparse.Namespace) -> int: + registry = load_registry(args.registry) + + if args.recall_metric in ("hybrid", "semantic") and not args.embedder: + print( + f"ERROR: --recall-metric {args.recall_metric} requires --embedder.\n" + " Example: --embedder openai:text-embedding-3-small", + file=sys.stderr, + ) + return 2 + + embed_fn = build_embedder(args.embedder) if args.embedder else None + corpus = load_corpus(args.corpus) if args.corpus else None + scores: list[float] = [] + + for rp in args.results: + result = _load_json(rp) + g2 = g2_recall_precision( + result, registry, + recall_metric=args.recall_metric, embed_fn=embed_fn, + tau=args.tau, tau_nc=args.tau_nc, + corpus=corpus, + ) + if g2.passed or g2.details.get("recall") is not None: + scores.append(g2.details.get("recall", 0.0)) + + if len(scores) < 2: + print( + f"ERROR: need >= 2 runs for aa_band; got {len(scores)}. " + "Make sure the registry is non-empty and the runs are valid.", + file=sys.stderr, + ) + return 1 + + band = aa_band(scores) + high_var = left_skew_flag(scores) + print(f"A/A noise band (95th-pct pairwise delta): {band:.4f}") + print(f"Scores across reruns: {[round(s, 4) for s in scores]}") + if high_var: + print("WARNING: HIGH_VARIANCE — min < median - 0.10. Investigate before using this band.") + return 0 + + +# ── day-zero ────────────────────────────────────────────────────────────────── + + +def cmd_day_zero(args: argparse.Namespace) -> int: + result = _load_json(args.result) + registry = load_registry(args.registry) + + if not args.corpus: + print( + "ERROR: day-zero (a promotion decision) requires --corpus for evidence\n" + "verification — a champion must not be minted on unverified evidence.\n" + " Supply the run's input bundle, e.g. --corpus experiments//input.json", + file=sys.stderr, + ) + return 2 + corpus = load_corpus(args.corpus) + + if args.signoffs < 2: + print( + f"ERROR: Day-Zero requires 2 independent human sign-offs; got {args.signoffs}.", + file=sys.stderr, + ) + return 1 + + gate_results = run_gates( + result, + registry, + args.registry, + is_day_zero=True, + human_signed_off=True, + signoff_count=args.signoffs, + corpus=corpus, + ) + + config = _eval_config(args, registry, corpus) + _write_eval_config(args.result, config) + experiment_config = _read_experiment_config(args.result) + v = get_verdict(gate_results) + scorecard = render_scorecard( + gate_results, + corpus=registry.corpus, + model_id=args.model_id or "unknown", + run_id=args.run_id or "day-zero", + is_self_graded=True, + kappa_advisory=registry.is_kappa_advisory(), + config=config, + experiment_config=experiment_config, + ) + print(scorecard) + + if v == "PROMOTE" and args.baseline: + g2 = next((g for g in gate_results if g.gate == "G2"), None) + g3 = next((g for g in gate_results if g.gate == "G3"), None) + scores = {} + if g2: + scores["recall"] = g2.details.get("recall", 0.0) + if g3: + scores["grounding_pct"] = g3.details.get("grounding_pct", 0.0) + + champion = ChampionRecord( + corpus=registry.corpus, + run_id=args.run_id or "day-zero", + model_id=args.model_id or "unknown", + registry_sha256=registry.sha256(), + scores=scores, + is_day_zero=True, + human_sign_offs=[f"signoff-{i + 1}" for i in range(args.signoffs)], + config=config, + corpus_sha256=corpus.sha256, + ) + save_champion( + args.baseline, + champion, + summary=f"Day-Zero champion for {registry.corpus}", + date=args.date or "unknown", + ) + print(f"\nDay-Zero champion saved to {args.baseline}") + + return 0 if v == "PROMOTE" else 1 + + +# ── invalidate ──────────────────────────────────────────────────────────────── + + +def cmd_invalidate(args: argparse.Namespace) -> int: + invalidate_champion(args.baseline, reason=args.reason, date=args.date or "unknown") + print(f"Champion invalidated in {args.baseline}. Reason: {args.reason}") + return 0 + + +# ── parser ──────────────────────────────────────────────────────────────────── + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="flyeval", + description="FlyRadar Lean Core eval: G1-G3 + G5 deterministic, G4 judge on by default", + ) + sub = parser.add_subparsers(dest="command", required=True) + + def _add_common(p: argparse.ArgumentParser) -> None: + p.add_argument("--result", required=True, help="Path to DiscoveryResult JSON") + p.add_argument("--registry", required=True, help="Path to lean-1 registry JSON") + p.add_argument( + "--corpus", + help="Path to the run's input.json corpus bundle — enables deterministic " + "evidence verification (required for day-zero; without it, gate runs " + "carry an EVIDENCE UNVERIFIED disclosure)", + ) + p.add_argument("--baseline", help="Path to baseline.json (per-corpus champion store)") + p.add_argument("--model-id", default="unknown") + p.add_argument("--run-id", default="run") + p.add_argument("--date", default="", help="ISO date for promotion log") + + # gate + p_gate = sub.add_parser("gate", help="Run the gates and print a scorecard") + _add_common(p_gate) + p_gate.add_argument("--recall-floor", type=float, default=0.70) + p_gate.add_argument("--grounding-floor", type=float, default=0.90) + p_gate.add_argument("--pii-list", nargs="*", default=[]) + p_gate.add_argument( + "--embedder", + default=os.environ.get("FLYEVAL_EMBEDDER"), + help="opt-in embedder spec for the semantic recall path " + '(e.g. "azure:text-embedding-3-small"); omit for pure-lexical recall. ' + "Env: FLYEVAL_EMBEDDER", + ) + p_gate.add_argument( + "--recall-metric", + choices=["lexical", "semantic", "hybrid"], + default=os.environ.get("FLYEVAL_RECALL_METRIC", "hybrid"), + help="which recall metric GATES (default hybrid; hybrid/semantic require --embedder). " + "Env: FLYEVAL_RECALL_METRIC", + ) + p_gate.add_argument( + "--tau", + type=float, + default=float(os.environ.get("FLYEVAL_TAU", "0.70")), + help="cosine similarity threshold for the semantic recall path (real items). " + "Env: FLYEVAL_TAU", + ) + p_gate.add_argument( + "--tau-nc", + type=float, + default=float(os.environ.get("FLYEVAL_TAU_NC", "0.85")), + help="cosine similarity threshold for NC item detection (higher; no source anchor). " + "Env: FLYEVAL_TAU_NC", + ) + p_gate.add_argument("--human-signed-off", action="store_true") + p_gate.add_argument("--signoffs", type=int, default=0) + p_gate.add_argument( + "--judge-model", + default=os.environ.get("FLYEVAL_JUDGE_MODEL", "anthropic:claude-sonnet-4-6"), + help="provider:model for the non-blocking G4 LLM-as-a-Judge (e.g. azure:gpt-4o). " + "Runs by default; pass --no-judge to skip G4. Env: FLYEVAL_JUDGE_MODEL", + ) + p_gate.add_argument( + "--no-judge", + action="store_true", + help="skip the G4 LLM-as-a-Judge (it runs by default).", + ) + p_gate.add_argument( + "--judge-runs", + type=int, + default=int(os.environ.get("FLYEVAL_JUDGE_RUNS", "1")), + help="G4 judge runs; the median of numeric scores is kept (odd recommended). " + "Env: FLYEVAL_JUDGE_RUNS", + ) + p_gate.add_argument( + "--judge-concurrency", + type=int, + default=int(os.environ.get("FLYEVAL_JUDGE_CONCURRENCY", "1")), + help="bounded fan-out for the per-item G4 [J] metrics (1 = sequential; " + ">=2 runs each metric's chat calls across a thread pool, order preserved). " + "Env: FLYEVAL_JUDGE_CONCURRENCY", + ) + p_gate.add_argument( + "--champion-result", + help="Path to the champion's output.json for the G4 comparative-review metric", + ) + p_gate.set_defaults(func=cmd_gate) + + # aa-band + p_aa = sub.add_parser("aa-band", help="Compute A/A noise band from champion reruns") + p_aa.add_argument( + "--results", + nargs="+", + required=True, + help="Paths to champion-rerun result JSON files (>= 2)", + ) + p_aa.add_argument("--registry", required=True) + p_aa.add_argument( + "--recall-metric", + choices=["lexical", "semantic", "hybrid"], + default="hybrid", + help="recall metric to use — must match the champion's metric (default hybrid; " + "hybrid/semantic require --embedder)", + ) + p_aa.add_argument( + "--embedder", + default=None, + help="embedder spec for semantic/hybrid recall (e.g. ollama:bge-m3)", + ) + p_aa.add_argument("--tau", type=float, default=0.70) + p_aa.add_argument("--tau-nc", type=float, default=0.85) + p_aa.add_argument( + "--corpus", + help="Path to input.json — must match the gate's corpus setting so the " + "band is computed under the same evidence filtering as the champion", + ) + p_aa.set_defaults(func=cmd_aa_band) + + # day-zero + p_dz = sub.add_parser("day-zero", help="Promote the inaugural champion (Day-Zero protocol)") + _add_common(p_dz) + p_dz.add_argument( + "--signoffs", + type=int, + default=0, + help="Number of independent human sign-offs collected (need 2)", + ) + p_dz.set_defaults(func=cmd_day_zero) + + # invalidate + p_inv = sub.add_parser("invalidate", help="Invalidate the current champion") + p_inv.add_argument("--baseline", required=True) + p_inv.add_argument("--reason", required=True) + p_inv.add_argument("--date", default="") + p_inv.set_defaults(func=cmd_invalidate) + + return parser + + +def main() -> None: + parser = build_parser() + args = parser.parse_args() + sys.exit(args.func(args)) + + +if __name__ == "__main__": + main() diff --git a/fireflyframework_agentic/evaluation/run_config_snapshot.py b/fireflyframework_agentic/evaluation/run_config_snapshot.py new file mode 100644 index 00000000..db543129 --- /dev/null +++ b/fireflyframework_agentic/evaluation/run_config_snapshot.py @@ -0,0 +1,160 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Capture the effective flyradar run configuration into experiment_configuration.json. + +Non-invasive snapshot: it records how a run was generated by reading what flyradar +already exposes as data — the request options the caller sent, the ``/api/v1/version`` +endpoint, ``RadarSettings``, and the prompt catalog — without modifying flyradar. The +snapshot is written next to the run's ``output.json`` at generation time, which is the +moment the configuration is known. + +This is the bridge: the durable fix is for flyradar to stamp the same config into +``DiscoveryResult`` itself (the one place that knows the effective values and cannot +drift). See the "flyradar improvements" issue. ``temperature`` and ``seed`` are not +exposed by ``RadarSettings`` and are recorded as ``null`` here. + +Usage: + cd flyradar_experiments + set -a && source .env && set +a + uv run python -m fireflyframework_agentic.evaluation.run_config_snapshot \ + --output-dir experiments/bbva_españa/runs/2026-06-12-sonnet-01 \ + --options request_options.json \ + --commit c107918 +""" +from __future__ import annotations + +import argparse +import json +import os +import urllib.request +from importlib.resources import files +from pathlib import Path + +try: + from flyradar.config import RadarSettings +except ImportError: # flyradar is an optional dependency of this snapshot. + RadarSettings = None + +#: Path of the flyradar version endpoint (whitelisted in the service middleware). +VERSION_PATH = "/api/v1/version" + +#: RadarSettings fields that define scoring / dedup behaviour, captured verbatim. +_SETTINGS_KEYS = ( + "model", + "fallback_model", + "duplicity_similarity_threshold", + "rootcause_cost_weight", + "rootcause_frequency_weight", + "rootcause_actionability_weight", +) + + +def fetch_version(base_url: str, *, timeout: float = 10.0) -> dict: + """GET the flyradar version endpoint; return ``{}`` on any failure.""" + url = base_url.rstrip("/") + VERSION_PATH + try: + with urllib.request.urlopen(url, timeout=timeout) as resp: + return json.loads(resp.read().decode("utf-8")) + except Exception: + return {} + + +def load_radar_settings() -> dict | None: + """Dump the scoring / dedup RadarSettings, or ``None`` if flyradar isn't importable.""" + if RadarSettings is None: + return None + settings = RadarSettings() + return {key: getattr(settings, key, None) for key in _SETTINGS_KEYS} + + +def load_prompt_versions() -> dict | None: + """Read each stage prompt's ``version`` from the flyradar prompt catalog, or ``None``.""" + try: + catalog = files("flyradar.resources.prompts") + except ModuleNotFoundError: + return None + versions: dict[str, str] = {} + for entry in catalog.iterdir(): + if not entry.name.endswith(".yaml"): + continue + for line in entry.read_text(encoding="utf-8").splitlines(): + if line.strip().startswith("version:"): + versions[entry.name[:-5]] = line.split(":", 1)[1].strip().strip('"') + break + return versions or None + + +def build_run_config( + options: dict, + *, + version: dict, + settings: dict | None, + prompt_versions: dict | None, + commit: str | None = None, +) -> dict: + """Assemble the experiment-configuration snapshot from its captured parts.""" + return { + "captured_by": "config-snapshot (non-invasive)", + "flyradar_version": version.get("version"), + "flyradar_commit": commit or version.get("commit"), + "options": options, + "settings": settings, + "prompt_versions": prompt_versions, + "temperature": None, + "seed": None, + "_note": ( + "Non-invasive snapshot captured at generation time. `options` is the request " + "the caller sent; `settings` and `prompt_versions` are read from flyradar when " + "importable at the deployed commit. `temperature` and `seed` are not exposed by " + "RadarSettings and are recorded as null. The durable fix is for flyradar to stamp " + "this config into DiscoveryResult (see the 'flyradar improvements' issue)." + ), + } + + +def write_snapshot(output_dir: str | Path, config: dict) -> Path: + """Write ``experiment_configuration.json`` into the run's output directory.""" + path = Path(output_dir) / "experiment_configuration.json" + path.write_text(json.dumps(config, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + return path + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Capture the flyradar run configuration.") + parser.add_argument("--output-dir", required=True, help="Run directory holding output.json.") + parser.add_argument( + "--options", required=True, help="JSON file of the DiscoveryRequest options that were sent." + ) + parser.add_argument( + "--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL)." + ) + parser.add_argument("--commit", default=None, help="Deployed flyradar git commit, if known.") + args = parser.parse_args(argv) + + base_url = args.base_url or os.environ.get("FLYRADAR_BASE_URL", "") + options = json.loads(Path(args.options).read_text(encoding="utf-8")) + config = build_run_config( + options, + version=fetch_version(base_url) if base_url else {}, + settings=load_radar_settings(), + prompt_versions=load_prompt_versions(), + commit=args.commit, + ) + path = write_snapshot(args.output_dir, config) + print(f"Wrote {path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 4ab1d859d16d4ae92d6a6d3a4a283a236d25d29d Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:58:49 +0200 Subject: [PATCH 08/48] feat(lab): add retrieval metrics (hit@k, recall@k, MRR, MAP, nDCG) (#275) * feat(lab): add retrieval_metrics module with compute_retrieval_metrics and RetrieverMetrics * feat(lab): export RetrieverMetrics and compute_retrieval_metrics from lab package * feat(evaluation): import RetrieverMetrics and compute_retrieval_metrics from lab.retrieval_metrics --------- Co-authored-by: miguelgfierro --- .../evaluation/__init__.py | 2 +- fireflyframework_agentic/lab/__init__.py | 3 + .../lab/retrieval_metrics.py | 200 ++++++++++++++++++ 3 files changed, 204 insertions(+), 1 deletion(-) create mode 100644 fireflyframework_agentic/lab/retrieval_metrics.py diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index 37093075..ad01980c 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -39,7 +39,7 @@ from fireflyframework_agentic.evaluation.judge_client import JudgeClient, OllamaEmbedder, build_embedder, cosine from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256 -from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics +from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag try: diff --git a/fireflyframework_agentic/lab/__init__.py b/fireflyframework_agentic/lab/__init__.py index 46cc08dc..8e127d8a 100644 --- a/fireflyframework_agentic/lab/__init__.py +++ b/fireflyframework_agentic/lab/__init__.py @@ -18,6 +18,7 @@ from fireflyframework_agentic.lab.comparison import ComparisonEntry, ModelComparison from fireflyframework_agentic.lab.dataset import EvalCase, EvalDataset from fireflyframework_agentic.lab.evaluator import EvalOrchestrator, EvalReport, EvalResult +from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics from fireflyframework_agentic.lab.session import LabSession, SessionEntry __all__ = [ @@ -31,5 +32,7 @@ "EvalResult", "LabSession", "ModelComparison", + "RetrieverMetrics", "SessionEntry", + "compute_retrieval_metrics", ] diff --git a/fireflyframework_agentic/lab/retrieval_metrics.py b/fireflyframework_agentic/lab/retrieval_metrics.py new file mode 100644 index 00000000..5f3e2373 --- /dev/null +++ b/fireflyframework_agentic/lab/retrieval_metrics.py @@ -0,0 +1,200 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Deterministic IR evaluation metrics for ranked retrieval results (no LLM, no network). + +Industry-standard information-retrieval metrics computed over a ranked list of +retrieved chunks vs the gold set each result carries (``gold`` + per-hit +``is_gold``). Metrics are reported at cut-offs k ∈ {1, 5, 10}: + +* **Hit@k** -- at least one gold document appears in the top-k results. +* **Recall@k** -- fraction of gold documents found in top-k. +* **Precision@k** -- fraction of top-k results that are gold. +* **MRR@10** -- mean reciprocal rank of the first gold hit (up to k=10). +* **MAP@10** -- mean average precision (up to k=10). +* **nDCG@10** -- normalised discounted cumulative gain (up to k=10). + +Optional fields (populated when the raw result rows contain them): + +* ``no_answer_rate`` -- fraction of rows where the model produced no answer. +* ``citation_precision`` -- precision of in-answer citations vs gold set. +* ``mean_search_ms`` / ``mean_answer_ms`` -- mean retrieval and generation latencies. + +Ported from ``flycanon_experiments/scripts/deterministic_eval.py``. +""" + +from __future__ import annotations + +import math + +from pydantic import BaseModel + +KS = (1, 5, 10) + + +def _dedup(retrieved: list[dict]) -> list[dict]: + """Return one entry per source, first chunk wins, preserving rank order. + + flycanon splits each ingested document into many chunks; a single gold + filing can therefore appear multiple times in the ranked list. Without + deduplication nDCG/MAP/Recall count every chunk separately, inflating + scores past 1.0 when a good embedding model retrieves several chunks from + the same filing. Taking only the first (highest-ranked) chunk per + source_id makes the list item-unique, matching the recommenders-library + contract that all IR formulae assume. + """ + seen: set[str] = set() + out: list[dict] = [] + for r in sorted(retrieved, key=lambda x: x["rank"]): + key = r.get("source_id") or "|".join(r.get("identities", [])) + if key not in seen: + seen.add(key) + out.append(r) + return out + + +def _ndcg(retrieved: list[dict], n_gold: int, k: int = 10) -> float: + """Return nDCG@k for a single query.""" + dcg = sum( + 1.0 / math.log2(r["rank"] + 1) + for r in retrieved + if r.get("is_gold") and r["rank"] <= k + ) + ideal = sum(1.0 / math.log2(i + 2) for i in range(min(n_gold, k))) + return dcg / ideal if ideal else 0.0 + + +def _ap(retrieved: list[dict], n_gold: int, k: int = 10) -> float: + """Return average precision@k for a single query.""" + hits, precisions = 0, [] + for r in sorted(retrieved, key=lambda x: x["rank"]): + if r["rank"] > k: + break + if r.get("is_gold"): + hits += 1 + precisions.append(hits / r["rank"]) + return sum(precisions) / min(n_gold, k) if n_gold else 0.0 + + +def compute_retrieval_metrics(results: list[dict]) -> dict: + """Compute deterministic IR metrics over a list of retrieval result rows. + + Each element of *results* must be a dict with at least: + + * ``retrieved`` -- list of dicts with ``rank`` (int, 1-based), ``source_id`` + (str) or ``identities`` (list[str]), and ``is_gold`` (bool). + * ``gold`` -- list of gold source identifiers (used to compute ``n_gold``). + + Optional keys per row: + + * ``no_answer`` (bool) / ``answer`` (str) -- used for ``no_answer_rate``. + * ``citations`` (list[dict]) -- each with ``is_gold`` (bool) for citation precision. + * ``search_ms`` (float) / ``answer_ms`` (float) -- latency in milliseconds. + + Returns a flat dict with keys: ``n_queries``, ``hit@1``, ``hit@5``, + ``hit@10``, ``recall@1``, ``recall@5``, ``recall@10``, ``precision@1``, + ``precision@5``, ``precision@10``, ``mrr@10``, ``map@10``, ``ndcg@10``, + ``no_answer_rate``, ``citation_precision``, ``mean_search_ms``, + ``mean_answer_ms``. + """ + n = len(results) + agg = {f"{m}@{k}": 0.0 for k in KS for m in ("hit", "recall", "precision")} + agg.update({"mrr@10": 0.0, "map@10": 0.0, "ndcg@10": 0.0}) + no_answer = 0 + cite_num = cite_den = 0.0 + search_ms: list[float] = [] + answer_ms: list[float] = [] + + for row in results: + retrieved = _dedup(row["retrieved"]) + n_gold = max(len(set(row["gold"])), 1) + gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")] + for k in KS: + in_k = [g for g in gold_ranks if g <= k] + agg[f"hit@{k}"] += 1.0 if in_k else 0.0 + agg[f"recall@{k}"] += len(in_k) / n_gold + agg[f"precision@{k}"] += len(in_k) / k + agg["mrr@10"] += (1.0 / min(gold_ranks)) if gold_ranks else 0.0 + agg["map@10"] += _ap(retrieved, n_gold) + agg["ndcg@10"] += _ndcg(retrieved, n_gold) + + if row.get("no_answer") or not row.get("answer", "").strip(): + no_answer += 1 + cites = row.get("citations", []) + if cites: + cite_num += sum(1 for c in cites if c.get("is_gold")) + cite_den += len(cites) + if row.get("search_ms") is not None: + search_ms.append(row["search_ms"]) + if row.get("answer_ms") is not None: + answer_ms.append(row["answer_ms"]) + + out = {k: round(v / n, 4) for k, v in agg.items()} if n else {} + out["n_queries"] = n + out["no_answer_rate"] = round(no_answer / n, 4) if n else None + out["citation_precision"] = round(cite_num / cite_den, 4) if cite_den else None + out["mean_search_ms"] = round(sum(search_ms) / len(search_ms)) if search_ms else None + out["mean_answer_ms"] = round(sum(answer_ms) / len(answer_ms)) if answer_ms else None + return out + + +class RetrieverMetrics(BaseModel): + """Structured IR metrics for a retrieval evaluation run. + + Fields mirror the flat dict returned by :func:`compute_retrieval_metrics`. + Optional fields are ``None`` when the raw result rows lack the required data + (e.g. no latency timestamps, no citations). + """ + + n_queries: int = 0 + hit_at_1: float = 0.0 + hit_at_5: float = 0.0 + hit_at_10: float = 0.0 + recall_at_1: float = 0.0 + recall_at_5: float = 0.0 + recall_at_10: float = 0.0 + precision_at_1: float = 0.0 + precision_at_5: float = 0.0 + precision_at_10: float = 0.0 + mrr_at_10: float = 0.0 + map_at_10: float = 0.0 + ndcg_at_10: float = 0.0 + no_answer_rate: float | None = None + citation_precision: float | None = None + mean_search_ms: float | None = None + mean_answer_ms: float | None = None + + @classmethod + def from_results(cls, results: list[dict]) -> "RetrieverMetrics": + """Compute metrics from raw retrieval result rows and return a model instance.""" + m = compute_retrieval_metrics(results) + return cls( + n_queries=m.get("n_queries", 0), + hit_at_1=m.get("hit@1", 0.0), + hit_at_5=m.get("hit@5", 0.0), + hit_at_10=m.get("hit@10", 0.0), + recall_at_1=m.get("recall@1", 0.0), + recall_at_5=m.get("recall@5", 0.0), + recall_at_10=m.get("recall@10", 0.0), + precision_at_1=m.get("precision@1", 0.0), + precision_at_5=m.get("precision@5", 0.0), + precision_at_10=m.get("precision@10", 0.0), + mrr_at_10=m.get("mrr@10", 0.0), + map_at_10=m.get("map@10", 0.0), + ndcg_at_10=m.get("ndcg@10", 0.0), + no_answer_rate=m.get("no_answer_rate"), + citation_precision=m.get("citation_precision"), + mean_search_ms=m.get("mean_search_ms"), + mean_answer_ms=m.get("mean_answer_ms"), + ) From 0acac370f601451015b3f98366717b57a7c9c401 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Fri, 19 Jun 2026 00:02:20 +0200 Subject: [PATCH 09/48] feat(examples): add flyradar and flycanon evaluation examples (#276) * feat(evaluation): add flyradar gate evaluation example * feat(evaluation): add flycanon RAG retrieval evaluation example --------- Co-authored-by: miguelgfierro --- examples/flycanon_eval_example.py | 379 ++++++++++++++++++++++++++++ examples/flyradar_eval_example.py | 406 ++++++++++++++++++++++++++++++ 2 files changed, 785 insertions(+) create mode 100644 examples/flycanon_eval_example.py create mode 100644 examples/flyradar_eval_example.py diff --git a/examples/flycanon_eval_example.py b/examples/flycanon_eval_example.py new file mode 100644 index 00000000..9d8d071b --- /dev/null +++ b/examples/flycanon_eval_example.py @@ -0,0 +1,379 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""FlyCanon evaluation example — RAG retrieval benchmark with champion/challenger tracking. + +Demonstrates how to use ``fireflyframework_agentic.evaluation`` to replicate +the flycanon experiment evaluation workflow: + +1. Load a results JSONL file produced by a flycanon retrieval pipeline. +2. Compute deterministic IR metrics (Recall@k, Precision@k, MRR, nDCG, MAP). +3. Compare against a saved baseline to detect regression. +4. Print a formatted metrics table. +5. Offer to promote the new run to champion when it beats the baseline. + +The champion/challenger pattern mirrors the flycanon_experiments harness: +each run writes metrics to a file; ``approve`` promotes it by repointing +baseline.json. Here we replicate that flow using the framework's +``compute_retrieval_metrics`` / ``RetrieverMetrics`` API directly. + +Usage:: + + # Score a results file (no baseline comparison) + python examples/flycanon_eval_example.py --results-file results.jsonl + + # Compare against a saved baseline + python examples/flycanon_eval_example.py \\ + --results-file results.jsonl \\ + --baseline baseline.json + + # Promote if better (write new champion to baseline.json) + python examples/flycanon_eval_example.py \\ + --results-file results.jsonl \\ + --baseline baseline.json \\ + --promote-if-better + +Exit codes: 0 = scored successfully, 1 = regression detected vs baseline. + +Results JSONL format +-------------------- +Each line is a JSON object representing one query's retrieval result:: + + { + "question": "What was Apple's revenue in Q4 2023?", + "gold": ["AAPL_10K_2023", "AAPL_10Q_Q4_2023"], + "retrieved": [ + {"rank": 1, "source_id": "AAPL_10K_2023", "is_gold": true}, + {"rank": 2, "source_id": "MSFT_10K_2023", "is_gold": false}, + {"rank": 3, "source_id": "AAPL_10Q_Q4_2023", "is_gold": true} + ], + "answer": "Apple's revenue in Q4 2023 was $89.5 billion.", + "no_answer": false, + "citations": [ + {"source_id": "AAPL_10K_2023", "is_gold": true} + ], + "search_ms": 142, + "answer_ms": 2310 + } + +The ``gold`` list contains the source IDs that are considered correct answers. +Each entry in ``retrieved`` must have a 1-based ``rank``, ``source_id`` (or +``identities`` list), and ``is_gold`` bool. + +Baseline JSON format +-------------------- +A flat JSON object with metric names as keys and float values:: + + { + "ndcg@10": 0.7234, + "mrr@10": 0.6891, + "recall@10": 0.8120, + "hit@10": 0.9100, + "map@10": 0.6543, + "n_queries": 200 + } + +This is the same format written by ``--promote-if-better``. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +from fireflyframework_agentic.evaluation import RetrieverMetrics, compute_retrieval_metrics + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +# Metrics that form the primary quality signal for champion/challenger +# comparisons. These are listed in priority order: nDCG@10 is the primary +# ranking metric; MRR@10 measures how quickly the first gold result appears; +# Recall@10 measures overall coverage; Hit@10 measures binary success rate; +# MAP@10 measures precision across the ranked list. +PRIMARY_METRICS = ["ndcg@10", "mrr@10", "recall@10", "hit@10", "map@10"] + +# Regression threshold: a metric must drop by more than this fraction of its +# baseline value to be flagged as a regression (guards against noise). +REGRESSION_THRESHOLD = 0.01 + + +def _load_jsonl(path: str) -> list[dict]: + """Load a newline-delimited JSON file, one object per line.""" + lines = Path(path).read_text(encoding="utf-8").strip().splitlines() + return [json.loads(line) for line in lines if line.strip()] + + +def _load_baseline(path: str) -> dict | None: + """Load a baseline JSON file, returning None if it does not exist.""" + p = Path(path) + if not p.exists(): + return None + return json.loads(p.read_text(encoding="utf-8")) + + +def _save_baseline(path: str, metrics: dict) -> None: + """Write a flat metrics dict to the baseline JSON file.""" + Path(path).write_text(json.dumps(metrics, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + + +def _metrics_to_flat(m: RetrieverMetrics) -> dict: + """Convert a RetrieverMetrics model to the flat dict stored in baseline.json.""" + return { + "n_queries": m.n_queries, + "hit@1": m.hit_at_1, + "hit@5": m.hit_at_5, + "hit@10": m.hit_at_10, + "recall@1": m.recall_at_1, + "recall@5": m.recall_at_5, + "recall@10": m.recall_at_10, + "precision@1": m.precision_at_1, + "precision@5": m.precision_at_5, + "precision@10": m.precision_at_10, + "mrr@10": m.mrr_at_10, + "map@10": m.map_at_10, + "ndcg@10": m.ndcg_at_10, + "no_answer_rate": m.no_answer_rate, + "citation_precision": m.citation_precision, + "mean_search_ms": m.mean_search_ms, + "mean_answer_ms": m.mean_answer_ms, + } + + +def _print_metrics_table(metrics: RetrieverMetrics, baseline: dict | None) -> None: + """Print a formatted table comparing current metrics vs baseline.""" + flat = _metrics_to_flat(metrics) + + col_w = 22 + num_w = 10 + header = f"{'Metric':<{col_w}} {'Current':>{num_w}}" + if baseline: + header += f" {'Baseline':>{num_w}} {'Delta':>{num_w}}" + print(header) + print("-" * (col_w + num_w + (num_w * 2 + 2 if baseline else 0))) + + for key, value in flat.items(): + if value is None: + continue + # Format floats as 4 decimal places; ints as plain integers. + if isinstance(value, float): + cur_str = f"{value:.4f}" + else: + cur_str = str(value) + + row = f"{key:<{col_w}} {cur_str:>{num_w}}" + if baseline and key in baseline and isinstance(value, float): + base_val = baseline[key] + delta = value - base_val + delta_str = f"{delta:+.4f}" + row += f" {base_val:>{num_w}.4f} {delta_str:>{num_w}}" + print(row) + + print() + + +def _detect_regressions(flat: dict, baseline: dict) -> list[str]: + """Return the names of primary metrics that regressed vs baseline. + + A regression is flagged when the new value drops by more than + REGRESSION_THRESHOLD * baseline_value (relative threshold). This + guards against flagging noise as a regression. + """ + regressions = [] + for key in PRIMARY_METRICS: + new_val = flat.get(key) + base_val = baseline.get(key) + if new_val is None or base_val is None: + continue + if base_val > 0 and (base_val - new_val) / base_val > REGRESSION_THRESHOLD: + regressions.append(key) + return regressions + + +def _beats_baseline(flat: dict, baseline: dict) -> bool: + """Return True if the new metrics are better than or equal to the baseline. + + 'Better' means no primary metric has regressed beyond REGRESSION_THRESHOLD + AND at least one primary metric has improved. + """ + regressions = _detect_regressions(flat, baseline) + if regressions: + return False + # Check for at least one improvement. + for key in PRIMARY_METRICS: + new_val = flat.get(key) + base_val = baseline.get(key) + if new_val is not None and base_val is not None and new_val > base_val: + return True + return False + + +# --------------------------------------------------------------------------- +# Main evaluation flow +# --------------------------------------------------------------------------- + + +def run_evaluation(args: argparse.Namespace) -> int: + """Run retrieval metric scoring and optional champion/challenger comparison.""" + + # ------------------------------------------------------------------ + # Step 1 — Load results from the JSONL file. + # + # Each line is one query's retrieval result. The file is produced by + # a flycanon pipeline run (runner.run_queries writes results.jsonl). + # ------------------------------------------------------------------ + print(f"Loading results : {args.results_file}") + results = _load_jsonl(args.results_file) + print(f" {len(results)} query results loaded.") + + if not results: + print("ERROR: results file is empty.", file=sys.stderr) + return 1 + + # ------------------------------------------------------------------ + # Step 2 — Compute deterministic IR metrics. + # + # compute_retrieval_metrics() returns a flat dict of standard IR metrics. + # RetrieverMetrics.from_results() wraps that into a typed Pydantic model + # for convenient attribute access. + # + # Metrics are computed at cut-offs k ∈ {1, 5, 10} and include: + # hit@k -- at least one gold doc in top-k (binary) + # recall@k -- fraction of gold docs found in top-k + # precision@k -- fraction of top-k that are gold + # mrr@10 -- mean reciprocal rank of first gold hit + # map@10 -- mean average precision + # ndcg@10 -- normalised discounted cumulative gain + # ------------------------------------------------------------------ + print("\nComputing retrieval metrics ...") + metrics = RetrieverMetrics.from_results(results) + + print(f" nDCG@10 : {metrics.ndcg_at_10:.4f}") + print(f" MRR@10 : {metrics.mrr_at_10:.4f}") + print(f" Recall@10 : {metrics.recall_at_10:.4f}") + print(f" Hit@10 : {metrics.hit_at_10:.4f}") + print(f" MAP@10 : {metrics.map_at_10:.4f}") + + # ------------------------------------------------------------------ + # Step 3 — Load the baseline (champion) for regression detection. + # ------------------------------------------------------------------ + baseline = None + if args.baseline: + baseline = _load_baseline(args.baseline) + if baseline: + print(f"\nLoaded baseline : {args.baseline}") + else: + print(f"\nNo baseline found at {args.baseline} — first run, no comparison.") + + # ------------------------------------------------------------------ + # Step 4 — Print the full metrics table. + # ------------------------------------------------------------------ + print("\n" + "=" * 56) + print("Retrieval Metrics") + print("=" * 56) + _print_metrics_table(metrics, baseline) + + # ------------------------------------------------------------------ + # Step 5 — Regression check. + # + # Compare against the baseline on primary metrics. Regressions block + # promotion (exit code 1) unless --promote-if-better is set and the + # run actually improved overall. + # ------------------------------------------------------------------ + flat = _metrics_to_flat(metrics) + + if baseline: + regressions = _detect_regressions(flat, baseline) + if regressions: + print(f"REGRESSION detected on: {', '.join(regressions)}") + print(f" Threshold: {REGRESSION_THRESHOLD * 100:.0f}% relative drop on any primary metric.") + else: + better = _beats_baseline(flat, baseline) + if better: + print("Challenger BEATS baseline on at least one primary metric.") + else: + print("Challenger is on-par with baseline (no regression, no improvement).") + + if regressions and not args.promote_if_better: + print("\nVerdict: HOLD — regression detected. Tune the pipeline and re-run.") + return 1 + + # ------------------------------------------------------------------ + # Step 6 — Champion promotion. + # + # When --promote-if-better is set and the metrics beat (or equal) the + # baseline, save the new metrics as the champion. Future runs will + # compare against this updated record. + # ------------------------------------------------------------------ + if args.promote_if_better and args.baseline: + if baseline is None or _beats_baseline(flat, baseline): + _save_baseline(args.baseline, flat) + print(f"\nChampion PROMOTED — metrics saved to {args.baseline}") + else: + print("\nNot promoted — challenger did not beat baseline on primary metrics.") + + print("\nVerdict: PROMOTE" if not (baseline and _detect_regressions(flat, baseline)) else "\nVerdict: HOLD") + return 0 + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="flycanon_eval_example", + description=( + "FlyCanon RAG retrieval benchmark — computes IR metrics from a results JSONL " + "and compares against a champion baseline." + ), + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + p.add_argument( + "--results-file", + required=True, + help="Path to results.jsonl produced by the flycanon pipeline.", + ) + p.add_argument( + "--baseline", + default=None, + help=( + "Path to baseline.json (champion store). When absent, scores are printed " + "without comparison." + ), + ) + p.add_argument( + "--promote-if-better", + action="store_true", + help=( + "When set, write new metrics to baseline.json if the challenger beats the " + "champion on primary metrics. Has no effect when --baseline is omitted." + ), + ) + return p + + +def main() -> None: + parser = build_parser() + args = parser.parse_args() + sys.exit(run_evaluation(args)) + + +if __name__ == "__main__": + main() diff --git a/examples/flyradar_eval_example.py b/examples/flyradar_eval_example.py new file mode 100644 index 00000000..706528f4 --- /dev/null +++ b/examples/flyradar_eval_example.py @@ -0,0 +1,406 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""FlyRadar evaluation example — gate-based process-mining quality gate. + +Demonstrates how to use ``fireflyframework_agentic.evaluation`` to replicate +the flyradar experiment quality-gate workflow: + +1. Load a must-find registry (the gold standard items the model must discover). +2. Load a DiscoveryResult produced by a flyradar pipeline run. +3. Run gates G1-G5 to produce a structured verdict: + G1 -- Structural & Safe (schema validity, PII, empty-registry guard). + G2 -- Recall & Precision (must-find recall floor, NC precision). + G3 -- Grounded (finding-to-evidence anchoring). + G4 -- LLM-as-a-Judge (advisory only; never blocks promotion). + G5 -- No-regression / promotion (champion/challenger comparison). +4. Render a human-readable scorecard and print the final verdict. +5. Promote the challenger to champion when the verdict is PROMOTE. + +Usage:: + + # Minimal: deterministic gates only (no G4 judge, no baseline) + python examples/flyradar_eval_example.py \\ + --result output.json \\ + --registry registry.json + + # With corpus verification and a champion baseline + python examples/flyradar_eval_example.py \\ + --result output.json \\ + --registry registry.json \\ + --baseline baseline.json \\ + --corpus input.json + + # With the advisory G4 LLM judge (requires API key in environment) + FLYEVAL_JUDGE_MODEL=anthropic:claude-sonnet-4-6 \\ + python examples/flyradar_eval_example.py \\ + --result output.json \\ + --registry registry.json \\ + --judge-model anthropic:claude-sonnet-4-6 + +Exit codes: 0 = PROMOTE, 1 = HOLD. + +Input file formats +------------------ +``--result`` (output.json) + A DiscoveryResult JSON produced by a flyradar pipeline run. Must contain + at minimum ``findings`` (list) and ``evidence_index`` (list). + +``--registry`` (registry.json) + A lean-1 registry JSON. Each item has ``id``, ``tier`` (L0-L3), ``title``, + ``description``, and ``nc`` (bool, True for negative controls). + +``--baseline`` (baseline.json) + A ChampionRecord JSON written by a previous PROMOTE run. When omitted the + gate runs in day-zero mode (G5 always passes and a new champion is minted). + +``--corpus`` (input.json) + The corpus bundle used during the run. When supplied, G3 verifies that cited + evidence excerpts actually appear in the corpus documents. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +from fireflyframework_agentic.evaluation import ( + ChampionRecord, + GateResult, + build_embedder, + load_champion, + load_corpus, + load_registry, + render_scorecard, + run_gates, + run_judge, + save_champion, + verdict, + VERDICT_PROMOTE, +) +from fireflyframework_agentic.evaluation.models import EvalConfig + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _load_json(path: str) -> dict: + """Read a JSON file and return its contents as a dict.""" + return json.loads(Path(path).read_text(encoding="utf-8")) + + +def _lexical_missed_ids(result: dict, registry) -> list[str]: + """Return the IDs of registry items not matched by any finding (lexically). + + The G4 judge uses these to focus its coverage checks on items that + lexical recall missed — the places where semantic recovery matters most. + """ + from fireflyframework_agentic.evaluation.matcher import matches + + evidence_index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")} + findings = result.get("findings", []) + # L3 items are informational-only and are never scored. + scored_items = [item for item in registry.real_items if item.tier != "L3"] + return [ + item.id + for item in scored_items + if not any(matches(f, item, evidence_index) for f in findings) + ] + + +# --------------------------------------------------------------------------- +# Main evaluation flow +# --------------------------------------------------------------------------- + + +def run_evaluation(args: argparse.Namespace) -> int: + """Run the full flyradar gate evaluation and return an exit code.""" + + # ------------------------------------------------------------------ + # Step 1 — Load inputs. + # ------------------------------------------------------------------ + print(f"Loading result : {args.result}") + result = _load_json(args.result) + + print(f"Loading registry : {args.registry}") + registry = load_registry(args.registry) + print(f" {len(registry.real_items)} real items, {len(registry.nc_items)} NC items") + + # The EvalConfig captures provenance for the run record. + config = EvalConfig( + model_id=args.model_id, + corpus=registry.corpus, + run_id=args.run_id, + registry_path=args.registry, + corpus_path=args.corpus or "", + baseline_path=args.baseline or "", + judge_model=args.judge_model or "", + ) + + # Optional: corpus bundle for deterministic evidence verification (G3). + corpus = None + if args.corpus: + print(f"Loading corpus : {args.corpus}") + corpus = load_corpus(args.corpus) + + # Optional: champion record for regression detection (G5). + champion = None + champion_scores = None + aa_noise = None + if args.baseline: + print(f"Loading baseline : {args.baseline}") + champion = load_champion(args.baseline) + if champion: + champion_scores = champion.scores + aa_noise = champion.aa_noise + print(f" Champion run : {champion.run_id} ({champion.model_id})") + else: + print(" No champion found — running in day-zero mode.") + + # Optional: embedder for semantic/hybrid recall (G2). + embed_fn = None + if args.embedder: + print(f"Building embedder: {args.embedder}") + embed_fn = build_embedder(args.embedder) + + print() + + # ------------------------------------------------------------------ + # Step 2 — Run deterministic gates G1-G3 + G5. + # + # run_gates() returns a list of GateResult objects, one per gate. + # Each GateResult carries: + # .gate -- "G1" | "G2" | "G3" | "G5" + # .passed -- bool + # .details -- dict with per-metric values + # .errors -- list[str] of blocking error codes + # ------------------------------------------------------------------ + print("Running gates G1-G3 + G5 ...") + gate_results: list[GateResult] = run_gates( + result, + registry, + args.registry, + pii_list=args.pii_list or [], + recall_floor=args.recall_floor, + grounding_floor=args.grounding_floor, + champion_scores=champion_scores, + aa_noise=aa_noise, + is_day_zero=(champion is None), + human_signed_off=args.human_signed_off, + signoff_count=args.signoffs, + embed_fn=embed_fn, + tau=args.tau, + recall_metric=args.recall_metric, + tau_nc=args.tau_nc, + corpus=corpus, + ) + + # Quick gate summary before the full scorecard. + for gr in gate_results: + status = "PASS" if gr.passed else "FAIL" + print(f" {gr.gate}: {status}") + + # ------------------------------------------------------------------ + # Step 3 — Run the advisory G4 LLM-as-a-Judge (optional). + # + # G4 is non-blocking: it never changes the verdict or exit code. + # It produces an AdvisoryReport with per-finding quality signals + # (faithfulness, citation relevance, fabricated entities, etc.). + # ------------------------------------------------------------------ + advisory = None + if args.judge_model: + print(f"\nRunning G4 judge ({args.judge_model}) ...") + missed_ids = _lexical_missed_ids(result, registry) + advisory = run_judge( + result, + registry, + judge_model=args.judge_model, + runs=args.judge_runs, + concurrency=args.judge_concurrency, + pipeline_model=args.model_id, + embed_fn=embed_fn, + tau=args.tau, + lexical_missed_ids=missed_ids, + ) + print(f" Judge completed ({args.judge_runs} run(s)).") + else: + print("\nG4 judge skipped (pass --judge-model to enable).") + + # ------------------------------------------------------------------ + # Step 4 — Render the scorecard. + # + # render_scorecard() produces a markdown-formatted human-readable + # report that mirrors the output of `flyeval gate` in the playground. + # ------------------------------------------------------------------ + print() + scorecard = render_scorecard( + gate_results, + corpus=registry.corpus, + model_id=config.model_id, + run_id=config.run_id, + is_self_graded=True, + kappa_advisory=registry.is_kappa_advisory(), + evidence_unverified=(corpus is None), + advisory=advisory, + ) + print(scorecard) + + # ------------------------------------------------------------------ + # Step 5 — Inspect the verdict and handle promotion. + # + # verdict() returns "PROMOTE" or "HOLD" based on the gate results. + # On PROMOTE, save the challenger as the new champion so future runs + # can detect regressions against this baseline. + # ------------------------------------------------------------------ + v = verdict(gate_results) + print(f"\nFinal verdict: {v}") + + if v == VERDICT_PROMOTE and args.baseline: + # Extract the key scores from G2 and G3 to store in the champion record. + g2 = next((g for g in gate_results if g.gate == "G2"), None) + g3 = next((g for g in gate_results if g.gate == "G3"), None) + scores: dict[str, float] = {} + if g2: + scores["recall"] = g2.details.get("recall", 0.0) + if g3: + scores["grounding_pct"] = g3.details.get("grounding_pct", 0.0) + + new_champion = ChampionRecord( + corpus=registry.corpus, + run_id=config.run_id, + model_id=config.model_id, + registry_sha256=registry.sha256(), + scores=scores, + is_day_zero=(champion is None), + ) + save_champion( + args.baseline, + new_champion, + summary=f"Promoted by flyradar_eval_example.py — {config.run_id}", + ) + print(f"Champion saved to {args.baseline}") + + # Exit 0 = PROMOTE, 1 = HOLD (mirrors `flyeval gate` convention). + return 0 if v == VERDICT_PROMOTE else 1 + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="flyradar_eval_example", + description="FlyRadar gate evaluation — replicates the flyeval gate workflow.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + # Required inputs. + p.add_argument("--result", required=True, help="Path to DiscoveryResult JSON.") + p.add_argument("--registry", required=True, help="Path to lean-1 registry JSON.") + + # Optional inputs. + p.add_argument( + "--baseline", + help="Path to baseline.json (champion store). When absent, runs in day-zero mode.", + ) + p.add_argument( + "--corpus", + help="Path to input.json corpus bundle for deterministic evidence verification (G3).", + ) + + # Run metadata. + p.add_argument("--model-id", default="unknown", help="Model identifier for the scorecard.") + p.add_argument("--run-id", default="example-run", help="Run identifier for the scorecard.") + + # Gate thresholds. + p.add_argument( + "--recall-floor", + type=float, + default=0.70, + help="Minimum recall required for G2 to pass.", + ) + p.add_argument( + "--grounding-floor", + type=float, + default=0.90, + help="Minimum grounding percentage required for G3 to pass.", + ) + p.add_argument( + "--recall-metric", + choices=["lexical", "semantic", "hybrid"], + default="lexical", + help="Recall metric used by G2. 'semantic' and 'hybrid' require --embedder.", + ) + p.add_argument( + "--tau", + type=float, + default=0.70, + help="Cosine similarity threshold for semantic recall (real items).", + ) + p.add_argument( + "--tau-nc", + type=float, + default=0.85, + help="Cosine similarity threshold for NC item detection.", + ) + p.add_argument("--pii-list", nargs="*", default=[], help="PII tokens to check for in findings.") + p.add_argument("--human-signed-off", action="store_true", help="Mark this run as human-reviewed.") + p.add_argument("--signoffs", type=int, default=0, help="Number of human sign-offs collected.") + + # G4 judge options. + p.add_argument( + "--judge-model", + default=None, + help=( + "Provider:model string for the advisory G4 LLM judge " + "(e.g. 'anthropic:claude-sonnet-4-6'). Omit to skip G4." + ), + ) + p.add_argument( + "--judge-runs", + type=int, + default=1, + help="Number of judge calls to aggregate (odd number recommended for median).", + ) + p.add_argument( + "--judge-concurrency", + type=int, + default=1, + help="Thread fan-out for per-item G4 metrics (1 = sequential).", + ) + + # Embedder for semantic recall. + p.add_argument( + "--embedder", + default=None, + help="Embedder spec for semantic/hybrid recall (e.g. 'ollama:bge-m3').", + ) + + return p + + +def main() -> None: + parser = build_parser() + args = parser.parse_args() + sys.exit(run_evaluation(args)) + + +if __name__ == "__main__": + main() From cc048cf187371d99927072d03dd3016c8e765777 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Fri, 19 Jun 2026 00:08:13 +0200 Subject: [PATCH 10/48] test(evaluation): add unit tests for evaluation package and retrieval metrics (#277) * feat(evaluation): add tests/unit/evaluation package init * feat(evaluation): add unit tests for matcher (anchored, source_stem, tokens, matches) * feat(evaluation): add unit tests for stats (aa_band, aggregate_grounding, left_skew_flag) * feat(evaluation): add unit tests for gates (GateResult, verdict, render_scorecard, g5_no_regression) * feat(evaluation): add unit tests for champion (ChampionRecord, load/save/invalidate, input_hash) * feat(evaluation): add unit tests for retrieval_metrics (compute_retrieval_metrics, RetrieverMetrics) * feat(evaluation): fix boundary test for left_skew_flag (floating-point precision) * feat(evaluation): fix no_answer_rate test to match implementation behaviour --------- Co-authored-by: miguelgfierro --- tests/unit/evaluation/__init__.py | 0 tests/unit/evaluation/test_champion.py | 199 ++++++++++++++++++ tests/unit/evaluation/test_gates.py | 219 ++++++++++++++++++++ tests/unit/evaluation/test_matcher.py | 221 ++++++++++++++++++++ tests/unit/evaluation/test_stats.py | 183 +++++++++++++++++ tests/unit/lab/test_retrieval_metrics.py | 247 +++++++++++++++++++++++ 6 files changed, 1069 insertions(+) create mode 100644 tests/unit/evaluation/__init__.py create mode 100644 tests/unit/evaluation/test_champion.py create mode 100644 tests/unit/evaluation/test_gates.py create mode 100644 tests/unit/evaluation/test_matcher.py create mode 100644 tests/unit/evaluation/test_stats.py create mode 100644 tests/unit/lab/test_retrieval_metrics.py diff --git a/tests/unit/evaluation/__init__.py b/tests/unit/evaluation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/evaluation/test_champion.py b/tests/unit/evaluation/test_champion.py new file mode 100644 index 00000000..948a9639 --- /dev/null +++ b/tests/unit/evaluation/test_champion.py @@ -0,0 +1,199 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for evaluation.champion: ChampionRecord, load/save/invalidate_champion, input_hash.""" + +from __future__ import annotations + +import json + +import pytest + +from fireflyframework_agentic.evaluation.champion import ( + ChampionRecord, + input_hash, + invalidate_champion, + load_champion, + save_champion, +) + + +def _make_champion(**overrides) -> ChampionRecord: + defaults = dict( + corpus="test-corpus", + run_id="run-2026-01", + model_id="claude-sonnet-4-5", + registry_sha256="abc123", + scores={"recall": 0.85, "grounding_pct": 0.92}, + aa_noise={"recall": 0.02}, + is_day_zero=False, + human_sign_offs=["reviewer-1"], + ) + defaults.update(overrides) + return ChampionRecord(**defaults) + + +# ── load_champion ───────────────────────────────────────────────────────────── + + +def test_load_champion_nonexistent_file_returns_none(tmp_path): + result = load_champion(tmp_path / "baseline.json") + assert result is None + + +def test_load_champion_file_with_null_champion_returns_none(tmp_path): + baseline = tmp_path / "baseline.json" + baseline.write_text(json.dumps({"champion": None, "promotion_log": []}), encoding="utf-8") + assert load_champion(baseline) is None + + +# ── save_champion / load_champion round-trip ────────────────────────────────── + + +def test_save_then_load_round_trips_all_fields(tmp_path): + baseline = tmp_path / "baseline.json" + champ = _make_champion() + save_champion(baseline, champ, summary="initial champion", date="2026-01-01") + + loaded = load_champion(baseline) + assert loaded is not None + assert loaded.corpus == champ.corpus + assert loaded.run_id == champ.run_id + assert loaded.model_id == champ.model_id + assert loaded.registry_sha256 == champ.registry_sha256 + assert loaded.scores == champ.scores + assert loaded.aa_noise == champ.aa_noise + assert loaded.is_day_zero == champ.is_day_zero + assert loaded.human_sign_offs == champ.human_sign_offs + + +def test_save_champion_appends_promotion_log_entry(tmp_path): + baseline = tmp_path / "baseline.json" + champ = _make_champion() + save_champion(baseline, champ, summary="first", date="2026-01-01") + + champ2 = _make_champion(run_id="run-2026-02", scores={"recall": 0.90}) + save_champion(baseline, champ2, summary="second", date="2026-02-01") + + raw = json.loads(baseline.read_text(encoding="utf-8")) + log = raw["promotion_log"] + assert len(log) == 2 + assert log[0]["to"] == "run-2026-01" + assert log[1]["to"] == "run-2026-02" + assert log[1]["from"] == "run-2026-01" + + +def test_save_champion_creates_file_when_missing(tmp_path): + baseline = tmp_path / "baseline.json" + assert not baseline.exists() + save_champion(baseline, _make_champion()) + assert baseline.exists() + + +def test_save_champion_day_zero_flag_preserved(tmp_path): + baseline = tmp_path / "baseline.json" + champ = _make_champion(is_day_zero=True) + save_champion(baseline, champ) + loaded = load_champion(baseline) + assert loaded.is_day_zero is True + + +def test_save_champion_label_is_day_zero_when_flag_set(tmp_path): + baseline = tmp_path / "baseline.json" + champ = _make_champion(is_day_zero=True) + save_champion(baseline, champ) + raw = json.loads(baseline.read_text(encoding="utf-8")) + assert raw["promotion_log"][0]["label"] == "day-zero" + + +def test_save_champion_label_is_promotion_when_flag_not_set(tmp_path): + baseline = tmp_path / "baseline.json" + save_champion(baseline, _make_champion(is_day_zero=False)) + raw = json.loads(baseline.read_text(encoding="utf-8")) + assert raw["promotion_log"][0]["label"] == "promotion" + + +# ── invalidate_champion ─────────────────────────────────────────────────────── + + +def test_invalidate_champion_sets_champion_to_null(tmp_path): + baseline = tmp_path / "baseline.json" + save_champion(baseline, _make_champion()) + invalidate_champion(baseline, reason="EMPTY_MUST_FIND fake champion", date="2026-03-01") + + loaded = load_champion(baseline) + assert loaded is None + + raw = json.loads(baseline.read_text(encoding="utf-8")) + assert raw["champion"] is None + + +def test_invalidate_champion_appends_invalidation_log(tmp_path): + baseline = tmp_path / "baseline.json" + save_champion(baseline, _make_champion(), date="2026-01-01") + invalidate_champion(baseline, reason="fake champion", date="2026-03-01") + + raw = json.loads(baseline.read_text(encoding="utf-8")) + log = raw["promotion_log"] + assert log[-1]["label"] == "INVALIDATED" + assert "fake champion" in log[-1]["summary"] + assert log[-1]["to"] is None + + +def test_invalidate_champion_noop_when_file_missing(tmp_path): + # Should not raise when file does not exist. + invalidate_champion(tmp_path / "no-file.json", reason="test") + + +# ── ChampionRecord helpers ──────────────────────────────────────────────────── + + +def test_primary_metric_returns_first_key(): + champ = _make_champion(scores={"recall": 0.85, "grounding_pct": 0.92}) + assert champ.primary_metric() == "recall" + + +def test_primary_score_returns_first_value(): + champ = _make_champion(scores={"recall": 0.85, "grounding_pct": 0.92}) + assert champ.primary_score() == 0.85 + + +def test_primary_metric_empty_scores(): + champ = _make_champion(scores={}) + assert champ.primary_metric() == "" + assert champ.primary_score() == 0.0 + + +# ── input_hash ──────────────────────────────────────────────────────────────── + + +def test_input_hash_is_16_chars(): + result = input_hash({"key": "value"}) + assert len(result) == 16 + + +def test_input_hash_is_deterministic(): + data = {"process_graph": {"processes": []}, "findings": []} + h1 = input_hash(data) + h2 = input_hash(data) + assert h1 == h2 + + +def test_input_hash_differs_for_different_inputs(): + assert input_hash({"a": 1}) != input_hash({"a": 2}) + + +def test_input_hash_key_order_independent(): + # sort_keys=True in input_hash should make {"a":1, "b":2} == {"b":2, "a":1}. + assert input_hash({"a": 1, "b": 2}) == input_hash({"b": 2, "a": 1}) diff --git a/tests/unit/evaluation/test_gates.py b/tests/unit/evaluation/test_gates.py new file mode 100644 index 00000000..2edc3b99 --- /dev/null +++ b/tests/unit/evaluation/test_gates.py @@ -0,0 +1,219 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for evaluation.gates: GateResult, verdict, render_scorecard, g5_no_regression.""" + +from __future__ import annotations + +from fireflyframework_agentic.evaluation.gates import ( + GateResult, + Verdict, + g5_no_regression, + render_scorecard, +) +from fireflyframework_agentic.evaluation.scorecard import verdict + + +# ── GateResult ──────────────────────────────────────────────────────────────── + + +def test_gate_result_str_pass(): + gr = GateResult(gate="G1", passed=True) + assert str(gr) == "[G1] PASS" + + +def test_gate_result_str_flag(): + gr = GateResult(gate="G2", passed=False, reason_code="RECALL_BELOW_FLOOR") + assert str(gr) == "[G2] FLAG:RECALL_BELOW_FLOOR" + + +def test_gate_result_flag_without_reason_code(): + gr = GateResult(gate="G3", passed=False, reason_code="") + assert str(gr) == "[G3] FLAG:" + + +def test_gate_result_passed_true(): + gr = GateResult(gate="G5", passed=True, details={"note": "ok"}) + assert gr.passed is True + assert gr.details["note"] == "ok" + + +def test_gate_result_default_details_is_empty_dict(): + gr = GateResult(gate="G1", passed=True) + assert gr.details == {} + + +# ── verdict ─────────────────────────────────────────────────────────────────── + + +def test_verdict_promote_when_all_pass_and_g5_present(): + gates = [ + GateResult(gate="G1", passed=True), + GateResult(gate="G2", passed=True), + GateResult(gate="G3", passed=True), + GateResult(gate="G5", passed=True), + ] + assert verdict(gates) == "PROMOTE" + + +def test_verdict_hold_when_any_gate_fails(): + gates = [ + GateResult(gate="G1", passed=True), + GateResult(gate="G2", passed=False, reason_code="RECALL_BELOW_FLOOR"), + GateResult(gate="G3", passed=True), + GateResult(gate="G5", passed=True), + ] + assert verdict(gates) == "HOLD" + + +def test_verdict_hold_when_g5_missing(): + # All G1/G2/G3 pass but G5 is absent — no promotion without sign-off. + gates = [ + GateResult(gate="G1", passed=True), + GateResult(gate="G2", passed=True), + GateResult(gate="G3", passed=True), + ] + assert verdict(gates) == "HOLD" + + +def test_verdict_hold_on_empty_list(): + assert verdict([]) == "HOLD" + + +def test_verdict_hold_when_g5_fails(): + gates = [ + GateResult(gate="G1", passed=True), + GateResult(gate="G2", passed=True), + GateResult(gate="G3", passed=True), + GateResult(gate="G5", passed=False, reason_code="HOLD"), + ] + assert verdict(gates) == "HOLD" + + +# ── render_scorecard (from gates module) ────────────────────────────────────── + + +def test_render_scorecard_contains_verdict_line(): + gates = [ + GateResult(gate="G1", passed=True), + GateResult(gate="G2", passed=True), + GateResult(gate="G3", passed=True), + GateResult(gate="G5", passed=True), + ] + output = render_scorecard(gates) + assert "VERDICT: PROMOTE" in output + + +def test_render_scorecard_hold_when_flag(): + gates = [ + GateResult(gate="G1", passed=False, reason_code="SCHEMA_INVALID"), + GateResult(gate="G2", passed=True), + GateResult(gate="G3", passed=True), + GateResult(gate="G5", passed=True), + ] + output = render_scorecard(gates) + assert "VERDICT: HOLD" in output + + +def test_render_scorecard_includes_all_gate_lines(): + gates = [ + GateResult(gate="G1", passed=True), + GateResult(gate="G2", passed=True), + GateResult(gate="G3", passed=True), + GateResult(gate="G5", passed=True), + ] + output = render_scorecard(gates) + for gate_label in ("[G1]", "[G2]", "[G3]", "[G5]"): + assert gate_label in output + + +# ── g5_no_regression ────────────────────────────────────────────────────────── + + +def test_g5_day_zero_insufficient_signoffs(): + result = g5_no_regression( + candidate_scores={"recall": 0.85}, + champion_scores=None, + aa_noise=None, + is_day_zero=True, + human_signed_off=False, + signoff_count=1, + ) + assert result.passed is False + assert result.reason_code == "HOLD" + + +def test_g5_day_zero_sufficient_signoffs(): + result = g5_no_regression( + candidate_scores={"recall": 0.85}, + champion_scores=None, + aa_noise=None, + is_day_zero=True, + human_signed_off=False, + signoff_count=2, + ) + assert result.passed is True + assert result.details["day_zero"] is True + + +def test_g5_hold_when_no_human_signoff(): + result = g5_no_regression( + candidate_scores={"recall": 0.90}, + champion_scores={"recall": 0.80}, + aa_noise={"recall": 0.02}, + human_signed_off=False, + ) + assert result.passed is False + assert result.reason_code == "HOLD" + + +def test_g5_hold_when_regression_beyond_band(): + # Candidate recall 0.75 vs champion 0.80; delta=-0.05 < -band=-0.02. + result = g5_no_regression( + candidate_scores={"recall": 0.75}, + champion_scores={"recall": 0.80}, + aa_noise={"recall": 0.02}, + human_signed_off=True, + ) + assert result.passed is False + assert result.reason_code == "HOLD" + assert any("recall" in r for r in result.details["regressions"]) + + +def test_g5_promote_when_candidate_beats_champion(): + result = g5_no_regression( + candidate_scores={"recall": 0.90}, + champion_scores={"recall": 0.80}, + aa_noise={"recall": 0.02}, + human_signed_off=True, + ) + assert result.passed is True + assert result.details["improvements"] + + +def test_g5_promote_when_within_noise_band(): + # delta = 0.01 — positive but within band of 0.02; counts as no regression, no improvement. + result = g5_no_regression( + candidate_scores={"recall": 0.81}, + champion_scores={"recall": 0.80}, + aa_noise={"recall": 0.02}, + human_signed_off=True, + ) + assert result.passed is True + assert result.details["improvements"] == [] + + +def test_g5_verdict_constants(): + assert Verdict.PROMOTE == "PROMOTE" + assert Verdict.HOLD == "HOLD" diff --git a/tests/unit/evaluation/test_matcher.py b/tests/unit/evaluation/test_matcher.py new file mode 100644 index 00000000..cc87564b --- /dev/null +++ b/tests/unit/evaluation/test_matcher.py @@ -0,0 +1,221 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for evaluation.matcher: anchored, source_stem, tokens, matches.""" + +from __future__ import annotations + +import pytest + +from fireflyframework_agentic.evaluation.matcher import ( + anchored, + matches, + source_stem, + tokens, +) +from fireflyframework_agentic.evaluation.registry import RegistryItem + + +# ── tokens ─────────────────────────────────────────────────────────────────── + + +def test_tokens_basic(): + result = tokens("Hello World") + assert result == ["hello", "world"] + + +def test_tokens_lowercases(): + result = tokens("KYC AML PEP") + assert result == ["kyc", "aml", "pep"] + + +def test_tokens_strips_punctuation(): + result = tokens("risk-management: cost (FTE).") + assert "risk" in result + assert "management" in result + assert "cost" in result + assert "fte" in result + + +def test_tokens_empty_string(): + assert tokens("") == [] + + +def test_tokens_numbers_included(): + result = tokens("case-id CU-2026-1003") + assert "2026" in result or "cu" in result + + +def test_tokens_unicode(): + result = tokens("análisis de crédito") + assert "análisis" in result or "an" in result + + +# ── anchored ───────────────────────────────────────────────────────────────── + + +def test_anchored_overlapping_long_token(): + # "underwriting" is 12 chars — well above the 5-char floor. + assert anchored("credit underwriting risk", "underwriting process steps") is True + + +def test_anchored_no_overlap(): + # No token >= 5 chars shared between claim and evidence. + assert anchored("cat sat", "dog ran") is False + + +def test_anchored_short_tokens_ignored(): + # All tokens in both strings are < 5 chars; no overlap counts. + assert anchored("a big cat", "a big dog") is False + + +def test_anchored_mixed_lengths_match(): + # "kyc" is < 5, but "compliance" is long enough. + assert anchored("kyc compliance review", "compliance framework") is True + + +def test_anchored_custom_min_token(): + # Lower the floor so short tokens can anchor. + assert anchored("kyc check", "kyc process", min_token=3) is True + + +def test_anchored_both_empty(): + assert anchored("", "") is False + + +def test_anchored_partial_token_no_match(): + # "risk" (4 chars) is below the default 5-char floor. + assert anchored("risk alert", "risk factor") is False + + +def test_anchored_returns_bool(): + result = anchored("credit underwriting", "underwriting model") + assert isinstance(result, bool) + + +# ── source_stem ─────────────────────────────────────────────────────────────── + + +def test_source_stem_bare_filename_with_extension(): + assert source_stem("SOP-002-kyc-edd.md") == "sop-002-kyc-edd" + + +def test_source_stem_directory_prefixed(): + assert source_stem("sops/SOP-002-kyc-edd.md") == "sop-002-kyc-edd" + + +def test_source_stem_deep_path_prefix(): + assert source_stem("docs/policies/SOP-002-kyc-edd.md") == "sop-002-kyc-edd" + + +def test_source_stem_lowercase(): + # Stems are always lowercased. + assert source_stem("REPORT-FINAL.pdf") == "report-final" + + +def test_source_stem_event_log_row_id(): + # src-: → process stem. + assert source_stem("src-credit-underwriting:CU-2026-1003") == "credit-underwriting" + + +def test_source_stem_event_log_row_id_preserves_hyphens(): + assert source_stem("src-kyc-onboarding:KYC-001") == "kyc-onboarding" + + +def test_source_stem_strips_fragment(): + # #page=N should be removed before stemming. + assert source_stem("docs/report.pdf#page=5") == "report" + + +def test_source_stem_strips_anchor(): + assert source_stem("sops/SOP-001.md#section-3") == "sop-001" + + +def test_source_stem_bare_no_extension(): + # No extension, no directory — stem is just the lowercase name. + assert source_stem("my-document") == "my-document" + + +def test_source_stem_no_directory_no_extension_lowercase(): + assert source_stem("Signal") == "signal" + + +def test_source_stem_csv_extension(): + assert source_stem("activity-cost-fte.csv") == "activity-cost-fte" + + +# ── matches ─────────────────────────────────────────────────────────────────── + + +def _make_item(description: str, evidence: list[str], keywords: list[str] | None = None) -> RegistryItem: + """Construct a minimal RegistryItem for matching tests.""" + return RegistryItem( + id="test-item", + tier="L1", + description=description, + evidence=evidence, + scope="finding", + keywords=keywords or [], + ) + + +def _make_finding(title: str, description: str, evidence_id: str) -> dict: + return { + "title": title, + "description": description, + "evidence_refs": [{"evidence_id": evidence_id}], + } + + +def _make_evidence_index(evidence_id: str, locator: str, excerpt: str = "") -> dict: + return {evidence_id: {"id": evidence_id, "locator": locator, "excerpt": excerpt}} + + +def test_matches_true_when_source_and_topic_match(): + # Finding title shares a long token with item description and cites the same source. + item = _make_item("credit underwriting process", ["sop-kyc-credit.md"]) + finding = _make_finding("credit underwriting review", "credit underwriting risk assessment", "ev-1") + evidence_index = _make_evidence_index("ev-1", "sop-kyc-credit.md") + assert matches(finding, item, evidence_index, scope="finding") is True + + +def test_matches_false_when_source_differs(): + # Token match exists but sources don't overlap — anti-gaming guard fires. + item = _make_item("credit underwriting process", ["sop-credit.md"]) + finding = _make_finding("credit underwriting review", "credit underwriting details", "ev-1") + evidence_index = _make_evidence_index("ev-1", "other-document.md") + assert matches(finding, item, evidence_index, scope="finding") is False + + +def test_matches_false_when_no_token_overlap(): + # Same source, but no shared long token between finding text and item description. + item = _make_item("regulatory capital requirement", ["sop-capital.md"]) + finding = _make_finding("kyc identity check", "client onboarding steps", "ev-1") + evidence_index = _make_evidence_index("ev-1", "sop-capital.md") + assert matches(finding, item, evidence_index, scope="finding") is False + + +def test_matches_keyword_rail_short_token(): + # "KYC" is 3 chars — below the 5-char token floor but valid as a keyword. + item = _make_item("some description about identity", ["sop-kyc.md"], keywords=["KYC"]) + finding = _make_finding("KYC onboarding", "KYC onboarding process", "ev-1") + evidence_index = _make_evidence_index("ev-1", "sop-kyc.md") + assert matches(finding, item, evidence_index, scope="finding") is True + + +def test_matches_empty_evidence_refs_returns_false(): + # Finding with no evidence refs cannot share a source with any item. + item = _make_item("credit underwriting", ["sop-credit.md"]) + finding = {"title": "credit underwriting", "description": "credit underwriting risk", "evidence_refs": []} + assert matches(finding, item, {}, scope="finding") is False diff --git a/tests/unit/evaluation/test_stats.py b/tests/unit/evaluation/test_stats.py new file mode 100644 index 00000000..9523be8c --- /dev/null +++ b/tests/unit/evaluation/test_stats.py @@ -0,0 +1,183 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for evaluation.stats: aa_band, aggregate_grounding, left_skew_flag.""" + +from __future__ import annotations + +import pytest + +from fireflyframework_agentic.evaluation.stats import ( + aa_band, + aggregate_grounding, + left_skew_flag, +) + + +# ── aa_band ────────────────────────────────────────────────────────────────── + + +def test_aa_band_two_identical_scores(): + # Two identical scores produce zero pairwise delta. + assert aa_band([0.80, 0.80]) == 0.0 + + +def test_aa_band_two_different_scores(): + # Single delta = |0.90 - 0.80| = 0.10; 95th percentile of one value is that value. + result = aa_band([0.80, 0.90]) + assert abs(result - 0.10) < 1e-9 + + +def test_aa_band_three_scores_known_deltas(): + # Scores: 0.70, 0.80, 0.90 + # Pairwise deltas: |0.70-0.80|=0.10, |0.70-0.90|=0.20, |0.80-0.90|=0.10 + # Sorted: [0.10, 0.10, 0.20] → 95th pct index = int(3 * 95 / 100) = 2 → 0.20 + result = aa_band([0.70, 0.80, 0.90]) + assert abs(result - 0.20) < 1e-9 + + +def test_aa_band_large_spread(): + # Max delta in [0.0, 1.0] is 1.0. + result = aa_band([0.0, 1.0]) + assert abs(result - 1.0) < 1e-9 + + +def test_aa_band_requires_at_least_two_scores(): + with pytest.raises(ValueError, match="aa_band requires >= 2 reruns"): + aa_band([0.80]) + + +def test_aa_band_empty_raises(): + with pytest.raises(ValueError, match="aa_band requires >= 2 reruns"): + aa_band([]) + + +def test_aa_band_custom_percentile(): + # 50th percentile of [0.10, 0.10, 0.20] at idx=1 → 0.10. + result = aa_band([0.70, 0.80, 0.90], percentile=50) + assert abs(result - 0.10) < 1e-9 + + +def test_aa_band_returns_float(): + result = aa_band([0.80, 0.85, 0.90]) + assert isinstance(result, float) + + +# ── aggregate_grounding ─────────────────────────────────────────────────────── + + +def test_aggregate_grounding_single_dict(): + g = {"support_pct": 90.0, "unsupported_ids": ["ev-1"]} + result = aggregate_grounding([g]) + assert result["support_pct"] == 90.0 + assert result["unsupported_ids"] == ["ev-1"] + assert result["_aggregate_runs"] == 1 + + +def test_aggregate_grounding_mean_support_pct(): + dicts = [ + {"support_pct": 80.0, "unsupported_ids": []}, + {"support_pct": 100.0, "unsupported_ids": []}, + ] + result = aggregate_grounding(dicts) + assert result["support_pct"] == 90.0 + + +def test_aggregate_grounding_union_of_unsupported_ids(): + dicts = [ + {"support_pct": 90.0, "unsupported_ids": ["ev-1", "ev-2"]}, + {"support_pct": 85.0, "unsupported_ids": ["ev-2", "ev-3"]}, + ] + result = aggregate_grounding(dicts) + assert set(result["unsupported_ids"]) == {"ev-1", "ev-2", "ev-3"} + + +def test_aggregate_grounding_union_sorted(): + dicts = [ + {"support_pct": 90.0, "unsupported_ids": ["ev-b"]}, + {"support_pct": 90.0, "unsupported_ids": ["ev-a"]}, + ] + result = aggregate_grounding(dicts) + assert result["unsupported_ids"] == ["ev-a", "ev-b"] + + +def test_aggregate_grounding_empty_input(): + result = aggregate_grounding([]) + assert result["support_pct"] == 0.0 + assert result["unsupported_ids"] == [] + + +def test_aggregate_grounding_records_run_count(): + dicts = [ + {"support_pct": 80.0, "unsupported_ids": []}, + {"support_pct": 90.0, "unsupported_ids": []}, + {"support_pct": 100.0, "unsupported_ids": []}, + ] + result = aggregate_grounding(dicts) + assert result["_aggregate_runs"] == 3 + + +def test_aggregate_grounding_per_run_pct_recorded(): + dicts = [ + {"support_pct": 80.0, "unsupported_ids": []}, + {"support_pct": 100.0, "unsupported_ids": []}, + ] + result = aggregate_grounding(dicts) + assert result["_support_pct_per_run"] == [80.0, 100.0] + + +def test_aggregate_grounding_missing_unsupported_ids_treated_as_empty(): + dicts = [ + {"support_pct": 90.0}, # no unsupported_ids key + {"support_pct": 80.0, "unsupported_ids": ["ev-1"]}, + ] + result = aggregate_grounding(dicts) + assert result["unsupported_ids"] == ["ev-1"] + + +# ── left_skew_flag ──────────────────────────────────────────────────────────── + + +def test_left_skew_flag_true_when_catastrophic_run(): + # median([0.80, 0.80, 0.80]) = 0.80; min = 0.60 < 0.80 - 0.10 = 0.70. + assert left_skew_flag([0.60, 0.80, 0.80]) is True + + +def test_left_skew_flag_false_when_min_close_to_median(): + # median = 0.80; min = 0.75; 0.75 >= 0.80 - 0.10 = 0.70 → no flag. + assert left_skew_flag([0.75, 0.80, 0.85]) is False + + +def test_left_skew_flag_false_when_all_equal(): + assert left_skew_flag([0.85, 0.85, 0.85]) is False + + +def test_left_skew_flag_boundary_just_above_threshold(): + # min = 0.71, median = 0.80; 0.71 >= 0.80 - 0.10 = 0.70 → no flag. + assert left_skew_flag([0.71, 0.80, 0.80]) is False + + +def test_left_skew_flag_single_score_always_false(): + # A single score has no meaningful distribution; function returns False. + assert left_skew_flag([0.50]) is False + + +def test_left_skew_flag_two_scores_with_large_gap(): + # median([0.50, 0.90]) = 0.70; min = 0.50 < 0.70 - 0.10 = 0.60. + assert left_skew_flag([0.50, 0.90]) is True + + +def test_left_skew_flag_returns_bool(): + result = left_skew_flag([0.80, 0.85, 0.90]) + assert isinstance(result, bool) diff --git a/tests/unit/lab/test_retrieval_metrics.py b/tests/unit/lab/test_retrieval_metrics.py new file mode 100644 index 00000000..a018a08b --- /dev/null +++ b/tests/unit/lab/test_retrieval_metrics.py @@ -0,0 +1,247 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for lab.retrieval_metrics: compute_retrieval_metrics and RetrieverMetrics.""" + +from __future__ import annotations + +import math + +import pytest + +from fireflyframework_agentic.lab.retrieval_metrics import ( + RetrieverMetrics, + compute_retrieval_metrics, +) + + +# ── helpers ─────────────────────────────────────────────────────────────────── + + +def _row(gold_rank: int | None = None, total: int = 5, n_gold: int = 1) -> dict: + """Build one result row with ``total`` retrieved items. + + If ``gold_rank`` is not None, the item at that rank is marked as gold. + All items get a unique ``source_id`` so dedup leaves them all. + """ + retrieved = [] + for rank in range(1, total + 1): + retrieved.append({ + "rank": rank, + "source_id": f"doc-{rank}", + "is_gold": rank == gold_rank, + }) + gold_ids = [f"doc-{gold_rank}"] if gold_rank is not None else [] + return { + "retrieved": retrieved, + "gold": gold_ids * n_gold, + } + + +# ── hit@k ───────────────────────────────────────────────────────────────────── + + +def test_hit_at_1_perfect_when_gold_is_rank1(): + results = [_row(gold_rank=1)] + m = compute_retrieval_metrics(results) + assert m["hit@1"] == 1.0 + + +def test_hit_at_1_zero_when_gold_not_in_top1(): + results = [_row(gold_rank=2)] + m = compute_retrieval_metrics(results) + assert m["hit@1"] == 0.0 + + +def test_hit_at_5_one_when_gold_at_rank5(): + results = [_row(gold_rank=5)] + m = compute_retrieval_metrics(results) + assert m["hit@5"] == 1.0 + + +def test_hit_at_5_zero_when_gold_not_in_top5(): + # Gold is at rank 10 — outside top-5 window with only 5 items, make 10. + results = [_row(gold_rank=None, total=10)] # no gold in retrieved + m = compute_retrieval_metrics(results) + assert m["hit@5"] == 0.0 + + +def test_hit_at_10_one_when_gold_at_rank10(): + results = [_row(gold_rank=10, total=10)] + m = compute_retrieval_metrics(results) + assert m["hit@10"] == 1.0 + + +# ── recall@k ────────────────────────────────────────────────────────────────── + + +def test_recall_at_k_increases_with_k(): + # Gold at rank 3: recall@1=0, recall@5>=recall@1. + results = [_row(gold_rank=3)] + m = compute_retrieval_metrics(results) + assert m["recall@1"] <= m["recall@5"] <= m["recall@10"] + + +def test_recall_at_1_full_when_single_gold_at_rank1(): + results = [_row(gold_rank=1, n_gold=1)] + m = compute_retrieval_metrics(results) + assert m["recall@1"] == 1.0 + + +def test_recall_at_1_zero_when_no_gold_in_rank1(): + results = [_row(gold_rank=5)] + m = compute_retrieval_metrics(results) + assert m["recall@1"] == 0.0 + + +# ── MRR ─────────────────────────────────────────────────────────────────────── + + +def test_mrr_is_1_when_gold_at_rank1(): + results = [_row(gold_rank=1)] + m = compute_retrieval_metrics(results) + assert m["mrr@10"] == 1.0 + + +def test_mrr_is_half_when_gold_at_rank2(): + results = [_row(gold_rank=2)] + m = compute_retrieval_metrics(results) + assert abs(m["mrr@10"] - 0.5) < 1e-9 + + +def test_mrr_is_zero_when_no_gold(): + results = [_row(gold_rank=None)] + m = compute_retrieval_metrics(results) + assert m["mrr@10"] == 0.0 + + +def test_mrr_average_across_queries(): + # Query 1: gold at rank 1 (MRR=1.0); Query 2: gold at rank 2 (MRR=0.5). + results = [_row(gold_rank=1), _row(gold_rank=2)] + m = compute_retrieval_metrics(results) + assert abs(m["mrr@10"] - 0.75) < 1e-3 + + +# ── nDCG ────────────────────────────────────────────────────────────────────── + + +def test_ndcg_is_1_when_gold_at_rank1(): + results = [_row(gold_rank=1, n_gold=1)] + m = compute_retrieval_metrics(results) + assert abs(m["ndcg@10"] - 1.0) < 1e-9 + + +def test_ndcg_is_less_than_1_when_gold_not_at_rank1(): + results = [_row(gold_rank=3, n_gold=1)] + m = compute_retrieval_metrics(results) + assert m["ndcg@10"] < 1.0 + assert m["ndcg@10"] > 0.0 + + +def test_ndcg_is_zero_when_no_gold(): + results = [_row(gold_rank=None)] + m = compute_retrieval_metrics(results) + assert m["ndcg@10"] == 0.0 + + +# ── n_queries ───────────────────────────────────────────────────────────────── + + +def test_n_queries_matches_input_length(): + results = [_row(gold_rank=1), _row(gold_rank=2), _row(gold_rank=3)] + m = compute_retrieval_metrics(results) + assert m["n_queries"] == 3 + + +def test_empty_results_returns_zero_n_queries(): + m = compute_retrieval_metrics([]) + assert m["n_queries"] == 0 + + +# ── optional fields ─────────────────────────────────────────────────────────── + + +def test_no_answer_rate_is_zero_when_answer_present(): + # Rows with a non-empty answer string are counted as answered. + results = [{**_row(gold_rank=1), "answer": "some answer text"}] + m = compute_retrieval_metrics(results) + assert m["no_answer_rate"] == 0.0 + + +def test_no_answer_rate_is_one_when_no_answer_field(): + # Rows without an answer field are treated as no-answer by the implementation. + results = [_row(gold_rank=1)] + m = compute_retrieval_metrics(results) + assert m["no_answer_rate"] == 1.0 + + +def test_citation_precision_is_none_when_no_citations(): + results = [_row(gold_rank=1)] + m = compute_retrieval_metrics(results) + assert m["citation_precision"] is None + + +def test_latency_fields_are_none_when_absent(): + results = [_row(gold_rank=1)] + m = compute_retrieval_metrics(results) + assert m["mean_search_ms"] is None + assert m["mean_answer_ms"] is None + + +def test_mean_search_ms_computed_when_present(): + results = [{**_row(gold_rank=1), "search_ms": 100.0, "answer_ms": 200.0}] + m = compute_retrieval_metrics(results) + assert m["mean_search_ms"] == 100 + assert m["mean_answer_ms"] == 200 + + +# ── RetrieverMetrics.from_results ───────────────────────────────────────────── + + +def test_retriever_metrics_from_results_hit_at_1(): + results = [_row(gold_rank=1)] + rm = RetrieverMetrics.from_results(results) + assert rm.hit_at_1 == 1.0 + + +def test_retriever_metrics_from_results_n_queries(): + results = [_row(gold_rank=1), _row(gold_rank=2)] + rm = RetrieverMetrics.from_results(results) + assert rm.n_queries == 2 + + +def test_retriever_metrics_from_results_mrr(): + results = [_row(gold_rank=1)] + rm = RetrieverMetrics.from_results(results) + assert rm.mrr_at_10 == 1.0 + + +def test_retriever_metrics_from_results_defaults_on_empty(): + rm = RetrieverMetrics.from_results([]) + assert rm.n_queries == 0 + assert rm.hit_at_1 == 0.0 + assert rm.mrr_at_10 == 0.0 + + +def test_retriever_metrics_is_pydantic_model(): + rm = RetrieverMetrics() + assert rm.n_queries == 0 + assert rm.hit_at_1 == 0.0 + assert rm.no_answer_rate is None + + +def test_retriever_metrics_recall_increases_with_k(): + results = [_row(gold_rank=3)] + rm = RetrieverMetrics.from_results(results) + assert rm.recall_at_1 <= rm.recall_at_5 <= rm.recall_at_10 From f79439b0abec86dac42ae96ad3e61856a39cea60 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Fri, 19 Jun 2026 00:12:26 +0200 Subject: [PATCH 11/48] docs(evaluation): add evaluation package documentation (#278) * feat(evaluation): add evaluation package documentation * docs(evaluation): mention evaluation subpackage in README --------- Co-authored-by: miguelgfierro --- README.md | 7 + docs/evaluation.md | 435 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 442 insertions(+) create mode 100644 docs/evaluation.md diff --git a/README.md b/README.md index 9d005b23..904237da 100644 --- a/README.md +++ b/README.md @@ -412,6 +412,12 @@ classDiagram `EvalDataset` loads/saves test cases from JSON. `ModelComparison` runs the same prompts across multiple agents for side-by-side analysis. +- **Evaluation** — Gate-based quality gates (G1–G5), LLM-as-judge advisory scoring, + champion/challenger tracking, and deterministic retrieval metrics for assessing + agent and pipeline outputs. The `flyeval` CLI drives the full gate pipeline from + the command line. Install with `pip install "fireflyframework-agentic[evaluation]"`. + See [docs/evaluation.md](docs/evaluation.md) for the full guide. + > **Optional developer tooling.** `fireflyframework_agentic.experiments` (A/B > experiments) and `fireflyframework_agentic.lab` (offline evaluation / > benchmarking) are leaf modules — nothing in the core imports them and they add @@ -817,6 +823,7 @@ Detailed guides for each module: - [Security](docs/security.md) — Prompt/output guards, at-rest encryption - [Experiments](docs/experiments.md) — A/B testing, variant comparison - [Lab](docs/lab.md) — Benchmarks, datasets, evaluators +- [Evaluation](docs/evaluation.md) — Gate pipeline, flyeval CLI, champion/challenger, retrieval metrics - Studio — moved to [fireflyframework-agentic-studio](https://github.com/fireflyframework/fireflyframework-agentic-studio) --- diff --git a/docs/evaluation.md b/docs/evaluation.md new file mode 100644 index 00000000..c2abe319 --- /dev/null +++ b/docs/evaluation.md @@ -0,0 +1,435 @@ +# Evaluation Guide + +Copyright 2026 Firefly Software Foundation. Licensed under the Apache License 2.0. + +The Evaluation subpackage provides gate-based quality gates, LLM-as-judge advisory scoring, +champion/challenger tracking, and deterministic retrieval metrics for assessing agent outputs. + +--- + +## Concepts + +### Gate pipeline + +The evaluation framework runs **five gates** in sequence. Every gate always runs — a failed +gate raises a *flag*, not a veto, so the scorecard always carries the complete picture. + +| Gate | Name | Kind | Description | +|------|------|------|-------------| +| G1 | Structural & Safe | Deterministic | Schema validity, PII non-disclosure, empty-registry guard. | +| G2 | Must-finds & Negative Controls | Deterministic | Lexical/semantic recall against the must-find registry; NC precision. | +| G3 | Evidence (Grounding) | Deterministic | Excerpt-to-corpus anchoring; fabricated-evidence detection. | +| G4 | LLM-as-a-Judge | Advisory (non-blocking) | Semantic faithfulness, entailment, gap detection — never changes the verdict. | +| G5 | No-regression / Promotion | Human decision | Champion/challenger comparison with A/A noise band; collects sign-offs. | + +**No gate vetoes.** Failures append to the `GateResult` flags list and scoring continues. +The scorecard carries every signal regardless of which gates fired. + +### GateResult + +`GateResult` is a dataclass returned by each gate: + +```python +@dataclass +class GateResult: + gate: str # "G1", "G2", …, "G5" + passed: bool + reason_code: str = "" # e.g. "SCHEMA_INVALID", "NC_HIT", "UNGROUNDED" + details: dict = field(default_factory=dict) +``` + +`str(gate_result)` prints `[G2] PASS` or `[G2] FLAG:NC_HIT`. + +### Verdict + +`verdict(gate_results)` returns `VERDICT_PROMOTE` or `VERDICT_HOLD`: + +- `VERDICT_PROMOTE` — all gates passed **and** G5 (the human sign-off gate) is present. +- `VERDICT_HOLD` — any gate flagged, or G5 is missing. + +The CLI exits `0` on PROMOTE and `1` on HOLD, so it composes into CI. + +### Must-find registry + +A registry (`lean-1` schema) is a JSON file listing items the discovery output is +expected to surface (`tier` L0–L3) and negative controls (NC) it must *not* assert. + +```json +{ + "schema_version": "lean-1", + "corpus": "banca-cordobesa", + "items": [ + { "id": "ao-pep-4eyes", "tier": "L0", "scope": "decision", + "description": "PEP cases require a second analyst sign-off (4-eyes)", + "keywords": ["PEP", "4-eyes"], + "evidence": ["SOP-002-kyc-edd.md"] }, + { "id": "ao-nc-realtime", "tier": "NC", "scope": "finding", + "description": "KYC-Hub synchronises in real time — factually false" } + ] +} +``` + +Tier semantics: L0 = must-find control (a single miss flags the run), L1 = high-priority, +L2 = important, L3 = nice-to-have (not counted in the recall floor). + +### Advisory judge (G4) + +G4 calls a chat LLM (or local Ollama model) for semantic checks the deterministic gates +cannot perform: faithfulness, entailment, numeric/temporal fidelity, actionability, +fabricated-entity detection, and more. It is: + +- **Non-blocking** — `AdvisoryReport` is carried separately and never enters `verdict()`. +- **Non-deterministic** — each metric runs `judge_runs` times (default: 3) and the + median score is reported. +- **Opt-in** — pass `--judge-model provider:model` to activate it; omit the flag to skip. + +### Champion/challenger pattern + +Champions are **per-corpus**. `ChampionRecord` persists the best-known run so that +promotion decisions are made against a stable, signed baseline rather than the last run. + +``` + ┌──────────────────────────────────────────┐ + │ run result JSON (challenger) │ + └──────────────┬───────────────────────────┘ + │ + ┌───────────────▼───────────────┐ + │ G1 · G2 · G3 (deterministic) │ + │ G4 (advisory, opt-in) │ + └───────────────┬───────────────┘ + │ flags + scores + ┌───────────────▼───────────────┐ + │ G5 — no-regression vs │ + │ champion baseline + A/A band │ + └───────────────┬───────────────┘ + │ + ┌───────────────▼───────────────┐ + │ Markdown scorecard │ + │ PROMOTE / HOLD │ + └───────────────────────────────┘ +``` + +`invalidate_champion()` marks a baseline invalid. The `EMPTY_MUST_FIND` guard in G1 +prevents a fake-100% champion being created against an empty registry. + +--- + +## Installation + +The evaluation subpackage requires `scipy` and `numpy`. Install the optional extra: + +```bash +pip install "fireflyframework-agentic[evaluation]" +``` + +The `flyeval` CLI entry-point is registered automatically by the package. Verify: + +```bash +flyeval --version +``` + +--- + +## CLI + +All subcommands exit `0` on PROMOTE and `1` on HOLD. + +### `flyeval gate` + +Run the full gate pipeline against a result JSON and print a Markdown scorecard. + +```bash +flyeval gate \ + --result runs/2026-06-18/output.json \ + --registry registries/banca-cordobesa.json \ + --baseline baselines/banca-cordobesa.json \ + --judge-model anthropic:claude-3-5-haiku \ + --judge-runs 3 +``` + +Key flags: + +| Flag | Default | Description | +|------|---------|-------------| +| `--result` | required | Path to the run's `output.json`. | +| `--registry` | required | Must-find registry (lean-1 JSON). | +| `--baseline` | — | Champion baseline JSON for G5 regression check. | +| `--judge-model` | — | `provider:model` for G4 advisory judge. | +| `--judge-runs` | 3 | Number of independent judge calls (median aggregation). | +| `--no-judge` | — | Skip G4 entirely. | +| `--recall-floor` | 0.70 | Minimum G2 recall before flagging. | +| `--grounding-floor` | 0.90 | Minimum G3 grounding rate before flagging. | +| `--corpus` | — | Path to the evidence corpus bundle for G3 verification. | +| `--pii-list` | — | Path to a JSON array of names to scan for PII leaks (G1). | +| `--embedder` | — | `provider:model` for semantic recall (G2 embedding path). | +| `--model-id` | "unknown" | Identifier of the model under evaluation (for scorecard). | + +### `flyeval aa-band` + +Compute the A/A noise band from multiple repeated runs of the same model to establish +the noise floor before setting up the champion comparison. + +```bash +flyeval aa-band \ + --results runs/aa-run-1/output.json runs/aa-run-2/output.json runs/aa-run-3/output.json \ + --registry registries/banca-cordobesa.json +``` + +The command prints per-metric variance and recommended noise floors. + +### `flyeval day-zero` + +Promote the very first champion for a corpus (Day-Zero protocol). Requires at least +`--signoffs` sign-offs (default: 2) before PROMOTE is issued. + +```bash +flyeval day-zero \ + --result runs/2026-06-18/output.json \ + --registry registries/banca-cordobesa.json \ + --baseline baselines/banca-cordobesa.json \ + --signoffs 2 +``` + +The command writes the new `ChampionRecord` into `--baseline` on success. + +### `flyeval invalidate` + +Mark the current champion invalid with a documented reason. Use this when the registry +changes in a way that makes the existing champion incommensurable. + +```bash +flyeval invalidate \ + --baseline baselines/banca-cordobesa.json \ + --reason "Registry expanded from 39 to 94 items (lean-1 v2)." +``` + +--- + +## Python API + +### Running gates + +```python +import json +from fireflyframework_agentic.evaluation import ( + run_gates, + render_scorecard, + verdict, + load_registry, + VERDICT_PROMOTE, +) + +result = json.loads(open("runs/2026-06-18/output.json").read()) +registry = load_registry("registries/banca-cordobesa.json") + +gate_results = run_gates(result, registry) +scorecard_md = render_scorecard( + gate_results, + corpus="banca-cordobesa", + model_id="anthropic:claude-3-5-sonnet", + run_id="2026-06-18-sonnet-01", +) +print(scorecard_md) + +v = verdict(gate_results) +print("Verdict:", v) # "PROMOTE" or "HOLD" +assert v == VERDICT_PROMOTE +``` + +### Champion management + +```python +from fireflyframework_agentic.evaluation import ( + load_champion, + save_champion, + invalidate_champion, + ChampionRecord, +) + +# Load the current champion (returns None on Day Zero). +champ = load_champion("baselines/banca-cordobesa.json") +if champ is None: + print("Day Zero — no champion yet.") +else: + print(f"Champion: {champ.run_id} | {champ.primary_metric()}={champ.primary_score():.3f}") + +# Save a new champion after a successful PROMOTE. +new_champ = ChampionRecord( + corpus="banca-cordobesa", + run_id="2026-06-18-sonnet-01", + model_id="anthropic:claude-3-5-sonnet", + registry_sha256=registry.sha256(), + scores={"lexical_recall": 0.857, "grounding_pct": 0.941}, + human_sign_offs=["alice", "bob"], +) +save_champion("baselines/banca-cordobesa.json", new_champ) + +# Invalidate when the registry changes materially. +invalidate_champion( + "baselines/banca-cordobesa.json", + reason="Registry expanded from 39 to 94 items.", +) +``` + +### EvalConfig + +`EvalConfig` is a Pydantic model that captures the parameters of a single evaluation run. +Use it to build reproducible, serialisable run records. + +```python +from fireflyframework_agentic.evaluation.models import EvalConfig + +cfg = EvalConfig( + model_id="anthropic:claude-3-5-sonnet", + corpus="banca-cordobesa", + run_id="2026-06-18-sonnet-01", + registry_path="registries/banca-cordobesa.json", + corpus_path="corpora/banca-cordobesa/", + baseline_path="baselines/banca-cordobesa.json", + judge_model="anthropic:claude-3-5-haiku", + judge_runs=3, +) +print(cfg.model_dump_json(indent=2)) +``` + +### Advisory judge (G4) + +```python +from fireflyframework_agentic.evaluation import run_judge, JudgeClient, build_embedder + +client = JudgeClient( + chat_fn=my_chat_fn, # callable(system: str, user: str) -> dict + embed_fn=build_embedder("ollama:bge-m3"), +) + +advisory = run_judge( + result=result, + registry=registry, + client=client, + runs=3, + missed_ids=[], # IDs the deterministic G2 missed — judge tries to recover them +) +print(advisory.scores) # dict of metric -> float +print(advisory.errors) # any metrics that failed (best-effort, never raises) +``` + +--- + +## Retrieval Metrics + +The `compute_retrieval_metrics()` function computes standard IR metrics over ranked +retrieval results. It is imported from `fireflyframework_agentic.lab.retrieval_metrics` +and re-exported by the evaluation package. + +Supported metrics at cut-offs k ∈ {1, 5, 10}: + +- **Hit@k** — at least one gold document in top-k. +- **Recall@k** — fraction of gold documents in top-k. +- **Precision@k** — fraction of top-k results that are gold. +- **MRR@10** — mean reciprocal rank of the first gold hit. +- **MAP@10** — mean average precision. +- **nDCG@10** — normalised discounted cumulative gain. + +```python +from fireflyframework_agentic.evaluation import compute_retrieval_metrics, RetrieverMetrics + +# Each row is a query; each row's "retrieved" list is ranked (rank=1 is top). +rows = [ + { + "query": "KYC enhanced due diligence steps", + "gold": ["SOP-002-kyc-edd.md"], + "retrieved": [ + {"rank": 1, "source_id": "SOP-002-kyc-edd.md", "is_gold": True}, + {"rank": 2, "source_id": "SOP-001-account-opening.md", "is_gold": False}, + {"rank": 3, "source_id": "INT-002-KYC-Jaime.md", "is_gold": True}, + ], + }, +] + +metrics: RetrieverMetrics = compute_retrieval_metrics(rows) +print(f"Recall@5: {metrics.recall_5:.3f}") +print(f"nDCG@10: {metrics.ndcg_10:.3f}") +print(f"MRR@10: {metrics.mrr_10:.3f}") +``` + +`RetrieverMetrics` also carries optional fields when the raw rows include them: +`no_answer_rate`, `citation_precision`, `mean_search_ms`, `mean_answer_ms`. + +--- + +## Architecture + +```mermaid +flowchart TD + R["result JSON\n(DiscoveryResult / output.json)"] + REG["Registry JSON\n(lean-1 must-find)"] + CORP["Corpus bundle\n(raw evidence documents)"] + BASE["Baseline JSON\n(champion record)"] + + R --> G1["G1 · Structural & Safe\n(schema, PII, empty-registry)"] + REG --> G1 + R --> G2["G2 · Recall & NC Precision\n(lexical + optional semantic)"] + REG --> G2 + R --> G3["G3 · Grounding\n(excerpt anchoring, fabrication)"] + CORP --> G3 + R --> G4["G4 · LLM Judge advisory\n(faithfulness, entailment, gaps)"] + REG --> G4 + G1 --> SC["Markdown Scorecard\nrender_scorecard()"] + G2 --> SC + G3 --> SC + G4 -.advisory.-> SC + BASE --> G5["G5 · No-regression\n(A/A band, sign-offs)"] + G1 --> G5 + G2 --> G5 + G3 --> G5 + G5 --> SC + SC --> V["verdict()\nPROMOTE / HOLD"] + V --> CHAMP["save_champion()\nor invalidate_champion()"] +``` + +--- + +## Reference + +### Exports + +All symbols below are importable from `fireflyframework_agentic.evaluation`. + +| Symbol | Kind | Description | +|--------|------|-------------| +| `EvalConfig` | Pydantic model | Parameters for a single evaluation run. | +| `GateResult` | Dataclass | Result of one gate: `gate`, `passed`, `reason_code`, `details`. | +| `Verdict` | Constants class | `Verdict.PROMOTE`, `Verdict.HOLD`. | +| `VERDICT_PROMOTE` | `str` | `"PROMOTE"`. | +| `VERDICT_HOLD` | `str` | `"HOLD"`. | +| `run_gates()` | Function | Run all four deterministic gates (G1–G3, G5 shape) and return results. | +| `g2_recall_precision()` | Function | Run only G2 (recall + NC precision) and return `GateResult`. | +| `verdict()` | Function | Derive PROMOTE/HOLD from a list of `GateResult`. | +| `render_scorecard()` | Function | Render a Markdown scorecard from gate results and metadata. | +| `ChampionRecord` | Dataclass | Per-corpus champion metadata and scores. | +| `load_champion()` | Function | Load the current champion from `baseline.json`; returns `None` on Day Zero. | +| `save_champion()` | Function | Persist a new champion to `baseline.json`. | +| `invalidate_champion()` | Function | Mark the champion invalid with a reason string. | +| `AdvisoryReport` | Dataclass | G4 judge output: `scores`, `errors`, `raw`. | +| `run_judge()` | Function | Run the LLM-as-a-Judge advisory pass. | +| `JudgeClient` | Dataclass | Holds `chat_fn` and `embed_fn` for the judge. | +| `OllamaEmbedder` | Class | Local Ollama embedding callable (default BGE-M3). | +| `build_embedder()` | Function | Factory: `"ollama:bge-m3"` → `OllamaEmbedder`. | +| `cosine()` | Function | Cosine similarity between two numpy vectors. | +| `Registry` | Dataclass | Parsed must-find registry with real items and NC items. | +| `RegistryItem` | Dataclass | One must-find or NC item: `id`, `tier`, `scope`, `description`, …. | +| `load_registry()` | Function | Parse and validate a lean-1 registry JSON file. | +| `registry_sha256()` | Function | SHA-256 of a registry file path. | +| `load_corpus()` | Function | Load and index a corpus bundle for G3 evidence verification. | +| `corpus_sha256()` | Function | SHA-256 of a corpus directory or bundle. | +| `verify_evidence_index()` | Function | Check each `evidence_index` entry against the corpus. | +| `EMPTY` / `FABRICATED` / `SOURCE_UNKNOWN` / `VERIFIED` | `str` | Evidence verification status constants. | +| `RetrieverMetrics` | Pydantic model | IR metrics: `recall_k`, `precision_k`, `ndcg_10`, `mrr_10`, `map_10`. | +| `compute_retrieval_metrics()` | Function | Compute IR metrics from a list of ranked-retrieval result rows. | +| `anchored()` | Function | True if claim and evidence share at least one non-trivial token. | +| `matches()` | Function | Gate predicate: does a candidate match a registry item? | +| `source_stem()` | Function | Normalise a `locator` path to its file stem for dedup. | +| `tokens()` | Function | Tokenise text to a list of lowercase word strings. | +| `aa_band()` | Function | Compute per-metric A/A noise floor from repeated runs. | +| `aggregate_grounding()` | Function | Summarise grounding stats across a result's findings. | +| `left_skew_flag()` | Function | True when the score distribution is left-skewed (over-optimistic). | From a1d28a597ad87559dad0e26a2f266cf516553d21 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 09:24:24 +0200 Subject: [PATCH 12/48] remove examples/flyradar_eval_example.py --- examples/flyradar_eval_example.py | 406 ------------------------------ 1 file changed, 406 deletions(-) delete mode 100644 examples/flyradar_eval_example.py diff --git a/examples/flyradar_eval_example.py b/examples/flyradar_eval_example.py deleted file mode 100644 index 706528f4..00000000 --- a/examples/flyradar_eval_example.py +++ /dev/null @@ -1,406 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""FlyRadar evaluation example — gate-based process-mining quality gate. - -Demonstrates how to use ``fireflyframework_agentic.evaluation`` to replicate -the flyradar experiment quality-gate workflow: - -1. Load a must-find registry (the gold standard items the model must discover). -2. Load a DiscoveryResult produced by a flyradar pipeline run. -3. Run gates G1-G5 to produce a structured verdict: - G1 -- Structural & Safe (schema validity, PII, empty-registry guard). - G2 -- Recall & Precision (must-find recall floor, NC precision). - G3 -- Grounded (finding-to-evidence anchoring). - G4 -- LLM-as-a-Judge (advisory only; never blocks promotion). - G5 -- No-regression / promotion (champion/challenger comparison). -4. Render a human-readable scorecard and print the final verdict. -5. Promote the challenger to champion when the verdict is PROMOTE. - -Usage:: - - # Minimal: deterministic gates only (no G4 judge, no baseline) - python examples/flyradar_eval_example.py \\ - --result output.json \\ - --registry registry.json - - # With corpus verification and a champion baseline - python examples/flyradar_eval_example.py \\ - --result output.json \\ - --registry registry.json \\ - --baseline baseline.json \\ - --corpus input.json - - # With the advisory G4 LLM judge (requires API key in environment) - FLYEVAL_JUDGE_MODEL=anthropic:claude-sonnet-4-6 \\ - python examples/flyradar_eval_example.py \\ - --result output.json \\ - --registry registry.json \\ - --judge-model anthropic:claude-sonnet-4-6 - -Exit codes: 0 = PROMOTE, 1 = HOLD. - -Input file formats ------------------- -``--result`` (output.json) - A DiscoveryResult JSON produced by a flyradar pipeline run. Must contain - at minimum ``findings`` (list) and ``evidence_index`` (list). - -``--registry`` (registry.json) - A lean-1 registry JSON. Each item has ``id``, ``tier`` (L0-L3), ``title``, - ``description``, and ``nc`` (bool, True for negative controls). - -``--baseline`` (baseline.json) - A ChampionRecord JSON written by a previous PROMOTE run. When omitted the - gate runs in day-zero mode (G5 always passes and a new champion is minted). - -``--corpus`` (input.json) - The corpus bundle used during the run. When supplied, G3 verifies that cited - evidence excerpts actually appear in the corpus documents. -""" - -from __future__ import annotations - -import argparse -import json -import sys -from pathlib import Path - -from fireflyframework_agentic.evaluation import ( - ChampionRecord, - GateResult, - build_embedder, - load_champion, - load_corpus, - load_registry, - render_scorecard, - run_gates, - run_judge, - save_champion, - verdict, - VERDICT_PROMOTE, -) -from fireflyframework_agentic.evaluation.models import EvalConfig - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -def _load_json(path: str) -> dict: - """Read a JSON file and return its contents as a dict.""" - return json.loads(Path(path).read_text(encoding="utf-8")) - - -def _lexical_missed_ids(result: dict, registry) -> list[str]: - """Return the IDs of registry items not matched by any finding (lexically). - - The G4 judge uses these to focus its coverage checks on items that - lexical recall missed — the places where semantic recovery matters most. - """ - from fireflyframework_agentic.evaluation.matcher import matches - - evidence_index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")} - findings = result.get("findings", []) - # L3 items are informational-only and are never scored. - scored_items = [item for item in registry.real_items if item.tier != "L3"] - return [ - item.id - for item in scored_items - if not any(matches(f, item, evidence_index) for f in findings) - ] - - -# --------------------------------------------------------------------------- -# Main evaluation flow -# --------------------------------------------------------------------------- - - -def run_evaluation(args: argparse.Namespace) -> int: - """Run the full flyradar gate evaluation and return an exit code.""" - - # ------------------------------------------------------------------ - # Step 1 — Load inputs. - # ------------------------------------------------------------------ - print(f"Loading result : {args.result}") - result = _load_json(args.result) - - print(f"Loading registry : {args.registry}") - registry = load_registry(args.registry) - print(f" {len(registry.real_items)} real items, {len(registry.nc_items)} NC items") - - # The EvalConfig captures provenance for the run record. - config = EvalConfig( - model_id=args.model_id, - corpus=registry.corpus, - run_id=args.run_id, - registry_path=args.registry, - corpus_path=args.corpus or "", - baseline_path=args.baseline or "", - judge_model=args.judge_model or "", - ) - - # Optional: corpus bundle for deterministic evidence verification (G3). - corpus = None - if args.corpus: - print(f"Loading corpus : {args.corpus}") - corpus = load_corpus(args.corpus) - - # Optional: champion record for regression detection (G5). - champion = None - champion_scores = None - aa_noise = None - if args.baseline: - print(f"Loading baseline : {args.baseline}") - champion = load_champion(args.baseline) - if champion: - champion_scores = champion.scores - aa_noise = champion.aa_noise - print(f" Champion run : {champion.run_id} ({champion.model_id})") - else: - print(" No champion found — running in day-zero mode.") - - # Optional: embedder for semantic/hybrid recall (G2). - embed_fn = None - if args.embedder: - print(f"Building embedder: {args.embedder}") - embed_fn = build_embedder(args.embedder) - - print() - - # ------------------------------------------------------------------ - # Step 2 — Run deterministic gates G1-G3 + G5. - # - # run_gates() returns a list of GateResult objects, one per gate. - # Each GateResult carries: - # .gate -- "G1" | "G2" | "G3" | "G5" - # .passed -- bool - # .details -- dict with per-metric values - # .errors -- list[str] of blocking error codes - # ------------------------------------------------------------------ - print("Running gates G1-G3 + G5 ...") - gate_results: list[GateResult] = run_gates( - result, - registry, - args.registry, - pii_list=args.pii_list or [], - recall_floor=args.recall_floor, - grounding_floor=args.grounding_floor, - champion_scores=champion_scores, - aa_noise=aa_noise, - is_day_zero=(champion is None), - human_signed_off=args.human_signed_off, - signoff_count=args.signoffs, - embed_fn=embed_fn, - tau=args.tau, - recall_metric=args.recall_metric, - tau_nc=args.tau_nc, - corpus=corpus, - ) - - # Quick gate summary before the full scorecard. - for gr in gate_results: - status = "PASS" if gr.passed else "FAIL" - print(f" {gr.gate}: {status}") - - # ------------------------------------------------------------------ - # Step 3 — Run the advisory G4 LLM-as-a-Judge (optional). - # - # G4 is non-blocking: it never changes the verdict or exit code. - # It produces an AdvisoryReport with per-finding quality signals - # (faithfulness, citation relevance, fabricated entities, etc.). - # ------------------------------------------------------------------ - advisory = None - if args.judge_model: - print(f"\nRunning G4 judge ({args.judge_model}) ...") - missed_ids = _lexical_missed_ids(result, registry) - advisory = run_judge( - result, - registry, - judge_model=args.judge_model, - runs=args.judge_runs, - concurrency=args.judge_concurrency, - pipeline_model=args.model_id, - embed_fn=embed_fn, - tau=args.tau, - lexical_missed_ids=missed_ids, - ) - print(f" Judge completed ({args.judge_runs} run(s)).") - else: - print("\nG4 judge skipped (pass --judge-model to enable).") - - # ------------------------------------------------------------------ - # Step 4 — Render the scorecard. - # - # render_scorecard() produces a markdown-formatted human-readable - # report that mirrors the output of `flyeval gate` in the playground. - # ------------------------------------------------------------------ - print() - scorecard = render_scorecard( - gate_results, - corpus=registry.corpus, - model_id=config.model_id, - run_id=config.run_id, - is_self_graded=True, - kappa_advisory=registry.is_kappa_advisory(), - evidence_unverified=(corpus is None), - advisory=advisory, - ) - print(scorecard) - - # ------------------------------------------------------------------ - # Step 5 — Inspect the verdict and handle promotion. - # - # verdict() returns "PROMOTE" or "HOLD" based on the gate results. - # On PROMOTE, save the challenger as the new champion so future runs - # can detect regressions against this baseline. - # ------------------------------------------------------------------ - v = verdict(gate_results) - print(f"\nFinal verdict: {v}") - - if v == VERDICT_PROMOTE and args.baseline: - # Extract the key scores from G2 and G3 to store in the champion record. - g2 = next((g for g in gate_results if g.gate == "G2"), None) - g3 = next((g for g in gate_results if g.gate == "G3"), None) - scores: dict[str, float] = {} - if g2: - scores["recall"] = g2.details.get("recall", 0.0) - if g3: - scores["grounding_pct"] = g3.details.get("grounding_pct", 0.0) - - new_champion = ChampionRecord( - corpus=registry.corpus, - run_id=config.run_id, - model_id=config.model_id, - registry_sha256=registry.sha256(), - scores=scores, - is_day_zero=(champion is None), - ) - save_champion( - args.baseline, - new_champion, - summary=f"Promoted by flyradar_eval_example.py — {config.run_id}", - ) - print(f"Champion saved to {args.baseline}") - - # Exit 0 = PROMOTE, 1 = HOLD (mirrors `flyeval gate` convention). - return 0 if v == VERDICT_PROMOTE else 1 - - -# --------------------------------------------------------------------------- -# CLI -# --------------------------------------------------------------------------- - - -def build_parser() -> argparse.ArgumentParser: - p = argparse.ArgumentParser( - prog="flyradar_eval_example", - description="FlyRadar gate evaluation — replicates the flyeval gate workflow.", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Required inputs. - p.add_argument("--result", required=True, help="Path to DiscoveryResult JSON.") - p.add_argument("--registry", required=True, help="Path to lean-1 registry JSON.") - - # Optional inputs. - p.add_argument( - "--baseline", - help="Path to baseline.json (champion store). When absent, runs in day-zero mode.", - ) - p.add_argument( - "--corpus", - help="Path to input.json corpus bundle for deterministic evidence verification (G3).", - ) - - # Run metadata. - p.add_argument("--model-id", default="unknown", help="Model identifier for the scorecard.") - p.add_argument("--run-id", default="example-run", help="Run identifier for the scorecard.") - - # Gate thresholds. - p.add_argument( - "--recall-floor", - type=float, - default=0.70, - help="Minimum recall required for G2 to pass.", - ) - p.add_argument( - "--grounding-floor", - type=float, - default=0.90, - help="Minimum grounding percentage required for G3 to pass.", - ) - p.add_argument( - "--recall-metric", - choices=["lexical", "semantic", "hybrid"], - default="lexical", - help="Recall metric used by G2. 'semantic' and 'hybrid' require --embedder.", - ) - p.add_argument( - "--tau", - type=float, - default=0.70, - help="Cosine similarity threshold for semantic recall (real items).", - ) - p.add_argument( - "--tau-nc", - type=float, - default=0.85, - help="Cosine similarity threshold for NC item detection.", - ) - p.add_argument("--pii-list", nargs="*", default=[], help="PII tokens to check for in findings.") - p.add_argument("--human-signed-off", action="store_true", help="Mark this run as human-reviewed.") - p.add_argument("--signoffs", type=int, default=0, help="Number of human sign-offs collected.") - - # G4 judge options. - p.add_argument( - "--judge-model", - default=None, - help=( - "Provider:model string for the advisory G4 LLM judge " - "(e.g. 'anthropic:claude-sonnet-4-6'). Omit to skip G4." - ), - ) - p.add_argument( - "--judge-runs", - type=int, - default=1, - help="Number of judge calls to aggregate (odd number recommended for median).", - ) - p.add_argument( - "--judge-concurrency", - type=int, - default=1, - help="Thread fan-out for per-item G4 metrics (1 = sequential).", - ) - - # Embedder for semantic recall. - p.add_argument( - "--embedder", - default=None, - help="Embedder spec for semantic/hybrid recall (e.g. 'ollama:bge-m3').", - ) - - return p - - -def main() -> None: - parser = build_parser() - args = parser.parse_args() - sys.exit(run_evaluation(args)) - - -if __name__ == "__main__": - main() From 61617186f1ed103c783197784497dd841a260b43 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 09:24:27 +0200 Subject: [PATCH 13/48] ci: add --extra evaluation to typecheck and test sync steps --- .github/workflows/pr-gate.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-gate.yml b/.github/workflows/pr-gate.yml index c0ef76d4..86e35717 100644 --- a/.github/workflows/pr-gate.yml +++ b/.github/workflows/pr-gate.yml @@ -57,7 +57,7 @@ jobs: - uses: actions/setup-python@v6 with: python-version: '3.13' - - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra openai-embeddings + - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra openai-embeddings --extra evaluation - run: uv run pyright test: @@ -72,7 +72,7 @@ jobs: - uses: actions/setup-python@v6 with: python-version: '3.13' - - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra vectorstores-pgvector --extra openai-embeddings + - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra vectorstores-pgvector --extra openai-embeddings --extra evaluation - run: uv run pytest -m "not nightly" --cov --cov-report=term-missing build: From 203134ca971377816c462b7d4c5125d9ebc9d4e0 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 09:24:32 +0200 Subject: [PATCH 14/48] fix(evaluation): resolve all ruff lint errors (import sort, SIM108, B905, N806, UP035) --- examples/flycanon_eval_example.py | 13 +-- .../evaluation/__init__.py | 21 ++++- fireflyframework_agentic/evaluation/cli.py | 42 +++++----- fireflyframework_agentic/evaluation/corpus.py | 20 ++--- fireflyframework_agentic/evaluation/gates.py | 42 +++------- fireflyframework_agentic/evaluation/judge.py | 79 ++++++++----------- .../evaluation/judge_client.py | 25 ++---- .../evaluation/matcher.py | 60 +++++++------- .../evaluation/registry.py | 40 +++++----- .../evaluation/run_config_snapshot.py | 9 +-- .../evaluation/scorecard.py | 44 +++-------- fireflyframework_agentic/evaluation/stats.py | 9 +-- uv.lock | 59 +++++++++++++- 13 files changed, 220 insertions(+), 243 deletions(-) diff --git a/examples/flycanon_eval_example.py b/examples/flycanon_eval_example.py index 9d8d071b..856b520b 100644 --- a/examples/flycanon_eval_example.py +++ b/examples/flycanon_eval_example.py @@ -94,8 +94,7 @@ import sys from pathlib import Path -from fireflyframework_agentic.evaluation import RetrieverMetrics, compute_retrieval_metrics - +from fireflyframework_agentic.evaluation import RetrieverMetrics # --------------------------------------------------------------------------- # Helpers @@ -171,10 +170,7 @@ def _print_metrics_table(metrics: RetrieverMetrics, baseline: dict | None) -> No if value is None: continue # Format floats as 4 decimal places; ints as plain integers. - if isinstance(value, float): - cur_str = f"{value:.4f}" - else: - cur_str = str(value) + cur_str = f"{value:.4f}" if isinstance(value, float) else str(value) row = f"{key:<{col_w}} {cur_str:>{num_w}}" if baseline and key in baseline and isinstance(value, float): @@ -353,10 +349,7 @@ def build_parser() -> argparse.ArgumentParser: p.add_argument( "--baseline", default=None, - help=( - "Path to baseline.json (champion store). When absent, scores are printed " - "without comparison." - ), + help=("Path to baseline.json (champion store). When absent, scores are printed without comparison."), ) p.add_argument( "--promote-if-better", diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index ad01980c..d986d09f 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -31,16 +31,29 @@ from importlib.metadata import PackageNotFoundError, version -from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index +from fireflyframework_agentic.evaluation.champion import ( + ChampionRecord, + invalidate_champion, + load_champion, + save_champion, +) +from fireflyframework_agentic.evaluation.corpus import ( + EMPTY, + FABRICATED, + SOURCE_UNKNOWN, + VERIFIED, + corpus_sha256, + load_corpus, + verify_evidence_index, +) from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, run_gates -from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict, VERDICT_PROMOTE, VERDICT_HOLD -from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge from fireflyframework_agentic.evaluation.judge_client import JudgeClient, OllamaEmbedder, build_embedder, cosine from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256 -from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics +from fireflyframework_agentic.evaluation.scorecard import VERDICT_HOLD, VERDICT_PROMOTE, render_scorecard, verdict from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag +from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics try: __version__ = version("fireflyframework-agentic") diff --git a/fireflyframework_agentic/evaluation/cli.py b/fireflyframework_agentic/evaluation/cli.py index 7ac868d9..80dc418a 100644 --- a/fireflyframework_agentic/evaluation/cli.py +++ b/fireflyframework_agentic/evaluation/cli.py @@ -48,7 +48,8 @@ from fireflyframework_agentic.evaluation.judge_client import build_embedder from fireflyframework_agentic.evaluation.matcher import matches from fireflyframework_agentic.evaluation.registry import load_registry -from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict as get_verdict +from fireflyframework_agentic.evaluation.scorecard import render_scorecard +from fireflyframework_agentic.evaluation.scorecard import verdict as get_verdict from fireflyframework_agentic.evaluation.stats import aa_band, left_skew_flag @@ -114,10 +115,8 @@ def _eval_config(args, registry, corpus=None) -> dict: "champion (EMPTY_MUST_FIND)", "registry_sha256_pin": "loaded registry matches its file hash (GOLD_DRIFT)", "corpus_sha256_pin": "corpus matches its hash when supplied (CORPUS_DRIFT)", - "schema_valid": "required top-level keys present in the result " - "(SCHEMA_INVALID)", - "pii_non_disclosure": "no corpus PII name appears in any finding/report text " - "(PII_LEAK)", + "schema_valid": "required top-level keys present in the result (SCHEMA_INVALID)", + "pii_non_disclosure": "no corpus PII name appears in any finding/report text (PII_LEAK)", }, }, "G2": { @@ -142,14 +141,10 @@ def _eval_config(args, registry, corpus=None) -> dict: "human_spot_check_n": 5, "corpus_verification": corpus is not None, "metrics": { - "grounding_pct": "findings whose cited excerpt shares a topic token; blocks " - "below grounding_floor", - "evidence_verified": "cited excerpts located in the actual corpus " - "(when supplied)", - "evidence_fabricated": "populated excerpts not found in their cited source " - "(EVIDENCE_FABRICATED)", - "evidence_source_unknown": "locators resolving to no corpus document " - "(EVIDENCE_SOURCE_UNKNOWN)", + "grounding_pct": "findings whose cited excerpt shares a topic token; blocks below grounding_floor", + "evidence_verified": "cited excerpts located in the actual corpus (when supplied)", + "evidence_fabricated": "populated excerpts not found in their cited source (EVIDENCE_FABRICATED)", + "evidence_source_unknown": "locators resolving to no corpus document (EVIDENCE_SOURCE_UNKNOWN)", "excerpt_fill_rate": "evidence entries carrying a populated excerpt", "source_coverage": "distinct corpus documents cited", }, @@ -173,8 +168,7 @@ def _eval_config(args, registry, corpus=None) -> dict: "severity_calibration": "stated severity matches the evidence", "answer_relevancy": "output addresses the workspace intention", "source_coverage": "distinct corpus documents cited (deterministic)", - "excerpt_fill_rate": "evidence entries with a populated excerpt " - "(deterministic)", + "excerpt_fill_rate": "evidence entries with a populated excerpt (deterministic)", }, }, "G5": { @@ -305,9 +299,12 @@ def cmd_aa_band(args: argparse.Namespace) -> int: for rp in args.results: result = _load_json(rp) g2 = g2_recall_precision( - result, registry, - recall_metric=args.recall_metric, embed_fn=embed_fn, - tau=args.tau, tau_nc=args.tau_nc, + result, + registry, + recall_metric=args.recall_metric, + embed_fn=embed_fn, + tau=args.tau, + tau_nc=args.tau_nc, corpus=corpus, ) if g2.passed or g2.details.get("recall") is not None: @@ -468,15 +465,13 @@ def _add_common(p: argparse.ArgumentParser) -> None: "--tau", type=float, default=float(os.environ.get("FLYEVAL_TAU", "0.70")), - help="cosine similarity threshold for the semantic recall path (real items). " - "Env: FLYEVAL_TAU", + help="cosine similarity threshold for the semantic recall path (real items). Env: FLYEVAL_TAU", ) p_gate.add_argument( "--tau-nc", type=float, default=float(os.environ.get("FLYEVAL_TAU_NC", "0.85")), - help="cosine similarity threshold for NC item detection (higher; no source anchor). " - "Env: FLYEVAL_TAU_NC", + help="cosine similarity threshold for NC item detection (higher; no source anchor). Env: FLYEVAL_TAU_NC", ) p_gate.add_argument("--human-signed-off", action="store_true") p_gate.add_argument("--signoffs", type=int, default=0) @@ -495,8 +490,7 @@ def _add_common(p: argparse.ArgumentParser) -> None: "--judge-runs", type=int, default=int(os.environ.get("FLYEVAL_JUDGE_RUNS", "1")), - help="G4 judge runs; the median of numeric scores is kept (odd recommended). " - "Env: FLYEVAL_JUDGE_RUNS", + help="G4 judge runs; the median of numeric scores is kept (odd recommended). Env: FLYEVAL_JUDGE_RUNS", ) p_gate.add_argument( "--judge-concurrency", diff --git a/fireflyframework_agentic/evaluation/corpus.py b/fireflyframework_agentic/evaluation/corpus.py index 32835f2c..34926b41 100644 --- a/fireflyframework_agentic/evaluation/corpus.py +++ b/fireflyframework_agentic/evaluation/corpus.py @@ -80,7 +80,7 @@ def normalize(text: str) -> str: smart quotes, collapse whitespace, casefold.""" text = unicodedata.normalize("NFKC", text) text = text.replace("**", "").replace("*", "") - text = re.sub(r"[\"""''']", "", text) + text = re.sub(r"[\"" "''']", "", text) return re.sub(r"\s+", " ", text).strip().casefold() @@ -129,9 +129,7 @@ def load_corpus(path: str | Path) -> Corpus: def _fragment_coverage(fragment: str, source: str) -> float: """Fraction of fragment covered by matching blocks of >= _MIN_BLOCK_CHARS chars.""" - blocks = difflib.SequenceMatcher( - None, fragment, source, autojunk=False - ).get_matching_blocks() + blocks = difflib.SequenceMatcher(None, fragment, source, autojunk=False).get_matching_blocks() covered = sum(b.size for b in blocks if b.size >= _MIN_BLOCK_CHARS) return covered / len(fragment) @@ -158,11 +156,9 @@ def verify_entry(corpus: Corpus, entry: dict) -> str: if not excerpt: return EMPTY - fragments = [ - f.strip() - for f in _SPLICE_PATTERN.split(excerpt) - if len(f.strip()) >= _MIN_FRAGMENT_CHARS - ] or [excerpt] + fragments = [f.strip() for f in _SPLICE_PATTERN.split(excerpt) if len(f.strip()) >= _MIN_FRAGMENT_CHARS] or [ + excerpt + ] for fragment in fragments: if fragment in source: @@ -178,8 +174,4 @@ def verify_evidence_index(corpus: Corpus, result: dict) -> dict[str, str]: Returns {evidence_id: status} over all entries — referenced or not — so the gates share one verification pass. """ - return { - ev["id"]: verify_entry(corpus, ev) - for ev in result.get("evidence_index", []) - if ev.get("id") - } + return {ev["id"]: verify_entry(corpus, ev) for ev in result.get("evidence_index", []) if ev.get("id")} diff --git a/fireflyframework_agentic/evaluation/gates.py b/fireflyframework_agentic/evaluation/gates.py index 057bfea7..fc98d311 100644 --- a/fireflyframework_agentic/evaluation/gates.py +++ b/fireflyframework_agentic/evaluation/gates.py @@ -93,11 +93,7 @@ def _build_evidence_index(result: dict, corpus: Corpus | None = None) -> dict[st if corpus is None: return index statuses = verify_evidence_index(corpus, result) - return { - eid: ev - for eid, ev in index.items() - if statuses[eid] in (VERIFIED, EMPTY) - } + return {eid: ev for eid, ev in index.items() if statuses[eid] in (VERIFIED, EMPTY)} # ── G1: Structural & Safe ──────────────────────────────────────────────────── @@ -322,8 +318,10 @@ def _finding_redundancy_rate(findings: list[dict]) -> float: """Fraction of findings that are near-duplicates of another (Jaccard ≥0.6 on ≥5-char tokens).""" if len(findings) < 2: return 0.0 + def _tok(text: str) -> frozenset[str]: return frozenset(t.lower() for t in text.split() if len(t) >= 5) + token_sets = [_tok(f.get("description", "")) for f in findings] in_redundant: set[int] = set() for i in range(len(token_sets)): @@ -381,9 +379,7 @@ def g2_recall_precision( if item.tier == "NC": lexical[item.id] = False elif item.scope == "dependency_graph" and item.from_node: - lexical[item.id] = matcher.matches_dependency_graph_relation( - item, result, evidence_index - ) + lexical[item.id] = matcher.matches_dependency_graph_relation(item, result, evidence_index) else: lexical[item.id] = any( matches(c, item, evidence_index, scope=scope) @@ -394,14 +390,10 @@ def g2_recall_precision( if recall_metric not in ("lexical", "semantic", "hybrid"): raise ValueError(f"unknown recall_metric {recall_metric!r}") if recall_metric in ("semantic", "hybrid") and embed_fn is None: - raise ValueError( - f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn" - ) + raise ValueError(f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn") if embed_fn is not None: - semantic = matcher.semantic_hits( - candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc - ) + semantic = matcher.semantic_hits(candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc) # dependency_graph relation items have no embedding candidates (§5.3b uses # the endpoint matcher, not per-candidate text embeddings); mirror the # lexical result so semantic/hybrid never under-credits them. @@ -424,8 +416,7 @@ def g2_recall_precision( finding_count = len(findings) finding_scoped_items = [i for i in registry.real_items if i.scope == "finding"] findings_matched = sum( - 1 for f in findings - if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items) + 1 for f in findings if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items) ) _sn = { "finding_count": finding_count, @@ -493,9 +484,7 @@ def _semantic_details() -> dict: "lexical_recall": round(_weighted_recall(scored_items, lexical), 4), "semantic_recall": round(_weighted_recall(scored_items, semantic), 4), "hybrid_recall": round( - _weighted_recall( - scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical} - ), + _weighted_recall(scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical}), 4, ), "tau": tau, @@ -577,8 +566,8 @@ def g3_grounded( grounded_ids: list[str] = [] # Ungrounded split (§6.3): distinguish format issues from real faithfulness failures. - ungrounded_empty_only: list[str] = [] # every ref had an empty excerpt - ungrounded_populated: list[str] = [] # had populated excerpt(s) but none anchored + ungrounded_empty_only: list[str] = [] # every ref had an empty excerpt + ungrounded_populated: list[str] = [] # had populated excerpt(s) but none anchored # Excerpt fill: count all resolved refs and how many carry a non-empty excerpt. total_refs = 0 @@ -657,18 +646,14 @@ def g3_grounded( "Populated excerpt(s) not found in the cited corpus document — " "the run asserts evidence the source does not contain." ) - return GateResult( - gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details - ) + return GateResult(gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details) if unknown_source_ids: details["message"] = ( "Evidence locator(s) resolve to no corpus document — either the " "corpus bundle is incomplete or the run invented a source." ) - return GateResult( - gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details - ) + return GateResult(gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details) if grounding_pct < grounding_floor: details["floor"] = grounding_floor @@ -746,8 +731,7 @@ def g5_no_regression( band = noise.get(metric, 0.0) if delta < -band: regressions.append( - f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} " - f"delta={delta:+.4f} < -band={-band:.4f}" + f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} delta={delta:+.4f} < -band={-band:.4f}" ) elif delta > band: improvements.append(f"{metric}: delta={delta:+.4f} > band={band:.4f}") diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index a347c8e1..80a90b04 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -142,10 +142,7 @@ def _map_chat(chat_fn, prompts, workers=1): results: list[dict] = [{} for _ in prompts] with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: - futures = { - executor.submit(chat_fn, system, user): idx - for idx, (system, user) in enumerate(prompts) - } + futures = {executor.submit(chat_fn, system, user): idx for idx, (system, user) in enumerate(prompts)} for future in concurrent.futures.as_completed(futures): idx = futures[future] try: @@ -165,11 +162,7 @@ def source_coverage(result: dict) -> dict: source stems present in evidence_index but cited by no finding. """ evidence_index = _evidence_index(result) - all_stems = { - source_stem(ev.get("locator", "")) - for ev in result.get("evidence_index", []) - if ev.get("locator") - } + all_stems = {source_stem(ev.get("locator", "")) for ev in result.get("evidence_index", []) if ev.get("locator")} cited_stems: set[str] = set() for f in result.get("findings", []): for ref in f.get("evidence_refs", []): @@ -245,7 +238,7 @@ def semantic_recovery( cand_vecs = np.asarray(embed_fn(candidate_texts), dtype=np.float64) recovered: list[dict] = [] - for item, ivec in zip(missed_items, item_vecs): + for item, ivec in zip(missed_items, item_vecs, strict=False): best = max((cosine(ivec, cvec) for cvec in cand_vecs), default=0.0) if best >= tau: recovered.append({"id": item.id, "cosine": round(best, 4)}) @@ -307,11 +300,7 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic source}], count}. """ evidence_index = _evidence_index(result) - scored = [ - (f, excerpts) - for f in result.get("findings", []) - if (excerpts := _cited_excerpts(f, evidence_index)) - ] + scored = [(f, excerpts) for f in result.get("findings", []) if (excerpts := _cited_excerpts(f, evidence_index))] prompts = [ ( SYSTEM, @@ -326,7 +315,7 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic ] answers = _map_chat(chat_fn, prompts, workers) mismatches: list[dict] = [] - for (f, _excerpts), answer in zip(scored, answers): + for (f, _excerpts), answer in zip(scored, answers, strict=False): for m in answer.get("mismatches", []) or []: mismatches.append( { @@ -395,7 +384,7 @@ def nc_semantic_precision(result: dict, registry, chat_fn, *, workers: int = 1) ] answers = _map_chat(chat_fn, prompts, workers) asserted_ids = [ - item.id for item, a in zip(nc_items, answers) if str(a.get("asserted", "")).lower() == "yes" + item.id for item, a in zip(nc_items, answers, strict=False) if str(a.get("asserted", "")).lower() == "yes" ] return {"asserted": len(asserted_ids), "total": len(nc_items), "asserted_ids": asserted_ids} @@ -407,10 +396,7 @@ def fabricated_entity(result: dict, chat_fn) -> dict: excerpts + locators. """ output_text = _output_text(result) - corpus = "\n".join( - f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" - for ev in result.get("evidence_index", []) - ) + corpus = "\n".join(f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" for ev in result.get("evidence_index", [])) user = ( "List any system, organization, or metric NAMED in the OUTPUT that does NOT " "appear anywhere in the CORPUS EVIDENCE.\n" @@ -433,8 +419,7 @@ def contradiction(result: dict, chat_fn) -> dict: lines.append(f"{f.get('id', '?')}: {f.get('title', '')} — {f.get('description', '')}") user = ( "Are any two of these FINDINGS mutually contradictory? List each contradicting pair.\n" - 'Reply with ONLY {"pairs": [["", ""], ...]}. Empty list if none.\n\n' - + "\n".join(lines) + 'Reply with ONLY {"pairs": [["", ""], ...]}. Empty list if none.\n\n' + "\n".join(lines) ) pairs = chat_fn(SYSTEM, user).get("pairs", []) or [] return {"count": len(pairs), "pairs": [list(p) for p in pairs]} @@ -514,7 +499,7 @@ def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict: answers = _map_chat(chat_fn, prompts, workers) verdicts: dict[str, str] = {} miscalibrated = 0 - for f, a in zip(findings, answers): + for f, a in zip(findings, answers, strict=False): verdict = str(a.get("calibration", "calibrated")).lower() verdicts[f.get("id", "?")] = verdict if verdict in ("under", "over"): @@ -557,7 +542,7 @@ def surface_deduplication(result: dict, chat_fn, *, workers: int = 1) -> dict: def _toks(node: dict) -> frozenset[str]: return frozenset(node.get("name", "").lower().split()) - PER_SURFACE_CAP = 10 + per_surface_cap = 10 # candidates: (surface, node_a, node_b, parent_process_name) candidates: list[tuple[str, dict, dict, str]] = [] @@ -574,7 +559,7 @@ def _toks(node: dict) -> frozenset[str]: if jac >= 0.30: pairs.append((jac, procs[i], procs[j])) pairs.sort(key=lambda x: x[0], reverse=True) - for _jac, a, b in pairs[:PER_SURFACE_CAP]: + for _jac, a, b in pairs[:per_surface_cap]: candidates.append(("process", a, b, "")) # Activities and decisions: within the same parent process only @@ -595,7 +580,7 @@ def _toks(node: dict) -> frozenset[str]: if jac >= 0.30: all_pairs.append((jac, nodes[i], nodes[j], proc_name)) all_pairs.sort(key=lambda x: x[0], reverse=True) - for _jac, a, b, proc_name in all_pairs[:PER_SURFACE_CAP]: + for _jac, a, b, proc_name in all_pairs[:per_surface_cap]: candidates.append((surface_key, a, b, proc_name)) if not candidates: @@ -604,33 +589,37 @@ def _toks(node: dict) -> frozenset[str]: prompts = [] for surface, a, b, parent_proc in candidates: ctx = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else "" - prompts.append(( - SYSTEM, - f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a " - f"duplicate / sub-case / restatement of the other?\n" - f"{ctx}" - 'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": ""}.\n\n' - f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n" - f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}", - )) + prompts.append( + ( + SYSTEM, + f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a " + f"duplicate / sub-case / restatement of the other?\n" + f"{ctx}" + 'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": ""}.\n\n' + f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n" + f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}", + ) + ) answers = _map_chat(chat_fn, prompts, workers) distinct = 0 redundant = 0 redundant_pairs: list[dict] = [] - for (surface, a, b, _parent), answer in zip(candidates, answers): + for (surface, a, b, _parent), answer in zip(candidates, answers, strict=False): verdict = str(answer.get("verdict", "")).upper() if verdict == "DISTINCT": distinct += 1 else: redundant += 1 - redundant_pairs.append({ - "surface": surface, - "a": a.get("name", ""), - "b": b.get("name", ""), - "reason": str(answer.get("reason", "")), - }) + redundant_pairs.append( + { + "surface": surface, + "a": a.get("name", ""), + "b": b.get("name", ""), + "reason": str(answer.get("reason", "")), + } + ) total = distinct + redundant return { @@ -800,9 +789,7 @@ def _run_judge_metric(name: str, fn) -> None: "numeric_temporal_fidelity", lambda: numeric_temporal_fidelity(result, chat_fn, workers=concurrency), ) - _run_judge_metric( - "citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency) - ) + _run_judge_metric("citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency)) _run_judge_metric( "nc_semantic_precision", lambda: nc_semantic_precision(result, registry, chat_fn, workers=concurrency), diff --git a/fireflyframework_agentic/evaluation/judge_client.py b/fireflyframework_agentic/evaluation/judge_client.py index 1af17f53..e4b58dea 100644 --- a/fireflyframework_agentic/evaluation/judge_client.py +++ b/fireflyframework_agentic/evaluation/judge_client.py @@ -245,8 +245,7 @@ def _dispatch(self, system: str, user: str, max_tokens: int) -> str: if self.provider == "ollama": return self._ollama(system, user, max_tokens) raise ValueError( - f"unknown judge provider {self.provider!r} in {self.model_spec!r}; " - "use anthropic:/openai:/azure:/ollama:" + f"unknown judge provider {self.provider!r} in {self.model_spec!r}; use anthropic:/openai:/azure:/ollama:" ) def _anthropic(self, system: str, user: str, max_tokens: int) -> str: @@ -262,9 +261,7 @@ def _anthropic(self, system: str, user: str, max_tokens: int) -> str: } headers = {"x-api-key": api_key, "anthropic-version": "2023-06-01"} resp = _http_post_json("https://api.anthropic.com/v1/messages", headers, body, self.timeout) - text = next( - (b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None - ) + text = next((b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None) if not text: raise RuntimeError(f"judge returned no text: {resp}") return text @@ -283,9 +280,7 @@ def _openai(self, system: str, user: str, max_tokens: int) -> str: ], } headers = {"Authorization": f"Bearer {api_key}"} - resp = _http_post_json( - "https://api.openai.com/v1/chat/completions", headers, body, self.timeout - ) + resp = _http_post_json("https://api.openai.com/v1/chat/completions", headers, body, self.timeout) return _extract_openai_text(resp) def _azure(self, system: str, user: str, max_tokens: int) -> str: @@ -297,10 +292,7 @@ def _azure(self, system: str, user: str, max_tokens: int) -> str: raise RuntimeError("AZURE_OPENAI_API_KEY not set") api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01" # Azure deployment lives in the URL path, not the JSON body. - url = ( - f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions" - f"?api-version={api_version}" - ) + url = f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions?api-version={api_version}" body = { "max_tokens": max_tokens, "temperature": 0.0, @@ -373,10 +365,7 @@ def embed(self, texts: list[str]) -> np.ndarray: if not api_key: raise RuntimeError("AZURE_OPENAI_API_KEY not set") api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01" - url = ( - f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings" - f"?api-version={api_version}" - ) + url = f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings?api-version={api_version}" headers = {"api-key": api_key} vectors = self._embed_with_split(texts, url, headers) return np.asarray(vectors, dtype=np.float32) @@ -438,9 +427,7 @@ def build_embedder(spec: str): return OpenAIEmbedder(model or "text-embedding-3-small").embed if provider == "azure": return AzureOpenAIEmbedder(model or "text-embedding-3-small").embed - raise NotImplementedError( - f"embedder backend {provider!r} not implemented yet; add it in build_embedder()" - ) + raise NotImplementedError(f"embedder backend {provider!r} not implemented yet; add it in build_embedder()") def cosine(a, b) -> float: diff --git a/fireflyframework_agentic/evaluation/matcher.py b/fireflyframework_agentic/evaluation/matcher.py index b4d81f44..ccf61c96 100644 --- a/fireflyframework_agentic/evaluation/matcher.py +++ b/fireflyframework_agentic/evaluation/matcher.py @@ -113,9 +113,7 @@ def _keyword_anchored(desc: str, keywords: list[str]) -> bool: if not keywords: return False desc_lower = desc.lower() - return any( - re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords - ) + return any(re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords) def candidate_text(candidate: dict, scope: str) -> str: @@ -141,18 +139,28 @@ def candidate_text(candidate: dict, scope: str) -> str: pain = candidate.get("pain_points") or [] goals_str = " ".join(goals) if isinstance(goals, list) else str(goals) pain_str = " ".join(pain) if isinstance(pain, list) else str(pain) - return " ".join(filter(None, [ - candidate.get("name", ""), - candidate.get("role", ""), - goals_str, - pain_str, - ])) + return " ".join( + filter( + None, + [ + candidate.get("name", ""), + candidate.get("role", ""), + goals_str, + pain_str, + ], + ) + ) if scope == "informal_channel": - return " ".join(filter(None, [ - candidate.get("name", ""), - candidate.get("usage_context", ""), - candidate.get("notes", ""), - ])) + return " ".join( + filter( + None, + [ + candidate.get("name", ""), + candidate.get("usage_context", ""), + candidate.get("notes", ""), + ], + ) + ) # process, decision, system, dependency_graph (diagnostic nodes) return " ".join(filter(None, [candidate.get("name", ""), candidate.get("description", "")])) @@ -246,9 +254,7 @@ def matches_dependency_graph_relation( def _anchor(endpoint_text: str) -> set[str]: return { - a["id"] - for a in all_activities - if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text) + a["id"] for a in all_activities if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text) } from_ids = _anchor(item.from_node) @@ -268,9 +274,8 @@ def _node_stems(node: dict) -> set[str]: dg = result.get("dependency_graph", {}) for edge in dg.get("activity_edges", []): - if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids: - if _node_stems(edge) & item_stems: - return True + if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids and _node_stems(edge) & item_stems: + return True for path in dg.get("critical_paths", []): if not (_node_stems(path) & item_stems): @@ -325,19 +330,13 @@ def semantic_hits( # Flatten all candidates across scopes, preserving their scope tag for # text extraction and per-item filtering. - scoped: list[tuple[str, dict]] = [ - (scope, cand) - for scope, cands in candidates.items() - for cand in cands - ] + scoped: list[tuple[str, dict]] = [(scope, cand) for scope, cands in candidates.items() for cand in cands] if not scoped: return {item.id: False for item in items} cand_texts = [candidate_text(cand, scope) for scope, cand in scoped] - item_texts = [ - " ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items - ] + item_texts = [" ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items] cand_vecs = np.asarray(embed_fn(cand_texts)) item_vecs = np.asarray(embed_fn(item_texts)) @@ -359,10 +358,7 @@ def semantic_hits( if cosine(cand_vecs[k], item_vec) >= tau_nc: hit = True break - elif ( - shares_source(cand, item, evidence_index) - and cosine(cand_vecs[k], item_vec) >= tau - ): + elif shares_source(cand, item, evidence_index) and cosine(cand_vecs[k], item_vec) >= tau: hit = True break hits[item.id] = hit diff --git a/fireflyframework_agentic/evaluation/registry.py b/fireflyframework_agentic/evaluation/registry.py index 2b869ba9..87c4beb1 100644 --- a/fireflyframework_agentic/evaluation/registry.py +++ b/fireflyframework_agentic/evaluation/registry.py @@ -24,6 +24,7 @@ - kappa present (0.0 placeholder allowed; G2 advisory until >= 0.70) - ABANCA DILO items must target a single measured sub-population """ + from __future__ import annotations import hashlib @@ -35,8 +36,15 @@ VALID_TIERS = ("L0", "L1", "L2", "L3", "NC") VALID_SCOPES = ( - "process", "activity", "decision", "finding", "action", - "persona", "system", "informal_channel", "dependency_graph", + "process", + "activity", + "decision", + "finding", + "action", + "persona", + "system", + "informal_channel", + "dependency_graph", ) SCHEMA_VERSION = "lean-1" KAPPA_ADVISORY_THRESHOLD = 0.70 @@ -47,13 +55,13 @@ class RegistryItem: id: str tier: Literal["L0", "L1", "L2", "L3", "NC"] description: str - evidence: list[str] # source file paths (path portion of locator, no #page=N) - scope: str = "finding" # which DiscoveryResult surface to match against (§4.3) + evidence: list[str] # source file paths (path portion of locator, no #page=N) + scope: str = "finding" # which DiscoveryResult surface to match against (§4.3) keywords: list[str] = field(default_factory=list) weight: float = 1.0 - from_node: str = "" # dependency_graph relation items only - to_node: str = "" # dependency_graph relation items only - relation: str = "" # defaults to "precedes" when from/to present + from_node: str = "" # dependency_graph relation items only + to_node: str = "" # dependency_graph relation items only + relation: str = "" # defaults to "precedes" when from/to present @dataclass(frozen=True) @@ -87,10 +95,7 @@ def sha256(self) -> str: def _validate(raw: dict, path: Path) -> None: if raw.get("schema_version") != SCHEMA_VERSION: - raise ValueError( - f"{path.name}: schema_version must be '{SCHEMA_VERSION}', " - f"got {raw.get('schema_version')!r}" - ) + raise ValueError(f"{path.name}: schema_version must be '{SCHEMA_VERSION}', got {raw.get('schema_version')!r}") for fname in ("corpus", "author", "date"): if not raw.get(fname): raise ValueError(f"{path.name}: missing required field '{fname}'") @@ -116,20 +121,17 @@ def _validate(raw: dict, path: Path) -> None: tier = it.get("tier") if tier not in VALID_TIERS: raise ValueError( - f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; " - f"must be one of {VALID_TIERS}" + f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; must be one of {VALID_TIERS}" ) scope = it.get("scope", "finding") if scope not in VALID_SCOPES: raise ValueError( - f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; " - f"must be one of {VALID_SCOPES}" + f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; must be one of {VALID_SCOPES}" ) if scope == "dependency_graph": if not it.get("from") or not it.get("to"): raise ValueError( - f"{path.name}: dependency_graph item '{it.get('id')}' must have " - "non-empty 'from' and 'to'" + f"{path.name}: dependency_graph item '{it.get('id')}' must have non-empty 'from' and 'to'" ) else: if "from" in it or "to" in it or "relation" in it: @@ -153,13 +155,13 @@ def _validate(raw: dict, path: Path) -> None: # ABANCA DILO blend guard: items must assert a single sub-population target. # Checks for phrases that would indicate a blended numeric target is asserted. # "blend" alone is too broad (items may reference it negatively). - BLEND_PHRASES = ("combined distribution", "across all offices regardless of segment") + blend_phrases = ("combined distribution", "across all offices regardless of segment") for it in items: if it.get("tier") == "NC": continue desc = it.get("description", "").lower() iid = it.get("id", "") - if any(phrase in desc for phrase in BLEND_PHRASES): + if any(phrase in desc for phrase in blend_phrases): raise ValueError( f"{path.name}: item '{iid}' description targets a blended distribution; " "ABANCA DILO items must target a single measured sub-population " diff --git a/fireflyframework_agentic/evaluation/run_config_snapshot.py b/fireflyframework_agentic/evaluation/run_config_snapshot.py index db543129..c029e8e6 100644 --- a/fireflyframework_agentic/evaluation/run_config_snapshot.py +++ b/fireflyframework_agentic/evaluation/run_config_snapshot.py @@ -32,6 +32,7 @@ --options request_options.json \ --commit c107918 """ + from __future__ import annotations import argparse @@ -133,12 +134,8 @@ def write_snapshot(output_dir: str | Path, config: dict) -> Path: def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser(description="Capture the flyradar run configuration.") parser.add_argument("--output-dir", required=True, help="Run directory holding output.json.") - parser.add_argument( - "--options", required=True, help="JSON file of the DiscoveryRequest options that were sent." - ) - parser.add_argument( - "--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL)." - ) + parser.add_argument("--options", required=True, help="JSON file of the DiscoveryRequest options that were sent.") + parser.add_argument("--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL).") parser.add_argument("--commit", default=None, help="Deployed flyradar git commit, if known.") args = parser.parse_args(argv) diff --git a/fireflyframework_agentic/evaluation/scorecard.py b/fireflyframework_agentic/evaluation/scorecard.py index b34885e8..da3c4a87 100644 --- a/fireflyframework_agentic/evaluation/scorecard.py +++ b/fireflyframework_agentic/evaluation/scorecard.py @@ -188,13 +188,9 @@ def _render_advisory(report) -> list[str]: d = m["faithfulness"] u = d.get("unsupported_ids", []) extra = f" (unsupported: {', '.join(u)})" if u else "" - lines.append( - f"Faithfulness (entailment): {d.get('supported')}/{d.get('total')} supported{extra}" - ) + lines.append(f"Faithfulness (entailment): {d.get('supported')}/{d.get('total')} supported{extra}") if "numeric_temporal_fidelity" in m: - lines.append( - f"Numeric/temporal fidelity: {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)" - ) + lines.append(f"Numeric/temporal fidelity: {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)") if "citation_relevance" in m: d = m["citation_relevance"] lines.append( @@ -218,14 +214,10 @@ def _render_advisory(report) -> list[str]: lines.append(f"Contradiction detection: {m['contradiction'].get('count', 0)}") if "actionability" in m: d = m["actionability"] - lines.append( - f"Actionability: {_num(d.get('score'))} (rated {d.get('rated', 0)})" - ) + lines.append(f"Actionability: {_num(d.get('score'))} (rated {d.get('rated', 0)})") if "severity_calibration" in m: d = m["severity_calibration"] - lines.append( - f"Severity calibration: {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated" - ) + lines.append(f"Severity calibration: {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated") if "answer_relevancy" in m: lines.append(f"Answer relevancy: {_num(m['answer_relevancy'].get('score'))}") if "comparative_vs_champion" in m: @@ -236,14 +228,10 @@ def _render_advisory(report) -> list[str]: d = m["source_coverage"] o = d.get("orphaned", []) extra = f" (orphaned: {', '.join(o)})" if o else "" - lines.append( - f"Source coverage [D]: {d.get('cited')}/{d.get('total')} documents cited{extra}" - ) + lines.append(f"Source coverage [D]: {d.get('cited')}/{d.get('total')} documents cited{extra}") if "excerpt_fill_rate" in m: d = m["excerpt_fill_rate"] - lines.append( - f"Evidence-excerpt fill [D]: {d.get('populated')}/{d.get('total')} populated" - ) + lines.append(f"Evidence-excerpt fill [D]: {d.get('populated')}/{d.get('total')} populated") if "open_gap" in m: gap = (m["open_gap"].get("gap") or "").strip() if gap: @@ -259,9 +247,7 @@ def _render_advisory(report) -> list[str]: json.dumps({"metrics": report.metrics, "details": report.details}, indent=2, default=str), "```", ] - lines.append( - "> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10)." - ) + lines.append("> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10).") lines.append("") return lines @@ -284,9 +270,7 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]: matched = d.get("findings_matched_to_registry", {}).get("fraction", 0.0) tier_summary = ", ".join( - f"{t} {v['hit']}/{v['total']}" - for t, v in tiers.items() - if "hit" in v and "total" in v + f"{t} {v['hit']}/{v['total']}" for t, v in tiers.items() if "hit" in v and "total" in v ) lines.append( f"Lexical recall is **{recall:.3f}** ({tier_summary}). " @@ -300,9 +284,7 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]: "The run is covering the same ground multiple times rather than broadening coverage." ) else: - lines.append( - f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic." - ) + lines.append(f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic.") lines.append( "_G2 is a topic-level test. A recall of 1.000 means every required topic was " "mentioned somewhere — it does not verify that the specific claims about those " @@ -453,14 +435,10 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]: flag_names = [g.gate for g in flags] if not flags: - lines.append( - "All deterministic gates pass. The run is ready for G5 human sign-off." - ) + lines.append("All deterministic gates pass. The run is ready for G5 human sign-off.") else: flag_str = ", ".join(flag_names) - lines.append( - f"The run is at **HOLD** due to flags on: {flag_str}. " - ) + lines.append(f"The run is at **HOLD** due to flags on: {flag_str}. ") for g in flags: if g.gate == "G3" and g.reason_code == "EVIDENCE_SOURCE_UNKNOWN": lines.append( diff --git a/fireflyframework_agentic/evaluation/stats.py b/fireflyframework_agentic/evaluation/stats.py index e70c629a..c622588c 100644 --- a/fireflyframework_agentic/evaluation/stats.py +++ b/fireflyframework_agentic/evaluation/stats.py @@ -23,10 +23,11 @@ aggregation bug where the previous runner inherited run 0's grounding report unchanged instead of merging across all runs. """ + from __future__ import annotations import statistics -from typing import Sequence +from collections.abc import Sequence def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float: @@ -49,11 +50,7 @@ def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float: scores = list(scores) if len(scores) < 2: raise ValueError(f"aa_band requires >= 2 reruns; got {len(scores)}") - deltas = [ - abs(x - y) - for i, x in enumerate(scores) - for y in scores[i + 1:] - ] + deltas = [abs(x - y) for i, x in enumerate(scores) for y in scores[i + 1 :]] sorted_deltas = sorted(deltas) # Index for the requested percentile; clamp to valid range idx = max(0, min(len(sorted_deltas) - 1, int(len(sorted_deltas) * percentile / 100))) diff --git a/uv.lock b/uv.lock index 7e3b501c..93e18075 100644 --- a/uv.lock +++ b/uv.lock @@ -1209,6 +1209,10 @@ dev = [ embeddings = [ { name = "numpy" }, ] +evaluation = [ + { name = "numpy" }, + { name = "scipy" }, +] google-embeddings = [ { name = "google-generativeai" }, ] @@ -1279,6 +1283,7 @@ requires-dist = [ { name = "mistralai", marker = "extra == 'mistral-embeddings'", specifier = ">=1.0.0" }, { name = "motor", marker = "extra == 'mongodb'", specifier = ">=3.6.0" }, { name = "numpy", marker = "extra == 'embeddings'", specifier = ">=1.26.0" }, + { name = "numpy", marker = "extra == 'evaluation'", specifier = ">=1.26.0" }, { name = "numpy", marker = "extra == 'reasoning-eval'", specifier = ">=2.0.0" }, { name = "openai", marker = "extra == 'azure-embeddings'", specifier = ">=1.0.0" }, { name = "openai", marker = "extra == 'openai-embeddings'", specifier = ">=1.0.0" }, @@ -1304,13 +1309,14 @@ requires-dist = [ { name = "python-dotenv", specifier = ">=1.0.0" }, { name = "qdrant-client", marker = "extra == 'vectorstores-qdrant'", specifier = ">=1.12.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" }, + { name = "scipy", marker = "extra == 'evaluation'", specifier = ">=1.11" }, { name = "sqlalchemy", marker = "extra == 'postgres'", specifier = ">=2.0.0" }, { name = "sqlite-vec", marker = "extra == 'vectorstores-sqlite-vec'", specifier = ">=0.1.6" }, { name = "testcontainers", marker = "extra == 'dev'", specifier = ">=4.10.0" }, { name = "voyageai", marker = "extra == 'voyage-embeddings'", specifier = ">=0.3.0" }, { name = "watchfiles", marker = "extra == 'watch'", specifier = ">=0.24.0" }, ] -provides-extras = ["postgres", "mongodb", "security", "embeddings", "openai-embeddings", "cohere-embeddings", "google-embeddings", "mistral-embeddings", "voyage-embeddings", "azure-embeddings", "bedrock-embeddings", "ollama-embeddings", "reasoning-eval", "vectorstores-chroma", "vectorstores-sqlite-vec", "vectorstores-pinecone", "vectorstores-qdrant", "vectorstores-pgvector", "watch", "binary", "all", "dev"] +provides-extras = ["postgres", "mongodb", "security", "embeddings", "openai-embeddings", "cohere-embeddings", "google-embeddings", "mistral-embeddings", "voyage-embeddings", "azure-embeddings", "bedrock-embeddings", "ollama-embeddings", "reasoning-eval", "vectorstores-chroma", "vectorstores-sqlite-vec", "vectorstores-pinecone", "vectorstores-qdrant", "vectorstores-pgvector", "watch", "binary", "all", "evaluation", "dev"] [[package]] name = "flatbuffers" @@ -4489,6 +4495,57 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/72/c6c32d2b657fa3dad1de340254e14390b1e334ce38268b7ad51abda3c8c2/s3transfer-0.17.0-py3-none-any.whl", hash = "sha256:ce3801712acf4ad3e89fb9990df97b4972e93f4b3b0004d214be5bce12814c20", size = 86811, upload-time = "2026-04-29T22:07:34.966Z" }, ] +[[package]] +name = "scipy" +version = "1.17.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/27/07ee1b57b65e92645f219b37148a7e7928b82e2b5dbeccecb4dff7c64f0b/scipy-1.17.1-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:5e3c5c011904115f88a39308379c17f91546f77c1667cea98739fe0fccea804c", size = 31590199, upload-time = "2026-02-23T00:19:17.192Z" }, + { url = "https://files.pythonhosted.org/packages/ec/ae/db19f8ab842e9b724bf5dbb7db29302a91f1e55bc4d04b1025d6d605a2c5/scipy-1.17.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6fac755ca3d2c3edcb22f479fceaa241704111414831ddd3bc6056e18516892f", size = 28154001, upload-time = "2026-02-23T00:19:22.241Z" }, + { url = "https://files.pythonhosted.org/packages/5b/58/3ce96251560107b381cbd6e8413c483bbb1228a6b919fa8652b0d4090e7f/scipy-1.17.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ff200bf9d24f2e4d5dc6ee8c3ac64d739d3a89e2326ba68aaf6c4a2b838fd7d", size = 20325719, upload-time = "2026-02-23T00:19:26.329Z" }, + { url = "https://files.pythonhosted.org/packages/b2/83/15087d945e0e4d48ce2377498abf5ad171ae013232ae31d06f336e64c999/scipy-1.17.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4b400bdc6f79fa02a4d86640310dde87a21fba0c979efff5248908c6f15fad1b", size = 22683595, upload-time = "2026-02-23T00:19:30.304Z" }, + { url = "https://files.pythonhosted.org/packages/b4/e0/e58fbde4a1a594c8be8114eb4aac1a55bcd6587047efc18a61eb1f5c0d30/scipy-1.17.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b64ca7d4aee0102a97f3ba22124052b4bd2152522355073580bf4845e2550b6", size = 32896429, upload-time = "2026-02-23T00:19:35.536Z" }, + { url = "https://files.pythonhosted.org/packages/f5/5f/f17563f28ff03c7b6799c50d01d5d856a1d55f2676f537ca8d28c7f627cd/scipy-1.17.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:581b2264fc0aa555f3f435a5944da7504ea3a065d7029ad60e7c3d1ae09c5464", size = 35203952, upload-time = "2026-02-23T00:19:42.259Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a5/9afd17de24f657fdfe4df9a3f1ea049b39aef7c06000c13db1530d81ccca/scipy-1.17.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:beeda3d4ae615106d7094f7e7cef6218392e4465cc95d25f900bebabfded0950", size = 34979063, upload-time = "2026-02-23T00:19:47.547Z" }, + { url = "https://files.pythonhosted.org/packages/8b/13/88b1d2384b424bf7c924f2038c1c409f8d88bb2a8d49d097861dd64a57b2/scipy-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6609bc224e9568f65064cfa72edc0f24ee6655b47575954ec6339534b2798369", size = 37598449, upload-time = "2026-02-23T00:19:53.238Z" }, + { url = "https://files.pythonhosted.org/packages/35/e5/d6d0e51fc888f692a35134336866341c08655d92614f492c6860dc45bb2c/scipy-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:37425bc9175607b0268f493d79a292c39f9d001a357bebb6b88fdfaff13f6448", size = 36510943, upload-time = "2026-02-23T00:20:50.89Z" }, + { url = "https://files.pythonhosted.org/packages/2a/fd/3be73c564e2a01e690e19cc618811540ba5354c67c8680dce3281123fb79/scipy-1.17.1-cp313-cp313-win_arm64.whl", hash = "sha256:5cf36e801231b6a2059bf354720274b7558746f3b1a4efb43fcf557ccd484a87", size = 24545621, upload-time = "2026-02-23T00:20:55.871Z" }, + { url = "https://files.pythonhosted.org/packages/6f/6b/17787db8b8114933a66f9dcc479a8272e4b4da75fe03b0c282f7b0ade8cd/scipy-1.17.1-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:d59c30000a16d8edc7e64152e30220bfbd724c9bbb08368c054e24c651314f0a", size = 31936708, upload-time = "2026-02-23T00:19:58.694Z" }, + { url = "https://files.pythonhosted.org/packages/38/2e/524405c2b6392765ab1e2b722a41d5da33dc5c7b7278184a8ad29b6cb206/scipy-1.17.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:010f4333c96c9bb1a4516269e33cb5917b08ef2166d5556ca2fd9f082a9e6ea0", size = 28570135, upload-time = "2026-02-23T00:20:03.934Z" }, + { url = "https://files.pythonhosted.org/packages/fd/c3/5bd7199f4ea8556c0c8e39f04ccb014ac37d1468e6cfa6a95c6b3562b76e/scipy-1.17.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:2ceb2d3e01c5f1d83c4189737a42d9cb2fc38a6eeed225e7515eef71ad301dce", size = 20741977, upload-time = "2026-02-23T00:20:07.935Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b8/8ccd9b766ad14c78386599708eb745f6b44f08400a5fd0ade7cf89b6fc93/scipy-1.17.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:844e165636711ef41f80b4103ed234181646b98a53c8f05da12ca5ca289134f6", size = 23029601, upload-time = "2026-02-23T00:20:12.161Z" }, + { url = "https://files.pythonhosted.org/packages/6d/a0/3cb6f4d2fb3e17428ad2880333cac878909ad1a89f678527b5328b93c1d4/scipy-1.17.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:158dd96d2207e21c966063e1635b1063cd7787b627b6f07305315dd73d9c679e", size = 33019667, upload-time = "2026-02-23T00:20:17.208Z" }, + { url = "https://files.pythonhosted.org/packages/f3/c3/2d834a5ac7bf3a0c806ad1508efc02dda3c8c61472a56132d7894c312dea/scipy-1.17.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74cbb80d93260fe2ffa334efa24cb8f2f0f622a9b9febf8b483c0b865bfb3475", size = 35264159, upload-time = "2026-02-23T00:20:23.087Z" }, + { url = "https://files.pythonhosted.org/packages/4d/77/d3ed4becfdbd217c52062fafe35a72388d1bd82c2d0ba5ca19d6fcc93e11/scipy-1.17.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:dbc12c9f3d185f5c737d801da555fb74b3dcfa1a50b66a1a93e09190f41fab50", size = 35102771, upload-time = "2026-02-23T00:20:28.636Z" }, + { url = "https://files.pythonhosted.org/packages/bd/12/d19da97efde68ca1ee5538bb261d5d2c062f0c055575128f11a2730e3ac1/scipy-1.17.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:94055a11dfebe37c656e70317e1996dc197e1a15bbcc351bcdd4610e128fe1ca", size = 37665910, upload-time = "2026-02-23T00:20:34.743Z" }, + { url = "https://files.pythonhosted.org/packages/06/1c/1172a88d507a4baaf72c5a09bb6c018fe2ae0ab622e5830b703a46cc9e44/scipy-1.17.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e30bdeaa5deed6bc27b4cc490823cd0347d7dae09119b8803ae576ea0ce52e4c", size = 36562980, upload-time = "2026-02-23T00:20:40.575Z" }, + { url = "https://files.pythonhosted.org/packages/70/b0/eb757336e5a76dfa7911f63252e3b7d1de00935d7705cf772db5b45ec238/scipy-1.17.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a720477885a9d2411f94a93d16f9d89bad0f28ca23c3f8daa521e2dcc3f44d49", size = 24856543, upload-time = "2026-02-23T00:20:45.313Z" }, + { url = "https://files.pythonhosted.org/packages/cf/83/333afb452af6f0fd70414dc04f898647ee1423979ce02efa75c3b0f2c28e/scipy-1.17.1-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:a48a72c77a310327f6a3a920092fa2b8fd03d7deaa60f093038f22d98e096717", size = 31584510, upload-time = "2026-02-23T00:21:01.015Z" }, + { url = "https://files.pythonhosted.org/packages/ed/a6/d05a85fd51daeb2e4ea71d102f15b34fedca8e931af02594193ae4fd25f7/scipy-1.17.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:45abad819184f07240d8a696117a7aacd39787af9e0b719d00285549ed19a1e9", size = 28170131, upload-time = "2026-02-23T00:21:05.888Z" }, + { url = "https://files.pythonhosted.org/packages/db/7b/8624a203326675d7746a254083a187398090a179335b2e4a20e2ddc46e83/scipy-1.17.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:3fd1fcdab3ea951b610dc4cef356d416d5802991e7e32b5254828d342f7b7e0b", size = 20342032, upload-time = "2026-02-23T00:21:09.904Z" }, + { url = "https://files.pythonhosted.org/packages/c9/35/2c342897c00775d688d8ff3987aced3426858fd89d5a0e26e020b660b301/scipy-1.17.1-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7bdf2da170b67fdf10bca777614b1c7d96ae3ca5794fd9587dce41eb2966e866", size = 22678766, upload-time = "2026-02-23T00:21:14.313Z" }, + { url = "https://files.pythonhosted.org/packages/ef/f2/7cdb8eb308a1a6ae1e19f945913c82c23c0c442a462a46480ce487fdc0ac/scipy-1.17.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adb2642e060a6549c343603a3851ba76ef0b74cc8c079a9a58121c7ec9fe2350", size = 32957007, upload-time = "2026-02-23T00:21:19.663Z" }, + { url = "https://files.pythonhosted.org/packages/0b/2e/7eea398450457ecb54e18e9d10110993fa65561c4f3add5e8eccd2b9cd41/scipy-1.17.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eee2cfda04c00a857206a4330f0c5e3e56535494e30ca445eb19ec624ae75118", size = 35221333, upload-time = "2026-02-23T00:21:25.278Z" }, + { url = "https://files.pythonhosted.org/packages/d9/77/5b8509d03b77f093a0d52e606d3c4f79e8b06d1d38c441dacb1e26cacf46/scipy-1.17.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d2650c1fb97e184d12d8ba010493ee7b322864f7d3d00d3f9bb97d9c21de4068", size = 35042066, upload-time = "2026-02-23T00:21:31.358Z" }, + { url = "https://files.pythonhosted.org/packages/f9/df/18f80fb99df40b4070328d5ae5c596f2f00fffb50167e31439e932f29e7d/scipy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:08b900519463543aa604a06bec02461558a6e1cef8fdbb8098f77a48a83c8118", size = 37612763, upload-time = "2026-02-23T00:21:37.247Z" }, + { url = "https://files.pythonhosted.org/packages/4b/39/f0e8ea762a764a9dc52aa7dabcfad51a354819de1f0d4652b6a1122424d6/scipy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:3877ac408e14da24a6196de0ddcace62092bfc12a83823e92e49e40747e52c19", size = 37290984, upload-time = "2026-02-23T00:22:35.023Z" }, + { url = "https://files.pythonhosted.org/packages/7c/56/fe201e3b0f93d1a8bcf75d3379affd228a63d7e2d80ab45467a74b494947/scipy-1.17.1-cp314-cp314-win_arm64.whl", hash = "sha256:f8885db0bc2bffa59d5c1b72fad7a6a92d3e80e7257f967dd81abb553a90d293", size = 25192877, upload-time = "2026-02-23T00:22:39.798Z" }, + { url = "https://files.pythonhosted.org/packages/96/ad/f8c414e121f82e02d76f310f16db9899c4fcde36710329502a6b2a3c0392/scipy-1.17.1-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:1cc682cea2ae55524432f3cdff9e9a3be743d52a7443d0cba9017c23c87ae2f6", size = 31949750, upload-time = "2026-02-23T00:21:42.289Z" }, + { url = "https://files.pythonhosted.org/packages/7c/b0/c741e8865d61b67c81e255f4f0a832846c064e426636cd7de84e74d209be/scipy-1.17.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:2040ad4d1795a0ae89bfc7e8429677f365d45aa9fd5e4587cf1ea737f927b4a1", size = 28585858, upload-time = "2026-02-23T00:21:47.706Z" }, + { url = "https://files.pythonhosted.org/packages/ed/1b/3985219c6177866628fa7c2595bfd23f193ceebbe472c98a08824b9466ff/scipy-1.17.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:131f5aaea57602008f9822e2115029b55d4b5f7c070287699fe45c661d051e39", size = 20757723, upload-time = "2026-02-23T00:21:52.039Z" }, + { url = "https://files.pythonhosted.org/packages/c0/19/2a04aa25050d656d6f7b9e7b685cc83d6957fb101665bfd9369ca6534563/scipy-1.17.1-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9cdc1a2fcfd5c52cfb3045feb399f7b3ce822abdde3a193a6b9a60b3cb5854ca", size = 23043098, upload-time = "2026-02-23T00:21:56.185Z" }, + { url = "https://files.pythonhosted.org/packages/86/f1/3383beb9b5d0dbddd030335bf8a8b32d4317185efe495374f134d8be6cce/scipy-1.17.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e3dcd57ab780c741fde8dc68619de988b966db759a3c3152e8e9142c26295ad", size = 33030397, upload-time = "2026-02-23T00:22:01.404Z" }, + { url = "https://files.pythonhosted.org/packages/41/68/8f21e8a65a5a03f25a79165ec9d2b28c00e66dc80546cf5eb803aeeff35b/scipy-1.17.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a9956e4d4f4a301ebf6cde39850333a6b6110799d470dbbb1e25326ac447f52a", size = 35281163, upload-time = "2026-02-23T00:22:07.024Z" }, + { url = "https://files.pythonhosted.org/packages/84/8d/c8a5e19479554007a5632ed7529e665c315ae7492b4f946b0deb39870e39/scipy-1.17.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:a4328d245944d09fd639771de275701ccadf5f781ba0ff092ad141e017eccda4", size = 35116291, upload-time = "2026-02-23T00:22:12.585Z" }, + { url = "https://files.pythonhosted.org/packages/52/52/e57eceff0e342a1f50e274264ed47497b59e6a4e3118808ee58ddda7b74a/scipy-1.17.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a77cbd07b940d326d39a1d1b37817e2ee4d79cb30e7338f3d0cddffae70fcaa2", size = 37682317, upload-time = "2026-02-23T00:22:18.513Z" }, + { url = "https://files.pythonhosted.org/packages/11/2f/b29eafe4a3fbc3d6de9662b36e028d5f039e72d345e05c250e121a230dd4/scipy-1.17.1-cp314-cp314t-win_amd64.whl", hash = "sha256:eb092099205ef62cd1782b006658db09e2fed75bffcae7cc0d44052d8aa0f484", size = 37345327, upload-time = "2026-02-23T00:22:24.442Z" }, + { url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165, upload-time = "2026-02-23T00:22:29.563Z" }, +] + [[package]] name = "secretstorage" version = "3.5.0" From 9c3555d03331bb8e05361dc49865df0355171d29 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:11 +0200 Subject: [PATCH 15/48] chore(evaluation): delete cli.py --- fireflyframework_agentic/evaluation/cli.py | 573 --------------------- 1 file changed, 573 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/cli.py diff --git a/fireflyframework_agentic/evaluation/cli.py b/fireflyframework_agentic/evaluation/cli.py deleted file mode 100644 index 7ac868d9..00000000 --- a/fireflyframework_agentic/evaluation/cli.py +++ /dev/null @@ -1,573 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""flyeval — FlyRadar Lean Core evaluation CLI. - -Usage ------ - flyeval gate --result R.json --registry REG.json [--baseline B.json] [--judge-model P:M] - flyeval aa-band --results R1.json R2.json ... --registry REG.json - flyeval day-zero --result R.json --registry REG.json --baseline B.json --signoffs 2 - flyeval invalidate --baseline B.json --reason "..." - -The deterministic gates G1-G3 + G5 (human sign-off) decide the verdict: every -subcommand exits 0 on PROMOTE, 1 on HOLD. G4 (the --judge-model LLM-as-a-Judge, -on by default, --no-judge to skip) is non-blocking — it prints advisory signals -and never changes the verdict or the exit code. -""" - -from __future__ import annotations - -import argparse -import hashlib -import json -import os -import sys -from pathlib import Path - -from fireflyframework_agentic.evaluation import __version__ -from fireflyframework_agentic.evaluation.champion import ( - ChampionRecord, - invalidate_champion, - load_champion, - save_champion, -) -from fireflyframework_agentic.evaluation.corpus import load_corpus -from fireflyframework_agentic.evaluation.gates import g2_recall_precision, run_gates -from fireflyframework_agentic.evaluation.judge import run_judge -from fireflyframework_agentic.evaluation.judge_client import build_embedder -from fireflyframework_agentic.evaluation.matcher import matches -from fireflyframework_agentic.evaluation.registry import load_registry -from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict as get_verdict -from fireflyframework_agentic.evaluation.stats import aa_band, left_skew_flag - - -def _load_json(path: str) -> dict: - return json.loads(Path(path).read_text(encoding="utf-8")) - - -def _lexical_missed_ids(result: dict, registry) -> list[str]: - """Scored (non-L3) real-item ids matched by no finding — the G2 lexical misses G4 recovers.""" - evidence_index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")} - findings = result.get("findings", []) - scored = [i for i in registry.real_items if i.tier != "L3"] - return [i.id for i in scored if not any(matches(f, i, evidence_index) for f in findings)] - - -def _read_experiment_config(result_path: str) -> dict | None: - """Read the experiment_configuration.json recorded next to the run's output.json. - - The experiment config records how the run was generated; it is authored by the - generation step at run time. Evaluation only reads it for display and never - writes or overwrites it. Returns None when the run has no recorded config. - """ - path = Path(result_path).parent / "experiment_configuration.json" - if not path.exists(): - return None - return json.loads(path.read_text(encoding="utf-8")) - - -def _write_eval_config(result_path: str, config: dict) -> Path: - """Write evaluation_configuration.json next to the run's output.json. - - The evaluation config is authored by flyeval at gate time (registry/corpus SHAs, - recall metric, floors, judge settings), so unlike the experiment config it is - owned here and safe to (over)write each run. It mirrors the block embedded in - the scorecard, as a machine-readable artifact. - """ - path = Path(result_path).parent / "evaluation_configuration.json" - path.write_text(json.dumps(config, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") - return path - - -def _eval_config(args, registry, corpus=None) -> dict: - """Capture the run's evaluation configuration for provenance. - - Uses getattr defaults so it works for both `gate` (has every flag) and - `day-zero` (lacks the gate-only flags, falling back to the lexical/no-judge - defaults, which honestly reflects how day-zero scores). - """ - jm = getattr(args, "judge_model", None) - baseline = getattr(args, "baseline", None) - tau = getattr(args, "tau", 0.70) - return { - "evaluator_version": __version__, - "registry_sha256": registry.sha256(), - "corpus_sha256": corpus.sha256 if corpus else None, - "model_id": getattr(args, "model_id", None) or "unknown", - "gates": { - "G1": { - "name": "Structural & Safe", - "pii_list": getattr(args, "pii_list", None) or [], - "metrics": { - "empty_must_find": "registry has >=1 must-find item; guards the fake-100% " - "champion (EMPTY_MUST_FIND)", - "registry_sha256_pin": "loaded registry matches its file hash (GOLD_DRIFT)", - "corpus_sha256_pin": "corpus matches its hash when supplied (CORPUS_DRIFT)", - "schema_valid": "required top-level keys present in the result " - "(SCHEMA_INVALID)", - "pii_non_disclosure": "no corpus PII name appears in any finding/report text " - "(PII_LEAK)", - }, - }, - "G2": { - "name": "Recall & Precision", - "recall_metric": getattr(args, "recall_metric", "lexical"), - "recall_floor": getattr(args, "recall_floor", 0.70), - "tau": tau, - "tau_nc": getattr(args, "tau_nc", 0.85), - "embedder": getattr(args, "embedder", None), - "metrics": { - "lexical_recall": "token-overlap recall (always reported)", - "semantic_recall": "embedding-similarity recall at >= tau (needs embedder)", - "hybrid_recall": "per item, a lexical OR semantic match", - "per_tier_recall": "hit/total per tier L0-L3; an L0 miss blocks", - "nc_precision": "negative-control items wrongly emitted; an NC hit blocks", - "finding_redundancy_rate": "fraction of findings duplicating another's topic", - }, - }, - "G3": { - "name": "Grounded", - "grounding_floor": getattr(args, "grounding_floor", 0.90), - "human_spot_check_n": 5, - "corpus_verification": corpus is not None, - "metrics": { - "grounding_pct": "findings whose cited excerpt shares a topic token; blocks " - "below grounding_floor", - "evidence_verified": "cited excerpts located in the actual corpus " - "(when supplied)", - "evidence_fabricated": "populated excerpts not found in their cited source " - "(EVIDENCE_FABRICATED)", - "evidence_source_unknown": "locators resolving to no corpus document " - "(EVIDENCE_SOURCE_UNKNOWN)", - "excerpt_fill_rate": "evidence entries carrying a populated excerpt", - "source_coverage": "distinct corpus documents cited", - }, - }, - "G4": { - "name": "LLM Judge (advisory, non-blocking)", - "judge_model": jm, - "judge_runs": getattr(args, "judge_runs", 1) if jm else None, - "judge_concurrency": getattr(args, "judge_concurrency", 1) if jm else None, - "judge_temperature": 0.0 if jm else None, - "tau": tau if jm else None, - "metrics": { - "faithfulness": "each finding's claim entailed by its cited evidence", - "numeric_temporal_fidelity": "numbers and dates in findings match the evidence", - "citation_relevance": "cited evidence refs are on-topic (context precision)", - "nc_semantic_precision": "negative-control items semantically asserted", - "fabricated_entity": "named entities absent from the corpus", - "contradiction": "findings contradicting the evidence or each other", - "open_gap": "a consequential issue the output failed to surface", - "actionability": "proposed actions are specific and actionable", - "severity_calibration": "stated severity matches the evidence", - "answer_relevancy": "output addresses the workspace intention", - "source_coverage": "distinct corpus documents cited (deterministic)", - "excerpt_fill_rate": "evidence entries with a populated excerpt " - "(deterministic)", - }, - }, - "G5": { - "name": "No-regression / promotion", - "is_day_zero": baseline is None, - "human_signed_off": getattr(args, "human_signed_off", False), - "signoffs": getattr(args, "signoffs", 0), - "baseline": baseline, - "baseline_sha256": _file_sha256(baseline) if baseline else None, - "metrics": { - "improvements": "metrics beating the champion by more than the AA noise band", - "regressions": "metrics that regressed versus the champion", - "noise_band": "per-metric AA noise floor a candidate must exceed", - "guardrail_regression": "any guardrail metric that dropped", - "signoffs": "independent human sign-offs recorded", - }, - }, - }, - } - - -def _file_sha256(path: str) -> str | None: - """SHA-256 of a file's bytes, or None when it can't be read.""" - try: - return hashlib.sha256(Path(path).read_bytes()).hexdigest() - except OSError: - return None - - -# ── gate ────────────────────────────────────────────────────────────────────── - - -def cmd_gate(args: argparse.Namespace) -> int: - if getattr(args, "no_judge", False): - args.judge_model = None # explicit opt-out; G4 runs by default otherwise - result = _load_json(args.result) - registry = load_registry(args.registry) - corpus = load_corpus(args.corpus) if args.corpus else None - champion = load_champion(args.baseline) if args.baseline else None - champion_scores = champion.scores if champion else None - aa_noise = champion.aa_noise if champion else None - - embed_fn = build_embedder(args.embedder) if args.embedder else None - - if args.recall_metric in ("hybrid", "semantic") and embed_fn is None: - print( - f"ERROR: --recall-metric {args.recall_metric} requires --embedder.\n" - " Example: --embedder openai:text-embedding-3-small", - file=sys.stderr, - ) - return 2 - - gate_results = run_gates( - result, - registry, - args.registry, - pii_list=args.pii_list or [], - recall_floor=args.recall_floor, - grounding_floor=args.grounding_floor, - champion_scores=champion_scores, - aa_noise=aa_noise, - is_day_zero=(champion is None), - human_signed_off=args.human_signed_off, - signoff_count=args.signoffs, - embed_fn=embed_fn, - tau=args.tau, - recall_metric=args.recall_metric, - tau_nc=args.tau_nc, - corpus=corpus, - ) - - # G4 — on by default, non-blocking. Skipped only with --no-judge; never affects the verdict. - advisory = None - if args.judge_model: - champion_result = _load_json(args.champion_result) if args.champion_result else None - advisory = run_judge( - result, - registry, - judge_model=args.judge_model, - runs=args.judge_runs, - concurrency=args.judge_concurrency, - pipeline_model=args.model_id or "", - champion_result=champion_result, - embed_fn=embed_fn, - tau=args.tau, - lexical_missed_ids=_lexical_missed_ids(result, registry), - ) - - config = _eval_config(args, registry, corpus) - _write_eval_config(args.result, config) - experiment_config = _read_experiment_config(args.result) - scorecard = render_scorecard( - gate_results, - corpus=registry.corpus, - model_id=args.model_id or "unknown", - run_id=args.run_id or "run", - is_self_graded=True, - kappa_advisory=registry.is_kappa_advisory(), - evidence_unverified=corpus is None, - advisory=advisory, - config=config, - experiment_config=experiment_config, - ) - print(scorecard) - - v = get_verdict(gate_results) - return 0 if v == "PROMOTE" else 1 - - -# ── aa-band ─────────────────────────────────────────────────────────────────── - - -def cmd_aa_band(args: argparse.Namespace) -> int: - registry = load_registry(args.registry) - - if args.recall_metric in ("hybrid", "semantic") and not args.embedder: - print( - f"ERROR: --recall-metric {args.recall_metric} requires --embedder.\n" - " Example: --embedder openai:text-embedding-3-small", - file=sys.stderr, - ) - return 2 - - embed_fn = build_embedder(args.embedder) if args.embedder else None - corpus = load_corpus(args.corpus) if args.corpus else None - scores: list[float] = [] - - for rp in args.results: - result = _load_json(rp) - g2 = g2_recall_precision( - result, registry, - recall_metric=args.recall_metric, embed_fn=embed_fn, - tau=args.tau, tau_nc=args.tau_nc, - corpus=corpus, - ) - if g2.passed or g2.details.get("recall") is not None: - scores.append(g2.details.get("recall", 0.0)) - - if len(scores) < 2: - print( - f"ERROR: need >= 2 runs for aa_band; got {len(scores)}. " - "Make sure the registry is non-empty and the runs are valid.", - file=sys.stderr, - ) - return 1 - - band = aa_band(scores) - high_var = left_skew_flag(scores) - print(f"A/A noise band (95th-pct pairwise delta): {band:.4f}") - print(f"Scores across reruns: {[round(s, 4) for s in scores]}") - if high_var: - print("WARNING: HIGH_VARIANCE — min < median - 0.10. Investigate before using this band.") - return 0 - - -# ── day-zero ────────────────────────────────────────────────────────────────── - - -def cmd_day_zero(args: argparse.Namespace) -> int: - result = _load_json(args.result) - registry = load_registry(args.registry) - - if not args.corpus: - print( - "ERROR: day-zero (a promotion decision) requires --corpus for evidence\n" - "verification — a champion must not be minted on unverified evidence.\n" - " Supply the run's input bundle, e.g. --corpus experiments//input.json", - file=sys.stderr, - ) - return 2 - corpus = load_corpus(args.corpus) - - if args.signoffs < 2: - print( - f"ERROR: Day-Zero requires 2 independent human sign-offs; got {args.signoffs}.", - file=sys.stderr, - ) - return 1 - - gate_results = run_gates( - result, - registry, - args.registry, - is_day_zero=True, - human_signed_off=True, - signoff_count=args.signoffs, - corpus=corpus, - ) - - config = _eval_config(args, registry, corpus) - _write_eval_config(args.result, config) - experiment_config = _read_experiment_config(args.result) - v = get_verdict(gate_results) - scorecard = render_scorecard( - gate_results, - corpus=registry.corpus, - model_id=args.model_id or "unknown", - run_id=args.run_id or "day-zero", - is_self_graded=True, - kappa_advisory=registry.is_kappa_advisory(), - config=config, - experiment_config=experiment_config, - ) - print(scorecard) - - if v == "PROMOTE" and args.baseline: - g2 = next((g for g in gate_results if g.gate == "G2"), None) - g3 = next((g for g in gate_results if g.gate == "G3"), None) - scores = {} - if g2: - scores["recall"] = g2.details.get("recall", 0.0) - if g3: - scores["grounding_pct"] = g3.details.get("grounding_pct", 0.0) - - champion = ChampionRecord( - corpus=registry.corpus, - run_id=args.run_id or "day-zero", - model_id=args.model_id or "unknown", - registry_sha256=registry.sha256(), - scores=scores, - is_day_zero=True, - human_sign_offs=[f"signoff-{i + 1}" for i in range(args.signoffs)], - config=config, - corpus_sha256=corpus.sha256, - ) - save_champion( - args.baseline, - champion, - summary=f"Day-Zero champion for {registry.corpus}", - date=args.date or "unknown", - ) - print(f"\nDay-Zero champion saved to {args.baseline}") - - return 0 if v == "PROMOTE" else 1 - - -# ── invalidate ──────────────────────────────────────────────────────────────── - - -def cmd_invalidate(args: argparse.Namespace) -> int: - invalidate_champion(args.baseline, reason=args.reason, date=args.date or "unknown") - print(f"Champion invalidated in {args.baseline}. Reason: {args.reason}") - return 0 - - -# ── parser ──────────────────────────────────────────────────────────────────── - - -def build_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser( - prog="flyeval", - description="FlyRadar Lean Core eval: G1-G3 + G5 deterministic, G4 judge on by default", - ) - sub = parser.add_subparsers(dest="command", required=True) - - def _add_common(p: argparse.ArgumentParser) -> None: - p.add_argument("--result", required=True, help="Path to DiscoveryResult JSON") - p.add_argument("--registry", required=True, help="Path to lean-1 registry JSON") - p.add_argument( - "--corpus", - help="Path to the run's input.json corpus bundle — enables deterministic " - "evidence verification (required for day-zero; without it, gate runs " - "carry an EVIDENCE UNVERIFIED disclosure)", - ) - p.add_argument("--baseline", help="Path to baseline.json (per-corpus champion store)") - p.add_argument("--model-id", default="unknown") - p.add_argument("--run-id", default="run") - p.add_argument("--date", default="", help="ISO date for promotion log") - - # gate - p_gate = sub.add_parser("gate", help="Run the gates and print a scorecard") - _add_common(p_gate) - p_gate.add_argument("--recall-floor", type=float, default=0.70) - p_gate.add_argument("--grounding-floor", type=float, default=0.90) - p_gate.add_argument("--pii-list", nargs="*", default=[]) - p_gate.add_argument( - "--embedder", - default=os.environ.get("FLYEVAL_EMBEDDER"), - help="opt-in embedder spec for the semantic recall path " - '(e.g. "azure:text-embedding-3-small"); omit for pure-lexical recall. ' - "Env: FLYEVAL_EMBEDDER", - ) - p_gate.add_argument( - "--recall-metric", - choices=["lexical", "semantic", "hybrid"], - default=os.environ.get("FLYEVAL_RECALL_METRIC", "hybrid"), - help="which recall metric GATES (default hybrid; hybrid/semantic require --embedder). " - "Env: FLYEVAL_RECALL_METRIC", - ) - p_gate.add_argument( - "--tau", - type=float, - default=float(os.environ.get("FLYEVAL_TAU", "0.70")), - help="cosine similarity threshold for the semantic recall path (real items). " - "Env: FLYEVAL_TAU", - ) - p_gate.add_argument( - "--tau-nc", - type=float, - default=float(os.environ.get("FLYEVAL_TAU_NC", "0.85")), - help="cosine similarity threshold for NC item detection (higher; no source anchor). " - "Env: FLYEVAL_TAU_NC", - ) - p_gate.add_argument("--human-signed-off", action="store_true") - p_gate.add_argument("--signoffs", type=int, default=0) - p_gate.add_argument( - "--judge-model", - default=os.environ.get("FLYEVAL_JUDGE_MODEL", "anthropic:claude-sonnet-4-6"), - help="provider:model for the non-blocking G4 LLM-as-a-Judge (e.g. azure:gpt-4o). " - "Runs by default; pass --no-judge to skip G4. Env: FLYEVAL_JUDGE_MODEL", - ) - p_gate.add_argument( - "--no-judge", - action="store_true", - help="skip the G4 LLM-as-a-Judge (it runs by default).", - ) - p_gate.add_argument( - "--judge-runs", - type=int, - default=int(os.environ.get("FLYEVAL_JUDGE_RUNS", "1")), - help="G4 judge runs; the median of numeric scores is kept (odd recommended). " - "Env: FLYEVAL_JUDGE_RUNS", - ) - p_gate.add_argument( - "--judge-concurrency", - type=int, - default=int(os.environ.get("FLYEVAL_JUDGE_CONCURRENCY", "1")), - help="bounded fan-out for the per-item G4 [J] metrics (1 = sequential; " - ">=2 runs each metric's chat calls across a thread pool, order preserved). " - "Env: FLYEVAL_JUDGE_CONCURRENCY", - ) - p_gate.add_argument( - "--champion-result", - help="Path to the champion's output.json for the G4 comparative-review metric", - ) - p_gate.set_defaults(func=cmd_gate) - - # aa-band - p_aa = sub.add_parser("aa-band", help="Compute A/A noise band from champion reruns") - p_aa.add_argument( - "--results", - nargs="+", - required=True, - help="Paths to champion-rerun result JSON files (>= 2)", - ) - p_aa.add_argument("--registry", required=True) - p_aa.add_argument( - "--recall-metric", - choices=["lexical", "semantic", "hybrid"], - default="hybrid", - help="recall metric to use — must match the champion's metric (default hybrid; " - "hybrid/semantic require --embedder)", - ) - p_aa.add_argument( - "--embedder", - default=None, - help="embedder spec for semantic/hybrid recall (e.g. ollama:bge-m3)", - ) - p_aa.add_argument("--tau", type=float, default=0.70) - p_aa.add_argument("--tau-nc", type=float, default=0.85) - p_aa.add_argument( - "--corpus", - help="Path to input.json — must match the gate's corpus setting so the " - "band is computed under the same evidence filtering as the champion", - ) - p_aa.set_defaults(func=cmd_aa_band) - - # day-zero - p_dz = sub.add_parser("day-zero", help="Promote the inaugural champion (Day-Zero protocol)") - _add_common(p_dz) - p_dz.add_argument( - "--signoffs", - type=int, - default=0, - help="Number of independent human sign-offs collected (need 2)", - ) - p_dz.set_defaults(func=cmd_day_zero) - - # invalidate - p_inv = sub.add_parser("invalidate", help="Invalidate the current champion") - p_inv.add_argument("--baseline", required=True) - p_inv.add_argument("--reason", required=True) - p_inv.add_argument("--date", default="") - p_inv.set_defaults(func=cmd_invalidate) - - return parser - - -def main() -> None: - parser = build_parser() - args = parser.parse_args() - sys.exit(args.func(args)) - - -if __name__ == "__main__": - main() From e9fd9651a017a037330ff698f0768572d0d3f557 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:18 +0200 Subject: [PATCH 16/48] chore(evaluation): delete gates.py --- fireflyframework_agentic/evaluation/gates.py | 840 ------------------- 1 file changed, 840 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/gates.py diff --git a/fireflyframework_agentic/evaluation/gates.py b/fireflyframework_agentic/evaluation/gates.py deleted file mode 100644 index 057bfea7..00000000 --- a/fireflyframework_agentic/evaluation/gates.py +++ /dev/null @@ -1,840 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Four gates — every gate always runs; a failure raises a flag, not a veto. - -Gate pipeline (EVALUATION_FRAMEWORK.md §6): - G1 — Structural & Safe - G2 — Must-finds & negative controls - G3 — Evidence (grounding) - G5 — No-regression / promotion (human decision) - -Each gate is a pure function of the result dict + supporting inputs. -run_gates() always executes all four gates and returns all four results so -the scorecard carries the complete picture regardless of which flags fire. -""" - -from __future__ import annotations - -from dataclasses import dataclass, field - -from fireflyframework_agentic.evaluation import matcher -from fireflyframework_agentic.evaluation.corpus import ( - EMPTY, - FABRICATED, - SOURCE_UNKNOWN, - VERIFIED, - Corpus, - corpus_sha256, - verify_evidence_index, -) -from fireflyframework_agentic.evaluation.matcher import anchored, matches -from fireflyframework_agentic.evaluation.registry import Registry, registry_sha256 - - -@dataclass -class GateResult: - gate: str - passed: bool - reason_code: str = "" - details: dict = field(default_factory=dict) - - def __str__(self) -> str: - status = "PASS" if self.passed else f"FLAG:{self.reason_code}" - return f"[{self.gate}] {status}" - - -class Verdict: - """Promotion gate verdict constants. - - Use ``Verdict.PROMOTE`` when the challenger meets the quality bar and - is safe to become the new champion. Use ``Verdict.HOLD`` when the - challenger does not meet the bar and must be iterated on. - """ - - PROMOTE: str = "PROMOTE" - HOLD: str = "HOLD" - - -def render_scorecard(gate_results: list[GateResult]) -> str: - """Render a human-readable scorecard from a list of GateResult objects. - - Emits one line per gate: ``[G1] PASS`` or ``[G2] FLAG:RECALL_BELOW_FLOOR``. - The overall verdict (PROMOTE / HOLD) appears on the final line. A run - promotes only when every gate passes; any flag signals HOLD. - """ - lines = [str(r) for r in gate_results] - all_passed = all(r.passed for r in gate_results) - verdict = Verdict.PROMOTE if all_passed else Verdict.HOLD - lines.append(f"VERDICT: {verdict}") - return "\n".join(lines) - - -def _build_evidence_index(result: dict, corpus: Corpus | None = None) -> dict[str, dict]: - """Index evidence by id; with a corpus, drop entries that fail verification. - - Dropped entries (FABRICATED excerpt or SOURCE_UNKNOWN locator) cannot - contribute source stems to G2's shared-source guard or excerpts to G3's - grounding — a run cannot anchor anything on evidence it invented. EMPTY - entries are kept: an empty excerpt is a format problem, not fabrication, - and its (verified) locator stem is still a legitimate citation. - """ - index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")} - if corpus is None: - return index - statuses = verify_evidence_index(corpus, result) - return { - eid: ev - for eid, ev in index.items() - if statuses[eid] in (VERIFIED, EMPTY) - } - - -# ── G1: Structural & Safe ──────────────────────────────────────────────────── - - -def _name_duplication_rate(nodes: list[dict]) -> float: - """Tier-1 + Tier-2 name clustering; returns 1 - clusters/count. - - Tier 1: same normalized id (lower-case) merges nodes into one cluster. - Tier 2: name token-Jaccard >= 0.6 merges nodes into one cluster. - - Report-only: no gate flag fires on any threshold. - """ - n = len(nodes) - if n < 2: - return 0.0 - - group = list(range(n)) - - def _root(i: int) -> int: - while group[i] != i: - group[i] = group[group[i]] - i = group[i] - return i - - seen: dict[str, int] = {} - for i, node in enumerate(nodes): - nid = node.get("id", "").lower() - if nid in seen: - group[_root(i)] = _root(seen[nid]) - else: - seen[nid] = i - - toks = [frozenset(node.get("name", "").lower().split()) for node in nodes] - for i in range(n): - for j in range(i + 1, n): - a, b = toks[i], toks[j] - union_ab = a | b - if union_ab and len(a & b) / len(union_ab) >= 0.6: - group[_root(i)] = _root(j) - - clusters = len({_root(i) for i in range(n)}) - return round(1 - clusters / n, 4) - - -def g1_structural( - result: dict, - registry: Registry, - registry_path: str, - *, - pii_list: list[str] | None = None, - corpus: Corpus | None = None, -) -> GateResult: - """G1 — Structural & Safe (hard veto). - - Checks (in order): - 1. EMPTY_MUST_FIND — must run first; kills the fake-100%-champion bug. - 2. Registry SHA-256 pin: loaded Registry matches the file on disk. - 3. Corpus SHA-256 pin (when a corpus is supplied): same drift guard for - the evidence universe (CORPUS_DRIFT). - 4. Required top-level keys present in result. - 5. PII non-disclosure: no corpus PII name in any finding/report text. - """ - # Guard 1: empty registry (fake-champion guard — always first) - if not registry.real_items: - return GateResult( - gate="G1", - passed=False, - reason_code="EMPTY_MUST_FIND", - details={"message": "Registry has zero real items — cannot evaluate recall."}, - ) - - # Guard 2: registry SHA-256 pin - computed_sha = registry_sha256(registry_path) - if computed_sha != registry.sha256(): - return GateResult( - gate="G1", - passed=False, - reason_code="GOLD_DRIFT", - details={ - "message": "Registry file has changed since it was loaded.", - "expected": registry.sha256(), - "actual": computed_sha, - }, - ) - - # Guard 3: corpus SHA-256 pin (CORPUS_DRIFT — the GOLD_DRIFT twin for evidence) - if corpus is not None: - current_corpus_sha = corpus_sha256(corpus.path) - if current_corpus_sha != corpus.sha256: - return GateResult( - gate="G1", - passed=False, - reason_code="CORPUS_DRIFT", - details={ - "message": "Corpus file has changed since it was loaded.", - "expected": corpus.sha256, - "actual": current_corpus_sha, - }, - ) - - # Guard 4: required result keys - required = ("process_graph", "findings", "evidence_index") - missing = [k for k in required if k not in result] - if missing: - return GateResult( - gate="G1", - passed=False, - reason_code="SCHEMA_INVALID", - details={"missing_keys": missing}, - ) - - # Guard 5: PII check - if pii_list: - free_text: list[str] = [] - for finding in result.get("findings", []): - free_text.extend([finding.get("title", ""), finding.get("description", "")]) - for report in result.get("reports", []): - free_text.append(str(report)) - combined = " ".join(free_text).lower() - hits = [name for name in pii_list if name.lower() in combined] - if hits: - return GateResult( - gate="G1", - passed=False, - reason_code="PII_LEAK", - details={ - "message": "Corpus PII names found in findings/reports.", - "matches": hits[:5], - }, - ) - - pg = result.get("process_graph", {}) - processes = pg.get("processes", []) - activities = [a for p in processes for a in p.get("activities", [])] - decisions = [d for p in processes for d in p.get("decisions", [])] - dg = result.get("dependency_graph", {}) - - details = { - "registry_sha256": registry.sha256(), - "real_items": len(registry.real_items), - "nc_items": len(registry.nc_items), - "map": { - "processes": { - "count": len(processes), - "duplication_rate": _name_duplication_rate(processes), - }, - "activities": { - "count": len(activities), - "duplication_rate": _name_duplication_rate(activities), - }, - "decisions": { - "count": len(decisions), - "duplication_rate": _name_duplication_rate(decisions), - }, - "personas": { - "count": len(result.get("personas", [])), - "duplication_rate": _name_duplication_rate(result.get("personas", [])), - }, - "systems": { - "count": len(result.get("systems", [])), - "duplication_rate": _name_duplication_rate(result.get("systems", [])), - }, - "informal_channels": { - "count": len(result.get("informal_channels", [])), - "duplication_rate": _name_duplication_rate(result.get("informal_channels", [])), - }, - "dependency_graph_edges": len(dg.get("activity_edges", [])), - }, - } - if corpus is not None: - details["corpus_sha256"] = corpus.sha256 - return GateResult(gate="G1", passed=True, details=details) - - -# ── G2: Recall & Precision ─────────────────────────────────────────────────── - - -def _candidates_by_scope(result: dict) -> dict[str, list[dict]]: - """Build per-scope candidate lists from a DiscoveryResult (§4.3). - - Process candidates are augmented with their children's evidence_refs because - process nodes typically carry no own refs — the source-document guard uses the - union of the process's own refs and all its activities' and decisions' refs. - - dependency_graph-scoped items are relation items (all carry from/to) and are - matched via matcher.matches_dependency_graph_relation() — not through per-candidate - iteration — so no "dependency_graph" key is included here. - """ - pg = result.get("process_graph", {}) - processes = pg.get("processes", []) - - def _merge_refs(proc: dict) -> dict: - children_refs = [ - ref - for child_list in (proc.get("activities", []), proc.get("decisions", [])) - for child in child_list - for ref in child.get("evidence_refs", []) - ] - return {**proc, "evidence_refs": list(proc.get("evidence_refs", [])) + children_refs} - - return { - "process": [_merge_refs(p) for p in processes], - "activity": [a for p in processes for a in p.get("activities", [])], - "decision": [d for p in processes for d in p.get("decisions", [])], - "finding": result.get("findings", []), - "action": result.get("proposed_actions", []), - "persona": result.get("personas", []), - "system": result.get("systems", []), - "informal_channel": result.get("informal_channels", []), - } - - -def _weighted_recall(scored_items: list, hits: dict[str, bool]) -> float: - """Weighted recall of a hit map over the scored (non-L3) items.""" - total_weight = sum(item.weight for item in scored_items) or 1.0 - weighted_hit = sum(item.weight for item in scored_items if hits[item.id]) - return weighted_hit / total_weight - - -def _finding_redundancy_rate(findings: list[dict]) -> float: - """Fraction of findings that are near-duplicates of another (Jaccard ≥0.6 on ≥5-char tokens).""" - if len(findings) < 2: - return 0.0 - def _tok(text: str) -> frozenset[str]: - return frozenset(t.lower() for t in text.split() if len(t) >= 5) - token_sets = [_tok(f.get("description", "")) for f in findings] - in_redundant: set[int] = set() - for i in range(len(token_sets)): - for j in range(i + 1, len(token_sets)): - a, b = token_sets[i], token_sets[j] - union = a | b - sim = len(a & b) / len(union) if union else 1.0 - if sim >= 0.6: - in_redundant.add(i) - in_redundant.add(j) - return round(len(in_redundant) / len(findings), 4) - - -def g2_recall_precision( - result: dict, - registry: Registry, - *, - recall_floor: float = 0.70, - embed_fn=None, - tau: float = 0.70, - tau_nc: float = 0.85, - recall_metric: str = "lexical", - corpus: Corpus | None = None, -) -> GateResult: - """G2 — Recall & Precision (hard veto). - - - L0 miss -> BLOCK (zeros the evaluation; regulatory-mandatory item absent) - - NC hit -> BLOCK (precision failure; plausible-but-false item was emitted) - - recall < floor -> BLOCK - - With a ``corpus``, evidence entries that fail verification (fabricated - excerpt or unknown source) are excluded from the evidence index before - matching, so the shared-source guard only accepts citations to real - corpus documents — a fabricated locator cannot satisfy any item. - - ``recall_metric`` ("lexical"/"semantic"/"hybrid") selects which hit map GATES. - "lexical" is matcher.matches (shared-source + topic-anchored token overlap) and - needs no embedder. "semantic"/"hybrid" add the embedding path (matcher.semantic_hits, - threshold ``tau`` for real items, ``tau_nc`` for NC items) and REQUIRE ``embed_fn`` - — passing them without one raises ValueError (use "lexical" for the offline path). - When an embedder is supplied, all three recalls (lexical/semantic/hybrid) are - reported in details regardless of which one gates. - """ - evidence_index = _build_evidence_index(result, corpus) - candidates = _candidates_by_scope(result) - findings = candidates["finding"] - - # NC items anchor via the embedding path only (§6.2): a correct finding about - # the true mirror fact shares vocabulary with the false description, so a - # token or keyword match would falsely convict it. Lexical NC is always False. - # dependency_graph relation items (those with from_node) use the endpoint - # matcher (§5.3b) instead of the per-candidate text predicate. - lexical: dict[str, bool] = {} - for item in registry.items: - if item.tier == "NC": - lexical[item.id] = False - elif item.scope == "dependency_graph" and item.from_node: - lexical[item.id] = matcher.matches_dependency_graph_relation( - item, result, evidence_index - ) - else: - lexical[item.id] = any( - matches(c, item, evidence_index, scope=scope) - for scope in matcher.allowed_scopes(item) - for c in candidates.get(scope, []) - ) - - if recall_metric not in ("lexical", "semantic", "hybrid"): - raise ValueError(f"unknown recall_metric {recall_metric!r}") - if recall_metric in ("semantic", "hybrid") and embed_fn is None: - raise ValueError( - f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn" - ) - - if embed_fn is not None: - semantic = matcher.semantic_hits( - candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc - ) - # dependency_graph relation items have no embedding candidates (§5.3b uses - # the endpoint matcher, not per-candidate text embeddings); mirror the - # lexical result so semantic/hybrid never under-credits them. - for item in registry.items: - if item.scope == "dependency_graph" and item.from_node: - semantic[item.id] = lexical[item.id] - else: - semantic = None - - metric = recall_metric - - if semantic is None or metric == "lexical": - hits = lexical - elif metric == "semantic": - hits = semantic - else: # hybrid - hits = {iid: lexical[iid] or semantic[iid] for iid in lexical} - - # Signal-to-noise panel — report-only, §6.2 item 3 - finding_count = len(findings) - finding_scoped_items = [i for i in registry.real_items if i.scope == "finding"] - findings_matched = sum( - 1 for f in findings - if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items) - ) - _sn = { - "finding_count": finding_count, - "findings_matched_to_registry": { - "count": findings_matched, - "fraction": round(findings_matched / finding_count, 4) if finding_count else 0.0, - }, - "finding_redundancy_rate": _finding_redundancy_rate(findings), - } - if corpus is not None: - excluded = len(_build_evidence_index(result)) - len(evidence_index) - _sn["evidence_entries_excluded_unverified"] = excluded - - # L0 misses - l0_misses = [item.id for item in registry.l0_items if not hits[item.id]] - if l0_misses: - return GateResult( - gate="G2", - passed=False, - reason_code="L0_MISSING", - details={ - "l0_misses": l0_misses, - "message": "Regulatory-mandatory items not found — evaluation zeroed.", - **_sn, - }, - ) - - # NC precision - nc_hits = [item.id for item in registry.nc_items if hits[item.id]] - if nc_hits: - return GateResult( - gate="G2", - passed=False, - reason_code="NC_HIT", - details={ - "nc_hits": nc_hits, - "message": "Plausible-but-false negative control items were matched — precision failure.", - **_sn, - }, - ) - - # Weighted recall — over scored items only (L0/L1/L2). L3 is a bonus tier - # ("extra credit"): an L3 miss must not lower recall, so L3 is excluded from - # the denominator and only reported in per_tier below. Recall is computed over - # the GATING hit map so the gate is internally consistent with the chosen metric. - real_items = registry.real_items - scored_items = [item for item in real_items if item.tier != "L3"] - recall = _weighted_recall(scored_items, hits) - - per_tier: dict[str, dict] = {} - for tier in ("L0", "L1", "L2", "L3"): - tier_items = [i for i in real_items if i.tier == tier] - if not tier_items: - continue - per_tier[tier] = { - "hit": sum(1 for i in tier_items if hits[i.id]), - "total": len(tier_items), - } - - def _semantic_details() -> dict: - """The extra recall-breakdown keys, only emitted when an embedder is given.""" - if semantic is None: - return {} - return { - "lexical_recall": round(_weighted_recall(scored_items, lexical), 4), - "semantic_recall": round(_weighted_recall(scored_items, semantic), 4), - "hybrid_recall": round( - _weighted_recall( - scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical} - ), - 4, - ), - "tau": tau, - } - - if recall < recall_floor: - return GateResult( - gate="G2", - passed=False, - reason_code="RECALL_BELOW_FLOOR", - details={ - "recall": round(recall, 4), - "recall_metric": metric, - "floor": recall_floor, - "per_tier": per_tier, - "misses": [item.id for item in scored_items if not hits[item.id]], - **_semantic_details(), - **_sn, - }, - ) - - return GateResult( - gate="G2", - passed=True, - details={ - "recall": round(recall, 4), - "recall_metric": metric, - "floor": recall_floor, - "per_tier": per_tier, - "nc_items_checked": len(registry.nc_items), - **_semantic_details(), - **_sn, - }, - ) - - -# ── G3: Grounded ───────────────────────────────────────────────────────────── - - -def g3_grounded( - result: dict, - *, - grounding_floor: float = 0.90, - human_spot_check_n: int = 5, - corpus: Corpus | None = None, -) -> GateResult: - """G3 — Grounded (automated portion; human spot-check triggered on pass). - - For each finding, verifies that at least one cited evidence excerpt shares a - non-trivial token with the finding description (topic-anchoring). - - With a ``corpus``, the gate also looks in a third direction — cited -> - exists: every evidence entry is verified against the actual corpus text - (corpus.verify_entry). A populated excerpt not found in its cited source - raises EVIDENCE_FABRICATED; a locator resolving to no corpus document - raises EVIDENCE_SOURCE_UNKNOWN; and only verified excerpts can ground a - finding, so a run cannot ground itself on evidence it invented. - - Also reports excerpt fill rate and source coverage so the reviewer can tell - whether ungrounded findings are a format problem (empty excerpts) or a real - faithfulness signal (populated excerpts that do not anchor). - - Known limitation: topic-anchoring, not claim entailment. A '45 days' claim - cited to a '3 days' source passes if they share the process name (excerpt - verification confirms the quote is real, not that the claim matches it). - The human spot-check is the binding faithfulness signal until NLI/AIS lands. - """ - evidence_index = _build_evidence_index(result) - findings = result.get("findings", []) - statuses = verify_evidence_index(corpus, result) if corpus is not None else None - - if not findings: - return GateResult( - gate="G3", - passed=False, - reason_code="NO_FINDINGS", - details={"message": "Result has zero findings — cannot compute grounding."}, - ) - - grounded_ids: list[str] = [] - # Ungrounded split (§6.3): distinguish format issues from real faithfulness failures. - ungrounded_empty_only: list[str] = [] # every ref had an empty excerpt - ungrounded_populated: list[str] = [] # had populated excerpt(s) but none anchored - - # Excerpt fill: count all resolved refs and how many carry a non-empty excerpt. - total_refs = 0 - populated_refs = 0 - - # Source coverage: which source stems are cited by at least one finding. - cited_stems: set[str] = set() - - for finding in findings: - fid = finding.get("id", "?") - desc = finding.get("description", "") - is_grounded = False - had_populated = False - for ref in finding.get("evidence_refs", []): - ev = evidence_index.get(ref.get("evidence_id", "")) - if ev: - total_refs += 1 - excerpt = ev.get("excerpt") or "" - if excerpt: - populated_refs += 1 - had_populated = True - # Track source coverage (even for ungrounded findings). - stem = matcher.source_stem(ev.get("locator", "")) - if stem: - cited_stems.add(stem) - # Only a corpus-verified excerpt can ground a finding. - if statuses is not None and statuses.get(ev.get("id")) != VERIFIED: - continue - if anchored(desc, excerpt): - is_grounded = True - break - if is_grounded: - grounded_ids.append(fid) - elif had_populated: - ungrounded_populated.append(fid) - else: - ungrounded_empty_only.append(fid) - - grounding_pct = len(grounded_ids) / len(findings) - - # All source stems present in the evidence index (not just those cited). - all_stems: set[str] = set() - for ev in result.get("evidence_index", []): - stem = matcher.source_stem(ev.get("locator", "")) - if stem: - all_stems.add(stem) - orphaned = sorted(all_stems - cited_stems) - - excerpt_fill = f"{populated_refs}/{total_refs}" if total_refs else "0/0" - source_coverage = f"{len(cited_stems)}/{len(all_stems)}" if all_stems else "0/0" - - details = { - "grounding_pct": round(grounding_pct, 4), - "grounded": len(grounded_ids), - "total": len(findings), - "excerpt_fill": excerpt_fill, - "source_coverage": source_coverage, - "orphaned_sources": orphaned, - } - - fabricated_ids: list[str] = [] - unknown_source_ids: list[str] = [] - if statuses is not None: - fabricated_ids = sorted(e for e, s in statuses.items() if s == FABRICATED) - unknown_source_ids = sorted(e for e, s in statuses.items() if s == SOURCE_UNKNOWN) - details["evidence_verification"] = { - "entries": len(statuses), - "verified": sum(1 for s in statuses.values() if s == VERIFIED), - "empty_excerpt": sum(1 for s in statuses.values() if s == EMPTY), - "fabricated": fabricated_ids, - "source_unknown": unknown_source_ids, - } - - if fabricated_ids: - details["message"] = ( - "Populated excerpt(s) not found in the cited corpus document — " - "the run asserts evidence the source does not contain." - ) - return GateResult( - gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details - ) - - if unknown_source_ids: - details["message"] = ( - "Evidence locator(s) resolve to no corpus document — either the " - "corpus bundle is incomplete or the run invented a source." - ) - return GateResult( - gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details - ) - - if grounding_pct < grounding_floor: - details["floor"] = grounding_floor - details["ungrounded_with_populated_excerpts"] = ungrounded_populated - details["ungrounded_with_empty_excerpts_only"] = ungrounded_empty_only - return GateResult(gate="G3", passed=False, reason_code="UNGROUNDED", details=details) - - spot_n = min(human_spot_check_n, len(findings)) - details["human_spot_check"] = ( - f"ACTION REQUIRED: manually review {spot_n} sampled findings for " - "field-consistency, citation-accuracy, and client-readiness. " - "This is the binding faithfulness signal until NLI/AIS lands." - ) - return GateResult(gate="G3", passed=True, details=details) - - -# ── G5: No-regression / promotion (human decision) ─────────────────────────── - - -def g5_no_regression( - candidate_scores: dict[str, float], - champion_scores: dict[str, float] | None, - aa_noise: dict[str, float] | None, - *, - is_day_zero: bool = False, - human_signed_off: bool = False, - signoff_count: int = 0, -) -> GateResult: - """G5 — No-regression / promotion gate (human decision). - - Day-Zero: no champion exists. Requires G1-G3 pass + 2 independent sign-offs. - Normal promotion: candidate must beat champion by > aa_noise on every metric, - no guardrail regresses, + 1 human sign-off. - - Champions are per-corpus. Do not compare across corpora. - """ - if is_day_zero or champion_scores is None: - required = 2 - if signoff_count < required: - return GateResult( - gate="G5", - passed=False, - reason_code="HOLD", - details={ - "reason": ( - f"Day-Zero requires {required} independent human sign-offs " - f"(kappa >= 0.70); got {signoff_count}." - ), - "action": "Collect sign-offs, then re-run with --day-zero --signoffs 2", - }, - ) - return GateResult( - gate="G5", - passed=True, - details={"day_zero": True, "signoffs": signoff_count}, - ) - - if not human_signed_off: - return GateResult( - gate="G5", - passed=False, - reason_code="HOLD", - details={"reason": "Human sign-off required for promotion."}, - ) - - noise = aa_noise or {} - regressions: list[str] = [] - improvements: list[str] = [] - - for metric, cand_val in candidate_scores.items(): - champ_val = champion_scores.get(metric) - if champ_val is None: - continue - delta = cand_val - champ_val - band = noise.get(metric, 0.0) - if delta < -band: - regressions.append( - f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} " - f"delta={delta:+.4f} < -band={-band:.4f}" - ) - elif delta > band: - improvements.append(f"{metric}: delta={delta:+.4f} > band={band:.4f}") - - if regressions: - return GateResult( - gate="G5", - passed=False, - reason_code="HOLD", - details={ - "regressions": regressions, - "improvements": improvements, - "message": "Guardrail metric(s) regressed beyond A/A noise band.", - }, - ) - - return GateResult( - gate="G5", - passed=True, - details={"improvements": improvements, "noise_band": noise}, - ) - - -# ── Full gate pipeline ──────────────────────────────────────────────────────── - - -def run_gates( - result: dict, - registry: Registry, - registry_path: str, - *, - pii_list: list[str] | None = None, - recall_floor: float = 0.70, - grounding_floor: float = 0.90, - champion_scores: dict[str, float] | None = None, - aa_noise: dict[str, float] | None = None, - is_day_zero: bool = False, - human_signed_off: bool = False, - signoff_count: int = 0, - embed_fn=None, - tau: float = 0.70, - tau_nc: float = 0.85, - recall_metric: str = "lexical", - corpus: Corpus | None = None, -) -> list[GateResult]: - """Run all gates G1 -> G2 -> G3 -> G5; every gate always executes. - - A failed gate raises a flag in its GateResult but never prevents the - remaining gates from running. The scorecard therefore always carries the - complete picture: a run that misses a regulatory item *and* grounds poorly - shows both flags. See EVALUATION_FRAMEWORK.md §2 ('No gate vetoes'). - - ``corpus`` (optional) enables deterministic evidence verification: G1 pins - the corpus hash, G2 ignores unverified evidence entries, and G3 flags - fabricated excerpts and unknown sources. Without it, evidence is taken at - face value from the run's own evidence_index (disclosed on the scorecard). - - Returns all four GateResult objects. - """ - g1 = g1_structural(result, registry, registry_path, pii_list=pii_list, corpus=corpus) - - g2 = g2_recall_precision( - result, - registry, - recall_floor=recall_floor, - embed_fn=embed_fn, - tau=tau, - tau_nc=tau_nc, - recall_metric=recall_metric, - corpus=corpus, - ) - - g3 = g3_grounded(result, grounding_floor=grounding_floor, corpus=corpus) - - # G5 uses whatever scores G2/G3 produced; 0.0 when a gate flagged and did - # not emit the metric (e.g. L0_MISSING returns before computing recall). - candidate_scores = { - "recall": g2.details.get("recall", 0.0), - "grounding_pct": g3.details.get("grounding_pct", 0.0), - } - g5 = g5_no_regression( - candidate_scores, - champion_scores, - aa_noise, - is_day_zero=is_day_zero, - human_signed_off=human_signed_off, - signoff_count=signoff_count, - ) - - return [g1, g2, g3, g5] From 38c3f60f5109d559c6fe385c1b12eea878282f2e Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:23 +0200 Subject: [PATCH 17/48] chore(evaluation): delete corpus.py --- fireflyframework_agentic/evaluation/corpus.py | 185 ------------------ 1 file changed, 185 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/corpus.py diff --git a/fireflyframework_agentic/evaluation/corpus.py b/fireflyframework_agentic/evaluation/corpus.py deleted file mode 100644 index 32835f2c..00000000 --- a/fireflyframework_agentic/evaluation/corpus.py +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Corpus loading and evidence verification (EVALUATION_FRAMEWORK.md §6.3). - -The corpus is the third pinned evaluation input, next to the DiscoveryResult -and the registry: the raw document bundle (input.json) the discovery pipeline -read. It is the trusted side of every evidence anchor — the registry tells -the evaluator what *should* be found; only the corpus can tell it whether what -a run cited is *real*. - -verify_entry() closes the fabricated-evidence channel: a run controls every -byte of its own evidence_index, so any check computable from (result, registry) -alone can be satisfied by self-reported evidence. Checking each excerpt -against the actual corpus text is the only deterministic counter. - -Excerpt contract: excerpts are verbatim quotes from the source document. -Spliced quotes (fragments joined with '...' or '…') are supported — each -fragment is verified independently. Paraphrase belongs in the finding -description, never in an excerpt. -""" - -from __future__ import annotations - -import base64 -import difflib -import hashlib -import json -import re -import unicodedata -from dataclasses import dataclass -from pathlib import Path - -from fireflyframework_agentic.evaluation.matcher import source_stem - -# Verification statuses for one evidence_index entry. -VERIFIED = "verified" # excerpt found (verbatim or spliced) in the cited source -EMPTY = "empty" # entry carries no excerpt text — nothing to verify -SOURCE_UNKNOWN = "source_unknown" # locator resolves to no corpus document -FABRICATED = "fabricated" # populated excerpt not found in the cited source - -# A spliced excerpt is split on these joiners; fragments shorter than -# _MIN_FRAGMENT_CHARS are too generic to verify and are skipped. -_SPLICE_PATTERN = re.compile(r"\.\.\.|…| -- ") -_MIN_FRAGMENT_CHARS = 15 - -# A fragment passes fuzzily when matching blocks (>= _MIN_BLOCK_CHARS chars) -# cover at least _COVERAGE_THRESHOLD of it — tolerates punctuation/whitespace -# drift while rejecting invented text (measured ~0.10-0.32 coverage). -_COVERAGE_THRESHOLD = 0.85 -_MIN_BLOCK_CHARS = 4 - - -@dataclass -class Corpus: - """The decoded, normalized corpus: {source stem: normalized text}. - - sha256 pins the corpus file exactly like the registry pin (§4.6): the - champion record stores it, and G1 re-hashes the file at scoring time to - flag CORPUS_DRIFT. - """ - - texts: dict[str, str] - sha256: str - path: str - - -def normalize(text: str) -> str: - """Normalize text for excerpt matching: NFKC, strip markdown emphasis and - smart quotes, collapse whitespace, casefold.""" - text = unicodedata.normalize("NFKC", text) - text = text.replace("**", "").replace("*", "") - text = re.sub(r"[\"""''']", "", text) - return re.sub(r"\s+", " ", text).strip().casefold() - - -def corpus_sha256(path: str | Path) -> str: - """SHA-256 of the corpus file on disk (the CORPUS_DRIFT re-hash).""" - return hashlib.sha256(Path(path).read_bytes()).hexdigest() - - -def load_corpus(path: str | Path) -> Corpus: - """Load a FlyRadar input.json bundle into a stem-indexed normalized Corpus. - - Decodes every artifacts[] file and signals[] event log (base64), normalizes - the text, and keys each by the same source_stem the matcher uses — so a - locator in any convention resolves to its document. - - Raises: - ValueError: when the bundle contains no documents, or two documents - reduce to the same stem (a collision would let a fabricated - citation resolve against the wrong real file). - """ - path = Path(path) - raw = json.loads(path.read_text(encoding="utf-8")) - - named_contents: list[tuple[str, str]] = [] - for artifact in raw.get("artifacts", []): - named_contents.append((artifact["filename"], artifact["content_base64"])) - for signal in raw.get("signals", []): - named_contents.append((signal["name"], signal["content_base64"])) - - if not named_contents: - raise ValueError(f"corpus bundle {path} contains no artifacts or signals") - - texts: dict[str, str] = {} - for name, content_b64 in named_contents: - stem = source_stem(name) - if stem in texts: - raise ValueError( - f"corpus stem collision: two documents reduce to {stem!r} — " - "rename one; a collision would verify citations against the wrong file" - ) - decoded = base64.b64decode(content_b64).decode("utf-8", errors="replace") - texts[stem] = normalize(decoded) - - return Corpus(texts=texts, sha256=corpus_sha256(path), path=str(path)) - - -def _fragment_coverage(fragment: str, source: str) -> float: - """Fraction of fragment covered by matching blocks of >= _MIN_BLOCK_CHARS chars.""" - blocks = difflib.SequenceMatcher( - None, fragment, source, autojunk=False - ).get_matching_blocks() - covered = sum(b.size for b in blocks if b.size >= _MIN_BLOCK_CHARS) - return covered / len(fragment) - - -def verify_entry(corpus: Corpus, entry: dict) -> str: - """Verify one evidence_index entry against the corpus. - - Returns one of VERIFIED / EMPTY / SOURCE_UNKNOWN / FABRICATED: - - the locator must resolve (by source stem) to a corpus document, and - - every fragment of the excerpt must appear in that document's text, - verbatim after normalization or with matching-block coverage >= - _COVERAGE_THRESHOLD. - - The score is the minimum over fragments, so one invented fragment sinks a - spliced excerpt. - - """ - stem = source_stem(entry.get("locator", "")) - source = corpus.texts.get(stem) - if source is None: - return SOURCE_UNKNOWN - - excerpt = normalize(entry.get("excerpt") or "") - if not excerpt: - return EMPTY - - fragments = [ - f.strip() - for f in _SPLICE_PATTERN.split(excerpt) - if len(f.strip()) >= _MIN_FRAGMENT_CHARS - ] or [excerpt] - - for fragment in fragments: - if fragment in source: - continue - if _fragment_coverage(fragment, source) < _COVERAGE_THRESHOLD: - return FABRICATED - return VERIFIED - - -def verify_evidence_index(corpus: Corpus, result: dict) -> dict[str, str]: - """Verify every evidence_index entry of a DiscoveryResult. - - Returns {evidence_id: status} over all entries — referenced or not — so - the gates share one verification pass. - """ - return { - ev["id"]: verify_entry(corpus, ev) - for ev in result.get("evidence_index", []) - if ev.get("id") - } From f81992336b5b0932a3d65c3acfc2b8a439a27a1b Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:30 +0200 Subject: [PATCH 18/48] chore(evaluation): delete registry.py --- .../evaluation/registry.py | 214 ------------------ 1 file changed, 214 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/registry.py diff --git a/fireflyframework_agentic/evaluation/registry.py b/fireflyframework_agentic/evaluation/registry.py deleted file mode 100644 index 2b869ba9..00000000 --- a/fireflyframework_agentic/evaluation/registry.py +++ /dev/null @@ -1,214 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""lean-1 registry loader — one schema for all four corpora. - -Replaces the four mutually incompatible schemes in use today (L1-L5, -documented/observed/pain-point, critical/important, and no tiers). -Loader enforces all invariants; they are not documentation. - -Invariants (EVALUATION_FRAMEWORK.md, the must-find registry): -- schema_version == "lean-1" -- every tier is one of L0 L1 L2 L3 NC -- negative_control_count >= ceil(real_items / 10) -- kappa present (0.0 placeholder allowed; G2 advisory until >= 0.70) -- ABANCA DILO items must target a single measured sub-population -""" -from __future__ import annotations - -import hashlib -import json -import math -from dataclasses import dataclass, field -from pathlib import Path -from typing import Literal - -VALID_TIERS = ("L0", "L1", "L2", "L3", "NC") -VALID_SCOPES = ( - "process", "activity", "decision", "finding", "action", - "persona", "system", "informal_channel", "dependency_graph", -) -SCHEMA_VERSION = "lean-1" -KAPPA_ADVISORY_THRESHOLD = 0.70 - - -@dataclass(frozen=True) -class RegistryItem: - id: str - tier: Literal["L0", "L1", "L2", "L3", "NC"] - description: str - evidence: list[str] # source file paths (path portion of locator, no #page=N) - scope: str = "finding" # which DiscoveryResult surface to match against (§4.3) - keywords: list[str] = field(default_factory=list) - weight: float = 1.0 - from_node: str = "" # dependency_graph relation items only - to_node: str = "" # dependency_graph relation items only - relation: str = "" # defaults to "precedes" when from/to present - - -@dataclass(frozen=True) -class Registry: - schema_version: str - corpus: str - author: str - date: str - kappa: float - items: list[RegistryItem] - _sha256: str = field(default="", compare=False) - - @property - def real_items(self) -> list[RegistryItem]: - return [i for i in self.items if i.tier != "NC"] - - @property - def nc_items(self) -> list[RegistryItem]: - return [i for i in self.items if i.tier == "NC"] - - @property - def l0_items(self) -> list[RegistryItem]: - return [i for i in self.items if i.tier == "L0"] - - def is_kappa_advisory(self) -> bool: - return self.kappa < KAPPA_ADVISORY_THRESHOLD - - def sha256(self) -> str: - return self._sha256 - - -def _validate(raw: dict, path: Path) -> None: - if raw.get("schema_version") != SCHEMA_VERSION: - raise ValueError( - f"{path.name}: schema_version must be '{SCHEMA_VERSION}', " - f"got {raw.get('schema_version')!r}" - ) - for fname in ("corpus", "author", "date"): - if not raw.get(fname): - raise ValueError(f"{path.name}: missing required field '{fname}'") - if "kappa" not in raw: - raise ValueError(f"{path.name}: missing 'kappa' field (use 0.0 as placeholder)") - - items = raw.get("items", []) - - # EMPTY_MUST_FIND guard — must be first; kills fake-champion bug - if not items: - raise ValueError( - f"{path.name}: EMPTY_MUST_FIND — items list is empty; " - "cannot evaluate recall. This guard exists to prevent the " - "fake-100%-champion failure." - ) - - ids = [it.get("id") for it in items] - if len(ids) != len(set(ids)): - dupes = sorted({i for i in ids if ids.count(i) > 1}) - raise ValueError(f"{path.name}: duplicate item ids: {dupes}") - - for it in items: - tier = it.get("tier") - if tier not in VALID_TIERS: - raise ValueError( - f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; " - f"must be one of {VALID_TIERS}" - ) - scope = it.get("scope", "finding") - if scope not in VALID_SCOPES: - raise ValueError( - f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; " - f"must be one of {VALID_SCOPES}" - ) - if scope == "dependency_graph": - if not it.get("from") or not it.get("to"): - raise ValueError( - f"{path.name}: dependency_graph item '{it.get('id')}' must have " - "non-empty 'from' and 'to'" - ) - else: - if "from" in it or "to" in it or "relation" in it: - raise ValueError( - f"{path.name}: item '{it.get('id')}' has 'from'/'to'/'relation' " - f"but scope is '{scope}'; these fields are only valid on " - "dependency_graph-scoped items" - ) - - real_count = sum(1 for it in items if it.get("tier") != "NC") - nc_count = sum(1 for it in items if it.get("tier") == "NC") - required_nc = max(1, math.ceil(real_count / 10)) - if nc_count < required_nc: - raise ValueError( - f"{path.name}: NC density too low — {nc_count} NC item(s) for " - f"{real_count} real items; need >= {required_nc} (ceil(real/10)). " - "Without NC items the eval measures recall only; a verbose hallucinator " - "scores perfectly." - ) - - # ABANCA DILO blend guard: items must assert a single sub-population target. - # Checks for phrases that would indicate a blended numeric target is asserted. - # "blend" alone is too broad (items may reference it negatively). - BLEND_PHRASES = ("combined distribution", "across all offices regardless of segment") - for it in items: - if it.get("tier") == "NC": - continue - desc = it.get("description", "").lower() - iid = it.get("id", "") - if any(phrase in desc for phrase in BLEND_PHRASES): - raise ValueError( - f"{path.name}: item '{iid}' description targets a blended distribution; " - "ABANCA DILO items must target a single measured sub-population " - "(Empresas or PyMEs). Use segment-keyed items: " - "dilo-empresas-operativa-42pct AND dilo-pymes-operativa-29pct separately." - ) - - -def _compute_sha256(path: Path) -> str: - return hashlib.sha256(path.read_bytes()).hexdigest() - - -def load_registry(path: str | Path) -> Registry: - """Load and validate a lean-1 registry file. - - Raises ValueError with a descriptive message on any invariant violation. - The EMPTY_MUST_FIND check runs first — it is the fake-champion guard. - """ - path = Path(path) - raw = json.loads(path.read_text(encoding="utf-8")) - _validate(raw, path) - sha = _compute_sha256(path) - - items = [ - RegistryItem( - id=it["id"], - tier=it["tier"], - scope=it.get("scope", "finding"), - description=it.get("description", ""), - evidence=it.get("evidence", []), - keywords=it.get("keywords", []), - weight=float(it.get("weight", 1.0)), - from_node=it.get("from", "") if it.get("scope") == "dependency_graph" else "", - to_node=it.get("to", "") if it.get("scope") == "dependency_graph" else "", - relation=it.get("relation", "precedes") if it.get("scope") == "dependency_graph" else "", - ) - for it in raw["items"] - ] - - return Registry( - schema_version=raw["schema_version"], - corpus=raw["corpus"], - author=raw["author"], - date=raw["date"], - kappa=float(raw["kappa"] or 0.0), - items=items, - _sha256=sha, - ) - - -def registry_sha256(path: str | Path) -> str: - return _compute_sha256(Path(path)) From 3bc07861bafefab41c04e6e4697be779097b3f49 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:36 +0200 Subject: [PATCH 19/48] chore(evaluation): delete matcher.py --- .../evaluation/matcher.py | 369 ------------------ 1 file changed, 369 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/matcher.py diff --git a/fireflyframework_agentic/evaluation/matcher.py b/fireflyframework_agentic/evaluation/matcher.py deleted file mode 100644 index b4d81f44..00000000 --- a/fireflyframework_agentic/evaluation/matcher.py +++ /dev/null @@ -1,369 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Single matching primitive reused across G2 (recall/precision) and G3 (grounding). - -anchored() is topic-level lexical overlap. matches() is the gate predicate. -One function, three uses — do not write three matching functions. - -Known limitation (EVALUATION_FRAMEWORK.md): anchored() is topic-anchored, not claim-verified. -A '45 days' claim cited to a '3 days' source passes if they share the process name. -Real claim entailment (NLI/AIS) is Phase 2. The G3 human spot-check is the -binding faithfulness signal until then. -""" - -from __future__ import annotations - -import re - -import numpy as np - -from fireflyframework_agentic.evaluation.judge_client import cosine - - -def tokens(text: str) -> list[str]: - return re.findall(r"\b\w+\b", text.lower()) - - -def anchored(claim: str, evidence: str, *, min_token: int = 5) -> bool: - """True if claim and evidence share at least one non-trivial token (>= min_token chars). - - Rejects a citation to an unrelated document. Does NOT verify the claim value — - that gap is closed by the deferred NLI/AIS check in Phase 2. - """ - a = {t for t in tokens(claim) if len(t) >= min_token} - b = {t for t in tokens(evidence) if len(t) >= min_token} - return bool(a & b) - - -def source_stem(locator: str) -> str: - """Normalize a locator/source path to a stable document stem for matching. - - Robust to the two locator conventions observed across runs: - - directory-prefixed ('sops/SOP-002-kyc-edd.md') and bare ('SOP-002-kyc-edd.md') - both reduce to 'sop-002-kyc-edd'; - - event-log row ids ('src-credit-underwriting:CU-2026-1003') reduce to the - process stem 'credit-underwriting', so they join the CSV the registry cites. - - Preserves the same-document anti-gaming property of matches(): it still keys - on which source document a finding cites — just independent of directory - prefix, file extension, and case, so one registry scores every run. - """ - s = locator.split("#")[0] # drop the locator fragment (#page=N, #anchor) - s = s.rsplit("/", 1)[-1] # basename — strip any directory prefix - if s.startswith("src-") and ":" in s: # event-log row id: src-: - return s.split(":", 1)[0][len("src-") :].lower() - if "." in s: # strip a trailing file extension - s = s.rsplit(".", 1)[0] - return s.lower() - - -def _finding_sources(finding: dict, evidence_index: dict[str, dict]) -> set[str]: - """Return the set of normalized source-document stems cited by a finding.""" - sources: set[str] = set() - for ref in finding.get("evidence_refs", []): - ev = evidence_index.get(ref.get("evidence_id", "")) - if ev: - stem = source_stem(ev.get("locator", "")) - if stem: - sources.add(stem) - return sources - - -def shares_source(finding: dict, item, evidence_index: dict[str, dict]) -> bool: - """True iff the finding cites at least one source document the item lists as evidence. - - Source documents are compared by normalized stem (source_stem) so one registry - scores every run regardless of locator convention. This is the anti-gaming - anchor reused by both the lexical predicate (matches) and the semantic path - (semantic_hits): a finding on a different document cannot satisfy this item. - - Spec-style NC items list their mirror source (§4.1); legacy NC items carry - evidence=[], which makes this always False for them. - - Args: - finding: dict from DiscoveryResult.findings[i] (model_dump output). - item: RegistryItem dataclass from registry.py. - evidence_index: {evidence_id: Evidence dict} built from result['evidence_index']. - """ - finding_sources = _finding_sources(finding, evidence_index) - item_sources = {source_stem(e) for e in item.evidence} - return bool(finding_sources & item_sources) - - -def _keyword_anchored(desc: str, keywords: list[str]) -> bool: - """True iff any keyword appears as a whole word in desc (case-insensitive). - - Keyword rail: exempt from the 5-char token floor so short banking terms - (KYC, PEP, AML) can anchor a match even though they are too short for the - token rail. Whole-word matching prevents false substring hits (e.g. "risk" - inside "enterprise-risk-management"). - """ - if not keywords: - return False - desc_lower = desc.lower() - return any( - re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords - ) - - -def candidate_text(candidate: dict, scope: str) -> str: - """Extract the searchable text from a candidate on the given scope surface (§4.3). - - Each scope surface uses different fields as the match text: - - finding / action : title + description - - process / decision : name + description - - activity : name + notes + regulatory_links - - persona : name + role + goals + pain_points - - system : name + description - - informal_channel : name + usage_context + notes - - dependency_graph : name + description (diagnostic nodes; relation items bypass this) - """ - if scope in ("finding", "action"): - return " ".join(filter(None, [candidate.get("title", ""), candidate.get("description", "")])) - if scope == "activity": - rl = candidate.get("regulatory_links") or [] - rl_str = " ".join(rl) if isinstance(rl, list) else str(rl or "") - return " ".join(filter(None, [candidate.get("name", ""), candidate.get("notes", ""), rl_str])) - if scope == "persona": - goals = candidate.get("goals") or [] - pain = candidate.get("pain_points") or [] - goals_str = " ".join(goals) if isinstance(goals, list) else str(goals) - pain_str = " ".join(pain) if isinstance(pain, list) else str(pain) - return " ".join(filter(None, [ - candidate.get("name", ""), - candidate.get("role", ""), - goals_str, - pain_str, - ])) - if scope == "informal_channel": - return " ".join(filter(None, [ - candidate.get("name", ""), - candidate.get("usage_context", ""), - candidate.get("notes", ""), - ])) - # process, decision, system, dependency_graph (diagnostic nodes) - return " ".join(filter(None, [candidate.get("name", ""), candidate.get("description", "")])) - - -INSIGHT_ITEM_SCOPES = ("finding", "action") -INSIGHT_MATCH_SURFACES = ("finding", "action", "activity", "decision") - - -def allowed_scopes(item) -> tuple[str, ...]: - """Candidate surfaces that may satisfy a registry item. - - Insight items (finding / action) may be satisfied by any insight or process-graph - *leaf* surface (activity / decision): a run often grounds the same operational fact - on a different surface than the registry's scope tag anticipates (the BBVA case — - pain points the registry tags 'finding' that the run emitted as decision/activity - nodes). shares_source is still REQUIRED on every candidate (see matches / - semantic_hits), so a candidate on the wrong document never counts — cross-scope - widens WHERE we look, never the source anchor. - - Structural items (process / activity / decision) stay on their own surface: a - structural must-find requires the run to have actually built that node, not merely - mentioned the fact in a finding (test_process_scope_miss_when_no_matching_process). - NC items are likewise scope-strict — widening a negative control's pool could only - make it easier to trip (a specificity regression), never recover a legitimate hit. - - `process` is never a match surface for an insight item: _candidates_by_scope folds - every child's evidence_refs into the process node, so its citation set is a union of - many documents and shares_source goes vacuous (hence its exclusion from - INSIGHT_MATCH_SURFACES). - """ - if item.tier == "NC": - return (item.scope,) - if item.scope in INSIGHT_ITEM_SCOPES: - return INSIGHT_MATCH_SURFACES - return (item.scope,) - - -def matches( - candidate: dict, - item, - evidence_index: dict[str, dict], - scope: str = "finding", -) -> bool: - """True iff candidate cites a shared source document AND is topic-anchored to item. - - Two-rail anchor (either rail suffices): - - Token rail: ≥1 shared token of ≥5 chars between candidate text and item description. - - Keyword rail: ≥1 item keyword appears as a whole word in the candidate text. - Exempt from the 5-char floor so short banking terms (KYC, PEP, AML) can anchor. - - The ``scope`` controls which fields are read as the candidate's match text (§4.3): - findings and actions use ``title + description``; processes and decisions use - ``name + description``; activities use ``name + notes + regulatory_links``. - - Anti-gaming guard: a candidate on a different document cannot satisfy this item - even if its text happens to match. Source documents are compared by - normalized stem (source_stem) so one registry scores every run regardless of - locator convention. - - Args: - candidate: dict from the DiscoveryResult surface matching ``scope``. - item: RegistryItem dataclass from registry.py. - evidence_index: {evidence_id: Evidence dict} built from result['evidence_index']. - scope: surface the candidate was drawn from (default "finding"). - """ - if not shares_source(candidate, item, evidence_index): - return False - desc = candidate_text(candidate, scope) - return _keyword_anchored(desc, list(item.keywords or [])) or anchored(desc, item.description) - - -def matches_dependency_graph_relation( - item, - result: dict, - evidence_index: dict[str, dict], -) -> bool: - """Endpoint matcher for dependency_graph relation items (§5.3b). - - Stage 1: Anchor both endpoints to activity nodes via token rail. - Stage 2: Verify a directed edge or path connects them in the asserted direction, - behind the shared-source guard on the edge's/path's evidence_refs. - - Returns False when either endpoint anchors to no activity, or when no connecting - edge/path shares a source document with the item. - """ - if not item.from_node or not item.to_node: - return False - - processes = result.get("process_graph", {}).get("processes", []) - all_activities = [a for p in processes for a in p.get("activities", [])] - - def _anchor(endpoint_text: str) -> set[str]: - return { - a["id"] - for a in all_activities - if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text) - } - - from_ids = _anchor(item.from_node) - to_ids = _anchor(item.to_node) - if not from_ids or not to_ids: - return False - - item_stems = {source_stem(e) for e in item.evidence} - - def _node_stems(node: dict) -> set[str]: - return { - source_stem(evidence_index[r["evidence_id"]].get("locator", "")) - for r in node.get("evidence_refs", []) - if r.get("evidence_id") in evidence_index - } - - dg = result.get("dependency_graph", {}) - - for edge in dg.get("activity_edges", []): - if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids: - if _node_stems(edge) & item_stems: - return True - - for path in dg.get("critical_paths", []): - if not (_node_stems(path) & item_stems): - continue - node_ids = path.get("node_ids", []) - from_pos = [i for i, nid in enumerate(node_ids) if nid in from_ids] - to_pos = [i for i, nid in enumerate(node_ids) if nid in to_ids] - if any(fp < tp for fp in from_pos for tp in to_pos): - return True - - return False - - -def semantic_hits( - candidates: dict[str, list[dict]], - items, - evidence_index: dict[str, dict], - embed_fn, - tau: float = 0.70, - tau_nc: float = 0.85, -) -> dict[str, bool]: - """Opt-in embedding-semantic recall: {item.id: found-by-some-shared-source candidate}. - - Scope-aware: each registry item is evaluated against candidates from its own - scope surface (finding, process, activity, decision, action) using the same - per-scope field extraction as the lexical path (candidate_text). Passing only - the findings list (the previous behaviour) would leave process/activity/decision/ - action items with an empty candidate pool and a guaranteed False result. - - Real items (L0–L3): hit iff some scope-matching candidate shares a source - document with the item (shares_source) AND is embedding-similar (cosine >= tau). - Source anchor is preserved — a candidate on a different document cannot recover - a real item. - - NC items (tier=="NC"): hit iff some scope-matching candidate is embedding-similar - (cosine >= tau_nc). When the NC lists its mirror source (§4.1) the shared-source - guard applies; legacy NC items with evidence=[] skip the anchor, with the higher - threshold (default 0.85) compensating. - - Cost is two embed_fn calls — all scope-appropriate candidate texts once and all - item texts once — not O(n*m) per-pair embeddings. - - Args: - candidates: {scope: [candidate dicts]} from _candidates_by_scope(). - items: iterable of RegistryItem dataclasses. - evidence_index: {evidence_id: Evidence dict}. - embed_fn: callable(list[str]) -> array-like of row vectors. - tau: cosine threshold for real items (inclusive). - tau_nc: cosine threshold for NC items (inclusive; higher to compensate for no source anchor). - """ - items = list(items) - - # Flatten all candidates across scopes, preserving their scope tag for - # text extraction and per-item filtering. - scoped: list[tuple[str, dict]] = [ - (scope, cand) - for scope, cands in candidates.items() - for cand in cands - ] - - if not scoped: - return {item.id: False for item in items} - - cand_texts = [candidate_text(cand, scope) for scope, cand in scoped] - item_texts = [ - " ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items - ] - - cand_vecs = np.asarray(embed_fn(cand_texts)) - item_vecs = np.asarray(embed_fn(item_texts)) - - hits: dict[str, bool] = {} - for i, item in enumerate(items): - item_vec = item_vecs[i] - allowed = allowed_scopes(item) - hit = False - for k, (scope, cand) in enumerate(scoped): - if scope not in allowed: - continue - if item.tier == "NC": - # Shared-source guard applies when the NC lists its mirror source - # (§4.2/§6.2); legacy evidence=[] NCs stay unanchored, with the - # higher tau_nc compensating. - if item.evidence and not shares_source(cand, item, evidence_index): - continue - if cosine(cand_vecs[k], item_vec) >= tau_nc: - hit = True - break - elif ( - shares_source(cand, item, evidence_index) - and cosine(cand_vecs[k], item_vec) >= tau - ): - hit = True - break - hits[item.id] = hit - return hits From 9c43a323da1f9929216593be23ae9366bcb67de2 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:41 +0200 Subject: [PATCH 20/48] chore(evaluation): delete scorecard.py --- .../evaluation/scorecard.py | 489 ------------------ 1 file changed, 489 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/scorecard.py diff --git a/fireflyframework_agentic/evaluation/scorecard.py b/fireflyframework_agentic/evaluation/scorecard.py deleted file mode 100644 index b34885e8..00000000 --- a/fireflyframework_agentic/evaluation/scorecard.py +++ /dev/null @@ -1,489 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Scorecard renderer: gate results -> Markdown report. - -Every scorecard states whether it is self-graded. Until Phase 3 independent -re-annotation lands, all Lean-Core PROMOTE verdicts are self-graded against -team-authored ground truth. See EVALUATION_FRAMEWORK.md. -""" - -from __future__ import annotations - -import json - -VERDICT_PROMOTE = "PROMOTE" -VERDICT_HOLD = "HOLD" - - -def verdict(gate_results: list) -> str: - """PROMOTE iff all gates passed and G5 is in the list; HOLD otherwise.""" - if not gate_results: - return VERDICT_HOLD - if not all(g.passed for g in gate_results): - return VERDICT_HOLD - gate_names = {g.gate for g in gate_results} - if "G5" not in gate_names: - return VERDICT_HOLD - return VERDICT_PROMOTE - - -def render_scorecard( - gate_results: list, - *, - corpus: str = "unknown", - model_id: str = "unknown", - run_id: str = "run", - is_self_graded: bool = True, - kappa_advisory: bool = False, - evidence_unverified: bool = False, - bpi2017_f1: float | None = None, - advisory=None, - config: dict | None = None, - experiment_config: dict | None = None, -) -> str: - """Render a Markdown evaluation scorecard. - - The scorecard always discloses self-graded status and advisory flags. - """ - v = verdict(gate_results) - lines = [ - "# FlyRadar Evaluation Scorecard", - "", - f"**Corpus**: {corpus}", - f"**Model**: {model_id}", - f"**Run**: {run_id}", - f"**Verdict**: **{v}**", - "", - ] - - if is_self_graded: - lines += [ - "> **SELF-GRADED**: All ground truth (must-find, gold, DILO, human sign-off) is", - "> authored by the FlyRadar team. This PROMOTE has no contamination-free signal", - "> until Phase 3. See EVALUATION_FRAMEWORK.md.", - "", - ] - - if kappa_advisory: - lines += [ - "> **ADVISORY**: Registry kappa < 0.70 — a second independent annotator has not", - "> verified the must-find items. Promotion is advisory for this corpus until", - "> kappa >= 0.70 from an independent re-annotation.", - "", - ] - - if evidence_unverified: - lines += [ - "> **EVIDENCE UNVERIFIED**: no corpus supplied (--corpus) — evidence locators", - "> and excerpts are taken at face value from the run's own evidence_index.", - "> Grounding certifies self-consistency, not corpus reality. Supply the run's", - "> input.json to enable deterministic excerpt verification (G3, §6.3).", - "", - ] - - if experiment_config is not None: - lines += [ - "## Experiment configuration", - "How this run was generated. Recorded fields (cost, tokens, latency, agents) are " - "read from the run's output.json; `model` is the value passed to the harness via " - "--model-id. Generation params (temperature, prompt/pipeline version, seed) are not " - "captured in output.json.", - "", - "```json", - json.dumps(experiment_config, indent=2, default=str), - "```", - "", - ] - - if config is not None: - lines += [ - "## Evaluation configuration", - "These are the parameters used to compute the evaluation.", - "", - "```json", - json.dumps(config, indent=2, default=str), - "```", - "", - ] - - lines += ["## Gate Results", ""] - g5_result = None - for g in gate_results: - if g.gate == "G5": - g5_result = g - continue - status = "PASS" if g.passed else f"FLAG ({g.reason_code})" - lines.append(f"### {g.gate}: {status}") - if g.details: - lines.append("```json") - lines.append(json.dumps(g.details, indent=2, default=str)) - lines.append("```") - lines.append("") - - if bpi2017_f1 is not None: - ok = bpi2017_f1 >= 0.60 - anchor_status = "PASS (>= 0.60)" if ok else "BELOW THRESHOLD (< 0.60)" - lines += [ - "## External Sanity Anchor (non-blocking)", - f"BPI-2017 variant-recovery F1: **{bpi2017_f1:.3f}** — {anchor_status}", - "_One non-self-graded signal. Non-blocking; informational only._", - "", - ] - - if advisory is not None: - lines += _render_advisory(advisory) - - if g5_result is not None: - status = "PASS" if g5_result.passed else f"FLAG ({g5_result.reason_code})" - lines.append(f"### G5: {status}") - if g5_result.details: - lines.append("```json") - lines.append(json.dumps(g5_result.details, indent=2, default=str)) - lines.append("```") - lines.append("") - - lines += _render_analysis(gate_results, advisory) - - return "\n".join(lines) - - -def _num(x) -> str: - """Format a metric leaf: None -> 'n/a', float -> 3dp, else str.""" - if x is None: - return "n/a" - if isinstance(x, float): - return f"{x:.3f}" - return str(x) - - -def _render_advisory(report) -> list[str]: - """Render the non-blocking G4 LLM-as-a-Judge section from an AdvisoryReport. - - Best-effort: only metrics present in report.metrics are shown. G4 never - affects the PROMOTE/HOLD verdict; this section is decision-support for the - G5 human sign-off, and is advisory until LLM-as-a-Judge calibration (§10). - """ - m = report.metrics - cal = "calibrated" if report.calibrated else "uncalibrated" - lines = [ - "## G4 — LLM-as-a-Judge (non-blocking — does NOT affect the PROMOTE/HOLD verdict)", - f"Judge: {report.judge_model} · {cal} · {report.runs}-run median", - ] - if report.same_provider_caveat: - lines.append("> same-provider as the pipeline — results may share blind spots.") - lines.append("```text") - - if "faithfulness" in m: - d = m["faithfulness"] - u = d.get("unsupported_ids", []) - extra = f" (unsupported: {', '.join(u)})" if u else "" - lines.append( - f"Faithfulness (entailment): {d.get('supported')}/{d.get('total')} supported{extra}" - ) - if "numeric_temporal_fidelity" in m: - lines.append( - f"Numeric/temporal fidelity: {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)" - ) - if "citation_relevance" in m: - d = m["citation_relevance"] - lines.append( - f"Citation relevance (ctx-prec): {_num(d.get('precision'))} ({d.get('relevant')}/{d.get('total')})" - ) - if "semantic_recovery" in m: - d = m["semantic_recovery"] - rec = d.get("recovered", []) - rids = ", ".join(r.get("id", "") for r in rec) if rec else "none" - lines.append( - f"Semantic recovery (ctx-recall): lexical {_num(d.get('lexical_recall'))} -> {_num(d.get('recovered_recall'))} (recovered: {rids})" - ) - if "nc_semantic_precision" in m: - d = m["nc_semantic_precision"] - a = d.get("asserted_ids", []) - extra = f" ({', '.join(a)})" if a else "" - lines.append(f"NC semantic precision: {d.get('asserted', 0)} asserted{extra}") - if "fabricated_entity" in m: - lines.append(f"Fabricated-entity check: {m['fabricated_entity'].get('count', 0)}") - if "contradiction" in m: - lines.append(f"Contradiction detection: {m['contradiction'].get('count', 0)}") - if "actionability" in m: - d = m["actionability"] - lines.append( - f"Actionability: {_num(d.get('score'))} (rated {d.get('rated', 0)})" - ) - if "severity_calibration" in m: - d = m["severity_calibration"] - lines.append( - f"Severity calibration: {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated" - ) - if "answer_relevancy" in m: - lines.append(f"Answer relevancy: {_num(m['answer_relevancy'].get('score'))}") - if "comparative_vs_champion" in m: - lines.append( - f"Comparative vs champion: more consistent -> {m['comparative_vs_champion'].get('more_consistent', 'n/a')}" - ) - if "source_coverage" in m: - d = m["source_coverage"] - o = d.get("orphaned", []) - extra = f" (orphaned: {', '.join(o)})" if o else "" - lines.append( - f"Source coverage [D]: {d.get('cited')}/{d.get('total')} documents cited{extra}" - ) - if "excerpt_fill_rate" in m: - d = m["excerpt_fill_rate"] - lines.append( - f"Evidence-excerpt fill [D]: {d.get('populated')}/{d.get('total')} populated" - ) - if "open_gap" in m: - gap = (m["open_gap"].get("gap") or "").strip() - if gap: - lines.append(f"Open gap probe: {gap}") - if report.errors: - lines.append(f"(errors: {len(report.errors)} metric(s) failed: {'; '.join(report.errors)})") - lines.append("```") - # Full detail — nothing truncated: every id, pair, verdict, and complete text. - lines += [ - "", - "**G4 — full metric detail:**", - "```json", - json.dumps({"metrics": report.metrics, "details": report.details}, indent=2, default=str), - "```", - ] - lines.append( - "> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10)." - ) - lines.append("") - return lines - - -def _render_analysis(gate_results: list, advisory=None) -> list[str]: - """Render a plain-language interpretation of all evaluation signals.""" - g2 = next((g for g in gate_results if g.gate == "G2"), None) - g3 = next((g for g in gate_results if g.gate == "G3"), None) - - lines = ["## Analysis", ""] - - # ── Topic coverage (G2) ────────────────────────────────────────────────── - lines.append("### Topic coverage (G2)") - if g2 and g2.details: - d = g2.details - recall = d.get("recall", 0.0) - tiers = d.get("per_tier", {}) - finding_count = d.get("finding_count", 0) - redundancy = d.get("finding_redundancy_rate", 0.0) - matched = d.get("findings_matched_to_registry", {}).get("fraction", 0.0) - - tier_summary = ", ".join( - f"{t} {v['hit']}/{v['total']}" - for t, v in tiers.items() - if "hit" in v and "total" in v - ) - lines.append( - f"Lexical recall is **{recall:.3f}** ({tier_summary}). " - f"The run produced {finding_count} findings, " - f"all of which map to a registry item (match rate {matched:.0%}). " - ) - if redundancy > 0.15: - lines.append( - f"Finding redundancy is **{redundancy:.0%}** — a meaningful share of " - "findings are near-duplicates of each other (Jaccard ≥ 0.6). " - "The run is covering the same ground multiple times rather than broadening coverage." - ) - else: - lines.append( - f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic." - ) - lines.append( - "_G2 is a topic-level test. A recall of 1.000 means every required topic was " - "mentioned somewhere — it does not verify that the specific claims about those " - "topics are accurate. Claim accuracy is G4 Faithfulness._" - ) - else: - lines.append("G2 result unavailable.") - lines.append("") - - # ── Evidence quality (G3) ──────────────────────────────────────────────── - lines.append("### Evidence quality (G3)") - if g3 and g3.details: - d = g3.details - grounding = d.get("grounding_pct", 0.0) - ev = d.get("evidence_verification", {}) - verified = ev.get("verified", 0) - entries = ev.get("entries", 0) - fabricated = ev.get("fabricated", []) - unknown = ev.get("source_unknown", []) - orphaned = d.get("orphaned_sources", []) - source_cov = d.get("source_coverage", "") - - lines.append( - f"Grounding is **{grounding:.0%}**: every finding cites at least one " - "corpus document, and all excerpts are populated. " - f"Evidence verification checked {entries} entries against the raw corpus: " - f"{verified} verified" - + (f", **{len(fabricated)} fabricated** (locators that do not exist in the corpus)" if fabricated else "") - + (f", **{len(unknown)} source-unknown** (locators that resolve to no corpus file)" if unknown else "") - + "." - ) - if unknown: - lines.append( - f"The source-unknown locator(s) are: `{'`, `'.join(unknown)}`. " - "This is most likely a corpus bundle gap rather than a hallucinated source — " - "verify that all expected files are included in `input.json`." - ) - if orphaned: - lines.append( - f"**{len(orphaned)} corpus documents were never cited** by this run " - f"({', '.join(orphaned)}). These are blind spots: the run extracted nothing " - "from these sources, so any findings they contain are silently missed." - ) - if source_cov: - cited, total = (int(x) for x in source_cov.split("/")) - if cited < total: - lines.append( - f"Overall source coverage is {cited}/{total} — " - f"{total - cited} corpus file(s) left entirely uncited." - ) - else: - lines.append("G3 result unavailable.") - lines.append("") - - # ── Claim accuracy (G4) ────────────────────────────────────────────────── - if advisory is not None: - m = advisory.metrics - lines.append("### Claim accuracy (G4 — advisory)") - - faith = m.get("faithfulness", {}) - supported = faith.get("supported", 0) - total_f = faith.get("total", 0) - if total_f: - faith_pct = supported / total_f - lines.append( - f"**Faithfulness: {supported}/{total_f} findings ({faith_pct:.0%}) are entailed by their cited evidence.** " - ) - if faith_pct < 0.5: - lines.append( - "This is a critical signal: the majority of findings contain claims " - "that the judge cannot verify from the cited sources. " - "The run is presenting inferences, extrapolations, or hallucinated details " - "as if they were directly evidenced. " - "Each unsupported finding should be reviewed against its cited document before use." - ) - elif faith_pct < 0.8: - lines.append( - "A significant minority of findings contain claims not traceable to cited sources. " - "These may be reasonable inferences, but they should be flagged for human verification." - ) - else: - lines.append("Most findings are directly supported by their cited evidence.") - - ntf = m.get("numeric_temporal_fidelity", {}) - mismatch_count = ntf.get("count", 0) - if mismatch_count: - lines.append( - f"**Numeric/temporal fidelity: {mismatch_count} mismatches detected.** " - "Specific figures — FTE costs, durations, timestamps, percentages, case IDs — " - "appear in findings but cannot be traced to the cited evidence. " - "These numbers should be treated as estimates or fabrications until verified " - "against the source documents." - ) - - fab = m.get("fabricated_entity", {}) - fab_count = fab.get("count", 0) - fab_entities = fab.get("entities", []) - if fab_count: - lines.append( - f"**Fabricated entities: {fab_count}** — the following names/identifiers appear " - f"in the output but are absent from the corpus: " - f"{', '.join(f'`{e}`' for e in fab_entities)}. " - "These should be removed or verified before sharing the output." - ) - - sev = m.get("severity_calibration", {}) - misc = sev.get("miscalibrated", 0) - total_s = sev.get("total", 0) - verdicts = sev.get("verdicts", {}) - over_count = sum(1 for v in verdicts.values() if v == "over") - under_count = sum(1 for v in verdicts.values() if v == "under") - if misc and total_s: - direction = "" - if over_count > under_count: - direction = f" (predominantly over-rated: {over_count} findings rated too high)" - elif under_count > over_count: - direction = f" (predominantly under-rated: {under_count} findings rated too low)" - lines.append( - f"**Severity calibration: {misc}/{total_s} findings miscalibrated{direction}.** " - "Over-rated findings inflate perceived urgency and can cause the client to " - "prioritise the wrong items." - ) - - act = m.get("actionability", {}) - act_score = act.get("score") - if act_score is not None: - if act_score < 0.6: - lines.append( - f"**Actionability score: {act_score:.3f}** — proposed actions are below the " - "0.6 threshold for concrete, quantified recommendations. " - "Actions tend to be generic rather than specific enough to assign and execute." - ) - else: - lines.append(f"Actionability score: {act_score:.3f} — actions are sufficiently concrete.") - - og = m.get("open_gap", {}) - gap_text = (og.get("gap") or "").strip() - if gap_text: - lines.append(f"**Most important missed finding:** {gap_text}") - - lines.append("") - - # ── Bottom line ────────────────────────────────────────────────────────── - lines.append("### Bottom line") - g5 = next((g for g in gate_results if g.gate == "G5"), None) - g5_reason = (g5.details or {}).get("reason", "") if g5 else "" - flags = [g for g in gate_results if not g.passed] - flag_names = [g.gate for g in flags] - - if not flags: - lines.append( - "All deterministic gates pass. The run is ready for G5 human sign-off." - ) - else: - flag_str = ", ".join(flag_names) - lines.append( - f"The run is at **HOLD** due to flags on: {flag_str}. " - ) - for g in flags: - if g.gate == "G3" and g.reason_code == "EVIDENCE_SOURCE_UNKNOWN": - lines.append( - "- **G3**: One evidence locator points to a file not in the corpus bundle. " - "Regenerate `input.json` to include all corpus sources, then re-run." - ) - elif g.gate == "G5": - lines.append(f"- **G5**: {g5_reason}") - - if advisory is not None: - m = advisory.metrics - faith = m.get("faithfulness", {}) - supported = faith.get("supported", 0) - total_f = faith.get("total", 1) - ntf_count = m.get("numeric_temporal_fidelity", {}).get("count", 0) - fab_count = m.get("fabricated_entity", {}).get("count", 0) - lines.append( - f"\nG4 advisory signals (non-blocking but important for the G5 reviewer): " - f"faithfulness {supported}/{total_f}, " - f"{ntf_count} numeric mismatches, " - f"{fab_count} fabricated entities. " - "The G5 reviewer should focus on the unsupported findings and verify figures " - "against the source documents before certifying the output." - ) - lines.append("") - return lines From a3673b5c5a441192b99ba7ecd40ab2d5d4bdc57a Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:45 +0200 Subject: [PATCH 21/48] chore(evaluation): delete run_config_snapshot.py --- .../evaluation/run_config_snapshot.py | 160 ------------------ 1 file changed, 160 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/run_config_snapshot.py diff --git a/fireflyframework_agentic/evaluation/run_config_snapshot.py b/fireflyframework_agentic/evaluation/run_config_snapshot.py deleted file mode 100644 index db543129..00000000 --- a/fireflyframework_agentic/evaluation/run_config_snapshot.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Capture the effective flyradar run configuration into experiment_configuration.json. - -Non-invasive snapshot: it records how a run was generated by reading what flyradar -already exposes as data — the request options the caller sent, the ``/api/v1/version`` -endpoint, ``RadarSettings``, and the prompt catalog — without modifying flyradar. The -snapshot is written next to the run's ``output.json`` at generation time, which is the -moment the configuration is known. - -This is the bridge: the durable fix is for flyradar to stamp the same config into -``DiscoveryResult`` itself (the one place that knows the effective values and cannot -drift). See the "flyradar improvements" issue. ``temperature`` and ``seed`` are not -exposed by ``RadarSettings`` and are recorded as ``null`` here. - -Usage: - cd flyradar_experiments - set -a && source .env && set +a - uv run python -m fireflyframework_agentic.evaluation.run_config_snapshot \ - --output-dir experiments/bbva_españa/runs/2026-06-12-sonnet-01 \ - --options request_options.json \ - --commit c107918 -""" -from __future__ import annotations - -import argparse -import json -import os -import urllib.request -from importlib.resources import files -from pathlib import Path - -try: - from flyradar.config import RadarSettings -except ImportError: # flyradar is an optional dependency of this snapshot. - RadarSettings = None - -#: Path of the flyradar version endpoint (whitelisted in the service middleware). -VERSION_PATH = "/api/v1/version" - -#: RadarSettings fields that define scoring / dedup behaviour, captured verbatim. -_SETTINGS_KEYS = ( - "model", - "fallback_model", - "duplicity_similarity_threshold", - "rootcause_cost_weight", - "rootcause_frequency_weight", - "rootcause_actionability_weight", -) - - -def fetch_version(base_url: str, *, timeout: float = 10.0) -> dict: - """GET the flyradar version endpoint; return ``{}`` on any failure.""" - url = base_url.rstrip("/") + VERSION_PATH - try: - with urllib.request.urlopen(url, timeout=timeout) as resp: - return json.loads(resp.read().decode("utf-8")) - except Exception: - return {} - - -def load_radar_settings() -> dict | None: - """Dump the scoring / dedup RadarSettings, or ``None`` if flyradar isn't importable.""" - if RadarSettings is None: - return None - settings = RadarSettings() - return {key: getattr(settings, key, None) for key in _SETTINGS_KEYS} - - -def load_prompt_versions() -> dict | None: - """Read each stage prompt's ``version`` from the flyradar prompt catalog, or ``None``.""" - try: - catalog = files("flyradar.resources.prompts") - except ModuleNotFoundError: - return None - versions: dict[str, str] = {} - for entry in catalog.iterdir(): - if not entry.name.endswith(".yaml"): - continue - for line in entry.read_text(encoding="utf-8").splitlines(): - if line.strip().startswith("version:"): - versions[entry.name[:-5]] = line.split(":", 1)[1].strip().strip('"') - break - return versions or None - - -def build_run_config( - options: dict, - *, - version: dict, - settings: dict | None, - prompt_versions: dict | None, - commit: str | None = None, -) -> dict: - """Assemble the experiment-configuration snapshot from its captured parts.""" - return { - "captured_by": "config-snapshot (non-invasive)", - "flyradar_version": version.get("version"), - "flyradar_commit": commit or version.get("commit"), - "options": options, - "settings": settings, - "prompt_versions": prompt_versions, - "temperature": None, - "seed": None, - "_note": ( - "Non-invasive snapshot captured at generation time. `options` is the request " - "the caller sent; `settings` and `prompt_versions` are read from flyradar when " - "importable at the deployed commit. `temperature` and `seed` are not exposed by " - "RadarSettings and are recorded as null. The durable fix is for flyradar to stamp " - "this config into DiscoveryResult (see the 'flyradar improvements' issue)." - ), - } - - -def write_snapshot(output_dir: str | Path, config: dict) -> Path: - """Write ``experiment_configuration.json`` into the run's output directory.""" - path = Path(output_dir) / "experiment_configuration.json" - path.write_text(json.dumps(config, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") - return path - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser(description="Capture the flyradar run configuration.") - parser.add_argument("--output-dir", required=True, help="Run directory holding output.json.") - parser.add_argument( - "--options", required=True, help="JSON file of the DiscoveryRequest options that were sent." - ) - parser.add_argument( - "--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL)." - ) - parser.add_argument("--commit", default=None, help="Deployed flyradar git commit, if known.") - args = parser.parse_args(argv) - - base_url = args.base_url or os.environ.get("FLYRADAR_BASE_URL", "") - options = json.loads(Path(args.options).read_text(encoding="utf-8")) - config = build_run_config( - options, - version=fetch_version(base_url) if base_url else {}, - settings=load_radar_settings(), - prompt_versions=load_prompt_versions(), - commit=args.commit, - ) - path = write_snapshot(args.output_dir, config) - print(f"Wrote {path}") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) From a51115e8e933c3fe6acecd75ea3610995251644c Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:50 +0200 Subject: [PATCH 22/48] chore(evaluation): delete models.py --- fireflyframework_agentic/evaluation/models.py | 70 ------------------- 1 file changed, 70 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/models.py diff --git a/fireflyframework_agentic/evaluation/models.py b/fireflyframework_agentic/evaluation/models.py deleted file mode 100644 index a98cdf20..00000000 --- a/fireflyframework_agentic/evaluation/models.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Shared config and model classes for the evaluation framework. - -EvalConfig captures the parameters of a single evaluation run: which model -is being tested, which corpus it runs against, and where the supporting -artefacts (registry, baseline, judge config) live. - -GateVerdict constants define the two possible outcomes of the promotion gate: -PROMOTE (the challenger beats or ties the champion and is safe to deploy) -or HOLD (the challenger does not meet the bar and must be iterated on). -""" - -from __future__ import annotations - -from typing import Any - -from pydantic import BaseModel - - -class EvalConfig(BaseModel): - """Configuration for a single evaluation run. - - Parameters: - model_id: Identifier of the model under evaluation. - corpus: Name of the evaluation corpus (e.g. "ms_marco_mini", "finance_bench"). - run_id: Unique identifier for this run (e.g. a timestamp or git SHA). - registry_path: Path to the must-find / golden registry JSON file. - corpus_path: Path to the corpus directory or bundle. - baseline_path: Path to a baseline results file for regression comparison. - judge_model: Model identifier used for the LLM-as-judge advisory pass. - judge_runs: Number of independent judge calls to aggregate (majority vote). - embed_model: Model identifier used for embedding-based retrieval metrics. - metadata: Arbitrary key/value pairs for run bookkeeping. - """ - - model_id: str - corpus: str - run_id: str - registry_path: str = "" - corpus_path: str = "" - baseline_path: str = "" - judge_model: str = "" - judge_runs: int = 3 - embed_model: str = "" - metadata: dict[str, Any] = {} - - -class GateVerdict: - """Promotion gate verdict constants. - - Use ``GateVerdict.PROMOTE`` when the challenger meets the quality bar and - is safe to become the new champion. Use ``GateVerdict.HOLD`` when the - challenger does not meet the bar and must be iterated on. - """ - - PROMOTE: str = "PROMOTE" - HOLD: str = "HOLD" From 5074d14eb91506d4d9367808d0d3196775571760 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:56 +0200 Subject: [PATCH 23/48] chore(evaluation): delete stats.py --- fireflyframework_agentic/evaluation/stats.py | 110 ------------------- 1 file changed, 110 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/stats.py diff --git a/fireflyframework_agentic/evaluation/stats.py b/fireflyframework_agentic/evaluation/stats.py deleted file mode 100644 index e70c629a..00000000 --- a/fireflyframework_agentic/evaluation/stats.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Statistics helpers: A/A noise band + fixed aggregate_grounding. - -The A/A band replaces McNemar, Wilcoxon, BCa bootstrap, Cliff's delta, Holm -correction, and MCID power analysis. Four self-authored corpora with ~30-70 -non-independent items each cannot power those tests; gating on unpowered tests -is false precision. See EVALUATION_FRAMEWORK.md (regression statistics). - -This module also provides the fixed aggregate_grounding() that closes a prior -aggregation bug where the previous runner inherited run 0's grounding report -unchanged instead of merging across all runs. -""" -from __future__ import annotations - -import statistics -from typing import Sequence - - -def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float: - """95th-percentile pairwise delta from champion reruns — the noise floor. - - Rerun the champion ~10 times; the 95th-percentile of all pairwise absolute - differences is the A/A noise floor. A candidate must beat the champion by - more than this number on EVERY seed to count as a real improvement. - - This single number replaces MCID, power analysis, McNemar, Wilcoxon, - bootstrap CIs, and Holm correction. See EVALUATION_FRAMEWORK.md (the A/A noise band). - - Args: - scores: Per-run primary metric scores from champion reruns (>= 2 required). - percentile: Which percentile (default 95). - - Returns: - Noise floor as a float in the same units as the input scores. - """ - scores = list(scores) - if len(scores) < 2: - raise ValueError(f"aa_band requires >= 2 reruns; got {len(scores)}") - deltas = [ - abs(x - y) - for i, x in enumerate(scores) - for y in scores[i + 1:] - ] - sorted_deltas = sorted(deltas) - # Index for the requested percentile; clamp to valid range - idx = max(0, min(len(sorted_deltas) - 1, int(len(sorted_deltas) * percentile / 100))) - return sorted_deltas[idx] - - -def aggregate_grounding(grounding_dicts: list[dict]) -> dict: - """Merge per-run grounding reports into a conservative aggregate. - - Fixes a prior aggregation bug where the previous runner inherited run 0's grounding - report unchanged. Correct behaviour: - - support_pct: mean across runs - - unsupported_ids: UNION across all runs (anything flagged in any run stays flagged) - - Args: - grounding_dicts: List of grounding report dicts, one per evaluation run. - Each must have 'support_pct' (float 0-100) and optionally - 'unsupported_ids' (list[str]). - - Returns: - Merged grounding dict. - """ - if not grounding_dicts: - return {"support_pct": 0.0, "unsupported_ids": []} - - support_pcts = [float(g.get("support_pct", 0.0)) for g in grounding_dicts] - mean_pct = statistics.mean(support_pcts) - - unsupported: set[str] = set() - for g in grounding_dicts: - unsupported.update(g.get("unsupported_ids", [])) - - first = grounding_dicts[0] - return { - **first, - "support_pct": round(mean_pct, 2), - "unsupported_ids": sorted(unsupported), - "_aggregate_runs": len(grounding_dicts), - "_support_pct_per_run": [round(p, 2) for p in support_pcts], - } - - -def left_skew_flag(scores: Sequence[float]) -> bool: - """True if min < median - 0.10 (HIGH_VARIANCE sentinel). - - A single catastrophic run cannot hide inside a decent mean. - True => HIGH_VARIANCE; block the run until investigated. - See EVALUATION_FRAMEWORK.md (anti-flakiness). - """ - scores = list(scores) - if len(scores) < 2: - return False - med = statistics.median(scores) - return min(scores) < med - 0.10 From 8716be93d143f846920a6ad85820c082eccb6ccf Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:19:01 +0200 Subject: [PATCH 24/48] chore(evaluation): delete champion.py --- .../evaluation/champion.py | 169 ------------------ 1 file changed, 169 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/champion.py diff --git a/fireflyframework_agentic/evaluation/champion.py b/fireflyframework_agentic/evaluation/champion.py deleted file mode 100644 index 239429eb..00000000 --- a/fireflyframework_agentic/evaluation/champion.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Per-corpus champion management. - -Champions are per-corpus — mode 2A (conformance) and mode 2B (extraction) -metrics live in incommensurable spaces. There is no global champion. -See EVALUATION_FRAMEWORK.md (per-corpus champions). - -The historical fake-100% incident: banca-cordobesa/baseline.json was populated -with a champion scored against an EMPTY must-find registry. The EMPTY_MUST_FIND -guard in G1 prevents a recurrence; the invalidate_champion() function provides -the corrective action when it does happen. -""" - -from __future__ import annotations - -import hashlib -import json -from dataclasses import dataclass, field -from pathlib import Path - - -@dataclass -class ChampionRecord: - """Per-corpus champion, stored as 'champion' key in baseline.json.""" - - corpus: str - run_id: str - model_id: str - registry_sha256: str - scores: dict # {metric_name: float} - aa_noise: dict = field(default_factory=dict) # {metric_name: noise_floor} - is_day_zero: bool = False - human_sign_offs: list[str] = field(default_factory=list) - config: dict = field(default_factory=dict) # evaluation config snapshot - corpus_sha256: str = "" # pin of the evidence corpus the champion was verified against - - def primary_metric(self) -> str: - return next(iter(self.scores)) if self.scores else "" - - def primary_score(self) -> float: - return float(self.scores.get(self.primary_metric(), 0.0)) - - -def load_champion(baseline_path: str | Path) -> ChampionRecord | None: - """Load the current per-corpus champion from baseline.json. - - Returns None when: - - The file does not exist (normal Day-Zero state). - - The file exists but 'champion' is null (post-invalidation state). - """ - path = Path(baseline_path) - if not path.exists(): - return None - raw = json.loads(path.read_text(encoding="utf-8")) - champ_raw = raw.get("champion") - if champ_raw is None: - return None - return ChampionRecord( - corpus=champ_raw["corpus"], - run_id=champ_raw["run_id"], - model_id=champ_raw["model_id"], - registry_sha256=champ_raw["registry_sha256"], - scores=champ_raw.get("scores", {}), - aa_noise=champ_raw.get("aa_noise", {}), - is_day_zero=champ_raw.get("is_day_zero", False), - human_sign_offs=champ_raw.get("human_sign_offs", []), - config=champ_raw.get("config", {}), - corpus_sha256=champ_raw.get("corpus_sha256", ""), - ) - - -def save_champion( - baseline_path: str | Path, - champion: ChampionRecord, - *, - summary: str = "", - date: str = "", -) -> None: - """Persist a new champion and append a promotion log entry. - - Reads the existing file if it exists (to preserve the log), then writes - the new champion. The promotion log is append-only. - """ - path = Path(baseline_path) - if path.exists(): - raw = json.loads(path.read_text(encoding="utf-8")) - log = raw.get("promotion_log", []) - prev_run = raw.get("champion", {}) - prev_run_id = prev_run.get("run_id") if isinstance(prev_run, dict) else None - else: - log = [] - prev_run_id = None - - log.append( - { - "date": date or "unknown", - "from": prev_run_id, - "to": champion.run_id, - "label": "day-zero" if champion.is_day_zero else "promotion", - "summary": summary, - } - ) - - payload = { - "champion": { - "corpus": champion.corpus, - "run_id": champion.run_id, - "model_id": champion.model_id, - "registry_sha256": champion.registry_sha256, - "scores": champion.scores, - "aa_noise": champion.aa_noise, - "is_day_zero": champion.is_day_zero, - "human_sign_offs": champion.human_sign_offs, - "config": champion.config, - "corpus_sha256": champion.corpus_sha256, - }, - "promotion_log": log, - } - path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") - - -def invalidate_champion( - baseline_path: str | Path, - *, - reason: str, - date: str = "", -) -> None: - """Null out the current champion and record the invalidation reason. - - Used when a champion was locked in against an empty or tampered registry - (the banca-cordobesa fake-100% incident). - """ - path = Path(baseline_path) - if not path.exists(): - return - raw = json.loads(path.read_text(encoding="utf-8")) - log = raw.get("promotion_log", []) - prev_run = raw.get("champion", {}) - prev_run_id = prev_run.get("run_id") if isinstance(prev_run, dict) else None - log.append( - { - "date": date or "unknown", - "from": prev_run_id, - "to": None, - "label": "INVALIDATED", - "summary": reason, - } - ) - raw["champion"] = None - raw["promotion_log"] = log - path.write_text(json.dumps(raw, indent=2, ensure_ascii=False), encoding="utf-8") - - -def input_hash(result_dict: dict) -> str: - """Stable 16-char SHA-256 prefix of the DiscoveryResult for provenance.""" - canonical = json.dumps(result_dict, sort_keys=True, ensure_ascii=False) - return hashlib.sha256(canonical.encode("utf-8")).hexdigest()[:16] From 5c8fe8e4450013f47754803e99e19cac3a4cb1bd Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:19:07 +0200 Subject: [PATCH 25/48] chore(evaluation): delete test_champion.py --- tests/unit/evaluation/test_champion.py | 199 ------------------------- 1 file changed, 199 deletions(-) delete mode 100644 tests/unit/evaluation/test_champion.py diff --git a/tests/unit/evaluation/test_champion.py b/tests/unit/evaluation/test_champion.py deleted file mode 100644 index 948a9639..00000000 --- a/tests/unit/evaluation/test_champion.py +++ /dev/null @@ -1,199 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Unit tests for evaluation.champion: ChampionRecord, load/save/invalidate_champion, input_hash.""" - -from __future__ import annotations - -import json - -import pytest - -from fireflyframework_agentic.evaluation.champion import ( - ChampionRecord, - input_hash, - invalidate_champion, - load_champion, - save_champion, -) - - -def _make_champion(**overrides) -> ChampionRecord: - defaults = dict( - corpus="test-corpus", - run_id="run-2026-01", - model_id="claude-sonnet-4-5", - registry_sha256="abc123", - scores={"recall": 0.85, "grounding_pct": 0.92}, - aa_noise={"recall": 0.02}, - is_day_zero=False, - human_sign_offs=["reviewer-1"], - ) - defaults.update(overrides) - return ChampionRecord(**defaults) - - -# ── load_champion ───────────────────────────────────────────────────────────── - - -def test_load_champion_nonexistent_file_returns_none(tmp_path): - result = load_champion(tmp_path / "baseline.json") - assert result is None - - -def test_load_champion_file_with_null_champion_returns_none(tmp_path): - baseline = tmp_path / "baseline.json" - baseline.write_text(json.dumps({"champion": None, "promotion_log": []}), encoding="utf-8") - assert load_champion(baseline) is None - - -# ── save_champion / load_champion round-trip ────────────────────────────────── - - -def test_save_then_load_round_trips_all_fields(tmp_path): - baseline = tmp_path / "baseline.json" - champ = _make_champion() - save_champion(baseline, champ, summary="initial champion", date="2026-01-01") - - loaded = load_champion(baseline) - assert loaded is not None - assert loaded.corpus == champ.corpus - assert loaded.run_id == champ.run_id - assert loaded.model_id == champ.model_id - assert loaded.registry_sha256 == champ.registry_sha256 - assert loaded.scores == champ.scores - assert loaded.aa_noise == champ.aa_noise - assert loaded.is_day_zero == champ.is_day_zero - assert loaded.human_sign_offs == champ.human_sign_offs - - -def test_save_champion_appends_promotion_log_entry(tmp_path): - baseline = tmp_path / "baseline.json" - champ = _make_champion() - save_champion(baseline, champ, summary="first", date="2026-01-01") - - champ2 = _make_champion(run_id="run-2026-02", scores={"recall": 0.90}) - save_champion(baseline, champ2, summary="second", date="2026-02-01") - - raw = json.loads(baseline.read_text(encoding="utf-8")) - log = raw["promotion_log"] - assert len(log) == 2 - assert log[0]["to"] == "run-2026-01" - assert log[1]["to"] == "run-2026-02" - assert log[1]["from"] == "run-2026-01" - - -def test_save_champion_creates_file_when_missing(tmp_path): - baseline = tmp_path / "baseline.json" - assert not baseline.exists() - save_champion(baseline, _make_champion()) - assert baseline.exists() - - -def test_save_champion_day_zero_flag_preserved(tmp_path): - baseline = tmp_path / "baseline.json" - champ = _make_champion(is_day_zero=True) - save_champion(baseline, champ) - loaded = load_champion(baseline) - assert loaded.is_day_zero is True - - -def test_save_champion_label_is_day_zero_when_flag_set(tmp_path): - baseline = tmp_path / "baseline.json" - champ = _make_champion(is_day_zero=True) - save_champion(baseline, champ) - raw = json.loads(baseline.read_text(encoding="utf-8")) - assert raw["promotion_log"][0]["label"] == "day-zero" - - -def test_save_champion_label_is_promotion_when_flag_not_set(tmp_path): - baseline = tmp_path / "baseline.json" - save_champion(baseline, _make_champion(is_day_zero=False)) - raw = json.loads(baseline.read_text(encoding="utf-8")) - assert raw["promotion_log"][0]["label"] == "promotion" - - -# ── invalidate_champion ─────────────────────────────────────────────────────── - - -def test_invalidate_champion_sets_champion_to_null(tmp_path): - baseline = tmp_path / "baseline.json" - save_champion(baseline, _make_champion()) - invalidate_champion(baseline, reason="EMPTY_MUST_FIND fake champion", date="2026-03-01") - - loaded = load_champion(baseline) - assert loaded is None - - raw = json.loads(baseline.read_text(encoding="utf-8")) - assert raw["champion"] is None - - -def test_invalidate_champion_appends_invalidation_log(tmp_path): - baseline = tmp_path / "baseline.json" - save_champion(baseline, _make_champion(), date="2026-01-01") - invalidate_champion(baseline, reason="fake champion", date="2026-03-01") - - raw = json.loads(baseline.read_text(encoding="utf-8")) - log = raw["promotion_log"] - assert log[-1]["label"] == "INVALIDATED" - assert "fake champion" in log[-1]["summary"] - assert log[-1]["to"] is None - - -def test_invalidate_champion_noop_when_file_missing(tmp_path): - # Should not raise when file does not exist. - invalidate_champion(tmp_path / "no-file.json", reason="test") - - -# ── ChampionRecord helpers ──────────────────────────────────────────────────── - - -def test_primary_metric_returns_first_key(): - champ = _make_champion(scores={"recall": 0.85, "grounding_pct": 0.92}) - assert champ.primary_metric() == "recall" - - -def test_primary_score_returns_first_value(): - champ = _make_champion(scores={"recall": 0.85, "grounding_pct": 0.92}) - assert champ.primary_score() == 0.85 - - -def test_primary_metric_empty_scores(): - champ = _make_champion(scores={}) - assert champ.primary_metric() == "" - assert champ.primary_score() == 0.0 - - -# ── input_hash ──────────────────────────────────────────────────────────────── - - -def test_input_hash_is_16_chars(): - result = input_hash({"key": "value"}) - assert len(result) == 16 - - -def test_input_hash_is_deterministic(): - data = {"process_graph": {"processes": []}, "findings": []} - h1 = input_hash(data) - h2 = input_hash(data) - assert h1 == h2 - - -def test_input_hash_differs_for_different_inputs(): - assert input_hash({"a": 1}) != input_hash({"a": 2}) - - -def test_input_hash_key_order_independent(): - # sort_keys=True in input_hash should make {"a":1, "b":2} == {"b":2, "a":1}. - assert input_hash({"a": 1, "b": 2}) == input_hash({"b": 2, "a": 1}) From fdc02771d8b3352a1031fc7ba5b3e1646b32f041 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:19:13 +0200 Subject: [PATCH 26/48] chore(evaluation): delete test_gates.py --- tests/unit/evaluation/test_gates.py | 219 ---------------------------- 1 file changed, 219 deletions(-) delete mode 100644 tests/unit/evaluation/test_gates.py diff --git a/tests/unit/evaluation/test_gates.py b/tests/unit/evaluation/test_gates.py deleted file mode 100644 index 2edc3b99..00000000 --- a/tests/unit/evaluation/test_gates.py +++ /dev/null @@ -1,219 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Unit tests for evaluation.gates: GateResult, verdict, render_scorecard, g5_no_regression.""" - -from __future__ import annotations - -from fireflyframework_agentic.evaluation.gates import ( - GateResult, - Verdict, - g5_no_regression, - render_scorecard, -) -from fireflyframework_agentic.evaluation.scorecard import verdict - - -# ── GateResult ──────────────────────────────────────────────────────────────── - - -def test_gate_result_str_pass(): - gr = GateResult(gate="G1", passed=True) - assert str(gr) == "[G1] PASS" - - -def test_gate_result_str_flag(): - gr = GateResult(gate="G2", passed=False, reason_code="RECALL_BELOW_FLOOR") - assert str(gr) == "[G2] FLAG:RECALL_BELOW_FLOOR" - - -def test_gate_result_flag_without_reason_code(): - gr = GateResult(gate="G3", passed=False, reason_code="") - assert str(gr) == "[G3] FLAG:" - - -def test_gate_result_passed_true(): - gr = GateResult(gate="G5", passed=True, details={"note": "ok"}) - assert gr.passed is True - assert gr.details["note"] == "ok" - - -def test_gate_result_default_details_is_empty_dict(): - gr = GateResult(gate="G1", passed=True) - assert gr.details == {} - - -# ── verdict ─────────────────────────────────────────────────────────────────── - - -def test_verdict_promote_when_all_pass_and_g5_present(): - gates = [ - GateResult(gate="G1", passed=True), - GateResult(gate="G2", passed=True), - GateResult(gate="G3", passed=True), - GateResult(gate="G5", passed=True), - ] - assert verdict(gates) == "PROMOTE" - - -def test_verdict_hold_when_any_gate_fails(): - gates = [ - GateResult(gate="G1", passed=True), - GateResult(gate="G2", passed=False, reason_code="RECALL_BELOW_FLOOR"), - GateResult(gate="G3", passed=True), - GateResult(gate="G5", passed=True), - ] - assert verdict(gates) == "HOLD" - - -def test_verdict_hold_when_g5_missing(): - # All G1/G2/G3 pass but G5 is absent — no promotion without sign-off. - gates = [ - GateResult(gate="G1", passed=True), - GateResult(gate="G2", passed=True), - GateResult(gate="G3", passed=True), - ] - assert verdict(gates) == "HOLD" - - -def test_verdict_hold_on_empty_list(): - assert verdict([]) == "HOLD" - - -def test_verdict_hold_when_g5_fails(): - gates = [ - GateResult(gate="G1", passed=True), - GateResult(gate="G2", passed=True), - GateResult(gate="G3", passed=True), - GateResult(gate="G5", passed=False, reason_code="HOLD"), - ] - assert verdict(gates) == "HOLD" - - -# ── render_scorecard (from gates module) ────────────────────────────────────── - - -def test_render_scorecard_contains_verdict_line(): - gates = [ - GateResult(gate="G1", passed=True), - GateResult(gate="G2", passed=True), - GateResult(gate="G3", passed=True), - GateResult(gate="G5", passed=True), - ] - output = render_scorecard(gates) - assert "VERDICT: PROMOTE" in output - - -def test_render_scorecard_hold_when_flag(): - gates = [ - GateResult(gate="G1", passed=False, reason_code="SCHEMA_INVALID"), - GateResult(gate="G2", passed=True), - GateResult(gate="G3", passed=True), - GateResult(gate="G5", passed=True), - ] - output = render_scorecard(gates) - assert "VERDICT: HOLD" in output - - -def test_render_scorecard_includes_all_gate_lines(): - gates = [ - GateResult(gate="G1", passed=True), - GateResult(gate="G2", passed=True), - GateResult(gate="G3", passed=True), - GateResult(gate="G5", passed=True), - ] - output = render_scorecard(gates) - for gate_label in ("[G1]", "[G2]", "[G3]", "[G5]"): - assert gate_label in output - - -# ── g5_no_regression ────────────────────────────────────────────────────────── - - -def test_g5_day_zero_insufficient_signoffs(): - result = g5_no_regression( - candidate_scores={"recall": 0.85}, - champion_scores=None, - aa_noise=None, - is_day_zero=True, - human_signed_off=False, - signoff_count=1, - ) - assert result.passed is False - assert result.reason_code == "HOLD" - - -def test_g5_day_zero_sufficient_signoffs(): - result = g5_no_regression( - candidate_scores={"recall": 0.85}, - champion_scores=None, - aa_noise=None, - is_day_zero=True, - human_signed_off=False, - signoff_count=2, - ) - assert result.passed is True - assert result.details["day_zero"] is True - - -def test_g5_hold_when_no_human_signoff(): - result = g5_no_regression( - candidate_scores={"recall": 0.90}, - champion_scores={"recall": 0.80}, - aa_noise={"recall": 0.02}, - human_signed_off=False, - ) - assert result.passed is False - assert result.reason_code == "HOLD" - - -def test_g5_hold_when_regression_beyond_band(): - # Candidate recall 0.75 vs champion 0.80; delta=-0.05 < -band=-0.02. - result = g5_no_regression( - candidate_scores={"recall": 0.75}, - champion_scores={"recall": 0.80}, - aa_noise={"recall": 0.02}, - human_signed_off=True, - ) - assert result.passed is False - assert result.reason_code == "HOLD" - assert any("recall" in r for r in result.details["regressions"]) - - -def test_g5_promote_when_candidate_beats_champion(): - result = g5_no_regression( - candidate_scores={"recall": 0.90}, - champion_scores={"recall": 0.80}, - aa_noise={"recall": 0.02}, - human_signed_off=True, - ) - assert result.passed is True - assert result.details["improvements"] - - -def test_g5_promote_when_within_noise_band(): - # delta = 0.01 — positive but within band of 0.02; counts as no regression, no improvement. - result = g5_no_regression( - candidate_scores={"recall": 0.81}, - champion_scores={"recall": 0.80}, - aa_noise={"recall": 0.02}, - human_signed_off=True, - ) - assert result.passed is True - assert result.details["improvements"] == [] - - -def test_g5_verdict_constants(): - assert Verdict.PROMOTE == "PROMOTE" - assert Verdict.HOLD == "HOLD" From 0732f8582e9e40818e9a4d05da2dff00220652b4 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:19:17 +0200 Subject: [PATCH 27/48] chore(evaluation): delete test_matcher.py --- tests/unit/evaluation/test_matcher.py | 221 -------------------------- 1 file changed, 221 deletions(-) delete mode 100644 tests/unit/evaluation/test_matcher.py diff --git a/tests/unit/evaluation/test_matcher.py b/tests/unit/evaluation/test_matcher.py deleted file mode 100644 index cc87564b..00000000 --- a/tests/unit/evaluation/test_matcher.py +++ /dev/null @@ -1,221 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Unit tests for evaluation.matcher: anchored, source_stem, tokens, matches.""" - -from __future__ import annotations - -import pytest - -from fireflyframework_agentic.evaluation.matcher import ( - anchored, - matches, - source_stem, - tokens, -) -from fireflyframework_agentic.evaluation.registry import RegistryItem - - -# ── tokens ─────────────────────────────────────────────────────────────────── - - -def test_tokens_basic(): - result = tokens("Hello World") - assert result == ["hello", "world"] - - -def test_tokens_lowercases(): - result = tokens("KYC AML PEP") - assert result == ["kyc", "aml", "pep"] - - -def test_tokens_strips_punctuation(): - result = tokens("risk-management: cost (FTE).") - assert "risk" in result - assert "management" in result - assert "cost" in result - assert "fte" in result - - -def test_tokens_empty_string(): - assert tokens("") == [] - - -def test_tokens_numbers_included(): - result = tokens("case-id CU-2026-1003") - assert "2026" in result or "cu" in result - - -def test_tokens_unicode(): - result = tokens("análisis de crédito") - assert "análisis" in result or "an" in result - - -# ── anchored ───────────────────────────────────────────────────────────────── - - -def test_anchored_overlapping_long_token(): - # "underwriting" is 12 chars — well above the 5-char floor. - assert anchored("credit underwriting risk", "underwriting process steps") is True - - -def test_anchored_no_overlap(): - # No token >= 5 chars shared between claim and evidence. - assert anchored("cat sat", "dog ran") is False - - -def test_anchored_short_tokens_ignored(): - # All tokens in both strings are < 5 chars; no overlap counts. - assert anchored("a big cat", "a big dog") is False - - -def test_anchored_mixed_lengths_match(): - # "kyc" is < 5, but "compliance" is long enough. - assert anchored("kyc compliance review", "compliance framework") is True - - -def test_anchored_custom_min_token(): - # Lower the floor so short tokens can anchor. - assert anchored("kyc check", "kyc process", min_token=3) is True - - -def test_anchored_both_empty(): - assert anchored("", "") is False - - -def test_anchored_partial_token_no_match(): - # "risk" (4 chars) is below the default 5-char floor. - assert anchored("risk alert", "risk factor") is False - - -def test_anchored_returns_bool(): - result = anchored("credit underwriting", "underwriting model") - assert isinstance(result, bool) - - -# ── source_stem ─────────────────────────────────────────────────────────────── - - -def test_source_stem_bare_filename_with_extension(): - assert source_stem("SOP-002-kyc-edd.md") == "sop-002-kyc-edd" - - -def test_source_stem_directory_prefixed(): - assert source_stem("sops/SOP-002-kyc-edd.md") == "sop-002-kyc-edd" - - -def test_source_stem_deep_path_prefix(): - assert source_stem("docs/policies/SOP-002-kyc-edd.md") == "sop-002-kyc-edd" - - -def test_source_stem_lowercase(): - # Stems are always lowercased. - assert source_stem("REPORT-FINAL.pdf") == "report-final" - - -def test_source_stem_event_log_row_id(): - # src-: → process stem. - assert source_stem("src-credit-underwriting:CU-2026-1003") == "credit-underwriting" - - -def test_source_stem_event_log_row_id_preserves_hyphens(): - assert source_stem("src-kyc-onboarding:KYC-001") == "kyc-onboarding" - - -def test_source_stem_strips_fragment(): - # #page=N should be removed before stemming. - assert source_stem("docs/report.pdf#page=5") == "report" - - -def test_source_stem_strips_anchor(): - assert source_stem("sops/SOP-001.md#section-3") == "sop-001" - - -def test_source_stem_bare_no_extension(): - # No extension, no directory — stem is just the lowercase name. - assert source_stem("my-document") == "my-document" - - -def test_source_stem_no_directory_no_extension_lowercase(): - assert source_stem("Signal") == "signal" - - -def test_source_stem_csv_extension(): - assert source_stem("activity-cost-fte.csv") == "activity-cost-fte" - - -# ── matches ─────────────────────────────────────────────────────────────────── - - -def _make_item(description: str, evidence: list[str], keywords: list[str] | None = None) -> RegistryItem: - """Construct a minimal RegistryItem for matching tests.""" - return RegistryItem( - id="test-item", - tier="L1", - description=description, - evidence=evidence, - scope="finding", - keywords=keywords or [], - ) - - -def _make_finding(title: str, description: str, evidence_id: str) -> dict: - return { - "title": title, - "description": description, - "evidence_refs": [{"evidence_id": evidence_id}], - } - - -def _make_evidence_index(evidence_id: str, locator: str, excerpt: str = "") -> dict: - return {evidence_id: {"id": evidence_id, "locator": locator, "excerpt": excerpt}} - - -def test_matches_true_when_source_and_topic_match(): - # Finding title shares a long token with item description and cites the same source. - item = _make_item("credit underwriting process", ["sop-kyc-credit.md"]) - finding = _make_finding("credit underwriting review", "credit underwriting risk assessment", "ev-1") - evidence_index = _make_evidence_index("ev-1", "sop-kyc-credit.md") - assert matches(finding, item, evidence_index, scope="finding") is True - - -def test_matches_false_when_source_differs(): - # Token match exists but sources don't overlap — anti-gaming guard fires. - item = _make_item("credit underwriting process", ["sop-credit.md"]) - finding = _make_finding("credit underwriting review", "credit underwriting details", "ev-1") - evidence_index = _make_evidence_index("ev-1", "other-document.md") - assert matches(finding, item, evidence_index, scope="finding") is False - - -def test_matches_false_when_no_token_overlap(): - # Same source, but no shared long token between finding text and item description. - item = _make_item("regulatory capital requirement", ["sop-capital.md"]) - finding = _make_finding("kyc identity check", "client onboarding steps", "ev-1") - evidence_index = _make_evidence_index("ev-1", "sop-capital.md") - assert matches(finding, item, evidence_index, scope="finding") is False - - -def test_matches_keyword_rail_short_token(): - # "KYC" is 3 chars — below the 5-char token floor but valid as a keyword. - item = _make_item("some description about identity", ["sop-kyc.md"], keywords=["KYC"]) - finding = _make_finding("KYC onboarding", "KYC onboarding process", "ev-1") - evidence_index = _make_evidence_index("ev-1", "sop-kyc.md") - assert matches(finding, item, evidence_index, scope="finding") is True - - -def test_matches_empty_evidence_refs_returns_false(): - # Finding with no evidence refs cannot share a source with any item. - item = _make_item("credit underwriting", ["sop-credit.md"]) - finding = {"title": "credit underwriting", "description": "credit underwriting risk", "evidence_refs": []} - assert matches(finding, item, {}, scope="finding") is False From f769ef1c40d28067040faa8a5f662038d4765eb0 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:19:23 +0200 Subject: [PATCH 28/48] chore(evaluation): delete test_stats.py --- tests/unit/evaluation/test_stats.py | 183 ---------------------------- 1 file changed, 183 deletions(-) delete mode 100644 tests/unit/evaluation/test_stats.py diff --git a/tests/unit/evaluation/test_stats.py b/tests/unit/evaluation/test_stats.py deleted file mode 100644 index 9523be8c..00000000 --- a/tests/unit/evaluation/test_stats.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Unit tests for evaluation.stats: aa_band, aggregate_grounding, left_skew_flag.""" - -from __future__ import annotations - -import pytest - -from fireflyframework_agentic.evaluation.stats import ( - aa_band, - aggregate_grounding, - left_skew_flag, -) - - -# ── aa_band ────────────────────────────────────────────────────────────────── - - -def test_aa_band_two_identical_scores(): - # Two identical scores produce zero pairwise delta. - assert aa_band([0.80, 0.80]) == 0.0 - - -def test_aa_band_two_different_scores(): - # Single delta = |0.90 - 0.80| = 0.10; 95th percentile of one value is that value. - result = aa_band([0.80, 0.90]) - assert abs(result - 0.10) < 1e-9 - - -def test_aa_band_three_scores_known_deltas(): - # Scores: 0.70, 0.80, 0.90 - # Pairwise deltas: |0.70-0.80|=0.10, |0.70-0.90|=0.20, |0.80-0.90|=0.10 - # Sorted: [0.10, 0.10, 0.20] → 95th pct index = int(3 * 95 / 100) = 2 → 0.20 - result = aa_band([0.70, 0.80, 0.90]) - assert abs(result - 0.20) < 1e-9 - - -def test_aa_band_large_spread(): - # Max delta in [0.0, 1.0] is 1.0. - result = aa_band([0.0, 1.0]) - assert abs(result - 1.0) < 1e-9 - - -def test_aa_band_requires_at_least_two_scores(): - with pytest.raises(ValueError, match="aa_band requires >= 2 reruns"): - aa_band([0.80]) - - -def test_aa_band_empty_raises(): - with pytest.raises(ValueError, match="aa_band requires >= 2 reruns"): - aa_band([]) - - -def test_aa_band_custom_percentile(): - # 50th percentile of [0.10, 0.10, 0.20] at idx=1 → 0.10. - result = aa_band([0.70, 0.80, 0.90], percentile=50) - assert abs(result - 0.10) < 1e-9 - - -def test_aa_band_returns_float(): - result = aa_band([0.80, 0.85, 0.90]) - assert isinstance(result, float) - - -# ── aggregate_grounding ─────────────────────────────────────────────────────── - - -def test_aggregate_grounding_single_dict(): - g = {"support_pct": 90.0, "unsupported_ids": ["ev-1"]} - result = aggregate_grounding([g]) - assert result["support_pct"] == 90.0 - assert result["unsupported_ids"] == ["ev-1"] - assert result["_aggregate_runs"] == 1 - - -def test_aggregate_grounding_mean_support_pct(): - dicts = [ - {"support_pct": 80.0, "unsupported_ids": []}, - {"support_pct": 100.0, "unsupported_ids": []}, - ] - result = aggregate_grounding(dicts) - assert result["support_pct"] == 90.0 - - -def test_aggregate_grounding_union_of_unsupported_ids(): - dicts = [ - {"support_pct": 90.0, "unsupported_ids": ["ev-1", "ev-2"]}, - {"support_pct": 85.0, "unsupported_ids": ["ev-2", "ev-3"]}, - ] - result = aggregate_grounding(dicts) - assert set(result["unsupported_ids"]) == {"ev-1", "ev-2", "ev-3"} - - -def test_aggregate_grounding_union_sorted(): - dicts = [ - {"support_pct": 90.0, "unsupported_ids": ["ev-b"]}, - {"support_pct": 90.0, "unsupported_ids": ["ev-a"]}, - ] - result = aggregate_grounding(dicts) - assert result["unsupported_ids"] == ["ev-a", "ev-b"] - - -def test_aggregate_grounding_empty_input(): - result = aggregate_grounding([]) - assert result["support_pct"] == 0.0 - assert result["unsupported_ids"] == [] - - -def test_aggregate_grounding_records_run_count(): - dicts = [ - {"support_pct": 80.0, "unsupported_ids": []}, - {"support_pct": 90.0, "unsupported_ids": []}, - {"support_pct": 100.0, "unsupported_ids": []}, - ] - result = aggregate_grounding(dicts) - assert result["_aggregate_runs"] == 3 - - -def test_aggregate_grounding_per_run_pct_recorded(): - dicts = [ - {"support_pct": 80.0, "unsupported_ids": []}, - {"support_pct": 100.0, "unsupported_ids": []}, - ] - result = aggregate_grounding(dicts) - assert result["_support_pct_per_run"] == [80.0, 100.0] - - -def test_aggregate_grounding_missing_unsupported_ids_treated_as_empty(): - dicts = [ - {"support_pct": 90.0}, # no unsupported_ids key - {"support_pct": 80.0, "unsupported_ids": ["ev-1"]}, - ] - result = aggregate_grounding(dicts) - assert result["unsupported_ids"] == ["ev-1"] - - -# ── left_skew_flag ──────────────────────────────────────────────────────────── - - -def test_left_skew_flag_true_when_catastrophic_run(): - # median([0.80, 0.80, 0.80]) = 0.80; min = 0.60 < 0.80 - 0.10 = 0.70. - assert left_skew_flag([0.60, 0.80, 0.80]) is True - - -def test_left_skew_flag_false_when_min_close_to_median(): - # median = 0.80; min = 0.75; 0.75 >= 0.80 - 0.10 = 0.70 → no flag. - assert left_skew_flag([0.75, 0.80, 0.85]) is False - - -def test_left_skew_flag_false_when_all_equal(): - assert left_skew_flag([0.85, 0.85, 0.85]) is False - - -def test_left_skew_flag_boundary_just_above_threshold(): - # min = 0.71, median = 0.80; 0.71 >= 0.80 - 0.10 = 0.70 → no flag. - assert left_skew_flag([0.71, 0.80, 0.80]) is False - - -def test_left_skew_flag_single_score_always_false(): - # A single score has no meaningful distribution; function returns False. - assert left_skew_flag([0.50]) is False - - -def test_left_skew_flag_two_scores_with_large_gap(): - # median([0.50, 0.90]) = 0.70; min = 0.50 < 0.70 - 0.10 = 0.60. - assert left_skew_flag([0.50, 0.90]) is True - - -def test_left_skew_flag_returns_bool(): - result = left_skew_flag([0.80, 0.85, 0.90]) - assert isinstance(result, bool) From 251605211cff33f0e89fcbe26f8aefdbfab0fa72 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:20:17 +0200 Subject: [PATCH 29/48] feat(evaluation): rewrite judge_client.py as async (httpx.AsyncClient) --- .../evaluation/judge_client.py | 382 +++++------------- 1 file changed, 91 insertions(+), 291 deletions(-) diff --git a/fireflyframework_agentic/evaluation/judge_client.py b/fireflyframework_agentic/evaluation/judge_client.py index 1af17f53..7f050d16 100644 --- a/fireflyframework_agentic/evaluation/judge_client.py +++ b/fireflyframework_agentic/evaluation/judge_client.py @@ -1,60 +1,24 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +"""Async LLM scoring client for judge metrics. -"""Provider-agnostic LLM-as-a-Judge client for the G4 advisory gate. - -Zero new dependencies: stdlib (urllib.request, json, os, time, re) + numpy. -The client is a thin POST wrapper over four chat providers (Anthropic, OpenAI, -Azure OpenAI, Ollama) plus an Ollama embedder. It is deliberately tolerant: -chat_json extracts the FIRST JSON object from the model text (models wrap JSON -in prose / code fences), and retries transient HTTP errors with backoff. - -This module is import-safe: importing it touches NO network and reads NO API -key. Keys are read lazily, per-call, only when a real request is made — so the -judge tests can import and inject stubs without any secret present. - -Provider/model spec format: ":", e.g. "anthropic:claude-sonnet-4-6", -"openai:gpt-4o", "azure:gpt-4o", "ollama:llama3". A bare model with no prefix is -treated as provider "unknown" (see parse_model / same_provider). +Thin httpx-based wrapper over Anthropic / OpenAI / Azure OpenAI / Ollama. +Reads API keys lazily (per-call) from env so importing never requires secrets. +Provider/model spec: ":", e.g. "anthropic:claude-sonnet-4-6". """ from __future__ import annotations +import asyncio import json import os import re -import time -import urllib.error -import urllib.request -import numpy as np +import httpx -# Transient HTTP status codes worth retrying (rate limit + 5xx). _RETRY_STATUS = (429, 500, 502, 503, 504) - -# Hard cap on a honoured Retry-After sleep (a hostile header should not stall us). _MAX_RETRY_AFTER = 30.0 -def _env(name, default=None): - """Read an env var, stripping surrounding whitespace; empty-after-strip -> default. - - Defensive against a ``.env`` value that arrives with a trailing ``\\r`` / - whitespace (CRLF), which would otherwise corrupt a request URL or header. - An unset OR blank value falls back to ``default`` so the existing - missing-key -> RuntimeError behaviour is preserved. - """ +def _env(name: str, default: str | None = None) -> str | None: value = os.environ.get(name) if value is None: return default @@ -62,30 +26,8 @@ def _env(name, default=None): return value if value else default -def _retry_delay(exc: urllib.error.HTTPError, attempt: int) -> float: - """Seconds to sleep before retrying an HTTPError. - - On 429 honour the ``Retry-After`` header (capped at 30s) when it is present - and numeric; otherwise fall back to exponential backoff (2 ** attempt). - """ - if exc.code == 429: - headers = getattr(exc, "headers", None) - retry_after = headers.get("retry-after") if headers is not None else None - if retry_after is not None: - try: - return min(float(retry_after), _MAX_RETRY_AFTER) - except (TypeError, ValueError): - pass - return 2.0**attempt - - def parse_model(spec: str) -> tuple[str, str]: - """Split a "provider:model" spec into (provider, model). - - A bare spec with no ':' is returned as provider "unknown" with the whole - string as the model, e.g. "claude-sonnet-4-6" -> ("unknown", "claude-sonnet-4-6"). - The provider is lower-cased; the model keeps its original case. - """ + """Split "provider:model" -> (provider, model). Bare spec -> ("unknown", spec).""" spec = (spec or "").strip() if ":" not in spec: return "unknown", spec @@ -94,28 +36,16 @@ def parse_model(spec: str) -> tuple[str, str]: def same_provider(pipeline_model: str, judge_model: str) -> bool: - """True iff both specs name the SAME known provider prefix. - - A missing or "unknown" provider on either side -> not-same (False). This is - the same-provider caveat signal: when the judge and the pipeline share a - provider the judged metrics are advisory (no cross-provider isolation). - """ - p_provider, _ = parse_model(pipeline_model) - j_provider, _ = parse_model(judge_model) - if p_provider == "unknown" or j_provider == "unknown": + """True iff both specs share the same known provider prefix.""" + p, _ = parse_model(pipeline_model) + j, _ = parse_model(judge_model) + if p == "unknown" or j == "unknown": return False - return p_provider == j_provider + return p == j def _first_json_object(text: str) -> dict: - """Extract and parse the FIRST balanced JSON object embedded in text. - - Models wrap JSON in prose, preambles, or ```json code fences. This scans - for the first '{' and walks the string tracking brace depth (string-aware, - so braces inside quoted values do not confuse the matcher) to find its - matching '}'. Falls back to a greedy regex span if no balanced object is - found. Raises ValueError when nothing parses. - """ + """Extract the first balanced JSON object from text (handles prose/code-fence wrapping).""" if not text: raise ValueError("empty model response") @@ -165,38 +95,12 @@ def _first_json_object(text: str) -> dict: raise ValueError("no JSON object found in model response") -def _http_post_json(url: str, headers: dict, body: dict, timeout: int) -> dict: - """POST a JSON body and return the parsed JSON response (single attempt).""" - data = json.dumps(body).encode("utf-8") - req_headers = {"content-type": "application/json", **headers} - req = urllib.request.Request(url, data=data, headers=req_headers, method="POST") - with urllib.request.urlopen(req, timeout=timeout) as resp: - return json.loads(resp.read().decode("utf-8")) - - -def _extract_openai_text(resp: dict) -> str: - """Pull the assistant text from an OpenAI/Azure chat-completions response. - - Guards an empty ``choices`` list and a null ``message.content`` and raises a - descriptive RuntimeError (not a KeyError) when no text is present, so the - judge layer records a clean dropped-vote reason instead of a stack trace. - """ - choices = resp.get("choices") or [] - if choices: - text = (choices[0].get("message") or {}).get("content") - if text: - return text - raise RuntimeError(f"judge returned no text: {resp}") - - class JudgeClient: - """Minimal multi-provider chat client returning parsed JSON dicts. + """Async multi-provider chat client returning parsed JSON dicts. Dispatch is by the provider prefix of the model spec. temperature is pinned - to 0.0 for deterministic verdicts. Transient HTTP errors (429/5xx) and URL - errors are retried up to max_retries: a 429 honours the ``Retry-After`` - header (capped at 30s) when present, otherwise backoff is exponential - (2 ** attempt seconds). + to 0.0 for deterministic verdicts. Transient HTTP errors (429/5xx) and network + errors are retried up to max_retries with backoff. The API key / endpoint env vars are read lazily inside chat_json, so constructing a JudgeClient never requires a secret. @@ -208,48 +112,49 @@ def __init__(self, model: str, timeout: int = 120, max_retries: int = 3) -> None self.timeout = timeout self.max_retries = max_retries - def chat_json(self, system: str, user: str, max_tokens: int = 1024) -> dict: + async def chat_json(self, system: str, user: str, max_tokens: int = 1024) -> dict: """Send (system, user) to the provider and parse the first JSON object. Raises on exhausted retries / unknown provider / unparseable output. - The judge module wraps every call in try/except, so a raise here becomes - a dropped vote rather than a crash. """ last_exc: Exception | None = None for attempt in range(self.max_retries): try: - text = self._dispatch(system, user, max_tokens) - return _first_json_object(text) - except urllib.error.HTTPError as exc: + if self.provider == "anthropic": + return await self._anthropic(system, user, max_tokens) + if self.provider == "openai": + return await self._openai(system, user, max_tokens) + if self.provider == "azure": + return await self._azure(system, user, max_tokens) + if self.provider == "ollama": + return await self._ollama(system, user, max_tokens) + raise ValueError( + f"unknown judge provider {self.provider!r} in {self.model_spec!r}; " + "use anthropic:/openai:/azure:/ollama:" + ) + except httpx.HTTPStatusError as exc: last_exc = exc - if exc.code not in _RETRY_STATUS or attempt == self.max_retries - 1: + if exc.response.status_code not in _RETRY_STATUS or attempt == self.max_retries - 1: raise - time.sleep(_retry_delay(exc, attempt)) - except (urllib.error.URLError, TimeoutError, ConnectionError) as exc: + retry_after_header = exc.response.headers.get("retry-after") + if retry_after_header is not None: + try: + delay = min(float(retry_after_header), _MAX_RETRY_AFTER) + except (TypeError, ValueError): + delay = 2.0**attempt + else: + delay = 2.0**attempt + await asyncio.sleep(delay) + except httpx.RequestError as exc: last_exc = exc if attempt == self.max_retries - 1: raise - time.sleep(2**attempt) + await asyncio.sleep(2.0) if last_exc is not None: raise last_exc raise RuntimeError("chat_json exhausted retries without a response") - def _dispatch(self, system: str, user: str, max_tokens: int) -> str: - """Route to the per-provider call and return the raw model text.""" - if self.provider == "anthropic": - return self._anthropic(system, user, max_tokens) - if self.provider == "openai": - return self._openai(system, user, max_tokens) - if self.provider == "azure": - return self._azure(system, user, max_tokens) - if self.provider == "ollama": - return self._ollama(system, user, max_tokens) - raise ValueError( - f"unknown judge provider {self.provider!r} in {self.model_spec!r}; " - "use anthropic:/openai:/azure:/ollama:" - ) - - def _anthropic(self, system: str, user: str, max_tokens: int) -> str: + async def _anthropic(self, system: str, user: str, max_tokens: int) -> dict: api_key = _env("ANTHROPIC_API_KEY") if not api_key: raise RuntimeError("ANTHROPIC_API_KEY not set") @@ -260,16 +165,21 @@ def _anthropic(self, system: str, user: str, max_tokens: int) -> str: "system": system, "messages": [{"role": "user", "content": user}], } - headers = {"x-api-key": api_key, "anthropic-version": "2023-06-01"} - resp = _http_post_json("https://api.anthropic.com/v1/messages", headers, body, self.timeout) - text = next( - (b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None - ) + headers = { + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + "content-type": "application/json", + } + async with httpx.AsyncClient(timeout=self.timeout) as client: + resp = await client.post("https://api.anthropic.com/v1/messages", json=body, headers=headers) + resp.raise_for_status() + data = resp.json() + text = next((b.get("text") for b in data.get("content", []) if b.get("type") == "text"), None) if not text: - raise RuntimeError(f"judge returned no text: {resp}") - return text + raise RuntimeError(f"judge returned no text: {data}") + return _first_json_object(text) - def _openai(self, system: str, user: str, max_tokens: int) -> str: + async def _openai(self, system: str, user: str, max_tokens: int) -> dict: api_key = _env("OPENAI_API_KEY") if not api_key: raise RuntimeError("OPENAI_API_KEY not set") @@ -282,25 +192,27 @@ def _openai(self, system: str, user: str, max_tokens: int) -> str: {"role": "user", "content": user}, ], } - headers = {"Authorization": f"Bearer {api_key}"} - resp = _http_post_json( - "https://api.openai.com/v1/chat/completions", headers, body, self.timeout - ) - return _extract_openai_text(resp) + headers = {"Authorization": f"Bearer {api_key}", "content-type": "application/json"} + async with httpx.AsyncClient(timeout=self.timeout) as client: + resp = await client.post("https://api.openai.com/v1/chat/completions", json=body, headers=headers) + resp.raise_for_status() + data = resp.json() + choices = data.get("choices") or [] + if choices: + text = (choices[0].get("message") or {}).get("content") + if text: + return _first_json_object(text) + raise RuntimeError(f"judge returned no text: {data}") - def _azure(self, system: str, user: str, max_tokens: int) -> str: + async def _azure(self, system: str, user: str, max_tokens: int) -> dict: endpoint = _env("AZURE_OPENAI_ENDPOINT") api_key = _env("AZURE_OPENAI_API_KEY") if not endpoint: raise RuntimeError("AZURE_OPENAI_ENDPOINT not set") if not api_key: raise RuntimeError("AZURE_OPENAI_API_KEY not set") - api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01" - # Azure deployment lives in the URL path, not the JSON body. - url = ( - f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions" - f"?api-version={api_version}" - ) + api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-02-01" + url = f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions?api-version={api_version}" body = { "max_tokens": max_tokens, "temperature": 0.0, @@ -309,146 +221,34 @@ def _azure(self, system: str, user: str, max_tokens: int) -> str: {"role": "user", "content": user}, ], } - headers = {"api-key": api_key} - resp = _http_post_json(url, headers, body, self.timeout) - return _extract_openai_text(resp) + headers = {"api-key": api_key, "content-type": "application/json"} + async with httpx.AsyncClient(timeout=self.timeout) as client: + resp = await client.post(url, json=body, headers=headers) + resp.raise_for_status() + data = resp.json() + choices = data.get("choices") or [] + if choices: + text = (choices[0].get("message") or {}).get("content") + if text: + return _first_json_object(text) + raise RuntimeError(f"judge returned no text: {data}") - def _ollama(self, system: str, user: str, max_tokens: int) -> str: + async def _ollama(self, system: str, user: str, max_tokens: int) -> dict: # noqa: ARG002 host = _env("OLLAMA_HOST") or "http://localhost:11434" body = { "model": self.model, "stream": False, - "options": {"temperature": 0.0, "num_predict": max_tokens}, + "options": {"temperature": 0.0}, "messages": [ {"role": "system", "content": system}, {"role": "user", "content": user}, ], } - resp = _http_post_json(f"{host.rstrip('/')}/api/chat", {}, body, self.timeout) - text = (resp.get("message") or {}).get("content") + async with httpx.AsyncClient(timeout=self.timeout) as client: + resp = await client.post(f"{host.rstrip('/')}/api/chat", json=body) + resp.raise_for_status() + data = resp.json() + text = (data.get("message") or {}).get("content") if not text: - raise RuntimeError(f"judge returned no text: {resp}") - return text - - -class OpenAIEmbedder: - """OpenAI embeddings client over /v1/embeddings. - - Reads OPENAI_API_KEY from the environment. Default model: text-embedding-3-small. - """ - - def __init__(self, model: str = "text-embedding-3-small", timeout: int = 60) -> None: - self.model = model - self.timeout = timeout - - def embed(self, texts: list[str]) -> np.ndarray: - api_key = _env("OPENAI_API_KEY") - if not api_key: - raise RuntimeError("OPENAI_API_KEY not set") - headers = {"Authorization": f"Bearer {api_key}"} - body = {"model": self.model, "input": texts} - resp = _http_post_json("https://api.openai.com/v1/embeddings", headers, body, self.timeout) - data = resp.get("data", []) - vectors = [item["embedding"] for item in sorted(data, key=lambda x: x["index"])] - return np.asarray(vectors, dtype=np.float32) - - -class AzureOpenAIEmbedder: - """Azure OpenAI embeddings client. - - Reads AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, and optionally - AZURE_OPENAI_API_VERSION from the environment. The model name is the - deployment name. Default model: text-embedding-3-small. - """ - - def __init__(self, model: str = "text-embedding-3-small", timeout: int = 60) -> None: - self.model = model - self.timeout = timeout - - def embed(self, texts: list[str]) -> np.ndarray: - endpoint = _env("AZURE_OPENAI_ENDPOINT") - api_key = _env("AZURE_OPENAI_API_KEY") - if not endpoint: - raise RuntimeError("AZURE_OPENAI_ENDPOINT not set") - if not api_key: - raise RuntimeError("AZURE_OPENAI_API_KEY not set") - api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01" - url = ( - f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings" - f"?api-version={api_version}" - ) - headers = {"api-key": api_key} - vectors = self._embed_with_split(texts, url, headers) - return np.asarray(vectors, dtype=np.float32) - - def _embed_with_split(self, texts: list[str], url: str, headers: dict) -> list[list[float]]: - """Send texts in one request; on HTTP 400 split in half and retry each half.""" - try: - resp = _http_post_json(url, headers, {"input": texts}, self.timeout) - data = resp.get("data", []) - return [item["embedding"] for item in sorted(data, key=lambda x: x["index"])] - except urllib.error.HTTPError as exc: - if exc.code == 400 and len(texts) > 1: - mid = len(texts) // 2 - left = self._embed_with_split(texts[:mid], url, headers) - right = self._embed_with_split(texts[mid:], url, headers) - return left + right - raise - - -class OllamaEmbedder: - """Local Ollama embedding client (default model bge-m3) over /api/embeddings. - - Posts one prompt per call (the stable single-prompt form) and stacks the - returned vectors into a 2-D numpy array. Constructing it touches no network; - the host is resolved from $OLLAMA_HOST at call time. - """ - - def __init__(self, model: str = "bge-m3", host: str | None = None, timeout: int = 60) -> None: - self.model = model - self.host = (host or _env("OLLAMA_HOST") or "http://localhost:11434").rstrip("/") - self.timeout = timeout - - def embed(self, texts: list[str]) -> np.ndarray: - """Embed a list of strings -> float32 ndarray of shape (len(texts), dim).""" - vectors: list[list[float]] = [] - for text in texts: - body = {"model": self.model, "prompt": text} - resp = _http_post_json(f"{self.host}/api/embeddings", {}, body, self.timeout) - vectors.append(resp["embedding"]) - return np.asarray(vectors, dtype=np.float32) - - -def build_embedder(spec: str): - """Return an ``embed_fn(list[str]) -> np.ndarray`` for an embedder spec. - - Dispatch is on the provider prefix of a ":" spec: - - "ollama" / "ollama:" -> OllamaEmbedder(model or "bge-m3").embed. - - a bare "" with no ':' -> treated as an Ollama model. - - any other provider -> NotImplementedError (the extension point). - - Add a new backend by adding a branch here. - """ - if (spec or "").strip() == "ollama": # bare provider, no model -> default model - return OllamaEmbedder("bge-m3").embed - provider, model = parse_model(spec) - if provider in ("unknown", "ollama"): # bare "" or "ollama:" - return OllamaEmbedder(model or "bge-m3").embed - if provider == "openai": - return OpenAIEmbedder(model or "text-embedding-3-small").embed - if provider == "azure": - return AzureOpenAIEmbedder(model or "text-embedding-3-small").embed - raise NotImplementedError( - f"embedder backend {provider!r} not implemented yet; add it in build_embedder()" - ) - - -def cosine(a, b) -> float: - """Cosine similarity between two 1-D vectors; 0.0 if either is the zero vector.""" - a = np.asarray(a, dtype=np.float64).ravel() - b = np.asarray(b, dtype=np.float64).ravel() - na = float(np.linalg.norm(a)) - nb = float(np.linalg.norm(b)) - if na == 0.0 or nb == 0.0: - return 0.0 - return float(np.dot(a, b) / (na * nb)) + raise RuntimeError(f"judge returned no text: {data}") + return _first_json_object(text) From 5609ab67d43af810c0bf29b8f7b9f7100d50de15 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:24:09 +0200 Subject: [PATCH 30/48] =?UTF-8?q?feat(evaluation):=20rewrite=20judge.py=20?= =?UTF-8?q?=E2=80=94=20async=20metrics=20+=20EvalContext=20+=20flycanon=20?= =?UTF-8?q?+=20RAGAS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fireflyframework_agentic/evaluation/judge.py | 769 ++++++++++--------- 1 file changed, 415 insertions(+), 354 deletions(-) diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index a347c8e1..9f24dc26 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -1,61 +1,48 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""G4 — LLM-as-a-Judge: an opt-in, NON-BLOCKING, NON-DETERMINISTIC advisory gate. - -G4 NEVER affects the PROMOTE/HOLD verdict and NEVER raises into the caller. -run_judge() wraps every metric in try/except; a failing metric appends to -report.errors and the run continues (best-effort). The result is an -AdvisoryReport, NOT a GateResult — it is carried separately so it can never -enter verdict() or the Skipped tuple (see scorecard / verdict_unaffected_note). - -Three families of metric (matching the flyradar contracts): -- [D] DETERMINISTIC — pure python, no LLM, printed even when the judge is off: - source_coverage, excerpt_fill_rate. -- [E] EMBEDDING — needs an embed_fn (local Ollama BGE by default): - semantic_recovery (context recall). -- [J] JUDGE — needs a chat_fn(system, user) -> dict; each [J] metric instructs - the model to reply with ONLY JSON: faithfulness, numeric_temporal_fidelity, - citation_relevance, nc_semantic_precision, fabricated_entity, contradiction, - open_gap, actionability, severity_calibration, answer_relevancy, - comparative_vs_champion. - -Aggregation follows the flycanon custom-judge design: run each [J] metric `runs` -times and take the MEDIAN of its numeric scores (robust to an outlier vote). - -Zero new dependencies: stdlib (json, statistics) + numpy. All imports at top. -calibrated is ALWAYS False for now (LLM-as-a-Judge calibration is §14, future work). +"""Evaluation judge — async metrics for flyradar and flycanon pipelines. + +Every metric: async def metric_name(item: dict, ctx: EvalContext) -> dict | float | None + +Flyradar item keys: findings, evidence_index, process_graph, proposed_actions, + workspace, reports, lexical_missed_ids, nc_items, champion +Flycanon item keys: question, answer, reference, contexts """ from __future__ import annotations -import concurrent.futures +import asyncio +import math +import os import statistics +from collections.abc import Awaitable, Callable from dataclasses import dataclass, field -import numpy as np +from pydantic import BaseModel, ConfigDict -from fireflyframework_agentic.evaluation.judge_client import ( - JudgeClient, - OllamaEmbedder, - cosine, - same_provider, -) -from fireflyframework_agentic.evaluation.matcher import source_stem +from fireflyframework_agentic.embeddings.providers.ollama import OllamaEmbedder +from fireflyframework_agentic.embeddings.similarity import cosine_similarity +from fireflyframework_agentic.evaluation.judge_client import JudgeClient, same_provider + +Metric = Callable[["dict", "EvalContext"], Awaitable["dict | float | None"]] SYSTEM = "You are a meticulous evaluator of a process-mining discovery report. Return ONLY a JSON object." +SYSTEM_RAG = "You are an evaluator of a RAG system's answers. Return ONLY a JSON object." + +RUBRIC = ( + "Score the ANSWER on two metrics:\n" + "- contains_answer (0.0-1.0): Does the answer contain the correct information from the REFERENCE?\n" + "- addresses_question (0.0-1.0): Does the answer directly address what the QUESTION is asking?\n" + 'Reply with ONLY {"contains_answer": , "addresses_question": }.' +) + + +class EvalContext(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + client: JudgeClient + embedder: OllamaEmbedder | None = None + runs: int = 3 + @dataclass class AdvisoryReport: @@ -68,7 +55,7 @@ class AdvisoryReport: judge_model: str same_provider_caveat: bool - calibrated: bool # ALWAYS False for now (§14) + calibrated: bool # ALWAYS False for now runs: int metrics: dict = field(default_factory=dict) details: dict = field(default_factory=dict) @@ -78,8 +65,8 @@ class AdvisoryReport: # ── shared accessors ─────────────────────────────────────────────────────────── -def _evidence_index(result: dict) -> dict[str, dict]: - return {ev.get("id"): ev for ev in result.get("evidence_index", []) if ev.get("id")} +def _evidence_index(item: dict) -> dict[str, dict]: + return {ev.get("id"): ev for ev in item.get("evidence_index", []) if ev.get("id")} def _cited_excerpts(finding: dict, evidence_index: dict[str, dict]) -> list[str]: @@ -94,143 +81,116 @@ def _cited_excerpts(finding: dict, evidence_index: dict[str, dict]) -> list[str] return out -def _output_text(result: dict) -> str: +def _output_text(item: dict) -> str: """All free text the model emitted: finding titles+descriptions + reports.""" parts: list[str] = [] - for f in result.get("findings", []): + for f in item.get("findings", []): parts.append(f.get("title", "")) parts.append(f.get("description", "")) - for r in result.get("reports", []): + for r in item.get("reports", []): parts.append(str(r)) return "\n".join(p for p in parts if p) -def _workspace_intention(result: dict) -> str: - ws = result.get("workspace") or {} +def _workspace_intention(item: dict) -> str: + ws = item.get("workspace") or {} return f"{ws.get('name', '')}\n{ws.get('description', '')}".strip() def _coerce_float(value, default=None): - """Coerce a model-returned number/numeric-string to float; total (never raises). - - Returns ``default`` (None) on junk so one malformed vote drops that single - vote instead of discarding the whole metric. - """ + """Coerce a model-returned number/numeric-string to float; total (never raises).""" try: return float(value) except (TypeError, ValueError): return default -def _map_chat(chat_fn, prompts, workers=1): - """Run a list of (system, user) chat prompts, returning ordered result dicts. +def _source_stem(locator: str) -> str: + """Return the part before the first '#', or the full string if no '#'.""" + idx = locator.find("#") + return locator[:idx] if idx != -1 else locator - ``workers <= 1`` calls ``chat_fn`` SEQUENTIALLY — byte-for-byte identical to - the in-line loops it replaces, INCLUDING letting a raise propagate (so - run_judge's per-metric try/except still drops that whole metric, the - behaviour the suite locks in). - ``workers >= 2`` fans the calls out across a ThreadPoolExecutor while - PRESERVING input order in the returned list. Concurrency cannot let one - raising future poison the batch, so in that path a raising call's slot - becomes ``{}`` — the metric's aggregation degrades for that one vote but - never raises (the same best-effort contract as run_judge). - """ - prompts = list(prompts) - if workers <= 1: - return [chat_fn(system, user) for system, user in prompts] - - results: list[dict] = [{} for _ in prompts] - with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: - futures = { - executor.submit(chat_fn, system, user): idx - for idx, (system, user) in enumerate(prompts) - } - for future in concurrent.futures.as_completed(futures): - idx = futures[future] - try: - results[idx] = future.result() - except Exception: # best-effort: a dropped vote, never a raise - results[idx] = {} - return results +async def _gather_chat(chat_fn, prompts: list[tuple[str, str]]) -> list[dict]: + """Run a list of (system, user) prompts concurrently, returning ordered results.""" + results = await asyncio.gather(*[chat_fn(s, u) for s, u in prompts], return_exceptions=True) + return [r if isinstance(r, dict) else {} for r in results] # ── [D] DETERMINISTIC — no LLM, always available ──────────────────────────────── -def source_coverage(result: dict) -> dict: +async def source_coverage(item: dict, ctx: EvalContext) -> dict: # noqa: ARG001 """Distinct source documents cited by >=1 finding vs all source documents. Returns {cited, total, orphaned} where orphaned is the sorted list of source stems present in evidence_index but cited by no finding. """ - evidence_index = _evidence_index(result) - all_stems = { - source_stem(ev.get("locator", "")) - for ev in result.get("evidence_index", []) - if ev.get("locator") - } + ev_idx = _evidence_index(item) + all_stems = {_source_stem(ev.get("locator", "")) for ev in item.get("evidence_index", []) if ev.get("locator")} cited_stems: set[str] = set() - for f in result.get("findings", []): + for f in item.get("findings", []): for ref in f.get("evidence_refs", []): - ev = evidence_index.get(ref.get("evidence_id", "")) + ev = ev_idx.get(ref.get("evidence_id", "")) if ev and ev.get("locator"): - cited_stems.add(source_stem(ev["locator"])) + cited_stems.add(_source_stem(ev["locator"])) cited_stems &= all_stems orphaned = sorted(all_stems - cited_stems) return {"cited": len(cited_stems), "total": len(all_stems), "orphaned": orphaned} -def excerpt_fill_rate(result: dict) -> dict: +async def excerpt_fill_rate(item: dict, ctx: EvalContext) -> dict: # noqa: ARG001 """Fraction of evidence_index entries with a non-empty excerpt. - Returns {populated, total}. This is the signal behind older runs' low G3 - grounding: empty excerpts cannot ground anything. + Returns {populated, total}. """ - entries = result.get("evidence_index", []) + entries = item.get("evidence_index", []) populated = sum(1 for ev in entries if (ev.get("excerpt") or "").strip()) return {"populated": populated, "total": len(entries)} -# ── [E] EMBEDDING — needs embed_fn ─────────────────────────────────────────────── - +# ── [E] EMBEDDING — needs embedder ─────────────────────────────────────────────── -def semantic_recovery( - result: dict, - registry, - lexical_missed_ids: list[str], - embed_fn, - tau: float = 0.70, -) -> dict: - """Context-recall: recover G2 lexical misses by embedding similarity. - For each registry item flagged a LEXICAL MISS by G2, embed its - description+keywords and take the max cosine against the embeddings of every - finding description (and their cited excerpts). If max cosine >= tau the - item is counted semantically present (recovered). +async def semantic_recovery(item: dict, ctx: EvalContext, tau: float = 0.70) -> dict | None: + """Context-recall: recover lexical misses by embedding similarity. - recovered_recall = (lexical_hits + recovered) / scored_denominator, where - the scored denominator is the count of non-NC items scored by G2 (real - items, matching G2's recall denominator family). Returns the lexical recall, - the recovered recall, the recovered item list (with cosine), and tau. + Reads item["lexical_missed_ids"] (list of str). + Returns None if ctx.embedder is None. """ + if ctx.embedder is None: + return None + + lexical_missed_ids: list[str] = item.get("lexical_missed_ids", []) missed = set(lexical_missed_ids or []) - real_items = registry.real_items - scored_items = [i for i in real_items if i.tier != "L3"] - denom = len(scored_items) or 1 - lexical_hits = sum(1 for i in scored_items if i.id not in missed) - # Candidate texts the findings actually surfaced. - evidence_index = _evidence_index(result) + # Build the scored items from nc_items (non-NC = real items for recall) + # In the new EvalContext model, nc_items is a list of {"id": ..., "description": ...} + # We treat all item findings as the candidate surface; nc_items stay separate. + # Recompute as: all items scored = those not in nc_items ids. + # If there's no registry concept, we use findings as the denominator proxy. + # But keep the logic simple: just score the missed items against finding descriptions. + ev_idx = _evidence_index(item) candidate_texts: list[str] = [] - for f in result.get("findings", []): + for f in item.get("findings", []): desc = f.get("description", "") if desc: candidate_texts.append(desc) - candidate_texts.extend(_cited_excerpts(f, evidence_index)) + candidate_texts.extend(_cited_excerpts(f, ev_idx)) + + # missed_items: we only know their IDs; we need descriptions to embed. + # In the new design, if no descriptions available, return minimal result. + all_findings = item.get("findings", []) + denom = max(len(all_findings), 1) + lexical_hits = sum(1 for f in all_findings if f.get("id") not in missed) + + missed_descs: list[tuple[str, str]] = [ + (f.get("id", ""), f.get("description", "")) + for f in all_findings + if f.get("id") in missed and f.get("description") + ] - missed_items = [i for i in scored_items if i.id in missed] - if not missed_items or not candidate_texts: + if not missed_descs or not candidate_texts: recovered_recall = lexical_hits / denom return { "lexical_recall": round(lexical_hits / denom, 4), @@ -240,15 +200,15 @@ def semantic_recovery( "scored_denominator": denom, } - item_texts = [f"{i.description} {' '.join(i.keywords)}".strip() for i in missed_items] - item_vecs = np.asarray(embed_fn(item_texts), dtype=np.float64) - cand_vecs = np.asarray(embed_fn(candidate_texts), dtype=np.float64) + item_texts = [desc for _fid, desc in missed_descs] + item_vecs = await ctx.embedder._embed_batch(item_texts) + cand_vecs = await ctx.embedder._embed_batch(candidate_texts) recovered: list[dict] = [] - for item, ivec in zip(missed_items, item_vecs): - best = max((cosine(ivec, cvec) for cvec in cand_vecs), default=0.0) + for (fid, _desc), ivec in zip(missed_descs, item_vecs, strict=False): + best = max((cosine_similarity(ivec, cvec) for cvec in cand_vecs), default=0.0) if best >= tau: - recovered.append({"id": item.id, "cosine": round(best, 4)}) + recovered.append({"id": fid, "cosine": round(best, 4)}) recovered_recall = (lexical_hits + len(recovered)) / denom return { @@ -263,16 +223,14 @@ def semantic_recovery( # ── [J] JUDGE — needs chat_fn(system, user) -> dict ────────────────────────────── -def faithfulness(result: dict, chat_fn, *, workers: int = 1) -> dict: +async def faithfulness(item: dict, ctx: EvalContext) -> dict: """Entailment: does each finding's cited evidence SUPPORT its claim? - Per (finding, cited-excerpts) pair, ask SUPPORTED / NOT_SUPPORTED. Returns - {supported, total, unsupported_ids}. Findings with no cited evidence are - counted as not-supported (nothing to entail against). + Returns {supported, total, unsupported_ids}. """ - evidence_index = _evidence_index(result) - findings = result.get("findings", []) - cited = [(f, _cited_excerpts(f, evidence_index)) for f in findings] + ev_idx = _evidence_index(item) + findings = item.get("findings", []) + cited = [(f, _cited_excerpts(f, ev_idx)) for f in findings] prompts = [ ( SYSTEM, @@ -284,7 +242,7 @@ def faithfulness(result: dict, chat_fn, *, workers: int = 1) -> dict: for f, excerpts in cited if excerpts ] - answers = iter(_map_chat(chat_fn, prompts, workers)) + answers = iter(await _gather_chat(ctx.client.chat_json, prompts)) supported = 0 unsupported_ids: list[str] = [] for f, excerpts in cited: @@ -300,18 +258,13 @@ def faithfulness(result: dict, chat_fn, *, workers: int = 1) -> dict: return {"supported": supported, "total": len(findings), "unsupported_ids": unsupported_ids} -def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dict: +async def numeric_temporal_fidelity(item: dict, ctx: EvalContext) -> dict: """Flag numbers/dates asserted in a finding that do NOT match its evidence. - Closes the 45-days-vs-3-days gap. Returns {mismatches: [{finding_id, value, - source}], count}. + Returns {mismatches: [{finding_id, value, source}], count}. """ - evidence_index = _evidence_index(result) - scored = [ - (f, excerpts) - for f in result.get("findings", []) - if (excerpts := _cited_excerpts(f, evidence_index)) - ] + ev_idx = _evidence_index(item) + scored = [(f, excerpts) for f in item.get("findings", []) if (excerpts := _cited_excerpts(f, ev_idx))] prompts = [ ( SYSTEM, @@ -324,9 +277,9 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic ) for f, excerpts in scored ] - answers = _map_chat(chat_fn, prompts, workers) + answers = await _gather_chat(ctx.client.chat_json, prompts) mismatches: list[dict] = [] - for (f, _excerpts), answer in zip(scored, answers): + for (f, _excerpts), answer in zip(scored, answers, strict=False): for m in answer.get("mismatches", []) or []: mismatches.append( { @@ -338,20 +291,17 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic return {"mismatches": mismatches, "count": len(mismatches)} -def citation_relevance(result: dict, chat_fn, *, workers: int = 1) -> dict: +async def citation_relevance(item: dict, ctx: EvalContext) -> dict: """Context precision: fraction of cited passages actually relevant to the claim. - Per evidence_ref, ask yes/no relevance. precision = relevant / total_refs. - Returns {precision, relevant, total}; when total == 0 (no cited passages with - excerpts) precision is None — the kept ``total`` lets a reader tell "perfect" - apart from "nothing to score". + Returns {precision, relevant, total}. """ - evidence_index = _evidence_index(result) + ev_idx = _evidence_index(item) prompts: list[tuple[str, str]] = [] - for f in result.get("findings", []): + for f in item.get("findings", []): desc = f.get("description", "") for ref in f.get("evidence_refs", []): - ev = evidence_index.get(ref.get("evidence_id", "")) + ev = ev_idx.get(ref.get("evidence_id", "")) if not ev: continue excerpt = ev.get("excerpt") or "" @@ -366,7 +316,7 @@ def citation_relevance(result: dict, chat_fn, *, workers: int = 1) -> dict: f"CITED PASSAGE: {excerpt}", ) ) - answers = _map_chat(chat_fn, prompts, workers) + answers = await _gather_chat(ctx.client.chat_json, prompts) total = len(prompts) relevant = sum(1 for a in answers if str(a.get("relevant", "")).lower() == "yes") if not total: @@ -374,43 +324,40 @@ def citation_relevance(result: dict, chat_fn, *, workers: int = 1) -> dict: return {"precision": round(relevant / total, 4), "relevant": relevant, "total": total} -def nc_semantic_precision(result: dict, registry, chat_fn, *, workers: int = 1) -> dict: +async def nc_semantic_precision(item: dict, ctx: EvalContext) -> dict: """Count negative-control falsehoods the output asserts or endorses. - Per NC item, ask yes/no whether the output endorses the false statement. - Returns {asserted, total, asserted_ids}. The semantic counterpart to the - inert deterministic NC check. + Reads item["nc_items"] as list of {"id": ..., "description": ...} dicts. + Returns {asserted, total, asserted_ids}. """ - output_text = _output_text(result) - nc_items = registry.nc_items + output_text = _output_text(item) + nc_items: list[dict] = item.get("nc_items", []) prompts = [ ( SYSTEM, "Does the OUTPUT assert or endorse the following FALSE statement?\n" 'Reply with ONLY {"asserted": "yes" or "no"}.\n\n' - f"FALSE STATEMENT: {item.description}\n" + f"FALSE STATEMENT: {nc.get('description', '')}\n" f"OUTPUT:\n{output_text}", ) - for item in nc_items + for nc in nc_items ] - answers = _map_chat(chat_fn, prompts, workers) + answers = await _gather_chat(ctx.client.chat_json, prompts) asserted_ids = [ - item.id for item, a in zip(nc_items, answers) if str(a.get("asserted", "")).lower() == "yes" + nc.get("id", "?") + for nc, a in zip(nc_items, answers, strict=False) + if str(a.get("asserted", "")).lower() == "yes" ] return {"asserted": len(asserted_ids), "total": len(nc_items), "asserted_ids": asserted_ids} -def fabricated_entity(result: dict, chat_fn) -> dict: +async def fabricated_entity(item: dict, ctx: EvalContext) -> dict: """Count systems/orgs/metrics named in the output but absent from the corpus. - Returns {count, entities}. The corpus universe is the set of evidence - excerpts + locators. + Returns {count, entities}. """ - output_text = _output_text(result) - corpus = "\n".join( - f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" - for ev in result.get("evidence_index", []) - ) + output_text = _output_text(item) + corpus = "\n".join(f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" for ev in item.get("evidence_index", [])) user = ( "List any system, organization, or metric NAMED in the OUTPUT that does NOT " "appear anywhere in the CORPUS EVIDENCE.\n" @@ -418,54 +365,54 @@ def fabricated_entity(result: dict, chat_fn) -> dict: f"OUTPUT:\n{output_text}\n\n" f"CORPUS EVIDENCE:\n{corpus}" ) - entities = chat_fn(SYSTEM, user).get("fabricated", []) or [] + answer = await ctx.client.chat_json(SYSTEM, user) + entities = answer.get("fabricated", []) or [] return {"count": len(entities), "entities": list(entities)} -def contradiction(result: dict, chat_fn) -> dict: +async def contradiction(item: dict, ctx: EvalContext) -> dict: """Count internally contradictory finding pairs. - Returns {count, pairs}. pairs is the list of contradicting finding-id pairs - the judge reports. + Returns {count, pairs}. """ lines = [] - for f in result.get("findings", []): + for f in item.get("findings", []): lines.append(f"{f.get('id', '?')}: {f.get('title', '')} — {f.get('description', '')}") user = ( "Are any two of these FINDINGS mutually contradictory? List each contradicting pair.\n" - 'Reply with ONLY {"pairs": [["", ""], ...]}. Empty list if none.\n\n' - + "\n".join(lines) + 'Reply with ONLY {"pairs": [["", ""], ...]}. Empty list if none.\n\n' + "\n".join(lines) ) - pairs = chat_fn(SYSTEM, user).get("pairs", []) or [] + answer = await ctx.client.chat_json(SYSTEM, user) + pairs = answer.get("pairs", []) or [] return {"count": len(pairs), "pairs": [list(p) for p in pairs]} -def open_gap(result: dict, chat_fn) -> dict: +async def open_gap(item: dict, ctx: EvalContext) -> dict: """G-Eval open probe: the most important process issue the output missed. Returns {gap} — a free-text advisory narrative (no score). """ - pg = result.get("process_graph") or {} + pg = item.get("process_graph") or {} pg_summary = f"process_graph has {len(pg.get('processes', []))} processes" user = ( "Given this corpus scope and output, what important process issue did the " "output FAIL to surface?\n" 'Reply with ONLY {"gap": ""}.\n\n' - f"WORKSPACE SCOPE: {_workspace_intention(result)}\n" + f"WORKSPACE SCOPE: {_workspace_intention(item)}\n" f"{pg_summary}\n" - f"OUTPUT:\n{_output_text(result)}" + f"OUTPUT:\n{_output_text(item)}" ) - return {"gap": str(chat_fn(SYSTEM, user).get("gap", ""))} + answer = await ctx.client.chat_json(SYSTEM, user) + return {"gap": str(answer.get("gap", ""))} -def actionability(result: dict, chat_fn, *, workers: int = 1) -> dict: +async def actionability(item: dict, ctx: EvalContext) -> dict: """Average 0-1 rating of whether proposed actions are specific+quantified+linked. - Returns {score, rated}. Each action is rated against whether it is specific, - quantified, and linked to a finding. + Returns {score, rated}. """ - actions = result.get("proposed_actions", []) or [] - finding_ids = {f.get("id") for f in result.get("findings", [])} + actions = item.get("proposed_actions", []) or [] + finding_ids = {f.get("id") for f in item.get("findings", [])} prompts = [ ( SYSTEM, @@ -482,24 +429,24 @@ def actionability(result: dict, chat_fn, *, workers: int = 1) -> dict: ) for a in actions ] - answers = _map_chat(chat_fn, prompts, workers) + answers = await _gather_chat(ctx.client.chat_json, prompts) scores: list[float] = [] for a in answers: value = _coerce_float(a.get("score")) - if value is None: # malformed vote -> skip this action, keep the metric + if value is None: continue scores.append(value) score = round(sum(scores) / len(scores), 4) if scores else None return {"score": score, "rated": len(scores)} -def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict: +async def severity_calibration(item: dict, ctx: EvalContext) -> dict: """Per-finding judgment of whether stated severity matches the evidence. Returns {miscalibrated, total, verdicts: {finding_id: under|over|calibrated}}. """ - evidence_index = _evidence_index(result) - findings = result.get("findings", []) + ev_idx = _evidence_index(item) + findings = item.get("findings", []) prompts = [ ( SYSTEM, @@ -507,14 +454,14 @@ def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict: 'Reply with ONLY {"calibration": "under" or "over" or "calibrated"}.\n\n' f"STATED SEVERITY: {f.get('severity', '')} SCORE: {f.get('score', '')}\n" f"FINDING: {f.get('description', '')}\n" - f"CITED EVIDENCE: {' || '.join(_cited_excerpts(f, evidence_index))}", + f"CITED EVIDENCE: {' || '.join(_cited_excerpts(f, ev_idx))}", ) for f in findings ] - answers = _map_chat(chat_fn, prompts, workers) + answers = await _gather_chat(ctx.client.chat_json, prompts) verdicts: dict[str, str] = {} miscalibrated = 0 - for f, a in zip(findings, answers): + for f, a in zip(findings, answers, strict=False): verdict = str(a.get("calibration", "calibrated")).lower() verdicts[f.get("id", "?")] = verdict if verdict in ("under", "over"): @@ -522,7 +469,7 @@ def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict: return {"miscalibrated": miscalibrated, "total": len(findings), "verdicts": verdicts} -def answer_relevancy(result: dict, chat_fn) -> dict: +async def answer_relevancy(item: dict, ctx: EvalContext) -> dict: """RAGAS-style: does the output address the stated workspace intention? Returns {score} in [0,1], or {"score": None} when the vote fails to coerce. @@ -530,38 +477,27 @@ def answer_relevancy(result: dict, chat_fn) -> dict: user = ( "Does the OUTPUT address the stated WORKSPACE INTENTION (on-topic, responsive)?\n" 'Reply with ONLY {"score": }.\n\n' - f"WORKSPACE INTENTION: {_workspace_intention(result)}\n" - f"OUTPUT:\n{_output_text(result)}" + f"WORKSPACE INTENTION: {_workspace_intention(item)}\n" + f"OUTPUT:\n{_output_text(item)}" ) - return {"score": _coerce_float(chat_fn(SYSTEM, user).get("score"))} + answer = await ctx.client.chat_json(SYSTEM, user) + return {"score": _coerce_float(answer.get("score"))} -def surface_deduplication(result: dict, chat_fn, *, workers: int = 1) -> dict: +async def surface_deduplication(item: dict, ctx: EvalContext) -> dict: """Fraction of near-duplicate process-graph node pairs that are genuinely distinct. - Scoping rules: - - Processes: all pairs compared (cross-process is valid at this level). - - Activities and decisions: ONLY within the same parent process. The same - activity name appearing in two different processes is a legitimate repetition - (e.g. "Approve Request" in both Loan and Credit-Card flows), not a duplicate. - - For each surface, the top-10 most name-similar pairs (token-Jaccard >= 0.30) - are selected. For activities/decisions the parent process name is included in - the judge prompt so it can reason about intra-process context. 30 pairs total. - Returns {distinct, redundant, total, distinct_rate, redundant_pairs}. """ - pg = result.get("process_graph", {}) + pg = item.get("process_graph", {}) procs = pg.get("processes", []) def _toks(node: dict) -> frozenset[str]: return frozenset(node.get("name", "").lower().split()) - PER_SURFACE_CAP = 10 - # candidates: (surface, node_a, node_b, parent_process_name) + per_surface_cap = 10 candidates: list[tuple[str, dict, dict, str]] = [] - # Processes: compare all pairs if len(procs) >= 2: pairs: list[tuple[float, dict, dict]] = [] for i in range(len(procs)): @@ -574,10 +510,9 @@ def _toks(node: dict) -> frozenset[str]: if jac >= 0.30: pairs.append((jac, procs[i], procs[j])) pairs.sort(key=lambda x: x[0], reverse=True) - for _jac, a, b in pairs[:PER_SURFACE_CAP]: + for _jac, a, b in pairs[:per_surface_cap]: candidates.append(("process", a, b, "")) - # Activities and decisions: within the same parent process only for surface_key, attr in (("activity", "activities"), ("decision", "decisions")): all_pairs: list[tuple[float, dict, dict, str]] = [] for proc in procs: @@ -595,7 +530,7 @@ def _toks(node: dict) -> frozenset[str]: if jac >= 0.30: all_pairs.append((jac, nodes[i], nodes[j], proc_name)) all_pairs.sort(key=lambda x: x[0], reverse=True) - for _jac, a, b, proc_name in all_pairs[:PER_SURFACE_CAP]: + for _jac, a, b, proc_name in all_pairs[:per_surface_cap]: candidates.append((surface_key, a, b, proc_name)) if not candidates: @@ -603,34 +538,38 @@ def _toks(node: dict) -> frozenset[str]: prompts = [] for surface, a, b, parent_proc in candidates: - ctx = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else "" - prompts.append(( - SYSTEM, - f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a " - f"duplicate / sub-case / restatement of the other?\n" - f"{ctx}" - 'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": ""}.\n\n' - f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n" - f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}", - )) + ctx_line = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else "" + prompts.append( + ( + SYSTEM, + f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a " + f"duplicate / sub-case / restatement of the other?\n" + f"{ctx_line}" + 'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": ""}.\n\n' + f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n" + f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}", + ) + ) - answers = _map_chat(chat_fn, prompts, workers) + answers = await _gather_chat(ctx.client.chat_json, prompts) distinct = 0 redundant = 0 redundant_pairs: list[dict] = [] - for (surface, a, b, _parent), answer in zip(candidates, answers): + for (surface, a, b, _parent), answer in zip(candidates, answers, strict=False): verdict = str(answer.get("verdict", "")).upper() if verdict == "DISTINCT": distinct += 1 else: redundant += 1 - redundant_pairs.append({ - "surface": surface, - "a": a.get("name", ""), - "b": b.get("name", ""), - "reason": str(answer.get("reason", "")), - }) + redundant_pairs.append( + { + "surface": surface, + "a": a.get("name", ""), + "b": b.get("name", ""), + "reason": str(answer.get("reason", "")), + } + ) total = distinct + redundant return { @@ -642,13 +581,15 @@ def _toks(node: dict) -> frozenset[str]: } -def comparative_vs_champion(result: dict, champion_result: dict, chat_fn) -> dict: +async def comparative_vs_champion(item: dict, ctx: EvalContext) -> dict | None: """Pairwise MT-Bench-style review of candidate vs champion (advisory only). - Returns {candidate, champion, more_consistent} where candidate/champion are - 1-5 ratings on Coverage/Quality/Evidence/Actionability/Regression. Never - feeds G5. + Returns None if item["champion"] is not present. + Returns {candidate, champion, more_consistent}. """ + champion = item.get("champion") + if champion is None: + return None user = ( "Score the CANDIDATE and the CHAMPION outputs on five axes (1-5 each): " "Coverage, Quality, Evidence, Actionability, Regression. Then say which is " @@ -657,10 +598,10 @@ def comparative_vs_champion(result: dict, champion_result: dict, chat_fn) -> dic '{"candidate": {"coverage": x, "quality": x, "evidence": x, "actionability": x, "regression": x}, ' '"champion": {"coverage": x, "quality": x, "evidence": x, "actionability": x, "regression": x}, ' '"more_consistent": "candidate" or "champion"}.\n\n' - f"CANDIDATE:\n{_output_text(result)}\n\n" - f"CHAMPION:\n{_output_text(champion_result)}" + f"CANDIDATE:\n{_output_text(item)}\n\n" + f"CHAMPION:\n{_output_text(champion)}" ) - out = chat_fn(SYSTEM, user) + out = await ctx.client.chat_json(SYSTEM, user) return { "candidate": out.get("candidate", {}), "champion": out.get("champion", {}), @@ -668,18 +609,175 @@ def comparative_vs_champion(result: dict, champion_result: dict, chat_fn) -> dic } -# ── median-of-N for [J] metrics ────────────────────────────────────────────────── +# ── flycanon custom metrics ─────────────────────────────────────────────────────── -def _numeric_leaves(d: dict) -> dict[tuple, float]: - """Flatten a metric dict to {path: float} over its FLOAT score-leaves only. +async def _rag_score_once(item: dict, ctx: EvalContext) -> dict | None: + """Single RAG scoring call: returns {"contains_answer": float, "addresses_question": float}.""" + question = item.get("question", "") + reference = item.get("reference", "") + answer = item.get("answer", "") + if not question or not answer: + return None + user = f"QUESTION: {question}\nREFERENCE: {reference}\nANSWER: {answer}\n\n{RUBRIC}" + result = await ctx.client.chat_json(SYSTEM_RAG, user) + return result + + +async def contains_answer(item: dict, ctx: EvalContext) -> float | None: + """Flycanon: does the answer contain the correct information from the reference? - Median applies to continuous scores only. A leaf counts as numeric-for-median - only when its value is a ``float``; ``bool`` and ``int`` leaves (counts, - denominators, 1-5 axes, and other bookkeeping) are deliberately skipped and - taken from the first run unchanged — this avoids fractional counts (rated=0.5) - and count/len(list) disagreement under runs>1 with an even N. + Runs ctx.runs times and returns the median score. + Returns None if the item lacks question/answer. + """ + scores: list[float] = [] + for _ in range(max(1, ctx.runs)): + result = await _rag_score_once(item, ctx) + if result is None: + return None + val = _coerce_float(result.get("contains_answer")) + if val is not None: + scores.append(val) + if not scores: + return None + return round(statistics.median(scores), 4) + + +async def addresses_question(item: dict, ctx: EvalContext) -> float | None: + """Flycanon: does the answer directly address what the question is asking? + + Runs ctx.runs times and returns the median score. + Returns None if the item lacks question/answer. """ + scores: list[float] = [] + for _ in range(max(1, ctx.runs)): + result = await _rag_score_once(item, ctx) + if result is None: + return None + val = _coerce_float(result.get("addresses_question")) + if val is not None: + scores.append(val) + if not scores: + return None + return round(statistics.median(scores), 4) + + +# ── RAGAS metrics ───────────────────────────────────────────────────────────────── +# ragas/langchain imports are inline inside _sync() since ragas is optional. + + +def _make_ragas_sample(item: dict): + """Build a RAGAS SingleTurnSample from an item dict (ragas import inline).""" + from ragas import SingleTurnSample # type: ignore[import] # noqa: PLC0415 + + return SingleTurnSample( + user_input=item.get("question", ""), + response=item.get("answer", ""), + reference=item.get("reference", ""), + retrieved_contexts=item.get("contexts", []), + ) + + +def _make_ragas_llm(ctx: EvalContext): + """Build a LangChain LLM wrapper for RAGAS (langchain import inline).""" + provider, model = ctx.client.provider, ctx.client.model + if provider == "anthropic": + from langchain_anthropic import ChatAnthropic # type: ignore[import] # noqa: PLC0415 + + api_key = os.environ.get("ANTHROPIC_API_KEY", "") + return ChatAnthropic(model=model, api_key=api_key, temperature=0.0) + if provider in ("openai", "azure"): + from langchain_openai import ChatOpenAI # type: ignore[import] # noqa: PLC0415 + + api_key = os.environ.get("OPENAI_API_KEY", "") + return ChatOpenAI(model=model, api_key=api_key, temperature=0.0) + if provider == "ollama": + from langchain_ollama import ChatOllama # type: ignore[import] # noqa: PLC0415 + + return ChatOllama(model=model, temperature=0.0) + raise ValueError(f"RAGAS: unsupported provider {provider!r}") + + +def _make_ragas_embeddings(ctx: EvalContext): + """Build LangChain embeddings for RAGAS (langchain import inline).""" + if ctx.embedder is not None: + from langchain_ollama import OllamaEmbeddings # type: ignore[import] # noqa: PLC0415 + + return OllamaEmbeddings(model=ctx.embedder._model) + from langchain_anthropic import AnthropicEmbeddings # type: ignore[import] # noqa: PLC0415 + + return AnthropicEmbeddings() + + +async def _ragas_score(metric_name: str, item: dict, ctx: EvalContext) -> float | None: + """Run a single named RAGAS metric and return its float score (or None).""" + + def _sync(): + from ragas import evaluate # type: ignore[import] # noqa: PLC0415 + from ragas.dataset_schema import EvaluationDataset # type: ignore[import] # noqa: PLC0415 + from ragas.metrics import ( # type: ignore[import] # noqa: PLC0415 + AnswerCorrectness, + AnswerRelevancy, + ContextPrecision, + ContextRecall, + Faithfulness, + ) + + _metrics_map = { + "answer_correctness": AnswerCorrectness, + "answer_relevancy_ragas": AnswerRelevancy, + "ragas_faithfulness": Faithfulness, + "context_recall": ContextRecall, + "context_precision": ContextPrecision, + } + metric_cls = _metrics_map.get(metric_name) + if metric_cls is None: + return None + + llm = _make_ragas_llm(ctx) + embeddings = _make_ragas_embeddings(ctx) + metric = metric_cls(llm=llm, embeddings=embeddings) + sample = _make_ragas_sample(item) + dataset = EvaluationDataset(samples=[sample]) + result = evaluate(dataset=dataset, metrics=[metric]) + df = result.to_pandas() + col = df.columns[df.columns.str.contains(metric_name.replace("_ragas", ""), case=False)] + if col.empty: + return None + val = df[col[0]].iloc[0] + if val is None or (isinstance(val, float) and math.isnan(val)): + return None + return round(float(val), 4) + + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, _sync) + + +async def answer_correctness(item: dict, ctx: EvalContext) -> float | None: + """RAGAS answer correctness (semantic F1 against reference).""" + return await _ragas_score("answer_correctness", item, ctx) + + +async def ragas_faithfulness(item: dict, ctx: EvalContext) -> float | None: + """RAGAS faithfulness (answer grounded in retrieved contexts).""" + return await _ragas_score("ragas_faithfulness", item, ctx) + + +async def context_recall(item: dict, ctx: EvalContext) -> float | None: + """RAGAS context recall (reference coverage by retrieved contexts).""" + return await _ragas_score("context_recall", item, ctx) + + +async def context_precision(item: dict, ctx: EvalContext) -> float | None: + """RAGAS context precision (retrieved contexts relevant to the question).""" + return await _ragas_score("context_precision", item, ctx) + + +# ── median-of-N helpers ────────────────────────────────────────────────────────── + + +def _numeric_leaves(d: dict) -> dict[tuple, float]: + """Flatten a metric dict to {path: float} over its FLOAT score-leaves only.""" out: dict[tuple, float] = {} def walk(node, path: tuple) -> None: @@ -701,11 +799,7 @@ def _set_leaf(d: dict, path: tuple, value: float) -> None: def _median_runs(samples: list[dict]) -> dict: - """Median across N metric-dicts: FLOAT score-leaves -> per-key median; rest = first. - - Only continuous float scores are medianed; integer bookkeeping (counts, - denominators, 1-5 axes) and all non-numeric fields are taken from the first run. - """ + """Median across N metric-dicts: FLOAT score-leaves -> per-key median; rest = first.""" samples = [s for s in samples if isinstance(s, dict)] if not samples: return {} @@ -728,102 +822,69 @@ def _median_runs(samples: list[dict]) -> dict: # ── orchestrator ───────────────────────────────────────────────────────────────── -def run_judge( - result: dict, - registry, +async def run_judge( + item: dict, + ctx: EvalContext, *, - judge_model: str, - runs: int = 1, - concurrency: int = 1, pipeline_model: str = "", - champion_result: dict | None = None, - chat_fn=None, - embed_fn=None, - tau: float = 0.70, - lexical_missed_ids: list[str] | None = None, ) -> AdvisoryReport: - """Run the G4 advisory gate, best-effort. NEVER raises; NEVER affects verdict. - - If chat_fn / embed_fn are None, real ones are built from JudgeClient / - OllamaEmbedder (tests inject stubs instead). Each [J] metric runs `runs` - times and the median of its numeric scores is kept. Every metric is wrapped - in try/except: a failure appends to report.errors and the run continues. - - ``concurrency`` (opt-in, default 1) bounds the per-item [J] metrics' internal - fan-out: 1 keeps the sequential per-item loops; >=2 runs each metric's items - across a thread pool (order preserved). The median-of-N ``runs`` loop stays - sequential and the single-call metrics are unaffected. The result is - byte-for-byte identical at concurrency=1. + """Run all metrics concurrently and return an AdvisoryReport. - Returns an AdvisoryReport (a plain dict carrier) with calibrated=False and - same_provider_caveat = same_provider(pipeline_model, judge_model). + Best-effort: never raises. Failing metrics append to report.errors. """ - if chat_fn is None: - client = JudgeClient(judge_model) - chat_fn = client.chat_json - if embed_fn is None: - embed_fn = OllamaEmbedder().embed - report = AdvisoryReport( - judge_model=judge_model, - same_provider_caveat=same_provider(pipeline_model, judge_model), + judge_model=ctx.client.model_spec, + same_provider_caveat=same_provider(pipeline_model, ctx.client.model_spec), calibrated=False, - runs=runs, + runs=ctx.runs, ) - def _run_det(name: str, fn) -> None: - try: - report.metrics[name] = fn() - except Exception as exc: # best-effort: never raise - report.errors.append(f"{name}: {type(exc).__name__}: {exc}") + # [D] deterministic (no LLM) + det_metrics: list[tuple[str, Metric]] = [ + ("source_coverage", source_coverage), + ("excerpt_fill_rate", excerpt_fill_rate), + ] + # [E] embedding + emb_metrics: list[tuple[str, Metric]] = [ + ("semantic_recovery", semantic_recovery), + ] + # [J] judge metrics (median-of-runs handled externally for single-call ones) + judge_metrics: list[tuple[str, Metric]] = [ + ("faithfulness", faithfulness), + ("numeric_temporal_fidelity", numeric_temporal_fidelity), + ("citation_relevance", citation_relevance), + ("nc_semantic_precision", nc_semantic_precision), + ("fabricated_entity", fabricated_entity), + ("contradiction", contradiction), + ("open_gap", open_gap), + ("actionability", actionability), + ("severity_calibration", severity_calibration), + ("answer_relevancy", answer_relevancy), + ("surface_deduplication", surface_deduplication), + ("comparative_vs_champion", comparative_vs_champion), + ] + # flycanon custom + flycanon_metrics: list[tuple[str, Metric]] = [ + ("contains_answer", contains_answer), + ("addresses_question", addresses_question), + ] + # RAGAS + ragas_metrics: list[tuple[str, Metric]] = [ + ("answer_correctness", answer_correctness), + ("ragas_faithfulness", ragas_faithfulness), + ("context_recall", context_recall), + ("context_precision", context_precision), + ] - def _run_judge_metric(name: str, fn) -> None: + all_metrics = det_metrics + emb_metrics + judge_metrics + flycanon_metrics + ragas_metrics + + async def _run_one(name: str, fn: Metric) -> None: try: - samples = [fn() for _ in range(max(1, runs))] - report.metrics[name] = _median_runs(samples) - except Exception as exc: # best-effort: never raise + result = await fn(item, ctx) + if result is not None: + report.metrics[name] = result + except Exception as exc: report.errors.append(f"{name}: {type(exc).__name__}: {exc}") - # [D] deterministic — always computed, no LLM. - _run_det("source_coverage", lambda: source_coverage(result)) - _run_det("excerpt_fill_rate", lambda: excerpt_fill_rate(result)) - - # [E] embedding — context recall. - _run_det( - "semantic_recovery", - lambda: semantic_recovery(result, registry, lexical_missed_ids or [], embed_fn, tau=tau), - ) - - # [J] judge — median-of-N. Per-item metrics fan out at workers=concurrency. - _run_judge_metric("faithfulness", lambda: faithfulness(result, chat_fn, workers=concurrency)) - _run_judge_metric( - "numeric_temporal_fidelity", - lambda: numeric_temporal_fidelity(result, chat_fn, workers=concurrency), - ) - _run_judge_metric( - "citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency) - ) - _run_judge_metric( - "nc_semantic_precision", - lambda: nc_semantic_precision(result, registry, chat_fn, workers=concurrency), - ) - _run_judge_metric("fabricated_entity", lambda: fabricated_entity(result, chat_fn)) - _run_judge_metric("contradiction", lambda: contradiction(result, chat_fn)) - _run_judge_metric("open_gap", lambda: open_gap(result, chat_fn)) - _run_judge_metric("actionability", lambda: actionability(result, chat_fn, workers=concurrency)) - _run_judge_metric( - "severity_calibration", - lambda: severity_calibration(result, chat_fn, workers=concurrency), - ) - _run_judge_metric("answer_relevancy", lambda: answer_relevancy(result, chat_fn)) - _run_judge_metric( - "surface_deduplication", - lambda: surface_deduplication(result, chat_fn, workers=concurrency), - ) - if champion_result is not None: - _run_judge_metric( - "comparative_vs_champion", - lambda: comparative_vs_champion(result, champion_result, chat_fn), - ) - + await asyncio.gather(*[_run_one(name, fn) for name, fn in all_metrics]) return report From 7799185bf69777b8f680fe04667129e42fddddf1 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:24:58 +0200 Subject: [PATCH 31/48] feat(evaluation): slim __init__.py to 3-file exports --- .../evaluation/__init__.py | 179 +++++++++--------- 1 file changed, 90 insertions(+), 89 deletions(-) diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index ad01980c..c2005e7a 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -1,89 +1,90 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Evaluation subpackage -- gate-based quality gates, LLM-as-judge advisory, champion/challenger tracking, and retrieval metrics. - -Gate pipeline (flags, not vetoes): - G1 -- Structural & Safe (schema + PII + empty-registry guard) - G2 -- Must-finds & negative controls (recall + NC precision) - G3 -- Evidence (grounding / token-anchoring) - G4 -- LLM-as-a-Judge (advisory, opt-in, never decides promotion) - G5 -- No-regression / promotion (champion/challenger comparison) - -Retrieval metrics: - Precision@k, Recall@k, MRR, NDCG -- computed over ranked retrieval results. - -Champion tracking: - Persists the best-known run record so that promotion decisions can be made - against a stable baseline rather than the most recent run. -""" - -from importlib.metadata import PackageNotFoundError, version - -from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index -from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, run_gates -from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict, VERDICT_PROMOTE, VERDICT_HOLD -from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion -from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge -from fireflyframework_agentic.evaluation.judge_client import JudgeClient, OllamaEmbedder, build_embedder, cosine -from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens -from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256 -from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics -from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag - -try: - __version__ = version("fireflyframework-agentic") -except PackageNotFoundError: - __version__ = "0.0.0+dev" - -__all__ = [ - "EMPTY", - "FABRICATED", - "SOURCE_UNKNOWN", - "VERIFIED", - "corpus_sha256", - "load_corpus", - "verify_evidence_index", - "GateResult", - "Verdict", - "g2_recall_precision", - "run_gates", - "render_scorecard", - "verdict", - "VERDICT_PROMOTE", - "VERDICT_HOLD", - "ChampionRecord", - "load_champion", - "save_champion", - "invalidate_champion", - "AdvisoryReport", - "run_judge", - "JudgeClient", - "OllamaEmbedder", - "build_embedder", - "cosine", - "Registry", - "RegistryItem", - "load_registry", - "registry_sha256", - "RetrieverMetrics", - "compute_retrieval_metrics", - "anchored", - "matches", - "source_stem", - "tokens", - "aa_band", - "aggregate_grounding", - "left_skew_flag", -] +from fireflyframework_agentic.evaluation.judge import ( + AdvisoryReport as AdvisoryReport, +) +from fireflyframework_agentic.evaluation.judge import ( + EvalContext as EvalContext, +) +from fireflyframework_agentic.evaluation.judge import ( + Metric as Metric, +) +from fireflyframework_agentic.evaluation.judge import ( + actionability as actionability, +) +from fireflyframework_agentic.evaluation.judge import ( + addresses_question as addresses_question, +) +from fireflyframework_agentic.evaluation.judge import ( + answer_correctness as answer_correctness, +) +from fireflyframework_agentic.evaluation.judge import ( + answer_relevancy as answer_relevancy, +) +from fireflyframework_agentic.evaluation.judge import ( + citation_relevance as citation_relevance, +) +from fireflyframework_agentic.evaluation.judge import ( + comparative_vs_champion as comparative_vs_champion, +) +from fireflyframework_agentic.evaluation.judge import ( + contains_answer as contains_answer, +) +from fireflyframework_agentic.evaluation.judge import ( + context_precision as context_precision, +) +from fireflyframework_agentic.evaluation.judge import ( + context_recall as context_recall, +) +from fireflyframework_agentic.evaluation.judge import ( + contradiction as contradiction, +) +from fireflyframework_agentic.evaluation.judge import ( + excerpt_fill_rate as excerpt_fill_rate, +) +from fireflyframework_agentic.evaluation.judge import ( + fabricated_entity as fabricated_entity, +) +from fireflyframework_agentic.evaluation.judge import ( + faithfulness as faithfulness, +) +from fireflyframework_agentic.evaluation.judge import ( + nc_semantic_precision as nc_semantic_precision, +) +from fireflyframework_agentic.evaluation.judge import ( + numeric_temporal_fidelity as numeric_temporal_fidelity, +) +from fireflyframework_agentic.evaluation.judge import ( + open_gap as open_gap, +) +from fireflyframework_agentic.evaluation.judge import ( + ragas_faithfulness as ragas_faithfulness, +) +from fireflyframework_agentic.evaluation.judge import ( + run_judge as run_judge, +) +from fireflyframework_agentic.evaluation.judge import ( + semantic_recovery as semantic_recovery, +) +from fireflyframework_agentic.evaluation.judge import ( + severity_calibration as severity_calibration, +) +from fireflyframework_agentic.evaluation.judge import ( + source_coverage as source_coverage, +) +from fireflyframework_agentic.evaluation.judge import ( + surface_deduplication as surface_deduplication, +) +from fireflyframework_agentic.evaluation.judge_client import ( + JudgeClient as JudgeClient, +) +from fireflyframework_agentic.evaluation.judge_client import ( + parse_model as parse_model, +) +from fireflyframework_agentic.evaluation.judge_client import ( + same_provider as same_provider, +) +from fireflyframework_agentic.lab.retrieval_metrics import ( + RetrieverMetrics as RetrieverMetrics, +) +from fireflyframework_agentic.lab.retrieval_metrics import ( + compute_retrieval_metrics as compute_retrieval_metrics, +) From 9526f43315f56324aba3173b75ceebd87d9c3d71 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:25:15 +0200 Subject: [PATCH 32/48] =?UTF-8?q?chore(evaluation):=20update=20pyproject.t?= =?UTF-8?q?oml=20=E2=80=94=20drop=20scipy,=20add=20ragas=20deps,=20remove?= =?UTF-8?q?=20flyeval=20entrypoint?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bb74201f..72a04fad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -120,8 +120,10 @@ all = [ "fireflyframework-agentic[postgres,mongodb,security,embeddings,openai-embeddings,cohere-embeddings,google-embeddings,mistral-embeddings,voyage-embeddings,azure-embeddings,bedrock-embeddings,ollama-embeddings,vectorstores-chroma,vectorstores-pinecone,vectorstores-qdrant,vectorstores-pgvector,vectorstores-sqlite-vec,watch,binary]", ] evaluation = [ - "scipy>=1.11", "numpy>=1.26.0", + "ragas>=0.2", + "langchain-anthropic>=0.3", + "langchain-ollama>=0.3", ] dev = [ "pytest>=8.3.0", @@ -136,9 +138,6 @@ dev = [ "pre-commit>=3.8.0", ] -[project.scripts] -flyeval = "fireflyframework_agentic.evaluation.cli:main" - [project.urls] Homepage = "https://fireflyframework.org/" Documentation = "https://github.com/fireflyframework/fireflyframework-agentic/tree/main/docs" From d56755228af64f4f5d0d24e5edbf5426853e6929 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:26:17 +0200 Subject: [PATCH 33/48] test(evaluation): add unit tests for judge.py metrics --- tests/unit/evaluation/test_judge.py | 248 ++++++++++++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100644 tests/unit/evaluation/test_judge.py diff --git a/tests/unit/evaluation/test_judge.py b/tests/unit/evaluation/test_judge.py new file mode 100644 index 00000000..7f27c125 --- /dev/null +++ b/tests/unit/evaluation/test_judge.py @@ -0,0 +1,248 @@ +from unittest.mock import MagicMock + +import pytest + +from fireflyframework_agentic.evaluation.judge import ( + EvalContext, + addresses_question, + contains_answer, + excerpt_fill_rate, + faithfulness, + source_coverage, +) +from fireflyframework_agentic.evaluation.judge_client import JudgeClient + + +def make_ctx(responses: list[dict]) -> EvalContext: + client = MagicMock(spec=JudgeClient) + client.model_spec = "anthropic:claude-sonnet-4-6" + client.provider = "anthropic" + client.model = "claude-sonnet-4-6" + call_iter = iter(responses) + + async def mock_chat_json(system, user, max_tokens=1024): + return next(call_iter) + + client.chat_json = mock_chat_json + return EvalContext(client=client, runs=1) + + +# ── contains_answer ────────────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_contains_answer_present(): + ctx = make_ctx([{"contains_answer": 1.0, "addresses_question": 1.0}]) + item = {"question": "Q", "reference": "R", "answer": "A"} + score = await contains_answer(item, ctx) + assert score == 1.0 + + +@pytest.mark.asyncio +async def test_contains_answer_absent(): + ctx = make_ctx([{"contains_answer": 0.0, "addresses_question": 0.5}]) + item = {"question": "Q", "reference": "R", "answer": "wrong"} + score = await contains_answer(item, ctx) + assert score == 0.0 + + +@pytest.mark.asyncio +async def test_contains_answer_partial(): + ctx = make_ctx([{"contains_answer": 0.5, "addresses_question": 0.8}]) + item = {"question": "Q", "reference": "R", "answer": "partial"} + score = await contains_answer(item, ctx) + assert score == 0.5 + + +@pytest.mark.asyncio +async def test_contains_answer_missing_question_returns_none(): + ctx = make_ctx([]) + item = {"reference": "R", "answer": "A"} + score = await contains_answer(item, ctx) + assert score is None + + +# ── addresses_question ─────────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_addresses_question_yes(): + ctx = make_ctx([{"contains_answer": 0.5, "addresses_question": 1.0}]) + item = {"question": "Q", "reference": "R", "answer": "A"} + score = await addresses_question(item, ctx) + assert score == 1.0 + + +@pytest.mark.asyncio +async def test_addresses_question_no(): + ctx = make_ctx([{"contains_answer": 0.0, "addresses_question": 0.0}]) + item = {"question": "Q", "reference": "R", "answer": "irrelevant"} + score = await addresses_question(item, ctx) + assert score == 0.0 + + +@pytest.mark.asyncio +async def test_addresses_question_missing_answer_returns_none(): + ctx = make_ctx([]) + item = {"question": "Q", "reference": "R"} + score = await addresses_question(item, ctx) + assert score is None + + +# ── faithfulness ───────────────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_faithfulness_all_supported(): + # One finding with cited evidence, judge says SUPPORTED. + ctx = make_ctx([{"verdict": "SUPPORTED", "reason": "matches"}]) + item = { + "findings": [ + { + "id": "F1", + "description": "The process takes 3 days.", + "evidence_refs": [{"evidence_id": "E1"}], + } + ], + "evidence_index": [{"id": "E1", "locator": "doc.pdf#1", "excerpt": "The process takes 3 days as documented."}], + } + result = await faithfulness(item, ctx) + assert result["supported"] == 1 + assert result["total"] == 1 + assert result["unsupported_ids"] == [] + + +@pytest.mark.asyncio +async def test_faithfulness_not_supported(): + ctx = make_ctx([{"verdict": "NOT_SUPPORTED", "reason": "contradicts"}]) + item = { + "findings": [ + { + "id": "F1", + "description": "The process takes 45 days.", + "evidence_refs": [{"evidence_id": "E1"}], + } + ], + "evidence_index": [{"id": "E1", "locator": "doc.pdf#1", "excerpt": "The process takes 3 days."}], + } + result = await faithfulness(item, ctx) + assert result["supported"] == 0 + assert result["total"] == 1 + assert "F1" in result["unsupported_ids"] + + +@pytest.mark.asyncio +async def test_faithfulness_no_cited_evidence(): + # Finding with no evidence_refs -> counted as unsupported without LLM call. + ctx = make_ctx([]) + item = { + "findings": [{"id": "F1", "description": "Something.", "evidence_refs": []}], + "evidence_index": [], + } + result = await faithfulness(item, ctx) + assert result["supported"] == 0 + assert result["total"] == 1 + assert "F1" in result["unsupported_ids"] + + +# ── source_coverage ─────────────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_source_coverage_all_cited(): + ctx = make_ctx([]) + item = { + "findings": [ + { + "id": "F1", + "description": "X", + "evidence_refs": [{"evidence_id": "E1"}], + } + ], + "evidence_index": [{"id": "E1", "locator": "doc.pdf#section1", "excerpt": "text"}], + } + result = await source_coverage(item, ctx) + assert result["cited"] == 1 + assert result["total"] == 1 + assert result["orphaned"] == [] + + +@pytest.mark.asyncio +async def test_source_coverage_orphaned(): + ctx = make_ctx([]) + item = { + "findings": [{"id": "F1", "description": "X", "evidence_refs": []}], + "evidence_index": [ + {"id": "E1", "locator": "doc1.pdf#p1", "excerpt": "text"}, + {"id": "E2", "locator": "doc2.pdf#p2", "excerpt": "text2"}, + ], + } + result = await source_coverage(item, ctx) + assert result["cited"] == 0 + assert result["total"] == 2 + assert len(result["orphaned"]) == 2 + + +@pytest.mark.asyncio +async def test_source_coverage_stem_dedup(): + # Two evidence items from the same file (different fragments) -> 1 source stem. + ctx = make_ctx([]) + item = { + "findings": [ + { + "id": "F1", + "description": "X", + "evidence_refs": [{"evidence_id": "E1"}], + } + ], + "evidence_index": [ + {"id": "E1", "locator": "doc.pdf#section1", "excerpt": "text1"}, + {"id": "E2", "locator": "doc.pdf#section2", "excerpt": "text2"}, + ], + } + result = await source_coverage(item, ctx) + # Both E1 and E2 share "doc.pdf" stem -> 1 total stem. + assert result["total"] == 1 + # E1 is cited -> that stem is covered. + assert result["cited"] == 1 + + +# ── excerpt_fill_rate ────────────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_excerpt_fill_rate_full(): + ctx = make_ctx([]) + item = { + "evidence_index": [ + {"id": "E1", "excerpt": "has content"}, + {"id": "E2", "excerpt": "also has content"}, + ] + } + result = await excerpt_fill_rate(item, ctx) + assert result["populated"] == 2 + assert result["total"] == 2 + + +@pytest.mark.asyncio +async def test_excerpt_fill_rate_partial(): + ctx = make_ctx([]) + item = { + "evidence_index": [ + {"id": "E1", "excerpt": "has content"}, + {"id": "E2", "excerpt": ""}, + {"id": "E3", "excerpt": " "}, + ] + } + result = await excerpt_fill_rate(item, ctx) + assert result["populated"] == 1 + assert result["total"] == 3 + + +@pytest.mark.asyncio +async def test_excerpt_fill_rate_empty(): + ctx = make_ctx([]) + item = {"evidence_index": []} + result = await excerpt_fill_rate(item, ctx) + assert result["populated"] == 0 + assert result["total"] == 0 From 564697405a176b36b418283bee9bc1bf18a4c918 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:52:05 +0200 Subject: [PATCH 34/48] fix(lab): type-annotate out dict, remove quoted return type in retrieval_metrics --- fireflyframework_agentic/lab/retrieval_metrics.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fireflyframework_agentic/lab/retrieval_metrics.py b/fireflyframework_agentic/lab/retrieval_metrics.py index 5f3e2373..ee129eec 100644 --- a/fireflyframework_agentic/lab/retrieval_metrics.py +++ b/fireflyframework_agentic/lab/retrieval_metrics.py @@ -66,11 +66,7 @@ def _dedup(retrieved: list[dict]) -> list[dict]: def _ndcg(retrieved: list[dict], n_gold: int, k: int = 10) -> float: """Return nDCG@k for a single query.""" - dcg = sum( - 1.0 / math.log2(r["rank"] + 1) - for r in retrieved - if r.get("is_gold") and r["rank"] <= k - ) + dcg = sum(1.0 / math.log2(r["rank"] + 1) for r in retrieved if r.get("is_gold") and r["rank"] <= k) ideal = sum(1.0 / math.log2(i + 2) for i in range(min(n_gold, k))) return dcg / ideal if ideal else 0.0 @@ -140,7 +136,7 @@ def compute_retrieval_metrics(results: list[dict]) -> dict: if row.get("answer_ms") is not None: answer_ms.append(row["answer_ms"]) - out = {k: round(v / n, 4) for k, v in agg.items()} if n else {} + out: dict[str, object] = {k: round(v / n, 4) for k, v in agg.items()} if n else {} out["n_queries"] = n out["no_answer_rate"] = round(no_answer / n, 4) if n else None out["citation_precision"] = round(cite_num / cite_den, 4) if cite_den else None @@ -176,7 +172,7 @@ class RetrieverMetrics(BaseModel): mean_answer_ms: float | None = None @classmethod - def from_results(cls, results: list[dict]) -> "RetrieverMetrics": + def from_results(cls, results: list[dict]) -> RetrieverMetrics: """Compute metrics from raw retrieval result rows and return a model instance.""" m = compute_retrieval_metrics(results) return cls( From 582d1c044609fc0544bb74ab93bf65051dbc59e5 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:52:11 +0200 Subject: [PATCH 35/48] fix(lab): remove unused import math, fix import sort in test_retrieval_metrics --- tests/unit/lab/test_retrieval_metrics.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/tests/unit/lab/test_retrieval_metrics.py b/tests/unit/lab/test_retrieval_metrics.py index a018a08b..1053c550 100644 --- a/tests/unit/lab/test_retrieval_metrics.py +++ b/tests/unit/lab/test_retrieval_metrics.py @@ -16,16 +16,11 @@ from __future__ import annotations -import math - -import pytest - from fireflyframework_agentic.lab.retrieval_metrics import ( RetrieverMetrics, compute_retrieval_metrics, ) - # ── helpers ─────────────────────────────────────────────────────────────────── @@ -37,11 +32,13 @@ def _row(gold_rank: int | None = None, total: int = 5, n_gold: int = 1) -> dict: """ retrieved = [] for rank in range(1, total + 1): - retrieved.append({ - "rank": rank, - "source_id": f"doc-{rank}", - "is_gold": rank == gold_rank, - }) + retrieved.append( + { + "rank": rank, + "source_id": f"doc-{rank}", + "is_gold": rank == gold_rank, + } + ) gold_ids = [f"doc-{gold_rank}"] if gold_rank is not None else [] return { "retrieved": retrieved, From 3e62b1f92697903909544350ab26d2eb69800f36 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:52:11 +0200 Subject: [PATCH 36/48] fix(evaluation): add type: ignore for pyright errors on RAGAS/langchain calls in judge.py --- fireflyframework_agentic/evaluation/judge.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index 9f24dc26..d5bcad66 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -685,12 +685,12 @@ def _make_ragas_llm(ctx: EvalContext): from langchain_anthropic import ChatAnthropic # type: ignore[import] # noqa: PLC0415 api_key = os.environ.get("ANTHROPIC_API_KEY", "") - return ChatAnthropic(model=model, api_key=api_key, temperature=0.0) + return ChatAnthropic(model=model, api_key=api_key, temperature=0.0) # type: ignore[call-arg,arg-type] if provider in ("openai", "azure"): from langchain_openai import ChatOpenAI # type: ignore[import] # noqa: PLC0415 api_key = os.environ.get("OPENAI_API_KEY", "") - return ChatOpenAI(model=model, api_key=api_key, temperature=0.0) + return ChatOpenAI(model=model, api_key=api_key, temperature=0.0) # type: ignore[call-arg,arg-type] if provider == "ollama": from langchain_ollama import ChatOllama # type: ignore[import] # noqa: PLC0415 @@ -740,7 +740,7 @@ def _sync(): sample = _make_ragas_sample(item) dataset = EvaluationDataset(samples=[sample]) result = evaluate(dataset=dataset, metrics=[metric]) - df = result.to_pandas() + df = result.to_pandas() # type: ignore[attr-defined] col = df.columns[df.columns.str.contains(metric_name.replace("_ragas", ""), case=False)] if col.empty: return None From 3679dbca4b2fea88cf9339b9c7aac279b0891def Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 14:58:02 +0200 Subject: [PATCH 37/48] refactor(evaluation): move retrieval_metrics.py from lab/ to evaluation/ --- fireflyframework_agentic/{lab => evaluation}/retrieval_metrics.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename fireflyframework_agentic/{lab => evaluation}/retrieval_metrics.py (100%) diff --git a/fireflyframework_agentic/lab/retrieval_metrics.py b/fireflyframework_agentic/evaluation/retrieval_metrics.py similarity index 100% rename from fireflyframework_agentic/lab/retrieval_metrics.py rename to fireflyframework_agentic/evaluation/retrieval_metrics.py From 6bce3748a7988907c3e39235cf96fda0b07b38ff Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 14:58:19 +0200 Subject: [PATCH 38/48] =?UTF-8?q?refactor(evaluation):=20update=20imports?= =?UTF-8?q?=20=E2=80=94=20retrieval=5Fmetrics=20now=20in=20evaluation/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fireflyframework_agentic/evaluation/__init__.py | 4 ++-- fireflyframework_agentic/lab/__init__.py | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index c2005e7a..c68f5a19 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -82,9 +82,9 @@ from fireflyframework_agentic.evaluation.judge_client import ( same_provider as same_provider, ) -from fireflyframework_agentic.lab.retrieval_metrics import ( +from fireflyframework_agentic.evaluation.retrieval_metrics import ( RetrieverMetrics as RetrieverMetrics, ) -from fireflyframework_agentic.lab.retrieval_metrics import ( +from fireflyframework_agentic.evaluation.retrieval_metrics import ( compute_retrieval_metrics as compute_retrieval_metrics, ) diff --git a/fireflyframework_agentic/lab/__init__.py b/fireflyframework_agentic/lab/__init__.py index 8e127d8a..46cc08dc 100644 --- a/fireflyframework_agentic/lab/__init__.py +++ b/fireflyframework_agentic/lab/__init__.py @@ -18,7 +18,6 @@ from fireflyframework_agentic.lab.comparison import ComparisonEntry, ModelComparison from fireflyframework_agentic.lab.dataset import EvalCase, EvalDataset from fireflyframework_agentic.lab.evaluator import EvalOrchestrator, EvalReport, EvalResult -from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics from fireflyframework_agentic.lab.session import LabSession, SessionEntry __all__ = [ @@ -32,7 +31,5 @@ "EvalResult", "LabSession", "ModelComparison", - "RetrieverMetrics", "SessionEntry", - "compute_retrieval_metrics", ] From 9229c4348656c3e1e992780be6dc4fcdb06cea2f Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 14:58:46 +0200 Subject: [PATCH 39/48] refactor(evaluation): move test_retrieval_metrics.py to tests/unit/evaluation/ --- tests/unit/{lab => evaluation}/test_retrieval_metrics.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/unit/{lab => evaluation}/test_retrieval_metrics.py (100%) diff --git a/tests/unit/lab/test_retrieval_metrics.py b/tests/unit/evaluation/test_retrieval_metrics.py similarity index 100% rename from tests/unit/lab/test_retrieval_metrics.py rename to tests/unit/evaluation/test_retrieval_metrics.py From 6cdd3db11edda4e42c57791e632a2b594e8510cc Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:00:19 +0200 Subject: [PATCH 40/48] refactor(evaluation): replace RetrieverMetrics class with plain functions --- .../evaluation/retrieval_metrics.py | 270 +++++++++--------- 1 file changed, 140 insertions(+), 130 deletions(-) diff --git a/fireflyframework_agentic/evaluation/retrieval_metrics.py b/fireflyframework_agentic/evaluation/retrieval_metrics.py index ee129eec..5a318a2a 100644 --- a/fireflyframework_agentic/evaluation/retrieval_metrics.py +++ b/fireflyframework_agentic/evaluation/retrieval_metrics.py @@ -14,46 +14,48 @@ """Deterministic IR evaluation metrics for ranked retrieval results (no LLM, no network). -Industry-standard information-retrieval metrics computed over a ranked list of -retrieved chunks vs the gold set each result carries (``gold`` + per-hit -``is_gold``). Metrics are reported at cut-offs k ∈ {1, 5, 10}: - -* **Hit@k** -- at least one gold document appears in the top-k results. -* **Recall@k** -- fraction of gold documents found in top-k. -* **Precision@k** -- fraction of top-k results that are gold. -* **MRR@10** -- mean reciprocal rank of the first gold hit (up to k=10). -* **MAP@10** -- mean average precision (up to k=10). -* **nDCG@10** -- normalised discounted cumulative gain (up to k=10). - -Optional fields (populated when the raw result rows contain them): - -* ``no_answer_rate`` -- fraction of rows where the model produced no answer. -* ``citation_precision`` -- precision of in-answer citations vs gold set. -* ``mean_search_ms`` / ``mean_answer_ms`` -- mean retrieval and generation latencies. - -Ported from ``flycanon_experiments/scripts/deterministic_eval.py``. +Each metric is a plain function that takes a list of result rows and returns a +float — the same design as scikit-learn or MS MARCO evaluation scripts. + +Result row schema (dict):: + + { + "retrieved": [{"rank": int, "source_id": str, "is_gold": bool}, ...], + "gold": [str, ...], # gold source identifiers + # optional: + "no_answer": bool, # model refused / produced no answer + "answer": str, # used for no_answer detection when no_answer absent + "citations": [{"is_gold": bool}, ...], + "search_ms": float, + "answer_ms": float, + } + +Individual metrics (recommended for composability):: + + hit_at_k(results, k) -> float + recall_at_k(results, k) -> float + precision_at_k(results, k) -> float + mrr(results, k=10) -> float + map_score(results, k=10) -> float + ndcg(results, k=10) -> float + no_answer_rate(results) -> float | None + citation_precision(results) -> float | None + mean_latency_ms(results, field) -> float | None + +Convenience aggregate (all metrics in one call):: + + compute_retrieval_metrics(results) -> dict """ from __future__ import annotations import math -from pydantic import BaseModel - KS = (1, 5, 10) def _dedup(retrieved: list[dict]) -> list[dict]: - """Return one entry per source, first chunk wins, preserving rank order. - - flycanon splits each ingested document into many chunks; a single gold - filing can therefore appear multiple times in the ranked list. Without - deduplication nDCG/MAP/Recall count every chunk separately, inflating - scores past 1.0 when a good embedding model retrieves several chunks from - the same filing. Taking only the first (highest-ranked) chunk per - source_id makes the list item-unique, matching the recommenders-library - contract that all IR formulae assume. - """ + """Return one entry per source, first chunk wins, preserving rank order.""" seen: set[str] = set() out: list[dict] = [] for r in sorted(retrieved, key=lambda x: x["rank"]): @@ -64,15 +66,13 @@ def _dedup(retrieved: list[dict]) -> list[dict]: return out -def _ndcg(retrieved: list[dict], n_gold: int, k: int = 10) -> float: - """Return nDCG@k for a single query.""" +def _ndcg_single(retrieved: list[dict], n_gold: int, k: int = 10) -> float: dcg = sum(1.0 / math.log2(r["rank"] + 1) for r in retrieved if r.get("is_gold") and r["rank"] <= k) ideal = sum(1.0 / math.log2(i + 2) for i in range(min(n_gold, k))) return dcg / ideal if ideal else 0.0 -def _ap(retrieved: list[dict], n_gold: int, k: int = 10) -> float: - """Return average precision@k for a single query.""" +def _ap_single(retrieved: list[dict], n_gold: int, k: int = 10) -> float: hits, precisions = 0, [] for r in sorted(retrieved, key=lambda x: x["rank"]): if r["rank"] > k: @@ -83,114 +83,124 @@ def _ap(retrieved: list[dict], n_gold: int, k: int = 10) -> float: return sum(precisions) / min(n_gold, k) if n_gold else 0.0 -def compute_retrieval_metrics(results: list[dict]) -> dict: - """Compute deterministic IR metrics over a list of retrieval result rows. +def hit_at_k(results: list[dict], k: int) -> float: + """Fraction of queries where at least one gold document appears in top-k.""" + if not results: + return 0.0 + hits = 0 + for row in results: + retrieved = _dedup(row["retrieved"]) + gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")] + if any(g <= k for g in gold_ranks): + hits += 1 + return round(hits / len(results), 4) + - Each element of *results* must be a dict with at least: +def recall_at_k(results: list[dict], k: int) -> float: + """Mean fraction of gold documents found in top-k.""" + if not results: + return 0.0 + total = 0.0 + for row in results: + retrieved = _dedup(row["retrieved"]) + n_gold = max(len(set(row["gold"])), 1) + gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")] + total += len([g for g in gold_ranks if g <= k]) / n_gold + return round(total / len(results), 4) - * ``retrieved`` -- list of dicts with ``rank`` (int, 1-based), ``source_id`` - (str) or ``identities`` (list[str]), and ``is_gold`` (bool). - * ``gold`` -- list of gold source identifiers (used to compute ``n_gold``). - Optional keys per row: +def precision_at_k(results: list[dict], k: int) -> float: + """Mean fraction of top-k results that are gold.""" + if not results: + return 0.0 + total = 0.0 + for row in results: + retrieved = _dedup(row["retrieved"]) + gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")] + total += len([g for g in gold_ranks if g <= k]) / k + return round(total / len(results), 4) - * ``no_answer`` (bool) / ``answer`` (str) -- used for ``no_answer_rate``. - * ``citations`` (list[dict]) -- each with ``is_gold`` (bool) for citation precision. - * ``search_ms`` (float) / ``answer_ms`` (float) -- latency in milliseconds. - Returns a flat dict with keys: ``n_queries``, ``hit@1``, ``hit@5``, - ``hit@10``, ``recall@1``, ``recall@5``, ``recall@10``, ``precision@1``, - ``precision@5``, ``precision@10``, ``mrr@10``, ``map@10``, ``ndcg@10``, - ``no_answer_rate``, ``citation_precision``, ``mean_search_ms``, - ``mean_answer_ms``. - """ - n = len(results) - agg = {f"{m}@{k}": 0.0 for k in KS for m in ("hit", "recall", "precision")} - agg.update({"mrr@10": 0.0, "map@10": 0.0, "ndcg@10": 0.0}) - no_answer = 0 - cite_num = cite_den = 0.0 - search_ms: list[float] = [] - answer_ms: list[float] = [] +def mrr(results: list[dict], k: int = 10) -> float: + """Mean reciprocal rank of the first gold hit (up to k).""" + if not results: + return 0.0 + total = 0.0 + for row in results: + retrieved = _dedup(row["retrieved"]) + gold_ranks = sorted(r["rank"] for r in retrieved if r.get("is_gold") and r["rank"] <= k) + total += 1.0 / gold_ranks[0] if gold_ranks else 0.0 + return round(total / len(results), 4) + +def map_score(results: list[dict], k: int = 10) -> float: + """Mean average precision at k.""" + if not results: + return 0.0 + total = 0.0 for row in results: retrieved = _dedup(row["retrieved"]) n_gold = max(len(set(row["gold"])), 1) - gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")] - for k in KS: - in_k = [g for g in gold_ranks if g <= k] - agg[f"hit@{k}"] += 1.0 if in_k else 0.0 - agg[f"recall@{k}"] += len(in_k) / n_gold - agg[f"precision@{k}"] += len(in_k) / k - agg["mrr@10"] += (1.0 / min(gold_ranks)) if gold_ranks else 0.0 - agg["map@10"] += _ap(retrieved, n_gold) - agg["ndcg@10"] += _ndcg(retrieved, n_gold) - - if row.get("no_answer") or not row.get("answer", "").strip(): - no_answer += 1 + total += _ap_single(retrieved, n_gold, k) + return round(total / len(results), 4) + + +def ndcg(results: list[dict], k: int = 10) -> float: + """Mean normalised discounted cumulative gain at k.""" + if not results: + return 0.0 + total = 0.0 + for row in results: + retrieved = _dedup(row["retrieved"]) + n_gold = max(len(set(row["gold"])), 1) + total += _ndcg_single(retrieved, n_gold, k) + return round(total / len(results), 4) + + +def no_answer_rate(results: list[dict]) -> float | None: + """Fraction of queries where the model produced no answer. None if no results.""" + if not results: + return None + count = sum( + 1 for row in results if row.get("no_answer") or not row.get("answer", "").strip() + ) + return round(count / len(results), 4) + + +def citation_precision(results: list[dict]) -> float | None: + """Precision of in-answer citations vs gold set. None if no citations present.""" + num = den = 0.0 + for row in results: cites = row.get("citations", []) if cites: - cite_num += sum(1 for c in cites if c.get("is_gold")) - cite_den += len(cites) - if row.get("search_ms") is not None: - search_ms.append(row["search_ms"]) - if row.get("answer_ms") is not None: - answer_ms.append(row["answer_ms"]) - - out: dict[str, object] = {k: round(v / n, 4) for k, v in agg.items()} if n else {} - out["n_queries"] = n - out["no_answer_rate"] = round(no_answer / n, 4) if n else None - out["citation_precision"] = round(cite_num / cite_den, 4) if cite_den else None - out["mean_search_ms"] = round(sum(search_ms) / len(search_ms)) if search_ms else None - out["mean_answer_ms"] = round(sum(answer_ms) / len(answer_ms)) if answer_ms else None - return out + num += sum(1 for c in cites if c.get("is_gold")) + den += len(cites) + return round(num / den, 4) if den else None -class RetrieverMetrics(BaseModel): - """Structured IR metrics for a retrieval evaluation run. +def mean_latency_ms(results: list[dict], field: str) -> float | None: + """Mean latency in ms for the given field (``search_ms`` or ``answer_ms``). None if absent.""" + values = [row[field] for row in results if row.get(field) is not None] + return round(sum(values) / len(values)) if values else None - Fields mirror the flat dict returned by :func:`compute_retrieval_metrics`. - Optional fields are ``None`` when the raw result rows lack the required data - (e.g. no latency timestamps, no citations). - """ - n_queries: int = 0 - hit_at_1: float = 0.0 - hit_at_5: float = 0.0 - hit_at_10: float = 0.0 - recall_at_1: float = 0.0 - recall_at_5: float = 0.0 - recall_at_10: float = 0.0 - precision_at_1: float = 0.0 - precision_at_5: float = 0.0 - precision_at_10: float = 0.0 - mrr_at_10: float = 0.0 - map_at_10: float = 0.0 - ndcg_at_10: float = 0.0 - no_answer_rate: float | None = None - citation_precision: float | None = None - mean_search_ms: float | None = None - mean_answer_ms: float | None = None - - @classmethod - def from_results(cls, results: list[dict]) -> RetrieverMetrics: - """Compute metrics from raw retrieval result rows and return a model instance.""" - m = compute_retrieval_metrics(results) - return cls( - n_queries=m.get("n_queries", 0), - hit_at_1=m.get("hit@1", 0.0), - hit_at_5=m.get("hit@5", 0.0), - hit_at_10=m.get("hit@10", 0.0), - recall_at_1=m.get("recall@1", 0.0), - recall_at_5=m.get("recall@5", 0.0), - recall_at_10=m.get("recall@10", 0.0), - precision_at_1=m.get("precision@1", 0.0), - precision_at_5=m.get("precision@5", 0.0), - precision_at_10=m.get("precision@10", 0.0), - mrr_at_10=m.get("mrr@10", 0.0), - map_at_10=m.get("map@10", 0.0), - ndcg_at_10=m.get("ndcg@10", 0.0), - no_answer_rate=m.get("no_answer_rate"), - citation_precision=m.get("citation_precision"), - mean_search_ms=m.get("mean_search_ms"), - mean_answer_ms=m.get("mean_answer_ms"), - ) +def compute_retrieval_metrics(results: list[dict]) -> dict: + """Compute all IR metrics over a list of retrieval result rows and return a flat dict. + + Convenience wrapper that calls each individual metric function. Prefer the + individual functions (``hit_at_k``, ``recall_at_k``, etc.) when you only + need a subset. + """ + out: dict[str, object] = {"n_queries": len(results)} + for k in KS: + out[f"hit@{k}"] = hit_at_k(results, k) + out[f"recall@{k}"] = recall_at_k(results, k) + out[f"precision@{k}"] = precision_at_k(results, k) + out["mrr@10"] = mrr(results) + out["map@10"] = map_score(results) + out["ndcg@10"] = ndcg(results) + out["no_answer_rate"] = no_answer_rate(results) + out["citation_precision"] = citation_precision(results) + out["mean_search_ms"] = mean_latency_ms(results, "search_ms") + out["mean_answer_ms"] = mean_latency_ms(results, "answer_ms") + return out From 3a3c35fbb775340ddc3c05f5265849610a90bac2 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:00:34 +0200 Subject: [PATCH 41/48] =?UTF-8?q?refactor(evaluation):=20update=20=5F=5Fin?= =?UTF-8?q?it=5F=5F.py=20exports=20=E2=80=94=20replace=20RetrieverMetrics?= =?UTF-8?q?=20with=20individual=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../evaluation/__init__.py | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index c68f5a19..9f31ee7b 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -83,8 +83,32 @@ same_provider as same_provider, ) from fireflyframework_agentic.evaluation.retrieval_metrics import ( - RetrieverMetrics as RetrieverMetrics, + citation_precision as citation_precision, ) from fireflyframework_agentic.evaluation.retrieval_metrics import ( compute_retrieval_metrics as compute_retrieval_metrics, ) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + hit_at_k as hit_at_k, +) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + map_score as map_score, +) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + mean_latency_ms as mean_latency_ms, +) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + mrr as mrr, +) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + ndcg as ndcg, +) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + no_answer_rate as no_answer_rate, +) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + precision_at_k as precision_at_k, +) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + recall_at_k as recall_at_k, +) From 26bfe3b0b56362039eb00c2d0859858ed52e542d Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:01:27 +0200 Subject: [PATCH 42/48] test(evaluation): rewrite test_retrieval_metrics for individual metric functions --- .../unit/evaluation/test_retrieval_metrics.py | 254 ++++++++---------- 1 file changed, 107 insertions(+), 147 deletions(-) diff --git a/tests/unit/evaluation/test_retrieval_metrics.py b/tests/unit/evaluation/test_retrieval_metrics.py index 1053c550..38fc07fe 100644 --- a/tests/unit/evaluation/test_retrieval_metrics.py +++ b/tests/unit/evaluation/test_retrieval_metrics.py @@ -12,233 +12,193 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Unit tests for lab.retrieval_metrics: compute_retrieval_metrics and RetrieverMetrics.""" +"""Unit tests for evaluation.retrieval_metrics.""" from __future__ import annotations -from fireflyframework_agentic.lab.retrieval_metrics import ( - RetrieverMetrics, +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + citation_precision, compute_retrieval_metrics, + hit_at_k, + map_score, + mean_latency_ms, + mrr, + ndcg, + no_answer_rate, + precision_at_k, + recall_at_k, ) -# ── helpers ─────────────────────────────────────────────────────────────────── - def _row(gold_rank: int | None = None, total: int = 5, n_gold: int = 1) -> dict: - """Build one result row with ``total`` retrieved items. - - If ``gold_rank`` is not None, the item at that rank is marked as gold. - All items get a unique ``source_id`` so dedup leaves them all. - """ retrieved = [] for rank in range(1, total + 1): - retrieved.append( - { - "rank": rank, - "source_id": f"doc-{rank}", - "is_gold": rank == gold_rank, - } - ) + retrieved.append({"rank": rank, "source_id": f"doc-{rank}", "is_gold": rank == gold_rank}) gold_ids = [f"doc-{gold_rank}"] if gold_rank is not None else [] - return { - "retrieved": retrieved, - "gold": gold_ids * n_gold, - } + return {"retrieved": retrieved, "gold": gold_ids * n_gold} -# ── hit@k ───────────────────────────────────────────────────────────────────── +# ── hit_at_k ────────────────────────────────────────────────────────────────── -def test_hit_at_1_perfect_when_gold_is_rank1(): - results = [_row(gold_rank=1)] - m = compute_retrieval_metrics(results) - assert m["hit@1"] == 1.0 +def test_hit_at_k_gold_at_rank1(): + assert hit_at_k([_row(gold_rank=1)], k=1) == 1.0 -def test_hit_at_1_zero_when_gold_not_in_top1(): - results = [_row(gold_rank=2)] - m = compute_retrieval_metrics(results) - assert m["hit@1"] == 0.0 +def test_hit_at_k_miss_at_rank1(): + assert hit_at_k([_row(gold_rank=2)], k=1) == 0.0 + + +def test_hit_at_k_gold_at_rank5(): + assert hit_at_k([_row(gold_rank=5)], k=5) == 1.0 -def test_hit_at_5_one_when_gold_at_rank5(): - results = [_row(gold_rank=5)] - m = compute_retrieval_metrics(results) - assert m["hit@5"] == 1.0 +def test_hit_at_k_gold_at_rank10(): + assert hit_at_k([_row(gold_rank=10, total=10)], k=10) == 1.0 -def test_hit_at_5_zero_when_gold_not_in_top5(): - # Gold is at rank 10 — outside top-5 window with only 5 items, make 10. - results = [_row(gold_rank=None, total=10)] # no gold in retrieved - m = compute_retrieval_metrics(results) - assert m["hit@5"] == 0.0 +def test_hit_at_k_empty(): + assert hit_at_k([], k=5) == 0.0 -def test_hit_at_10_one_when_gold_at_rank10(): - results = [_row(gold_rank=10, total=10)] - m = compute_retrieval_metrics(results) - assert m["hit@10"] == 1.0 +# ── recall_at_k ─────────────────────────────────────────────────────────────── -# ── recall@k ────────────────────────────────────────────────────────────────── +def test_recall_at_k_full_when_gold_at_rank1(): + assert recall_at_k([_row(gold_rank=1, n_gold=1)], k=1) == 1.0 + + +def test_recall_at_k_zero_when_gold_outside_k(): + assert recall_at_k([_row(gold_rank=5)], k=1) == 0.0 def test_recall_at_k_increases_with_k(): - # Gold at rank 3: recall@1=0, recall@5>=recall@1. - results = [_row(gold_rank=3)] - m = compute_retrieval_metrics(results) - assert m["recall@1"] <= m["recall@5"] <= m["recall@10"] + rows = [_row(gold_rank=3)] + assert recall_at_k(rows, k=1) <= recall_at_k(rows, k=5) <= recall_at_k(rows, k=10) + + +# ── precision_at_k ──────────────────────────────────────────────────────────── -def test_recall_at_1_full_when_single_gold_at_rank1(): - results = [_row(gold_rank=1, n_gold=1)] - m = compute_retrieval_metrics(results) - assert m["recall@1"] == 1.0 +def test_precision_at_k_gold_at_rank1(): + assert precision_at_k([_row(gold_rank=1)], k=1) == 1.0 -def test_recall_at_1_zero_when_no_gold_in_rank1(): - results = [_row(gold_rank=5)] - m = compute_retrieval_metrics(results) - assert m["recall@1"] == 0.0 +def test_precision_at_k_decreases_when_k_larger(): + rows = [_row(gold_rank=1)] + assert precision_at_k(rows, k=5) < precision_at_k(rows, k=1) -# ── MRR ─────────────────────────────────────────────────────────────────────── +# ── mrr ─────────────────────────────────────────────────────────────────────── -def test_mrr_is_1_when_gold_at_rank1(): - results = [_row(gold_rank=1)] - m = compute_retrieval_metrics(results) - assert m["mrr@10"] == 1.0 +def test_mrr_gold_at_rank1(): + assert mrr([_row(gold_rank=1)]) == 1.0 -def test_mrr_is_half_when_gold_at_rank2(): - results = [_row(gold_rank=2)] - m = compute_retrieval_metrics(results) - assert abs(m["mrr@10"] - 0.5) < 1e-9 +def test_mrr_gold_at_rank2(): + assert abs(mrr([_row(gold_rank=2)]) - 0.5) < 1e-9 -def test_mrr_is_zero_when_no_gold(): - results = [_row(gold_rank=None)] - m = compute_retrieval_metrics(results) - assert m["mrr@10"] == 0.0 +def test_mrr_no_gold(): + assert mrr([_row(gold_rank=None)]) == 0.0 def test_mrr_average_across_queries(): - # Query 1: gold at rank 1 (MRR=1.0); Query 2: gold at rank 2 (MRR=0.5). - results = [_row(gold_rank=1), _row(gold_rank=2)] - m = compute_retrieval_metrics(results) - assert abs(m["mrr@10"] - 0.75) < 1e-3 + rows = [_row(gold_rank=1), _row(gold_rank=2)] + assert abs(mrr(rows) - 0.75) < 1e-3 -# ── nDCG ────────────────────────────────────────────────────────────────────── +# ── ndcg ────────────────────────────────────────────────────────────────────── -def test_ndcg_is_1_when_gold_at_rank1(): - results = [_row(gold_rank=1, n_gold=1)] - m = compute_retrieval_metrics(results) - assert abs(m["ndcg@10"] - 1.0) < 1e-9 +def test_ndcg_gold_at_rank1(): + assert abs(ndcg([_row(gold_rank=1, n_gold=1)]) - 1.0) < 1e-9 -def test_ndcg_is_less_than_1_when_gold_not_at_rank1(): - results = [_row(gold_rank=3, n_gold=1)] - m = compute_retrieval_metrics(results) - assert m["ndcg@10"] < 1.0 - assert m["ndcg@10"] > 0.0 +def test_ndcg_less_than_1_when_not_at_rank1(): + score = ndcg([_row(gold_rank=3, n_gold=1)]) + assert 0.0 < score < 1.0 -def test_ndcg_is_zero_when_no_gold(): - results = [_row(gold_rank=None)] - m = compute_retrieval_metrics(results) - assert m["ndcg@10"] == 0.0 +def test_ndcg_zero_when_no_gold(): + assert ndcg([_row(gold_rank=None)]) == 0.0 -# ── n_queries ───────────────────────────────────────────────────────────────── +# ── map_score ───────────────────────────────────────────────────────────────── -def test_n_queries_matches_input_length(): - results = [_row(gold_rank=1), _row(gold_rank=2), _row(gold_rank=3)] - m = compute_retrieval_metrics(results) - assert m["n_queries"] == 3 +def test_map_score_perfect_when_gold_at_rank1(): + assert map_score([_row(gold_rank=1, n_gold=1)]) == 1.0 -def test_empty_results_returns_zero_n_queries(): - m = compute_retrieval_metrics([]) - assert m["n_queries"] == 0 +def test_map_score_zero_when_no_gold(): + assert map_score([_row(gold_rank=None)]) == 0.0 + + +# ── no_answer_rate ──────────────────────────────────────────────────────────── + +def test_no_answer_rate_zero_when_answer_present(): + rows = [{**_row(gold_rank=1), "answer": "some answer"}] + assert no_answer_rate(rows) == 0.0 -# ── optional fields ─────────────────────────────────────────────────────────── +def test_no_answer_rate_one_when_no_answer_field(): + assert no_answer_rate([_row(gold_rank=1)]) == 1.0 -def test_no_answer_rate_is_zero_when_answer_present(): - # Rows with a non-empty answer string are counted as answered. - results = [{**_row(gold_rank=1), "answer": "some answer text"}] - m = compute_retrieval_metrics(results) - assert m["no_answer_rate"] == 0.0 +def test_no_answer_rate_none_when_empty(): + assert no_answer_rate([]) is None -def test_no_answer_rate_is_one_when_no_answer_field(): - # Rows without an answer field are treated as no-answer by the implementation. - results = [_row(gold_rank=1)] - m = compute_retrieval_metrics(results) - assert m["no_answer_rate"] == 1.0 +# ── citation_precision ──────────────────────────────────────────────────────── -def test_citation_precision_is_none_when_no_citations(): - results = [_row(gold_rank=1)] - m = compute_retrieval_metrics(results) - assert m["citation_precision"] is None +def test_citation_precision_none_when_no_citations(): + assert citation_precision([_row(gold_rank=1)]) is None -def test_latency_fields_are_none_when_absent(): - results = [_row(gold_rank=1)] - m = compute_retrieval_metrics(results) - assert m["mean_search_ms"] is None - assert m["mean_answer_ms"] is None +def test_citation_precision_1_when_all_gold(): + rows = [{**_row(gold_rank=1), "citations": [{"is_gold": True}, {"is_gold": True}]}] + assert citation_precision(rows) == 1.0 -def test_mean_search_ms_computed_when_present(): - results = [{**_row(gold_rank=1), "search_ms": 100.0, "answer_ms": 200.0}] - m = compute_retrieval_metrics(results) - assert m["mean_search_ms"] == 100 - assert m["mean_answer_ms"] == 200 +def test_citation_precision_half_when_half_gold(): + rows = [{**_row(gold_rank=1), "citations": [{"is_gold": True}, {"is_gold": False}]}] + assert citation_precision(rows) == 0.5 -# ── RetrieverMetrics.from_results ───────────────────────────────────────────── +# ── mean_latency_ms ─────────────────────────────────────────────────────────── -def test_retriever_metrics_from_results_hit_at_1(): - results = [_row(gold_rank=1)] - rm = RetrieverMetrics.from_results(results) - assert rm.hit_at_1 == 1.0 +def test_mean_latency_none_when_field_absent(): + assert mean_latency_ms([_row(gold_rank=1)], "search_ms") is None -def test_retriever_metrics_from_results_n_queries(): - results = [_row(gold_rank=1), _row(gold_rank=2)] - rm = RetrieverMetrics.from_results(results) - assert rm.n_queries == 2 +def test_mean_latency_computed_when_present(): + rows = [{**_row(gold_rank=1), "search_ms": 100.0, "answer_ms": 200.0}] + assert mean_latency_ms(rows, "search_ms") == 100 + assert mean_latency_ms(rows, "answer_ms") == 200 -def test_retriever_metrics_from_results_mrr(): - results = [_row(gold_rank=1)] - rm = RetrieverMetrics.from_results(results) - assert rm.mrr_at_10 == 1.0 +# ── compute_retrieval_metrics (aggregate) ───────────────────────────────────── -def test_retriever_metrics_from_results_defaults_on_empty(): - rm = RetrieverMetrics.from_results([]) - assert rm.n_queries == 0 - assert rm.hit_at_1 == 0.0 - assert rm.mrr_at_10 == 0.0 +def test_compute_retrieval_metrics_n_queries(): + assert compute_retrieval_metrics([_row(1), _row(2), _row(3)])["n_queries"] == 3 -def test_retriever_metrics_is_pydantic_model(): - rm = RetrieverMetrics() - assert rm.n_queries == 0 - assert rm.hit_at_1 == 0.0 - assert rm.no_answer_rate is None + +def test_compute_retrieval_metrics_empty(): + m = compute_retrieval_metrics([]) + assert m["n_queries"] == 0 + assert m["hit@1"] == 0.0 -def test_retriever_metrics_recall_increases_with_k(): - results = [_row(gold_rank=3)] - rm = RetrieverMetrics.from_results(results) - assert rm.recall_at_1 <= rm.recall_at_5 <= rm.recall_at_10 +def test_compute_retrieval_metrics_matches_individual_functions(): + rows = [_row(gold_rank=1), _row(gold_rank=2)] + m = compute_retrieval_metrics(rows) + assert m["hit@1"] == hit_at_k(rows, 1) + assert m["recall@5"] == recall_at_k(rows, 5) + assert m["mrr@10"] == mrr(rows) + assert m["ndcg@10"] == ndcg(rows) From feadcbdc28a70cc9bd0cf38b3793268108f845f3 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:09:06 +0200 Subject: [PATCH 43/48] Remove compute_retrieval_metrics() and KS constant from retrieval_metrics --- .../evaluation/retrieval_metrics.py | 29 +------------------ 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/fireflyframework_agentic/evaluation/retrieval_metrics.py b/fireflyframework_agentic/evaluation/retrieval_metrics.py index 5a318a2a..df42ab24 100644 --- a/fireflyframework_agentic/evaluation/retrieval_metrics.py +++ b/fireflyframework_agentic/evaluation/retrieval_metrics.py @@ -30,7 +30,7 @@ "answer_ms": float, } -Individual metrics (recommended for composability):: +Individual metrics:: hit_at_k(results, k) -> float recall_at_k(results, k) -> float @@ -41,19 +41,12 @@ no_answer_rate(results) -> float | None citation_precision(results) -> float | None mean_latency_ms(results, field) -> float | None - -Convenience aggregate (all metrics in one call):: - - compute_retrieval_metrics(results) -> dict """ from __future__ import annotations import math -KS = (1, 5, 10) - - def _dedup(retrieved: list[dict]) -> list[dict]: """Return one entry per source, first chunk wins, preserving rank order.""" seen: set[str] = set() @@ -184,23 +177,3 @@ def mean_latency_ms(results: list[dict], field: str) -> float | None: return round(sum(values) / len(values)) if values else None -def compute_retrieval_metrics(results: list[dict]) -> dict: - """Compute all IR metrics over a list of retrieval result rows and return a flat dict. - - Convenience wrapper that calls each individual metric function. Prefer the - individual functions (``hit_at_k``, ``recall_at_k``, etc.) when you only - need a subset. - """ - out: dict[str, object] = {"n_queries": len(results)} - for k in KS: - out[f"hit@{k}"] = hit_at_k(results, k) - out[f"recall@{k}"] = recall_at_k(results, k) - out[f"precision@{k}"] = precision_at_k(results, k) - out["mrr@10"] = mrr(results) - out["map@10"] = map_score(results) - out["ndcg@10"] = ndcg(results) - out["no_answer_rate"] = no_answer_rate(results) - out["citation_precision"] = citation_precision(results) - out["mean_search_ms"] = mean_latency_ms(results, "search_ms") - out["mean_answer_ms"] = mean_latency_ms(results, "answer_ms") - return out From d54814fa98f85f42c8ef20be5f6f74db3b111f81 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:09:12 +0200 Subject: [PATCH 44/48] Remove compute_retrieval_metrics export from evaluation __init__ --- fireflyframework_agentic/evaluation/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index 9f31ee7b..35dd32f7 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -85,9 +85,6 @@ from fireflyframework_agentic.evaluation.retrieval_metrics import ( citation_precision as citation_precision, ) -from fireflyframework_agentic.evaluation.retrieval_metrics import ( - compute_retrieval_metrics as compute_retrieval_metrics, -) from fireflyframework_agentic.evaluation.retrieval_metrics import ( hit_at_k as hit_at_k, ) From 08536982e27522f6c3ade60db4c6e2716942e46e Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:09:35 +0200 Subject: [PATCH 45/48] Remove test_compute_retrieval_metrics_* tests --- .../unit/evaluation/test_retrieval_metrics.py | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/tests/unit/evaluation/test_retrieval_metrics.py b/tests/unit/evaluation/test_retrieval_metrics.py index 38fc07fe..ef38467f 100644 --- a/tests/unit/evaluation/test_retrieval_metrics.py +++ b/tests/unit/evaluation/test_retrieval_metrics.py @@ -18,7 +18,6 @@ from fireflyframework_agentic.evaluation.retrieval_metrics import ( citation_precision, - compute_retrieval_metrics, hit_at_k, map_score, mean_latency_ms, @@ -182,23 +181,3 @@ def test_mean_latency_computed_when_present(): assert mean_latency_ms(rows, "answer_ms") == 200 -# ── compute_retrieval_metrics (aggregate) ───────────────────────────────────── - - -def test_compute_retrieval_metrics_n_queries(): - assert compute_retrieval_metrics([_row(1), _row(2), _row(3)])["n_queries"] == 3 - - -def test_compute_retrieval_metrics_empty(): - m = compute_retrieval_metrics([]) - assert m["n_queries"] == 0 - assert m["hit@1"] == 0.0 - - -def test_compute_retrieval_metrics_matches_individual_functions(): - rows = [_row(gold_rank=1), _row(gold_rank=2)] - m = compute_retrieval_metrics(rows) - assert m["hit@1"] == hit_at_k(rows, 1) - assert m["recall@5"] == recall_at_k(rows, 5) - assert m["mrr@10"] == mrr(rows) - assert m["ndcg@10"] == ndcg(rows) From a7b1b91843b8c1c848872375d0b01618ceb84143 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:10:32 +0200 Subject: [PATCH 46/48] Update flycanon_eval_example to use plain metric functions instead of RetrieverMetrics --- examples/flycanon_eval_example.py | 74 ++++++++++++++++--------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/examples/flycanon_eval_example.py b/examples/flycanon_eval_example.py index 856b520b..30e66bd1 100644 --- a/examples/flycanon_eval_example.py +++ b/examples/flycanon_eval_example.py @@ -26,7 +26,7 @@ The champion/challenger pattern mirrors the flycanon_experiments harness: each run writes metrics to a file; ``approve`` promotes it by repointing baseline.json. Here we replicate that flow using the framework's -``compute_retrieval_metrics`` / ``RetrieverMetrics`` API directly. +individual retrieval metric functions directly. Usage:: @@ -94,7 +94,17 @@ import sys from pathlib import Path -from fireflyframework_agentic.evaluation import RetrieverMetrics +from fireflyframework_agentic.evaluation import ( + citation_precision, + hit_at_k, + map_score, + mean_latency_ms, + mrr, + ndcg, + no_answer_rate, + precision_at_k, + recall_at_k, +) # --------------------------------------------------------------------------- # Helpers @@ -131,32 +141,31 @@ def _save_baseline(path: str, metrics: dict) -> None: Path(path).write_text(json.dumps(metrics, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") -def _metrics_to_flat(m: RetrieverMetrics) -> dict: - """Convert a RetrieverMetrics model to the flat dict stored in baseline.json.""" +def _compute_metrics(results: list[dict]) -> dict: + """Compute all IR metrics and return a flat dict.""" return { - "n_queries": m.n_queries, - "hit@1": m.hit_at_1, - "hit@5": m.hit_at_5, - "hit@10": m.hit_at_10, - "recall@1": m.recall_at_1, - "recall@5": m.recall_at_5, - "recall@10": m.recall_at_10, - "precision@1": m.precision_at_1, - "precision@5": m.precision_at_5, - "precision@10": m.precision_at_10, - "mrr@10": m.mrr_at_10, - "map@10": m.map_at_10, - "ndcg@10": m.ndcg_at_10, - "no_answer_rate": m.no_answer_rate, - "citation_precision": m.citation_precision, - "mean_search_ms": m.mean_search_ms, - "mean_answer_ms": m.mean_answer_ms, + "n_queries": len(results), + "hit@1": hit_at_k(results, 1), + "hit@5": hit_at_k(results, 5), + "hit@10": hit_at_k(results, 10), + "recall@1": recall_at_k(results, 1), + "recall@5": recall_at_k(results, 5), + "recall@10": recall_at_k(results, 10), + "precision@1": precision_at_k(results, 1), + "precision@5": precision_at_k(results, 5), + "precision@10": precision_at_k(results, 10), + "mrr@10": mrr(results), + "map@10": map_score(results), + "ndcg@10": ndcg(results), + "no_answer_rate": no_answer_rate(results), + "citation_precision": citation_precision(results), + "mean_search_ms": mean_latency_ms(results, "search_ms"), + "mean_answer_ms": mean_latency_ms(results, "answer_ms"), } -def _print_metrics_table(metrics: RetrieverMetrics, baseline: dict | None) -> None: +def _print_metrics_table(flat: dict, baseline: dict | None) -> None: """Print a formatted table comparing current metrics vs baseline.""" - flat = _metrics_to_flat(metrics) col_w = 22 num_w = 10 @@ -244,10 +253,6 @@ def run_evaluation(args: argparse.Namespace) -> int: # ------------------------------------------------------------------ # Step 2 — Compute deterministic IR metrics. # - # compute_retrieval_metrics() returns a flat dict of standard IR metrics. - # RetrieverMetrics.from_results() wraps that into a typed Pydantic model - # for convenient attribute access. - # # Metrics are computed at cut-offs k ∈ {1, 5, 10} and include: # hit@k -- at least one gold doc in top-k (binary) # recall@k -- fraction of gold docs found in top-k @@ -257,13 +262,13 @@ def run_evaluation(args: argparse.Namespace) -> int: # ndcg@10 -- normalised discounted cumulative gain # ------------------------------------------------------------------ print("\nComputing retrieval metrics ...") - metrics = RetrieverMetrics.from_results(results) + flat = _compute_metrics(results) - print(f" nDCG@10 : {metrics.ndcg_at_10:.4f}") - print(f" MRR@10 : {metrics.mrr_at_10:.4f}") - print(f" Recall@10 : {metrics.recall_at_10:.4f}") - print(f" Hit@10 : {metrics.hit_at_10:.4f}") - print(f" MAP@10 : {metrics.map_at_10:.4f}") + print(f" nDCG@10 : {flat['ndcg@10']:.4f}") + print(f" MRR@10 : {flat['mrr@10']:.4f}") + print(f" Recall@10 : {flat['recall@10']:.4f}") + print(f" Hit@10 : {flat['hit@10']:.4f}") + print(f" MAP@10 : {flat['map@10']:.4f}") # ------------------------------------------------------------------ # Step 3 — Load the baseline (champion) for regression detection. @@ -282,7 +287,7 @@ def run_evaluation(args: argparse.Namespace) -> int: print("\n" + "=" * 56) print("Retrieval Metrics") print("=" * 56) - _print_metrics_table(metrics, baseline) + _print_metrics_table(flat, baseline) # ------------------------------------------------------------------ # Step 5 — Regression check. @@ -291,7 +296,6 @@ def run_evaluation(args: argparse.Namespace) -> int: # promotion (exit code 1) unless --promote-if-better is set and the # run actually improved overall. # ------------------------------------------------------------------ - flat = _metrics_to_flat(metrics) if baseline: regressions = _detect_regressions(flat, baseline) From 0c911b3d5d0e02d8c47b829d63dedd133b0ed8f5 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:12:02 +0200 Subject: [PATCH 47/48] Apply ruff format to retrieval_metrics.py --- fireflyframework_agentic/evaluation/retrieval_metrics.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fireflyframework_agentic/evaluation/retrieval_metrics.py b/fireflyframework_agentic/evaluation/retrieval_metrics.py index df42ab24..7c9c5cfe 100644 --- a/fireflyframework_agentic/evaluation/retrieval_metrics.py +++ b/fireflyframework_agentic/evaluation/retrieval_metrics.py @@ -47,6 +47,7 @@ import math + def _dedup(retrieved: list[dict]) -> list[dict]: """Return one entry per source, first chunk wins, preserving rank order.""" seen: set[str] = set() @@ -154,9 +155,7 @@ def no_answer_rate(results: list[dict]) -> float | None: """Fraction of queries where the model produced no answer. None if no results.""" if not results: return None - count = sum( - 1 for row in results if row.get("no_answer") or not row.get("answer", "").strip() - ) + count = sum(1 for row in results if row.get("no_answer") or not row.get("answer", "").strip()) return round(count / len(results), 4) @@ -175,5 +174,3 @@ def mean_latency_ms(results: list[dict], field: str) -> float | None: """Mean latency in ms for the given field (``search_ms`` or ``answer_ms``). None if absent.""" values = [row[field] for row in results if row.get(field) is not None] return round(sum(values) / len(values)) if values else None - - From ef16882e83038856c182c67ad0818446c135ea2f Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:15:10 +0200 Subject: [PATCH 48/48] Apply ruff format to test_retrieval_metrics.py --- tests/unit/evaluation/test_retrieval_metrics.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/evaluation/test_retrieval_metrics.py b/tests/unit/evaluation/test_retrieval_metrics.py index ef38467f..fa453e2d 100644 --- a/tests/unit/evaluation/test_retrieval_metrics.py +++ b/tests/unit/evaluation/test_retrieval_metrics.py @@ -179,5 +179,3 @@ def test_mean_latency_computed_when_present(): rows = [{**_row(gold_rank=1), "search_ms": 100.0, "answer_ms": 200.0}] assert mean_latency_ms(rows, "search_ms") == 100 assert mean_latency_ms(rows, "answer_ms") == 200 - -