From a1d28a597ad87559dad0e26a2f266cf516553d21 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 09:24:24 +0200 Subject: [PATCH 1/3] remove examples/flyradar_eval_example.py --- examples/flyradar_eval_example.py | 406 ------------------------------ 1 file changed, 406 deletions(-) delete mode 100644 examples/flyradar_eval_example.py diff --git a/examples/flyradar_eval_example.py b/examples/flyradar_eval_example.py deleted file mode 100644 index 706528f4..00000000 --- a/examples/flyradar_eval_example.py +++ /dev/null @@ -1,406 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""FlyRadar evaluation example — gate-based process-mining quality gate. - -Demonstrates how to use ``fireflyframework_agentic.evaluation`` to replicate -the flyradar experiment quality-gate workflow: - -1. Load a must-find registry (the gold standard items the model must discover). -2. Load a DiscoveryResult produced by a flyradar pipeline run. -3. Run gates G1-G5 to produce a structured verdict: - G1 -- Structural & Safe (schema validity, PII, empty-registry guard). - G2 -- Recall & Precision (must-find recall floor, NC precision). - G3 -- Grounded (finding-to-evidence anchoring). - G4 -- LLM-as-a-Judge (advisory only; never blocks promotion). - G5 -- No-regression / promotion (champion/challenger comparison). -4. Render a human-readable scorecard and print the final verdict. -5. Promote the challenger to champion when the verdict is PROMOTE. - -Usage:: - - # Minimal: deterministic gates only (no G4 judge, no baseline) - python examples/flyradar_eval_example.py \\ - --result output.json \\ - --registry registry.json - - # With corpus verification and a champion baseline - python examples/flyradar_eval_example.py \\ - --result output.json \\ - --registry registry.json \\ - --baseline baseline.json \\ - --corpus input.json - - # With the advisory G4 LLM judge (requires API key in environment) - FLYEVAL_JUDGE_MODEL=anthropic:claude-sonnet-4-6 \\ - python examples/flyradar_eval_example.py \\ - --result output.json \\ - --registry registry.json \\ - --judge-model anthropic:claude-sonnet-4-6 - -Exit codes: 0 = PROMOTE, 1 = HOLD. - -Input file formats ------------------- -``--result`` (output.json) - A DiscoveryResult JSON produced by a flyradar pipeline run. Must contain - at minimum ``findings`` (list) and ``evidence_index`` (list). - -``--registry`` (registry.json) - A lean-1 registry JSON. Each item has ``id``, ``tier`` (L0-L3), ``title``, - ``description``, and ``nc`` (bool, True for negative controls). - -``--baseline`` (baseline.json) - A ChampionRecord JSON written by a previous PROMOTE run. When omitted the - gate runs in day-zero mode (G5 always passes and a new champion is minted). - -``--corpus`` (input.json) - The corpus bundle used during the run. When supplied, G3 verifies that cited - evidence excerpts actually appear in the corpus documents. -""" - -from __future__ import annotations - -import argparse -import json -import sys -from pathlib import Path - -from fireflyframework_agentic.evaluation import ( - ChampionRecord, - GateResult, - build_embedder, - load_champion, - load_corpus, - load_registry, - render_scorecard, - run_gates, - run_judge, - save_champion, - verdict, - VERDICT_PROMOTE, -) -from fireflyframework_agentic.evaluation.models import EvalConfig - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -def _load_json(path: str) -> dict: - """Read a JSON file and return its contents as a dict.""" - return json.loads(Path(path).read_text(encoding="utf-8")) - - -def _lexical_missed_ids(result: dict, registry) -> list[str]: - """Return the IDs of registry items not matched by any finding (lexically). - - The G4 judge uses these to focus its coverage checks on items that - lexical recall missed — the places where semantic recovery matters most. - """ - from fireflyframework_agentic.evaluation.matcher import matches - - evidence_index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")} - findings = result.get("findings", []) - # L3 items are informational-only and are never scored. - scored_items = [item for item in registry.real_items if item.tier != "L3"] - return [ - item.id - for item in scored_items - if not any(matches(f, item, evidence_index) for f in findings) - ] - - -# --------------------------------------------------------------------------- -# Main evaluation flow -# --------------------------------------------------------------------------- - - -def run_evaluation(args: argparse.Namespace) -> int: - """Run the full flyradar gate evaluation and return an exit code.""" - - # ------------------------------------------------------------------ - # Step 1 — Load inputs. - # ------------------------------------------------------------------ - print(f"Loading result : {args.result}") - result = _load_json(args.result) - - print(f"Loading registry : {args.registry}") - registry = load_registry(args.registry) - print(f" {len(registry.real_items)} real items, {len(registry.nc_items)} NC items") - - # The EvalConfig captures provenance for the run record. - config = EvalConfig( - model_id=args.model_id, - corpus=registry.corpus, - run_id=args.run_id, - registry_path=args.registry, - corpus_path=args.corpus or "", - baseline_path=args.baseline or "", - judge_model=args.judge_model or "", - ) - - # Optional: corpus bundle for deterministic evidence verification (G3). - corpus = None - if args.corpus: - print(f"Loading corpus : {args.corpus}") - corpus = load_corpus(args.corpus) - - # Optional: champion record for regression detection (G5). - champion = None - champion_scores = None - aa_noise = None - if args.baseline: - print(f"Loading baseline : {args.baseline}") - champion = load_champion(args.baseline) - if champion: - champion_scores = champion.scores - aa_noise = champion.aa_noise - print(f" Champion run : {champion.run_id} ({champion.model_id})") - else: - print(" No champion found — running in day-zero mode.") - - # Optional: embedder for semantic/hybrid recall (G2). - embed_fn = None - if args.embedder: - print(f"Building embedder: {args.embedder}") - embed_fn = build_embedder(args.embedder) - - print() - - # ------------------------------------------------------------------ - # Step 2 — Run deterministic gates G1-G3 + G5. - # - # run_gates() returns a list of GateResult objects, one per gate. - # Each GateResult carries: - # .gate -- "G1" | "G2" | "G3" | "G5" - # .passed -- bool - # .details -- dict with per-metric values - # .errors -- list[str] of blocking error codes - # ------------------------------------------------------------------ - print("Running gates G1-G3 + G5 ...") - gate_results: list[GateResult] = run_gates( - result, - registry, - args.registry, - pii_list=args.pii_list or [], - recall_floor=args.recall_floor, - grounding_floor=args.grounding_floor, - champion_scores=champion_scores, - aa_noise=aa_noise, - is_day_zero=(champion is None), - human_signed_off=args.human_signed_off, - signoff_count=args.signoffs, - embed_fn=embed_fn, - tau=args.tau, - recall_metric=args.recall_metric, - tau_nc=args.tau_nc, - corpus=corpus, - ) - - # Quick gate summary before the full scorecard. - for gr in gate_results: - status = "PASS" if gr.passed else "FAIL" - print(f" {gr.gate}: {status}") - - # ------------------------------------------------------------------ - # Step 3 — Run the advisory G4 LLM-as-a-Judge (optional). - # - # G4 is non-blocking: it never changes the verdict or exit code. - # It produces an AdvisoryReport with per-finding quality signals - # (faithfulness, citation relevance, fabricated entities, etc.). - # ------------------------------------------------------------------ - advisory = None - if args.judge_model: - print(f"\nRunning G4 judge ({args.judge_model}) ...") - missed_ids = _lexical_missed_ids(result, registry) - advisory = run_judge( - result, - registry, - judge_model=args.judge_model, - runs=args.judge_runs, - concurrency=args.judge_concurrency, - pipeline_model=args.model_id, - embed_fn=embed_fn, - tau=args.tau, - lexical_missed_ids=missed_ids, - ) - print(f" Judge completed ({args.judge_runs} run(s)).") - else: - print("\nG4 judge skipped (pass --judge-model to enable).") - - # ------------------------------------------------------------------ - # Step 4 — Render the scorecard. - # - # render_scorecard() produces a markdown-formatted human-readable - # report that mirrors the output of `flyeval gate` in the playground. - # ------------------------------------------------------------------ - print() - scorecard = render_scorecard( - gate_results, - corpus=registry.corpus, - model_id=config.model_id, - run_id=config.run_id, - is_self_graded=True, - kappa_advisory=registry.is_kappa_advisory(), - evidence_unverified=(corpus is None), - advisory=advisory, - ) - print(scorecard) - - # ------------------------------------------------------------------ - # Step 5 — Inspect the verdict and handle promotion. - # - # verdict() returns "PROMOTE" or "HOLD" based on the gate results. - # On PROMOTE, save the challenger as the new champion so future runs - # can detect regressions against this baseline. - # ------------------------------------------------------------------ - v = verdict(gate_results) - print(f"\nFinal verdict: {v}") - - if v == VERDICT_PROMOTE and args.baseline: - # Extract the key scores from G2 and G3 to store in the champion record. - g2 = next((g for g in gate_results if g.gate == "G2"), None) - g3 = next((g for g in gate_results if g.gate == "G3"), None) - scores: dict[str, float] = {} - if g2: - scores["recall"] = g2.details.get("recall", 0.0) - if g3: - scores["grounding_pct"] = g3.details.get("grounding_pct", 0.0) - - new_champion = ChampionRecord( - corpus=registry.corpus, - run_id=config.run_id, - model_id=config.model_id, - registry_sha256=registry.sha256(), - scores=scores, - is_day_zero=(champion is None), - ) - save_champion( - args.baseline, - new_champion, - summary=f"Promoted by flyradar_eval_example.py — {config.run_id}", - ) - print(f"Champion saved to {args.baseline}") - - # Exit 0 = PROMOTE, 1 = HOLD (mirrors `flyeval gate` convention). - return 0 if v == VERDICT_PROMOTE else 1 - - -# --------------------------------------------------------------------------- -# CLI -# --------------------------------------------------------------------------- - - -def build_parser() -> argparse.ArgumentParser: - p = argparse.ArgumentParser( - prog="flyradar_eval_example", - description="FlyRadar gate evaluation — replicates the flyeval gate workflow.", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Required inputs. - p.add_argument("--result", required=True, help="Path to DiscoveryResult JSON.") - p.add_argument("--registry", required=True, help="Path to lean-1 registry JSON.") - - # Optional inputs. - p.add_argument( - "--baseline", - help="Path to baseline.json (champion store). When absent, runs in day-zero mode.", - ) - p.add_argument( - "--corpus", - help="Path to input.json corpus bundle for deterministic evidence verification (G3).", - ) - - # Run metadata. - p.add_argument("--model-id", default="unknown", help="Model identifier for the scorecard.") - p.add_argument("--run-id", default="example-run", help="Run identifier for the scorecard.") - - # Gate thresholds. - p.add_argument( - "--recall-floor", - type=float, - default=0.70, - help="Minimum recall required for G2 to pass.", - ) - p.add_argument( - "--grounding-floor", - type=float, - default=0.90, - help="Minimum grounding percentage required for G3 to pass.", - ) - p.add_argument( - "--recall-metric", - choices=["lexical", "semantic", "hybrid"], - default="lexical", - help="Recall metric used by G2. 'semantic' and 'hybrid' require --embedder.", - ) - p.add_argument( - "--tau", - type=float, - default=0.70, - help="Cosine similarity threshold for semantic recall (real items).", - ) - p.add_argument( - "--tau-nc", - type=float, - default=0.85, - help="Cosine similarity threshold for NC item detection.", - ) - p.add_argument("--pii-list", nargs="*", default=[], help="PII tokens to check for in findings.") - p.add_argument("--human-signed-off", action="store_true", help="Mark this run as human-reviewed.") - p.add_argument("--signoffs", type=int, default=0, help="Number of human sign-offs collected.") - - # G4 judge options. - p.add_argument( - "--judge-model", - default=None, - help=( - "Provider:model string for the advisory G4 LLM judge " - "(e.g. 'anthropic:claude-sonnet-4-6'). Omit to skip G4." - ), - ) - p.add_argument( - "--judge-runs", - type=int, - default=1, - help="Number of judge calls to aggregate (odd number recommended for median).", - ) - p.add_argument( - "--judge-concurrency", - type=int, - default=1, - help="Thread fan-out for per-item G4 metrics (1 = sequential).", - ) - - # Embedder for semantic recall. - p.add_argument( - "--embedder", - default=None, - help="Embedder spec for semantic/hybrid recall (e.g. 'ollama:bge-m3').", - ) - - return p - - -def main() -> None: - parser = build_parser() - args = parser.parse_args() - sys.exit(run_evaluation(args)) - - -if __name__ == "__main__": - main() From 61617186f1ed103c783197784497dd841a260b43 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 09:24:27 +0200 Subject: [PATCH 2/3] ci: add --extra evaluation to typecheck and test sync steps --- .github/workflows/pr-gate.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-gate.yml b/.github/workflows/pr-gate.yml index c0ef76d4..86e35717 100644 --- a/.github/workflows/pr-gate.yml +++ b/.github/workflows/pr-gate.yml @@ -57,7 +57,7 @@ jobs: - uses: actions/setup-python@v6 with: python-version: '3.13' - - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra openai-embeddings + - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra openai-embeddings --extra evaluation - run: uv run pyright test: @@ -72,7 +72,7 @@ jobs: - uses: actions/setup-python@v6 with: python-version: '3.13' - - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra vectorstores-pgvector --extra openai-embeddings + - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra vectorstores-pgvector --extra openai-embeddings --extra evaluation - run: uv run pytest -m "not nightly" --cov --cov-report=term-missing build: From 203134ca971377816c462b7d4c5125d9ebc9d4e0 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 09:24:32 +0200 Subject: [PATCH 3/3] fix(evaluation): resolve all ruff lint errors (import sort, SIM108, B905, N806, UP035) --- examples/flycanon_eval_example.py | 13 +-- .../evaluation/__init__.py | 21 ++++- fireflyframework_agentic/evaluation/cli.py | 42 +++++----- fireflyframework_agentic/evaluation/corpus.py | 20 ++--- fireflyframework_agentic/evaluation/gates.py | 42 +++------- fireflyframework_agentic/evaluation/judge.py | 79 ++++++++----------- .../evaluation/judge_client.py | 25 ++---- .../evaluation/matcher.py | 60 +++++++------- .../evaluation/registry.py | 40 +++++----- .../evaluation/run_config_snapshot.py | 9 +-- .../evaluation/scorecard.py | 44 +++-------- fireflyframework_agentic/evaluation/stats.py | 9 +-- uv.lock | 59 +++++++++++++- 13 files changed, 220 insertions(+), 243 deletions(-) diff --git a/examples/flycanon_eval_example.py b/examples/flycanon_eval_example.py index 9d8d071b..856b520b 100644 --- a/examples/flycanon_eval_example.py +++ b/examples/flycanon_eval_example.py @@ -94,8 +94,7 @@ import sys from pathlib import Path -from fireflyframework_agentic.evaluation import RetrieverMetrics, compute_retrieval_metrics - +from fireflyframework_agentic.evaluation import RetrieverMetrics # --------------------------------------------------------------------------- # Helpers @@ -171,10 +170,7 @@ def _print_metrics_table(metrics: RetrieverMetrics, baseline: dict | None) -> No if value is None: continue # Format floats as 4 decimal places; ints as plain integers. - if isinstance(value, float): - cur_str = f"{value:.4f}" - else: - cur_str = str(value) + cur_str = f"{value:.4f}" if isinstance(value, float) else str(value) row = f"{key:<{col_w}} {cur_str:>{num_w}}" if baseline and key in baseline and isinstance(value, float): @@ -353,10 +349,7 @@ def build_parser() -> argparse.ArgumentParser: p.add_argument( "--baseline", default=None, - help=( - "Path to baseline.json (champion store). When absent, scores are printed " - "without comparison." - ), + help=("Path to baseline.json (champion store). When absent, scores are printed without comparison."), ) p.add_argument( "--promote-if-better", diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index ad01980c..d986d09f 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -31,16 +31,29 @@ from importlib.metadata import PackageNotFoundError, version -from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index +from fireflyframework_agentic.evaluation.champion import ( + ChampionRecord, + invalidate_champion, + load_champion, + save_champion, +) +from fireflyframework_agentic.evaluation.corpus import ( + EMPTY, + FABRICATED, + SOURCE_UNKNOWN, + VERIFIED, + corpus_sha256, + load_corpus, + verify_evidence_index, +) from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, run_gates -from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict, VERDICT_PROMOTE, VERDICT_HOLD -from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge from fireflyframework_agentic.evaluation.judge_client import JudgeClient, OllamaEmbedder, build_embedder, cosine from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256 -from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics +from fireflyframework_agentic.evaluation.scorecard import VERDICT_HOLD, VERDICT_PROMOTE, render_scorecard, verdict from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag +from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics try: __version__ = version("fireflyframework-agentic") diff --git a/fireflyframework_agentic/evaluation/cli.py b/fireflyframework_agentic/evaluation/cli.py index 7ac868d9..80dc418a 100644 --- a/fireflyframework_agentic/evaluation/cli.py +++ b/fireflyframework_agentic/evaluation/cli.py @@ -48,7 +48,8 @@ from fireflyframework_agentic.evaluation.judge_client import build_embedder from fireflyframework_agentic.evaluation.matcher import matches from fireflyframework_agentic.evaluation.registry import load_registry -from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict as get_verdict +from fireflyframework_agentic.evaluation.scorecard import render_scorecard +from fireflyframework_agentic.evaluation.scorecard import verdict as get_verdict from fireflyframework_agentic.evaluation.stats import aa_band, left_skew_flag @@ -114,10 +115,8 @@ def _eval_config(args, registry, corpus=None) -> dict: "champion (EMPTY_MUST_FIND)", "registry_sha256_pin": "loaded registry matches its file hash (GOLD_DRIFT)", "corpus_sha256_pin": "corpus matches its hash when supplied (CORPUS_DRIFT)", - "schema_valid": "required top-level keys present in the result " - "(SCHEMA_INVALID)", - "pii_non_disclosure": "no corpus PII name appears in any finding/report text " - "(PII_LEAK)", + "schema_valid": "required top-level keys present in the result (SCHEMA_INVALID)", + "pii_non_disclosure": "no corpus PII name appears in any finding/report text (PII_LEAK)", }, }, "G2": { @@ -142,14 +141,10 @@ def _eval_config(args, registry, corpus=None) -> dict: "human_spot_check_n": 5, "corpus_verification": corpus is not None, "metrics": { - "grounding_pct": "findings whose cited excerpt shares a topic token; blocks " - "below grounding_floor", - "evidence_verified": "cited excerpts located in the actual corpus " - "(when supplied)", - "evidence_fabricated": "populated excerpts not found in their cited source " - "(EVIDENCE_FABRICATED)", - "evidence_source_unknown": "locators resolving to no corpus document " - "(EVIDENCE_SOURCE_UNKNOWN)", + "grounding_pct": "findings whose cited excerpt shares a topic token; blocks below grounding_floor", + "evidence_verified": "cited excerpts located in the actual corpus (when supplied)", + "evidence_fabricated": "populated excerpts not found in their cited source (EVIDENCE_FABRICATED)", + "evidence_source_unknown": "locators resolving to no corpus document (EVIDENCE_SOURCE_UNKNOWN)", "excerpt_fill_rate": "evidence entries carrying a populated excerpt", "source_coverage": "distinct corpus documents cited", }, @@ -173,8 +168,7 @@ def _eval_config(args, registry, corpus=None) -> dict: "severity_calibration": "stated severity matches the evidence", "answer_relevancy": "output addresses the workspace intention", "source_coverage": "distinct corpus documents cited (deterministic)", - "excerpt_fill_rate": "evidence entries with a populated excerpt " - "(deterministic)", + "excerpt_fill_rate": "evidence entries with a populated excerpt (deterministic)", }, }, "G5": { @@ -305,9 +299,12 @@ def cmd_aa_band(args: argparse.Namespace) -> int: for rp in args.results: result = _load_json(rp) g2 = g2_recall_precision( - result, registry, - recall_metric=args.recall_metric, embed_fn=embed_fn, - tau=args.tau, tau_nc=args.tau_nc, + result, + registry, + recall_metric=args.recall_metric, + embed_fn=embed_fn, + tau=args.tau, + tau_nc=args.tau_nc, corpus=corpus, ) if g2.passed or g2.details.get("recall") is not None: @@ -468,15 +465,13 @@ def _add_common(p: argparse.ArgumentParser) -> None: "--tau", type=float, default=float(os.environ.get("FLYEVAL_TAU", "0.70")), - help="cosine similarity threshold for the semantic recall path (real items). " - "Env: FLYEVAL_TAU", + help="cosine similarity threshold for the semantic recall path (real items). Env: FLYEVAL_TAU", ) p_gate.add_argument( "--tau-nc", type=float, default=float(os.environ.get("FLYEVAL_TAU_NC", "0.85")), - help="cosine similarity threshold for NC item detection (higher; no source anchor). " - "Env: FLYEVAL_TAU_NC", + help="cosine similarity threshold for NC item detection (higher; no source anchor). Env: FLYEVAL_TAU_NC", ) p_gate.add_argument("--human-signed-off", action="store_true") p_gate.add_argument("--signoffs", type=int, default=0) @@ -495,8 +490,7 @@ def _add_common(p: argparse.ArgumentParser) -> None: "--judge-runs", type=int, default=int(os.environ.get("FLYEVAL_JUDGE_RUNS", "1")), - help="G4 judge runs; the median of numeric scores is kept (odd recommended). " - "Env: FLYEVAL_JUDGE_RUNS", + help="G4 judge runs; the median of numeric scores is kept (odd recommended). Env: FLYEVAL_JUDGE_RUNS", ) p_gate.add_argument( "--judge-concurrency", diff --git a/fireflyframework_agentic/evaluation/corpus.py b/fireflyframework_agentic/evaluation/corpus.py index 32835f2c..34926b41 100644 --- a/fireflyframework_agentic/evaluation/corpus.py +++ b/fireflyframework_agentic/evaluation/corpus.py @@ -80,7 +80,7 @@ def normalize(text: str) -> str: smart quotes, collapse whitespace, casefold.""" text = unicodedata.normalize("NFKC", text) text = text.replace("**", "").replace("*", "") - text = re.sub(r"[\"""''']", "", text) + text = re.sub(r"[\"" "''']", "", text) return re.sub(r"\s+", " ", text).strip().casefold() @@ -129,9 +129,7 @@ def load_corpus(path: str | Path) -> Corpus: def _fragment_coverage(fragment: str, source: str) -> float: """Fraction of fragment covered by matching blocks of >= _MIN_BLOCK_CHARS chars.""" - blocks = difflib.SequenceMatcher( - None, fragment, source, autojunk=False - ).get_matching_blocks() + blocks = difflib.SequenceMatcher(None, fragment, source, autojunk=False).get_matching_blocks() covered = sum(b.size for b in blocks if b.size >= _MIN_BLOCK_CHARS) return covered / len(fragment) @@ -158,11 +156,9 @@ def verify_entry(corpus: Corpus, entry: dict) -> str: if not excerpt: return EMPTY - fragments = [ - f.strip() - for f in _SPLICE_PATTERN.split(excerpt) - if len(f.strip()) >= _MIN_FRAGMENT_CHARS - ] or [excerpt] + fragments = [f.strip() for f in _SPLICE_PATTERN.split(excerpt) if len(f.strip()) >= _MIN_FRAGMENT_CHARS] or [ + excerpt + ] for fragment in fragments: if fragment in source: @@ -178,8 +174,4 @@ def verify_evidence_index(corpus: Corpus, result: dict) -> dict[str, str]: Returns {evidence_id: status} over all entries — referenced or not — so the gates share one verification pass. """ - return { - ev["id"]: verify_entry(corpus, ev) - for ev in result.get("evidence_index", []) - if ev.get("id") - } + return {ev["id"]: verify_entry(corpus, ev) for ev in result.get("evidence_index", []) if ev.get("id")} diff --git a/fireflyframework_agentic/evaluation/gates.py b/fireflyframework_agentic/evaluation/gates.py index 057bfea7..fc98d311 100644 --- a/fireflyframework_agentic/evaluation/gates.py +++ b/fireflyframework_agentic/evaluation/gates.py @@ -93,11 +93,7 @@ def _build_evidence_index(result: dict, corpus: Corpus | None = None) -> dict[st if corpus is None: return index statuses = verify_evidence_index(corpus, result) - return { - eid: ev - for eid, ev in index.items() - if statuses[eid] in (VERIFIED, EMPTY) - } + return {eid: ev for eid, ev in index.items() if statuses[eid] in (VERIFIED, EMPTY)} # ── G1: Structural & Safe ──────────────────────────────────────────────────── @@ -322,8 +318,10 @@ def _finding_redundancy_rate(findings: list[dict]) -> float: """Fraction of findings that are near-duplicates of another (Jaccard ≥0.6 on ≥5-char tokens).""" if len(findings) < 2: return 0.0 + def _tok(text: str) -> frozenset[str]: return frozenset(t.lower() for t in text.split() if len(t) >= 5) + token_sets = [_tok(f.get("description", "")) for f in findings] in_redundant: set[int] = set() for i in range(len(token_sets)): @@ -381,9 +379,7 @@ def g2_recall_precision( if item.tier == "NC": lexical[item.id] = False elif item.scope == "dependency_graph" and item.from_node: - lexical[item.id] = matcher.matches_dependency_graph_relation( - item, result, evidence_index - ) + lexical[item.id] = matcher.matches_dependency_graph_relation(item, result, evidence_index) else: lexical[item.id] = any( matches(c, item, evidence_index, scope=scope) @@ -394,14 +390,10 @@ def g2_recall_precision( if recall_metric not in ("lexical", "semantic", "hybrid"): raise ValueError(f"unknown recall_metric {recall_metric!r}") if recall_metric in ("semantic", "hybrid") and embed_fn is None: - raise ValueError( - f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn" - ) + raise ValueError(f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn") if embed_fn is not None: - semantic = matcher.semantic_hits( - candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc - ) + semantic = matcher.semantic_hits(candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc) # dependency_graph relation items have no embedding candidates (§5.3b uses # the endpoint matcher, not per-candidate text embeddings); mirror the # lexical result so semantic/hybrid never under-credits them. @@ -424,8 +416,7 @@ def g2_recall_precision( finding_count = len(findings) finding_scoped_items = [i for i in registry.real_items if i.scope == "finding"] findings_matched = sum( - 1 for f in findings - if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items) + 1 for f in findings if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items) ) _sn = { "finding_count": finding_count, @@ -493,9 +484,7 @@ def _semantic_details() -> dict: "lexical_recall": round(_weighted_recall(scored_items, lexical), 4), "semantic_recall": round(_weighted_recall(scored_items, semantic), 4), "hybrid_recall": round( - _weighted_recall( - scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical} - ), + _weighted_recall(scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical}), 4, ), "tau": tau, @@ -577,8 +566,8 @@ def g3_grounded( grounded_ids: list[str] = [] # Ungrounded split (§6.3): distinguish format issues from real faithfulness failures. - ungrounded_empty_only: list[str] = [] # every ref had an empty excerpt - ungrounded_populated: list[str] = [] # had populated excerpt(s) but none anchored + ungrounded_empty_only: list[str] = [] # every ref had an empty excerpt + ungrounded_populated: list[str] = [] # had populated excerpt(s) but none anchored # Excerpt fill: count all resolved refs and how many carry a non-empty excerpt. total_refs = 0 @@ -657,18 +646,14 @@ def g3_grounded( "Populated excerpt(s) not found in the cited corpus document — " "the run asserts evidence the source does not contain." ) - return GateResult( - gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details - ) + return GateResult(gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details) if unknown_source_ids: details["message"] = ( "Evidence locator(s) resolve to no corpus document — either the " "corpus bundle is incomplete or the run invented a source." ) - return GateResult( - gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details - ) + return GateResult(gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details) if grounding_pct < grounding_floor: details["floor"] = grounding_floor @@ -746,8 +731,7 @@ def g5_no_regression( band = noise.get(metric, 0.0) if delta < -band: regressions.append( - f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} " - f"delta={delta:+.4f} < -band={-band:.4f}" + f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} delta={delta:+.4f} < -band={-band:.4f}" ) elif delta > band: improvements.append(f"{metric}: delta={delta:+.4f} > band={band:.4f}") diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index a347c8e1..80a90b04 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -142,10 +142,7 @@ def _map_chat(chat_fn, prompts, workers=1): results: list[dict] = [{} for _ in prompts] with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: - futures = { - executor.submit(chat_fn, system, user): idx - for idx, (system, user) in enumerate(prompts) - } + futures = {executor.submit(chat_fn, system, user): idx for idx, (system, user) in enumerate(prompts)} for future in concurrent.futures.as_completed(futures): idx = futures[future] try: @@ -165,11 +162,7 @@ def source_coverage(result: dict) -> dict: source stems present in evidence_index but cited by no finding. """ evidence_index = _evidence_index(result) - all_stems = { - source_stem(ev.get("locator", "")) - for ev in result.get("evidence_index", []) - if ev.get("locator") - } + all_stems = {source_stem(ev.get("locator", "")) for ev in result.get("evidence_index", []) if ev.get("locator")} cited_stems: set[str] = set() for f in result.get("findings", []): for ref in f.get("evidence_refs", []): @@ -245,7 +238,7 @@ def semantic_recovery( cand_vecs = np.asarray(embed_fn(candidate_texts), dtype=np.float64) recovered: list[dict] = [] - for item, ivec in zip(missed_items, item_vecs): + for item, ivec in zip(missed_items, item_vecs, strict=False): best = max((cosine(ivec, cvec) for cvec in cand_vecs), default=0.0) if best >= tau: recovered.append({"id": item.id, "cosine": round(best, 4)}) @@ -307,11 +300,7 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic source}], count}. """ evidence_index = _evidence_index(result) - scored = [ - (f, excerpts) - for f in result.get("findings", []) - if (excerpts := _cited_excerpts(f, evidence_index)) - ] + scored = [(f, excerpts) for f in result.get("findings", []) if (excerpts := _cited_excerpts(f, evidence_index))] prompts = [ ( SYSTEM, @@ -326,7 +315,7 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic ] answers = _map_chat(chat_fn, prompts, workers) mismatches: list[dict] = [] - for (f, _excerpts), answer in zip(scored, answers): + for (f, _excerpts), answer in zip(scored, answers, strict=False): for m in answer.get("mismatches", []) or []: mismatches.append( { @@ -395,7 +384,7 @@ def nc_semantic_precision(result: dict, registry, chat_fn, *, workers: int = 1) ] answers = _map_chat(chat_fn, prompts, workers) asserted_ids = [ - item.id for item, a in zip(nc_items, answers) if str(a.get("asserted", "")).lower() == "yes" + item.id for item, a in zip(nc_items, answers, strict=False) if str(a.get("asserted", "")).lower() == "yes" ] return {"asserted": len(asserted_ids), "total": len(nc_items), "asserted_ids": asserted_ids} @@ -407,10 +396,7 @@ def fabricated_entity(result: dict, chat_fn) -> dict: excerpts + locators. """ output_text = _output_text(result) - corpus = "\n".join( - f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" - for ev in result.get("evidence_index", []) - ) + corpus = "\n".join(f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" for ev in result.get("evidence_index", [])) user = ( "List any system, organization, or metric NAMED in the OUTPUT that does NOT " "appear anywhere in the CORPUS EVIDENCE.\n" @@ -433,8 +419,7 @@ def contradiction(result: dict, chat_fn) -> dict: lines.append(f"{f.get('id', '?')}: {f.get('title', '')} — {f.get('description', '')}") user = ( "Are any two of these FINDINGS mutually contradictory? List each contradicting pair.\n" - 'Reply with ONLY {"pairs": [["", ""], ...]}. Empty list if none.\n\n' - + "\n".join(lines) + 'Reply with ONLY {"pairs": [["", ""], ...]}. Empty list if none.\n\n' + "\n".join(lines) ) pairs = chat_fn(SYSTEM, user).get("pairs", []) or [] return {"count": len(pairs), "pairs": [list(p) for p in pairs]} @@ -514,7 +499,7 @@ def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict: answers = _map_chat(chat_fn, prompts, workers) verdicts: dict[str, str] = {} miscalibrated = 0 - for f, a in zip(findings, answers): + for f, a in zip(findings, answers, strict=False): verdict = str(a.get("calibration", "calibrated")).lower() verdicts[f.get("id", "?")] = verdict if verdict in ("under", "over"): @@ -557,7 +542,7 @@ def surface_deduplication(result: dict, chat_fn, *, workers: int = 1) -> dict: def _toks(node: dict) -> frozenset[str]: return frozenset(node.get("name", "").lower().split()) - PER_SURFACE_CAP = 10 + per_surface_cap = 10 # candidates: (surface, node_a, node_b, parent_process_name) candidates: list[tuple[str, dict, dict, str]] = [] @@ -574,7 +559,7 @@ def _toks(node: dict) -> frozenset[str]: if jac >= 0.30: pairs.append((jac, procs[i], procs[j])) pairs.sort(key=lambda x: x[0], reverse=True) - for _jac, a, b in pairs[:PER_SURFACE_CAP]: + for _jac, a, b in pairs[:per_surface_cap]: candidates.append(("process", a, b, "")) # Activities and decisions: within the same parent process only @@ -595,7 +580,7 @@ def _toks(node: dict) -> frozenset[str]: if jac >= 0.30: all_pairs.append((jac, nodes[i], nodes[j], proc_name)) all_pairs.sort(key=lambda x: x[0], reverse=True) - for _jac, a, b, proc_name in all_pairs[:PER_SURFACE_CAP]: + for _jac, a, b, proc_name in all_pairs[:per_surface_cap]: candidates.append((surface_key, a, b, proc_name)) if not candidates: @@ -604,33 +589,37 @@ def _toks(node: dict) -> frozenset[str]: prompts = [] for surface, a, b, parent_proc in candidates: ctx = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else "" - prompts.append(( - SYSTEM, - f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a " - f"duplicate / sub-case / restatement of the other?\n" - f"{ctx}" - 'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": ""}.\n\n' - f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n" - f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}", - )) + prompts.append( + ( + SYSTEM, + f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a " + f"duplicate / sub-case / restatement of the other?\n" + f"{ctx}" + 'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": ""}.\n\n' + f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n" + f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}", + ) + ) answers = _map_chat(chat_fn, prompts, workers) distinct = 0 redundant = 0 redundant_pairs: list[dict] = [] - for (surface, a, b, _parent), answer in zip(candidates, answers): + for (surface, a, b, _parent), answer in zip(candidates, answers, strict=False): verdict = str(answer.get("verdict", "")).upper() if verdict == "DISTINCT": distinct += 1 else: redundant += 1 - redundant_pairs.append({ - "surface": surface, - "a": a.get("name", ""), - "b": b.get("name", ""), - "reason": str(answer.get("reason", "")), - }) + redundant_pairs.append( + { + "surface": surface, + "a": a.get("name", ""), + "b": b.get("name", ""), + "reason": str(answer.get("reason", "")), + } + ) total = distinct + redundant return { @@ -800,9 +789,7 @@ def _run_judge_metric(name: str, fn) -> None: "numeric_temporal_fidelity", lambda: numeric_temporal_fidelity(result, chat_fn, workers=concurrency), ) - _run_judge_metric( - "citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency) - ) + _run_judge_metric("citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency)) _run_judge_metric( "nc_semantic_precision", lambda: nc_semantic_precision(result, registry, chat_fn, workers=concurrency), diff --git a/fireflyframework_agentic/evaluation/judge_client.py b/fireflyframework_agentic/evaluation/judge_client.py index 1af17f53..e4b58dea 100644 --- a/fireflyframework_agentic/evaluation/judge_client.py +++ b/fireflyframework_agentic/evaluation/judge_client.py @@ -245,8 +245,7 @@ def _dispatch(self, system: str, user: str, max_tokens: int) -> str: if self.provider == "ollama": return self._ollama(system, user, max_tokens) raise ValueError( - f"unknown judge provider {self.provider!r} in {self.model_spec!r}; " - "use anthropic:/openai:/azure:/ollama:" + f"unknown judge provider {self.provider!r} in {self.model_spec!r}; use anthropic:/openai:/azure:/ollama:" ) def _anthropic(self, system: str, user: str, max_tokens: int) -> str: @@ -262,9 +261,7 @@ def _anthropic(self, system: str, user: str, max_tokens: int) -> str: } headers = {"x-api-key": api_key, "anthropic-version": "2023-06-01"} resp = _http_post_json("https://api.anthropic.com/v1/messages", headers, body, self.timeout) - text = next( - (b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None - ) + text = next((b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None) if not text: raise RuntimeError(f"judge returned no text: {resp}") return text @@ -283,9 +280,7 @@ def _openai(self, system: str, user: str, max_tokens: int) -> str: ], } headers = {"Authorization": f"Bearer {api_key}"} - resp = _http_post_json( - "https://api.openai.com/v1/chat/completions", headers, body, self.timeout - ) + resp = _http_post_json("https://api.openai.com/v1/chat/completions", headers, body, self.timeout) return _extract_openai_text(resp) def _azure(self, system: str, user: str, max_tokens: int) -> str: @@ -297,10 +292,7 @@ def _azure(self, system: str, user: str, max_tokens: int) -> str: raise RuntimeError("AZURE_OPENAI_API_KEY not set") api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01" # Azure deployment lives in the URL path, not the JSON body. - url = ( - f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions" - f"?api-version={api_version}" - ) + url = f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions?api-version={api_version}" body = { "max_tokens": max_tokens, "temperature": 0.0, @@ -373,10 +365,7 @@ def embed(self, texts: list[str]) -> np.ndarray: if not api_key: raise RuntimeError("AZURE_OPENAI_API_KEY not set") api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01" - url = ( - f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings" - f"?api-version={api_version}" - ) + url = f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings?api-version={api_version}" headers = {"api-key": api_key} vectors = self._embed_with_split(texts, url, headers) return np.asarray(vectors, dtype=np.float32) @@ -438,9 +427,7 @@ def build_embedder(spec: str): return OpenAIEmbedder(model or "text-embedding-3-small").embed if provider == "azure": return AzureOpenAIEmbedder(model or "text-embedding-3-small").embed - raise NotImplementedError( - f"embedder backend {provider!r} not implemented yet; add it in build_embedder()" - ) + raise NotImplementedError(f"embedder backend {provider!r} not implemented yet; add it in build_embedder()") def cosine(a, b) -> float: diff --git a/fireflyframework_agentic/evaluation/matcher.py b/fireflyframework_agentic/evaluation/matcher.py index b4d81f44..ccf61c96 100644 --- a/fireflyframework_agentic/evaluation/matcher.py +++ b/fireflyframework_agentic/evaluation/matcher.py @@ -113,9 +113,7 @@ def _keyword_anchored(desc: str, keywords: list[str]) -> bool: if not keywords: return False desc_lower = desc.lower() - return any( - re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords - ) + return any(re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords) def candidate_text(candidate: dict, scope: str) -> str: @@ -141,18 +139,28 @@ def candidate_text(candidate: dict, scope: str) -> str: pain = candidate.get("pain_points") or [] goals_str = " ".join(goals) if isinstance(goals, list) else str(goals) pain_str = " ".join(pain) if isinstance(pain, list) else str(pain) - return " ".join(filter(None, [ - candidate.get("name", ""), - candidate.get("role", ""), - goals_str, - pain_str, - ])) + return " ".join( + filter( + None, + [ + candidate.get("name", ""), + candidate.get("role", ""), + goals_str, + pain_str, + ], + ) + ) if scope == "informal_channel": - return " ".join(filter(None, [ - candidate.get("name", ""), - candidate.get("usage_context", ""), - candidate.get("notes", ""), - ])) + return " ".join( + filter( + None, + [ + candidate.get("name", ""), + candidate.get("usage_context", ""), + candidate.get("notes", ""), + ], + ) + ) # process, decision, system, dependency_graph (diagnostic nodes) return " ".join(filter(None, [candidate.get("name", ""), candidate.get("description", "")])) @@ -246,9 +254,7 @@ def matches_dependency_graph_relation( def _anchor(endpoint_text: str) -> set[str]: return { - a["id"] - for a in all_activities - if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text) + a["id"] for a in all_activities if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text) } from_ids = _anchor(item.from_node) @@ -268,9 +274,8 @@ def _node_stems(node: dict) -> set[str]: dg = result.get("dependency_graph", {}) for edge in dg.get("activity_edges", []): - if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids: - if _node_stems(edge) & item_stems: - return True + if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids and _node_stems(edge) & item_stems: + return True for path in dg.get("critical_paths", []): if not (_node_stems(path) & item_stems): @@ -325,19 +330,13 @@ def semantic_hits( # Flatten all candidates across scopes, preserving their scope tag for # text extraction and per-item filtering. - scoped: list[tuple[str, dict]] = [ - (scope, cand) - for scope, cands in candidates.items() - for cand in cands - ] + scoped: list[tuple[str, dict]] = [(scope, cand) for scope, cands in candidates.items() for cand in cands] if not scoped: return {item.id: False for item in items} cand_texts = [candidate_text(cand, scope) for scope, cand in scoped] - item_texts = [ - " ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items - ] + item_texts = [" ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items] cand_vecs = np.asarray(embed_fn(cand_texts)) item_vecs = np.asarray(embed_fn(item_texts)) @@ -359,10 +358,7 @@ def semantic_hits( if cosine(cand_vecs[k], item_vec) >= tau_nc: hit = True break - elif ( - shares_source(cand, item, evidence_index) - and cosine(cand_vecs[k], item_vec) >= tau - ): + elif shares_source(cand, item, evidence_index) and cosine(cand_vecs[k], item_vec) >= tau: hit = True break hits[item.id] = hit diff --git a/fireflyframework_agentic/evaluation/registry.py b/fireflyframework_agentic/evaluation/registry.py index 2b869ba9..87c4beb1 100644 --- a/fireflyframework_agentic/evaluation/registry.py +++ b/fireflyframework_agentic/evaluation/registry.py @@ -24,6 +24,7 @@ - kappa present (0.0 placeholder allowed; G2 advisory until >= 0.70) - ABANCA DILO items must target a single measured sub-population """ + from __future__ import annotations import hashlib @@ -35,8 +36,15 @@ VALID_TIERS = ("L0", "L1", "L2", "L3", "NC") VALID_SCOPES = ( - "process", "activity", "decision", "finding", "action", - "persona", "system", "informal_channel", "dependency_graph", + "process", + "activity", + "decision", + "finding", + "action", + "persona", + "system", + "informal_channel", + "dependency_graph", ) SCHEMA_VERSION = "lean-1" KAPPA_ADVISORY_THRESHOLD = 0.70 @@ -47,13 +55,13 @@ class RegistryItem: id: str tier: Literal["L0", "L1", "L2", "L3", "NC"] description: str - evidence: list[str] # source file paths (path portion of locator, no #page=N) - scope: str = "finding" # which DiscoveryResult surface to match against (§4.3) + evidence: list[str] # source file paths (path portion of locator, no #page=N) + scope: str = "finding" # which DiscoveryResult surface to match against (§4.3) keywords: list[str] = field(default_factory=list) weight: float = 1.0 - from_node: str = "" # dependency_graph relation items only - to_node: str = "" # dependency_graph relation items only - relation: str = "" # defaults to "precedes" when from/to present + from_node: str = "" # dependency_graph relation items only + to_node: str = "" # dependency_graph relation items only + relation: str = "" # defaults to "precedes" when from/to present @dataclass(frozen=True) @@ -87,10 +95,7 @@ def sha256(self) -> str: def _validate(raw: dict, path: Path) -> None: if raw.get("schema_version") != SCHEMA_VERSION: - raise ValueError( - f"{path.name}: schema_version must be '{SCHEMA_VERSION}', " - f"got {raw.get('schema_version')!r}" - ) + raise ValueError(f"{path.name}: schema_version must be '{SCHEMA_VERSION}', got {raw.get('schema_version')!r}") for fname in ("corpus", "author", "date"): if not raw.get(fname): raise ValueError(f"{path.name}: missing required field '{fname}'") @@ -116,20 +121,17 @@ def _validate(raw: dict, path: Path) -> None: tier = it.get("tier") if tier not in VALID_TIERS: raise ValueError( - f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; " - f"must be one of {VALID_TIERS}" + f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; must be one of {VALID_TIERS}" ) scope = it.get("scope", "finding") if scope not in VALID_SCOPES: raise ValueError( - f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; " - f"must be one of {VALID_SCOPES}" + f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; must be one of {VALID_SCOPES}" ) if scope == "dependency_graph": if not it.get("from") or not it.get("to"): raise ValueError( - f"{path.name}: dependency_graph item '{it.get('id')}' must have " - "non-empty 'from' and 'to'" + f"{path.name}: dependency_graph item '{it.get('id')}' must have non-empty 'from' and 'to'" ) else: if "from" in it or "to" in it or "relation" in it: @@ -153,13 +155,13 @@ def _validate(raw: dict, path: Path) -> None: # ABANCA DILO blend guard: items must assert a single sub-population target. # Checks for phrases that would indicate a blended numeric target is asserted. # "blend" alone is too broad (items may reference it negatively). - BLEND_PHRASES = ("combined distribution", "across all offices regardless of segment") + blend_phrases = ("combined distribution", "across all offices regardless of segment") for it in items: if it.get("tier") == "NC": continue desc = it.get("description", "").lower() iid = it.get("id", "") - if any(phrase in desc for phrase in BLEND_PHRASES): + if any(phrase in desc for phrase in blend_phrases): raise ValueError( f"{path.name}: item '{iid}' description targets a blended distribution; " "ABANCA DILO items must target a single measured sub-population " diff --git a/fireflyframework_agentic/evaluation/run_config_snapshot.py b/fireflyframework_agentic/evaluation/run_config_snapshot.py index db543129..c029e8e6 100644 --- a/fireflyframework_agentic/evaluation/run_config_snapshot.py +++ b/fireflyframework_agentic/evaluation/run_config_snapshot.py @@ -32,6 +32,7 @@ --options request_options.json \ --commit c107918 """ + from __future__ import annotations import argparse @@ -133,12 +134,8 @@ def write_snapshot(output_dir: str | Path, config: dict) -> Path: def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser(description="Capture the flyradar run configuration.") parser.add_argument("--output-dir", required=True, help="Run directory holding output.json.") - parser.add_argument( - "--options", required=True, help="JSON file of the DiscoveryRequest options that were sent." - ) - parser.add_argument( - "--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL)." - ) + parser.add_argument("--options", required=True, help="JSON file of the DiscoveryRequest options that were sent.") + parser.add_argument("--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL).") parser.add_argument("--commit", default=None, help="Deployed flyradar git commit, if known.") args = parser.parse_args(argv) diff --git a/fireflyframework_agentic/evaluation/scorecard.py b/fireflyframework_agentic/evaluation/scorecard.py index b34885e8..da3c4a87 100644 --- a/fireflyframework_agentic/evaluation/scorecard.py +++ b/fireflyframework_agentic/evaluation/scorecard.py @@ -188,13 +188,9 @@ def _render_advisory(report) -> list[str]: d = m["faithfulness"] u = d.get("unsupported_ids", []) extra = f" (unsupported: {', '.join(u)})" if u else "" - lines.append( - f"Faithfulness (entailment): {d.get('supported')}/{d.get('total')} supported{extra}" - ) + lines.append(f"Faithfulness (entailment): {d.get('supported')}/{d.get('total')} supported{extra}") if "numeric_temporal_fidelity" in m: - lines.append( - f"Numeric/temporal fidelity: {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)" - ) + lines.append(f"Numeric/temporal fidelity: {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)") if "citation_relevance" in m: d = m["citation_relevance"] lines.append( @@ -218,14 +214,10 @@ def _render_advisory(report) -> list[str]: lines.append(f"Contradiction detection: {m['contradiction'].get('count', 0)}") if "actionability" in m: d = m["actionability"] - lines.append( - f"Actionability: {_num(d.get('score'))} (rated {d.get('rated', 0)})" - ) + lines.append(f"Actionability: {_num(d.get('score'))} (rated {d.get('rated', 0)})") if "severity_calibration" in m: d = m["severity_calibration"] - lines.append( - f"Severity calibration: {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated" - ) + lines.append(f"Severity calibration: {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated") if "answer_relevancy" in m: lines.append(f"Answer relevancy: {_num(m['answer_relevancy'].get('score'))}") if "comparative_vs_champion" in m: @@ -236,14 +228,10 @@ def _render_advisory(report) -> list[str]: d = m["source_coverage"] o = d.get("orphaned", []) extra = f" (orphaned: {', '.join(o)})" if o else "" - lines.append( - f"Source coverage [D]: {d.get('cited')}/{d.get('total')} documents cited{extra}" - ) + lines.append(f"Source coverage [D]: {d.get('cited')}/{d.get('total')} documents cited{extra}") if "excerpt_fill_rate" in m: d = m["excerpt_fill_rate"] - lines.append( - f"Evidence-excerpt fill [D]: {d.get('populated')}/{d.get('total')} populated" - ) + lines.append(f"Evidence-excerpt fill [D]: {d.get('populated')}/{d.get('total')} populated") if "open_gap" in m: gap = (m["open_gap"].get("gap") or "").strip() if gap: @@ -259,9 +247,7 @@ def _render_advisory(report) -> list[str]: json.dumps({"metrics": report.metrics, "details": report.details}, indent=2, default=str), "```", ] - lines.append( - "> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10)." - ) + lines.append("> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10).") lines.append("") return lines @@ -284,9 +270,7 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]: matched = d.get("findings_matched_to_registry", {}).get("fraction", 0.0) tier_summary = ", ".join( - f"{t} {v['hit']}/{v['total']}" - for t, v in tiers.items() - if "hit" in v and "total" in v + f"{t} {v['hit']}/{v['total']}" for t, v in tiers.items() if "hit" in v and "total" in v ) lines.append( f"Lexical recall is **{recall:.3f}** ({tier_summary}). " @@ -300,9 +284,7 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]: "The run is covering the same ground multiple times rather than broadening coverage." ) else: - lines.append( - f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic." - ) + lines.append(f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic.") lines.append( "_G2 is a topic-level test. A recall of 1.000 means every required topic was " "mentioned somewhere — it does not verify that the specific claims about those " @@ -453,14 +435,10 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]: flag_names = [g.gate for g in flags] if not flags: - lines.append( - "All deterministic gates pass. The run is ready for G5 human sign-off." - ) + lines.append("All deterministic gates pass. The run is ready for G5 human sign-off.") else: flag_str = ", ".join(flag_names) - lines.append( - f"The run is at **HOLD** due to flags on: {flag_str}. " - ) + lines.append(f"The run is at **HOLD** due to flags on: {flag_str}. ") for g in flags: if g.gate == "G3" and g.reason_code == "EVIDENCE_SOURCE_UNKNOWN": lines.append( diff --git a/fireflyframework_agentic/evaluation/stats.py b/fireflyframework_agentic/evaluation/stats.py index e70c629a..c622588c 100644 --- a/fireflyframework_agentic/evaluation/stats.py +++ b/fireflyframework_agentic/evaluation/stats.py @@ -23,10 +23,11 @@ aggregation bug where the previous runner inherited run 0's grounding report unchanged instead of merging across all runs. """ + from __future__ import annotations import statistics -from typing import Sequence +from collections.abc import Sequence def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float: @@ -49,11 +50,7 @@ def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float: scores = list(scores) if len(scores) < 2: raise ValueError(f"aa_band requires >= 2 reruns; got {len(scores)}") - deltas = [ - abs(x - y) - for i, x in enumerate(scores) - for y in scores[i + 1:] - ] + deltas = [abs(x - y) for i, x in enumerate(scores) for y in scores[i + 1 :]] sorted_deltas = sorted(deltas) # Index for the requested percentile; clamp to valid range idx = max(0, min(len(sorted_deltas) - 1, int(len(sorted_deltas) * percentile / 100))) diff --git a/uv.lock b/uv.lock index 7e3b501c..93e18075 100644 --- a/uv.lock +++ b/uv.lock @@ -1209,6 +1209,10 @@ dev = [ embeddings = [ { name = "numpy" }, ] +evaluation = [ + { name = "numpy" }, + { name = "scipy" }, +] google-embeddings = [ { name = "google-generativeai" }, ] @@ -1279,6 +1283,7 @@ requires-dist = [ { name = "mistralai", marker = "extra == 'mistral-embeddings'", specifier = ">=1.0.0" }, { name = "motor", marker = "extra == 'mongodb'", specifier = ">=3.6.0" }, { name = "numpy", marker = "extra == 'embeddings'", specifier = ">=1.26.0" }, + { name = "numpy", marker = "extra == 'evaluation'", specifier = ">=1.26.0" }, { name = "numpy", marker = "extra == 'reasoning-eval'", specifier = ">=2.0.0" }, { name = "openai", marker = "extra == 'azure-embeddings'", specifier = ">=1.0.0" }, { name = "openai", marker = "extra == 'openai-embeddings'", specifier = ">=1.0.0" }, @@ -1304,13 +1309,14 @@ requires-dist = [ { name = "python-dotenv", specifier = ">=1.0.0" }, { name = "qdrant-client", marker = "extra == 'vectorstores-qdrant'", specifier = ">=1.12.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" }, + { name = "scipy", marker = "extra == 'evaluation'", specifier = ">=1.11" }, { name = "sqlalchemy", marker = "extra == 'postgres'", specifier = ">=2.0.0" }, { name = "sqlite-vec", marker = "extra == 'vectorstores-sqlite-vec'", specifier = ">=0.1.6" }, { name = "testcontainers", marker = "extra == 'dev'", specifier = ">=4.10.0" }, { name = "voyageai", marker = "extra == 'voyage-embeddings'", specifier = ">=0.3.0" }, { name = "watchfiles", marker = "extra == 'watch'", specifier = ">=0.24.0" }, ] -provides-extras = ["postgres", "mongodb", "security", "embeddings", "openai-embeddings", "cohere-embeddings", "google-embeddings", "mistral-embeddings", "voyage-embeddings", "azure-embeddings", "bedrock-embeddings", "ollama-embeddings", "reasoning-eval", "vectorstores-chroma", "vectorstores-sqlite-vec", "vectorstores-pinecone", "vectorstores-qdrant", "vectorstores-pgvector", "watch", "binary", "all", "dev"] +provides-extras = ["postgres", "mongodb", "security", "embeddings", "openai-embeddings", "cohere-embeddings", "google-embeddings", "mistral-embeddings", "voyage-embeddings", "azure-embeddings", "bedrock-embeddings", "ollama-embeddings", "reasoning-eval", "vectorstores-chroma", "vectorstores-sqlite-vec", "vectorstores-pinecone", "vectorstores-qdrant", "vectorstores-pgvector", "watch", "binary", "all", "evaluation", "dev"] [[package]] name = "flatbuffers" @@ -4489,6 +4495,57 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/72/c6c32d2b657fa3dad1de340254e14390b1e334ce38268b7ad51abda3c8c2/s3transfer-0.17.0-py3-none-any.whl", hash = "sha256:ce3801712acf4ad3e89fb9990df97b4972e93f4b3b0004d214be5bce12814c20", size = 86811, upload-time = "2026-04-29T22:07:34.966Z" }, ] +[[package]] +name = "scipy" +version = "1.17.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/27/07ee1b57b65e92645f219b37148a7e7928b82e2b5dbeccecb4dff7c64f0b/scipy-1.17.1-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:5e3c5c011904115f88a39308379c17f91546f77c1667cea98739fe0fccea804c", size = 31590199, upload-time = "2026-02-23T00:19:17.192Z" }, + { url = "https://files.pythonhosted.org/packages/ec/ae/db19f8ab842e9b724bf5dbb7db29302a91f1e55bc4d04b1025d6d605a2c5/scipy-1.17.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6fac755ca3d2c3edcb22f479fceaa241704111414831ddd3bc6056e18516892f", size = 28154001, upload-time = "2026-02-23T00:19:22.241Z" }, + { url = "https://files.pythonhosted.org/packages/5b/58/3ce96251560107b381cbd6e8413c483bbb1228a6b919fa8652b0d4090e7f/scipy-1.17.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ff200bf9d24f2e4d5dc6ee8c3ac64d739d3a89e2326ba68aaf6c4a2b838fd7d", size = 20325719, upload-time = "2026-02-23T00:19:26.329Z" }, + { url = "https://files.pythonhosted.org/packages/b2/83/15087d945e0e4d48ce2377498abf5ad171ae013232ae31d06f336e64c999/scipy-1.17.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4b400bdc6f79fa02a4d86640310dde87a21fba0c979efff5248908c6f15fad1b", size = 22683595, upload-time = "2026-02-23T00:19:30.304Z" }, + { url = "https://files.pythonhosted.org/packages/b4/e0/e58fbde4a1a594c8be8114eb4aac1a55bcd6587047efc18a61eb1f5c0d30/scipy-1.17.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b64ca7d4aee0102a97f3ba22124052b4bd2152522355073580bf4845e2550b6", size = 32896429, upload-time = "2026-02-23T00:19:35.536Z" }, + { url = "https://files.pythonhosted.org/packages/f5/5f/f17563f28ff03c7b6799c50d01d5d856a1d55f2676f537ca8d28c7f627cd/scipy-1.17.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:581b2264fc0aa555f3f435a5944da7504ea3a065d7029ad60e7c3d1ae09c5464", size = 35203952, upload-time = "2026-02-23T00:19:42.259Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a5/9afd17de24f657fdfe4df9a3f1ea049b39aef7c06000c13db1530d81ccca/scipy-1.17.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:beeda3d4ae615106d7094f7e7cef6218392e4465cc95d25f900bebabfded0950", size = 34979063, upload-time = "2026-02-23T00:19:47.547Z" }, + { url = "https://files.pythonhosted.org/packages/8b/13/88b1d2384b424bf7c924f2038c1c409f8d88bb2a8d49d097861dd64a57b2/scipy-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6609bc224e9568f65064cfa72edc0f24ee6655b47575954ec6339534b2798369", size = 37598449, upload-time = "2026-02-23T00:19:53.238Z" }, + { url = "https://files.pythonhosted.org/packages/35/e5/d6d0e51fc888f692a35134336866341c08655d92614f492c6860dc45bb2c/scipy-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:37425bc9175607b0268f493d79a292c39f9d001a357bebb6b88fdfaff13f6448", size = 36510943, upload-time = "2026-02-23T00:20:50.89Z" }, + { url = "https://files.pythonhosted.org/packages/2a/fd/3be73c564e2a01e690e19cc618811540ba5354c67c8680dce3281123fb79/scipy-1.17.1-cp313-cp313-win_arm64.whl", hash = "sha256:5cf36e801231b6a2059bf354720274b7558746f3b1a4efb43fcf557ccd484a87", size = 24545621, upload-time = "2026-02-23T00:20:55.871Z" }, + { url = "https://files.pythonhosted.org/packages/6f/6b/17787db8b8114933a66f9dcc479a8272e4b4da75fe03b0c282f7b0ade8cd/scipy-1.17.1-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:d59c30000a16d8edc7e64152e30220bfbd724c9bbb08368c054e24c651314f0a", size = 31936708, upload-time = "2026-02-23T00:19:58.694Z" }, + { url = "https://files.pythonhosted.org/packages/38/2e/524405c2b6392765ab1e2b722a41d5da33dc5c7b7278184a8ad29b6cb206/scipy-1.17.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:010f4333c96c9bb1a4516269e33cb5917b08ef2166d5556ca2fd9f082a9e6ea0", size = 28570135, upload-time = "2026-02-23T00:20:03.934Z" }, + { url = "https://files.pythonhosted.org/packages/fd/c3/5bd7199f4ea8556c0c8e39f04ccb014ac37d1468e6cfa6a95c6b3562b76e/scipy-1.17.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:2ceb2d3e01c5f1d83c4189737a42d9cb2fc38a6eeed225e7515eef71ad301dce", size = 20741977, upload-time = "2026-02-23T00:20:07.935Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b8/8ccd9b766ad14c78386599708eb745f6b44f08400a5fd0ade7cf89b6fc93/scipy-1.17.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:844e165636711ef41f80b4103ed234181646b98a53c8f05da12ca5ca289134f6", size = 23029601, upload-time = "2026-02-23T00:20:12.161Z" }, + { url = "https://files.pythonhosted.org/packages/6d/a0/3cb6f4d2fb3e17428ad2880333cac878909ad1a89f678527b5328b93c1d4/scipy-1.17.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:158dd96d2207e21c966063e1635b1063cd7787b627b6f07305315dd73d9c679e", size = 33019667, upload-time = "2026-02-23T00:20:17.208Z" }, + { url = "https://files.pythonhosted.org/packages/f3/c3/2d834a5ac7bf3a0c806ad1508efc02dda3c8c61472a56132d7894c312dea/scipy-1.17.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74cbb80d93260fe2ffa334efa24cb8f2f0f622a9b9febf8b483c0b865bfb3475", size = 35264159, upload-time = "2026-02-23T00:20:23.087Z" }, + { url = "https://files.pythonhosted.org/packages/4d/77/d3ed4becfdbd217c52062fafe35a72388d1bd82c2d0ba5ca19d6fcc93e11/scipy-1.17.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:dbc12c9f3d185f5c737d801da555fb74b3dcfa1a50b66a1a93e09190f41fab50", size = 35102771, upload-time = "2026-02-23T00:20:28.636Z" }, + { url = "https://files.pythonhosted.org/packages/bd/12/d19da97efde68ca1ee5538bb261d5d2c062f0c055575128f11a2730e3ac1/scipy-1.17.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:94055a11dfebe37c656e70317e1996dc197e1a15bbcc351bcdd4610e128fe1ca", size = 37665910, upload-time = "2026-02-23T00:20:34.743Z" }, + { url = "https://files.pythonhosted.org/packages/06/1c/1172a88d507a4baaf72c5a09bb6c018fe2ae0ab622e5830b703a46cc9e44/scipy-1.17.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e30bdeaa5deed6bc27b4cc490823cd0347d7dae09119b8803ae576ea0ce52e4c", size = 36562980, upload-time = "2026-02-23T00:20:40.575Z" }, + { url = "https://files.pythonhosted.org/packages/70/b0/eb757336e5a76dfa7911f63252e3b7d1de00935d7705cf772db5b45ec238/scipy-1.17.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a720477885a9d2411f94a93d16f9d89bad0f28ca23c3f8daa521e2dcc3f44d49", size = 24856543, upload-time = "2026-02-23T00:20:45.313Z" }, + { url = "https://files.pythonhosted.org/packages/cf/83/333afb452af6f0fd70414dc04f898647ee1423979ce02efa75c3b0f2c28e/scipy-1.17.1-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:a48a72c77a310327f6a3a920092fa2b8fd03d7deaa60f093038f22d98e096717", size = 31584510, upload-time = "2026-02-23T00:21:01.015Z" }, + { url = "https://files.pythonhosted.org/packages/ed/a6/d05a85fd51daeb2e4ea71d102f15b34fedca8e931af02594193ae4fd25f7/scipy-1.17.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:45abad819184f07240d8a696117a7aacd39787af9e0b719d00285549ed19a1e9", size = 28170131, upload-time = "2026-02-23T00:21:05.888Z" }, + { url = "https://files.pythonhosted.org/packages/db/7b/8624a203326675d7746a254083a187398090a179335b2e4a20e2ddc46e83/scipy-1.17.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:3fd1fcdab3ea951b610dc4cef356d416d5802991e7e32b5254828d342f7b7e0b", size = 20342032, upload-time = "2026-02-23T00:21:09.904Z" }, + { url = "https://files.pythonhosted.org/packages/c9/35/2c342897c00775d688d8ff3987aced3426858fd89d5a0e26e020b660b301/scipy-1.17.1-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7bdf2da170b67fdf10bca777614b1c7d96ae3ca5794fd9587dce41eb2966e866", size = 22678766, upload-time = "2026-02-23T00:21:14.313Z" }, + { url = "https://files.pythonhosted.org/packages/ef/f2/7cdb8eb308a1a6ae1e19f945913c82c23c0c442a462a46480ce487fdc0ac/scipy-1.17.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adb2642e060a6549c343603a3851ba76ef0b74cc8c079a9a58121c7ec9fe2350", size = 32957007, upload-time = "2026-02-23T00:21:19.663Z" }, + { url = "https://files.pythonhosted.org/packages/0b/2e/7eea398450457ecb54e18e9d10110993fa65561c4f3add5e8eccd2b9cd41/scipy-1.17.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eee2cfda04c00a857206a4330f0c5e3e56535494e30ca445eb19ec624ae75118", size = 35221333, upload-time = "2026-02-23T00:21:25.278Z" }, + { url = "https://files.pythonhosted.org/packages/d9/77/5b8509d03b77f093a0d52e606d3c4f79e8b06d1d38c441dacb1e26cacf46/scipy-1.17.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d2650c1fb97e184d12d8ba010493ee7b322864f7d3d00d3f9bb97d9c21de4068", size = 35042066, upload-time = "2026-02-23T00:21:31.358Z" }, + { url = "https://files.pythonhosted.org/packages/f9/df/18f80fb99df40b4070328d5ae5c596f2f00fffb50167e31439e932f29e7d/scipy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:08b900519463543aa604a06bec02461558a6e1cef8fdbb8098f77a48a83c8118", size = 37612763, upload-time = "2026-02-23T00:21:37.247Z" }, + { url = "https://files.pythonhosted.org/packages/4b/39/f0e8ea762a764a9dc52aa7dabcfad51a354819de1f0d4652b6a1122424d6/scipy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:3877ac408e14da24a6196de0ddcace62092bfc12a83823e92e49e40747e52c19", size = 37290984, upload-time = "2026-02-23T00:22:35.023Z" }, + { url = "https://files.pythonhosted.org/packages/7c/56/fe201e3b0f93d1a8bcf75d3379affd228a63d7e2d80ab45467a74b494947/scipy-1.17.1-cp314-cp314-win_arm64.whl", hash = "sha256:f8885db0bc2bffa59d5c1b72fad7a6a92d3e80e7257f967dd81abb553a90d293", size = 25192877, upload-time = "2026-02-23T00:22:39.798Z" }, + { url = "https://files.pythonhosted.org/packages/96/ad/f8c414e121f82e02d76f310f16db9899c4fcde36710329502a6b2a3c0392/scipy-1.17.1-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:1cc682cea2ae55524432f3cdff9e9a3be743d52a7443d0cba9017c23c87ae2f6", size = 31949750, upload-time = "2026-02-23T00:21:42.289Z" }, + { url = "https://files.pythonhosted.org/packages/7c/b0/c741e8865d61b67c81e255f4f0a832846c064e426636cd7de84e74d209be/scipy-1.17.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:2040ad4d1795a0ae89bfc7e8429677f365d45aa9fd5e4587cf1ea737f927b4a1", size = 28585858, upload-time = "2026-02-23T00:21:47.706Z" }, + { url = "https://files.pythonhosted.org/packages/ed/1b/3985219c6177866628fa7c2595bfd23f193ceebbe472c98a08824b9466ff/scipy-1.17.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:131f5aaea57602008f9822e2115029b55d4b5f7c070287699fe45c661d051e39", size = 20757723, upload-time = "2026-02-23T00:21:52.039Z" }, + { url = "https://files.pythonhosted.org/packages/c0/19/2a04aa25050d656d6f7b9e7b685cc83d6957fb101665bfd9369ca6534563/scipy-1.17.1-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9cdc1a2fcfd5c52cfb3045feb399f7b3ce822abdde3a193a6b9a60b3cb5854ca", size = 23043098, upload-time = "2026-02-23T00:21:56.185Z" }, + { url = "https://files.pythonhosted.org/packages/86/f1/3383beb9b5d0dbddd030335bf8a8b32d4317185efe495374f134d8be6cce/scipy-1.17.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e3dcd57ab780c741fde8dc68619de988b966db759a3c3152e8e9142c26295ad", size = 33030397, upload-time = "2026-02-23T00:22:01.404Z" }, + { url = "https://files.pythonhosted.org/packages/41/68/8f21e8a65a5a03f25a79165ec9d2b28c00e66dc80546cf5eb803aeeff35b/scipy-1.17.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a9956e4d4f4a301ebf6cde39850333a6b6110799d470dbbb1e25326ac447f52a", size = 35281163, upload-time = "2026-02-23T00:22:07.024Z" }, + { url = "https://files.pythonhosted.org/packages/84/8d/c8a5e19479554007a5632ed7529e665c315ae7492b4f946b0deb39870e39/scipy-1.17.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:a4328d245944d09fd639771de275701ccadf5f781ba0ff092ad141e017eccda4", size = 35116291, upload-time = "2026-02-23T00:22:12.585Z" }, + { url = "https://files.pythonhosted.org/packages/52/52/e57eceff0e342a1f50e274264ed47497b59e6a4e3118808ee58ddda7b74a/scipy-1.17.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a77cbd07b940d326d39a1d1b37817e2ee4d79cb30e7338f3d0cddffae70fcaa2", size = 37682317, upload-time = "2026-02-23T00:22:18.513Z" }, + { url = "https://files.pythonhosted.org/packages/11/2f/b29eafe4a3fbc3d6de9662b36e028d5f039e72d345e05c250e121a230dd4/scipy-1.17.1-cp314-cp314t-win_amd64.whl", hash = "sha256:eb092099205ef62cd1782b006658db09e2fed75bffcae7cc0d44052d8aa0f484", size = 37345327, upload-time = "2026-02-23T00:22:24.442Z" }, + { url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165, upload-time = "2026-02-23T00:22:29.563Z" }, +] + [[package]] name = "secretstorage" version = "3.5.0"