From 94e38d0b07c3cd57c979eac6f9a190eb61c3c690 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Fri, 29 May 2026 09:53:55 -0500 Subject: [PATCH 1/3] test(experiments): add token-savings experiment runner for learn/recall Standalone (non-pytest) script that runs utterance 1 to seed guidelines and utterance 2 with vs. without recall in the claude-sandbox, comparing token usage from --output-format json plus per-turn usage from saved transcripts. Supports --shared-seed to reuse one seeded workspace across N measure runs. Lives under experiments/ so ad-hoc measurement work is separated from the test suite. --- experiments/token_savings.py | 443 +++++++++++++++++++++++++++++++++++ 1 file changed, 443 insertions(+) create mode 100644 experiments/token_savings.py diff --git a/experiments/token_savings.py b/experiments/token_savings.py new file mode 100644 index 00000000..b7e688b6 --- /dev/null +++ b/experiments/token_savings.py @@ -0,0 +1,443 @@ +"""Experiment: measure token savings from recalled guidelines. + +Adapted from test_claude_sandbox_learn_recall.py. Not a pytest test — runs as +a script and prints a comparison table. + +Design: + - Seed run: utterance 1 ("where was the photo @sample.jpg taken. use exif + metadata") on a fresh demo/workspace copy. Produces .evolve/entities/. + - With-guidelines run: utterance 2 ("what focal length...") on the same + workspace. Recall hook injects the guideline. + - Without-guidelines run: utterance 2 on a NEW fresh workspace copy with no + .evolve/. Recall has nothing to find. + +Repeat N times per condition. Reports headline tokens from claude +--output-format json and per-turn usage parsed from the saved transcript. + +Usage: + python experiments/token_savings.py [--runs 3] +""" + +from __future__ import annotations + +import argparse +import json +import os +import shlex +import shutil +import statistics +import subprocess +import sys +import time +from datetime import datetime, timezone +from pathlib import Path + +# Constants mirrored from test_claude_sandbox_learn_recall.py — kept inline +# so this script doesn't pull in pytest just to import them. +SANDBOX_IMAGE = "claude-sandbox" +REPO_ROOT = Path(__file__).resolve().parents[1] +SESSION_TIMEOUT_SECONDS = 600 +FORWARDED_ENV_VARS = ( + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "ANTHROPIC_BASE_URL", + "CLAUDE_MODEL", + "CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS", + "CLAUDE_CODE_SKIP_BEDROCK_AUTH", +) + + +UTTERANCE_LEARN = "where was the photo @sample.jpg taken. use exif metadata" +UTTERANCE_MEASURE = "what focal length was used to take the photo @sample.jpg. use exif metadata" + + +def _check_prerequisites() -> None: + if shutil.which("docker") is None: + sys.exit("ERROR: docker not installed") + if subprocess.run(["docker", "info"], capture_output=True).returncode != 0: + sys.exit("ERROR: docker daemon not running") + if subprocess.run(["docker", "image", "inspect", SANDBOX_IMAGE], capture_output=True).returncode != 0: + sys.exit(f"ERROR: sandbox image {SANDBOX_IMAGE!r} not built — run `just sandbox-build claude`") + if not (os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_AUTH_TOKEN")): + sys.exit("ERROR: ANTHROPIC_API_KEY (or ANTHROPIC_AUTH_TOKEN) not set") + + +def _fresh_workspace(tmp_root: Path, label: str) -> Path: + src = REPO_ROOT / "demo" / "workspace" + dst = tmp_root / label + if dst.exists(): + shutil.rmtree(dst) + shutil.copytree(src, dst, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup")) + return dst + + +def _run_sandbox_prompt_json(workspace: Path, prompt: str) -> tuple[subprocess.CompletedProcess, dict | None]: + """Run a prompt with --output-format json and return (proc, parsed_json).""" + plugins = REPO_ROOT / "platform-integrations" / "claude" / "plugins" + command = ( + "claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions " + "--output-format json -p " + shlex.quote(prompt) + ) + cmd = ["docker", "run", "--rm"] + for var in FORWARDED_ENV_VARS: + if os.environ.get(var): + cmd += ["-e", var] + cmd += [ + "-e", "EVOLVE_DEBUG=1", + "-v", f"{workspace}:/workspace", + "-v", f"{plugins}:/plugins", + SANDBOX_IMAGE, + "bash", "-c", command, + ] + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS) + parsed: dict | None = None + if proc.returncode == 0 and proc.stdout.strip(): + try: + parsed = json.loads(proc.stdout) + except json.JSONDecodeError: + # Output may be a stream of multiple JSON objects on rare occasions; + # fall back to last well-formed line. + for line in reversed(proc.stdout.splitlines()): + line = line.strip() + if line.startswith("{") and line.endswith("}"): + try: + parsed = json.loads(line) + break + except json.JSONDecodeError: + continue + return proc, parsed + + +def _per_turn_usage(transcript_path: Path) -> list[dict]: + """Pull the usage block from each assistant message in the transcript.""" + turns: list[dict] = [] + for line in transcript_path.read_text().splitlines(): + line = line.strip() + if not line: + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + message = record.get("message") + if not isinstance(message, dict): + continue + usage = message.get("usage") + if not isinstance(usage, dict): + continue + turns.append({ + "type": record.get("type"), + "role": message.get("role"), + "input_tokens": usage.get("input_tokens"), + "output_tokens": usage.get("output_tokens"), + "cache_creation_input_tokens": usage.get("cache_creation_input_tokens"), + "cache_read_input_tokens": usage.get("cache_read_input_tokens"), + }) + return turns + + +def _newest_transcript(workspace: Path, exclude: set[Path]) -> Path | None: + trajectories_dir = workspace / ".evolve" / "trajectories" + if not trajectories_dir.is_dir(): + return None + candidates = [p for p in trajectories_dir.glob("*.jsonl") if p not in exclude] + if not candidates: + return None + return max(candidates, key=lambda p: p.stat().st_mtime) + + +def _list_entities(workspace: Path) -> list[str]: + entities_dir = workspace / ".evolve" / "entities" + if not entities_dir.is_dir(): + return [] + return sorted(str(p.relative_to(entities_dir)) for p in entities_dir.rglob("*.md")) + + +def _extract_usage(parsed: dict | None) -> dict: + """Pull the headline usage block out of `claude --output-format json`.""" + if not parsed: + return {} + usage = parsed.get("usage") or {} + # claude reports cumulative usage in `usage`. Some versions also provide + # input_tokens/output_tokens at the top level; prefer the explicit block. + return { + "input_tokens": usage.get("input_tokens"), + "output_tokens": usage.get("output_tokens"), + "cache_creation_input_tokens": usage.get("cache_creation_input_tokens"), + "cache_read_input_tokens": usage.get("cache_read_input_tokens"), + "total_tokens": ( + (usage.get("input_tokens") or 0) + + (usage.get("output_tokens") or 0) + + (usage.get("cache_creation_input_tokens") or 0) + + (usage.get("cache_read_input_tokens") or 0) + ), + "duration_ms": parsed.get("duration_ms"), + "num_turns": parsed.get("num_turns"), + } + + +def _do_with_guidelines_run(tmp_root: Path, idx: int) -> dict: + label = f"with_guidelines_{idx}" + workspace = _fresh_workspace(tmp_root, label) + + print(f" [{label}] seeding (utterance 1)...", flush=True) + t0 = time.time() + seed_proc, _seed_parsed = _run_sandbox_prompt_json(workspace, UTTERANCE_LEARN) + print(f" [{label}] seed done in {time.time()-t0:.0f}s rc={seed_proc.returncode}", flush=True) + if seed_proc.returncode != 0: + return {"label": label, "error": "seed_failed", "stderr": seed_proc.stderr[-1000:]} + + seed_transcripts = set((workspace / ".evolve" / "trajectories").glob("*.jsonl")) if (workspace / ".evolve" / "trajectories").is_dir() else set() + entities = _list_entities(workspace) + if not entities: + return {"label": label, "error": "no_guideline_learned", "stdout": seed_proc.stdout[-1000:]} + + print(f" [{label}] measure (utterance 2) — {len(entities)} guideline(s) recallable...", flush=True) + t1 = time.time() + proc, parsed = _run_sandbox_prompt_json(workspace, UTTERANCE_MEASURE) + print(f" [{label}] measure done in {time.time()-t1:.0f}s rc={proc.returncode}", flush=True) + if proc.returncode != 0: + return {"label": label, "error": "measure_failed", "stderr": proc.stderr[-1000:]} + + transcript = _newest_transcript(workspace, exclude=seed_transcripts) + return { + "label": label, + "condition": "with_guidelines", + "headline_usage": _extract_usage(parsed), + "raw_json": parsed, + "per_turn": _per_turn_usage(transcript) if transcript else [], + "transcript_path": str(transcript) if transcript else None, + "entities_seeded": entities, + } + + +def _do_shared_seed_runs(tmp_root: Path, n_runs: int) -> list[dict]: + """Seed once, then measure n_runs times against the same workspace. + + Recall is driven by `.evolve/entities/`, which doesn't change across the + measure runs, so all measure runs see the same recallable guidelines. + `.evolve/trajectories/` and `.evolve/audit.log` accumulate as they would + in normal day-to-day use of the same project. + """ + label_root = "with_guidelines_shared" + workspace = _fresh_workspace(tmp_root, label_root) + + print(f" [{label_root}] seeding (utterance 1)...", flush=True) + t0 = time.time() + seed_proc, _seed_parsed = _run_sandbox_prompt_json(workspace, UTTERANCE_LEARN) + print(f" [{label_root}] seed done in {time.time()-t0:.0f}s rc={seed_proc.returncode}", flush=True) + if seed_proc.returncode != 0: + return [{"label": label_root, "error": "seed_failed", "stderr": seed_proc.stderr[-1000:]}] + + entities = _list_entities(workspace) + if not entities: + return [{"label": label_root, "error": "no_guideline_learned", "stdout": seed_proc.stdout[-1000:]}] + + results: list[dict] = [] + for i in range(1, n_runs + 1): + label = f"{label_root}_{i}" + # Snapshot trajectories present BEFORE this measure run, so we can find + # the new transcript afterward. + trajectories_dir = workspace / ".evolve" / "trajectories" + prior_transcripts = set(trajectories_dir.glob("*.jsonl")) if trajectories_dir.is_dir() else set() + + print(f" [{label}] measure (utterance 2) — {len(entities)} guideline(s) recallable...", flush=True) + t1 = time.time() + proc, parsed = _run_sandbox_prompt_json(workspace, UTTERANCE_MEASURE) + print(f" [{label}] measure done in {time.time()-t1:.0f}s rc={proc.returncode}", flush=True) + if proc.returncode != 0: + results.append({"label": label, "error": "measure_failed", "stderr": proc.stderr[-1000:]}) + continue + + transcript = _newest_transcript(workspace, exclude=prior_transcripts) + results.append({ + "label": label, + "condition": "with_guidelines", + "headline_usage": _extract_usage(parsed), + "raw_json": parsed, + "per_turn": _per_turn_usage(transcript) if transcript else [], + "transcript_path": str(transcript) if transcript else None, + "entities_seeded": entities, + }) + return results + + +def _do_without_guidelines_run(tmp_root: Path, idx: int) -> dict: + label = f"without_guidelines_{idx}" + workspace = _fresh_workspace(tmp_root, label) + print(f" [{label}] measure (utterance 2) — no .evolve/ ...", flush=True) + t0 = time.time() + proc, parsed = _run_sandbox_prompt_json(workspace, UTTERANCE_MEASURE) + print(f" [{label}] done in {time.time()-t0:.0f}s rc={proc.returncode}", flush=True) + if proc.returncode != 0: + return {"label": label, "error": "measure_failed", "stderr": proc.stderr[-1000:]} + + transcript = _newest_transcript(workspace, exclude=set()) + return { + "label": label, + "condition": "without_guidelines", + "headline_usage": _extract_usage(parsed), + "raw_json": parsed, + "per_turn": _per_turn_usage(transcript) if transcript else [], + "transcript_path": str(transcript) if transcript else None, + } + + +def _summarize(runs: list[dict], key: str) -> dict: + values = [r["headline_usage"].get(key) for r in runs if "headline_usage" in r] + values = [v for v in values if isinstance(v, (int, float))] + if not values: + return {"n": 0} + return { + "n": len(values), + "mean": statistics.mean(values), + "min": min(values), + "max": max(values), + "stdev": statistics.stdev(values) if len(values) > 1 else 0.0, + } + + +def _format_table(with_runs: list[dict], without_runs: list[dict]) -> str: + keys = [ + ("total_tokens", "total"), + ("input_tokens", "input"), + ("output_tokens", "output"), + ("cache_creation_input_tokens", "cache_create"), + ("cache_read_input_tokens", "cache_read"), + ("duration_ms", "duration_ms"), + ("num_turns", "num_turns"), + ] + lines = [] + lines.append(f"| metric | without_guidelines (n={len(without_runs)}) | with_guidelines (n={len(with_runs)}) | savings |") + lines.append("| --- | --- | --- | --- |") + for key, label in keys: + wo = _summarize(without_runs, key) + w = _summarize(with_runs, key) + if not wo.get("n") or not w.get("n"): + lines.append(f"| {label} | n/a | n/a | n/a |") + continue + wo_str = f"{wo['mean']:.0f} (range {wo['min']:.0f}–{wo['max']:.0f})" + w_str = f"{w['mean']:.0f} (range {w['min']:.0f}–{w['max']:.0f})" + delta = wo["mean"] - w["mean"] + pct = (delta / wo["mean"] * 100.0) if wo["mean"] else 0.0 + lines.append(f"| {label} | {wo_str} | {w_str} | {delta:+.0f} ({pct:+.1f}%) |") + return "\n".join(lines) + + +def _format_per_turn(run: dict) -> str: + if not run.get("per_turn"): + return "_(no transcript)_" + rows = ["| # | role | input | output | cache_create | cache_read |", "| --- | --- | --- | --- | --- | --- |"] + for i, turn in enumerate(run["per_turn"], 1): + rows.append( + f"| {i} | {turn.get('role','?')} | " + f"{turn.get('input_tokens') or '-'} | " + f"{turn.get('output_tokens') or '-'} | " + f"{turn.get('cache_creation_input_tokens') or '-'} | " + f"{turn.get('cache_read_input_tokens') or '-'} |" + ) + return "\n".join(rows) + + +def _write_report(results_dir: Path, with_runs: list[dict], without_runs: list[dict], seeding_mode: str = "per-run") -> Path: + report = [] + report.append("# Token-savings experiment\n") + report.append(f"_Generated {datetime.now(timezone.utc).isoformat()}_\n") + report.append("**Utterance 1 (seed):** " + UTTERANCE_LEARN) + report.append("\n**Utterance 2 (measured):** " + UTTERANCE_MEASURE) + report.append(f"\n**Seeding mode:** {seeding_mode}\n") + report.append("## Summary\n") + report.append(_format_table(with_runs, without_runs)) + report.append("") + + sample_with = next((r for r in with_runs if "headline_usage" in r), None) + sample_without = next((r for r in without_runs if "headline_usage" in r), None) + if sample_without: + report.append("\n## Per-turn (representative without_guidelines run)\n") + report.append(_format_per_turn(sample_without)) + if sample_with: + report.append("\n## Per-turn (representative with_guidelines run)\n") + report.append(_format_per_turn(sample_with)) + if sample_with.get("entities_seeded"): + report.append("\n**Guidelines recallable in this run:** " + ", ".join(sample_with["entities_seeded"])) + + errors = [r for r in (with_runs + without_runs) if r.get("error")] + if errors: + report.append("\n## Errors\n") + for r in errors: + report.append(f"- {r['label']}: {r['error']}") + + path = results_dir / "report.md" + path.write_text("\n".join(report) + "\n") + return path + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--runs", type=int, default=3, help="runs per condition (default 3)") + parser.add_argument( + "--shared-seed", + action="store_true", + help="run utterance 1 once and reuse the seeded workspace for all with-guidelines measure runs (default: fresh seed per run)", + ) + parser.add_argument( + "--keep-workspaces", + action="store_true", + help="don't delete the per-run workspaces (useful for debugging)", + ) + args = parser.parse_args() + + _check_prerequisites() + + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + results_dir = REPO_ROOT / "experiments" / "results" / f"token_savings_{timestamp}" + results_dir.mkdir(parents=True, exist_ok=True) + workspace_root = results_dir / "workspaces" + workspace_root.mkdir(exist_ok=True) + + seeding_mode = "shared (one seed, N measure runs)" if args.shared_seed else "per-run (fresh seed for each measure run)" + print(f"Results dir: {results_dir}") + print(f"Runs per condition: {args.runs}") + print(f"Seeding mode: {seeding_mode}") + + with_runs: list[dict] = [] + without_runs: list[dict] = [] + + if args.shared_seed: + print(f"\n=== with-guidelines (shared seed, {args.runs} measure runs) ===") + with_runs.extend(_do_shared_seed_runs(workspace_root, args.runs)) + for i in range(1, args.runs + 1): + print(f"\n=== without-guidelines run {i}/{args.runs} ===") + without_runs.append(_do_without_guidelines_run(workspace_root, i)) + else: + for i in range(1, args.runs + 1): + print(f"\n=== with-guidelines run {i}/{args.runs} ===") + with_runs.append(_do_with_guidelines_run(workspace_root, i)) + print(f"\n=== without-guidelines run {i}/{args.runs} ===") + without_runs.append(_do_without_guidelines_run(workspace_root, i)) + + raw_path = results_dir / "raw.json" + raw_path.write_text(json.dumps( + {"with_guidelines": with_runs, "without_guidelines": without_runs, "seeding_mode": seeding_mode}, + indent=2, default=str, + )) + report_path = _write_report(results_dir, with_runs, without_runs, seeding_mode=seeding_mode) + + print("\n" + "=" * 60) + print(_format_table(with_runs, without_runs)) + print("=" * 60) + print(f"\nReport: {report_path}") + print(f"Raw: {raw_path}") + + if not args.keep_workspaces: + shutil.rmtree(workspace_root, ignore_errors=True) + + errors = [r for r in (with_runs + without_runs) if r.get("error")] + if errors: + print(f"\n{len(errors)} run(s) had errors — see report.md") + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 7259fe88f418d4dd6afdf72ef1ac5200e6c83b3d Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Fri, 29 May 2026 09:55:09 -0500 Subject: [PATCH 2/3] docs(experiments): add README for the experiments dir Documents what experiments/ is for (ad-hoc measurement, not the test suite), how to run token_savings.py, what the results layout looks like, and the rough wall-clock budget. --- experiments/README.md | 50 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 experiments/README.md diff --git a/experiments/README.md b/experiments/README.md new file mode 100644 index 00000000..26aa9604 --- /dev/null +++ b/experiments/README.md @@ -0,0 +1,50 @@ +# Experiments + +Ad-hoc measurement scripts. Not part of the test suite — these aren't run in +CI, don't assert anything, and exist to produce numbers and writeups. + +If a script here graduates into a regression check, move it under `tests/`. + +## Scripts + +### `token_savings.py` + +Measures the token / wall-clock / step gap on utterance 2 when guidelines from +utterance 1 are recallable vs. not. Adapted from +`tests/e2e/test_claude_sandbox_learn_recall.py` but standalone — runs as a +script, prints a comparison table, writes results to `results/`. + +**Requires:** Docker, the `claude-sandbox` image (`just sandbox-build claude`), +and `ANTHROPIC_API_KEY` (or `ANTHROPIC_AUTH_TOKEN`) in the environment. + +**Run:** + +```bash +# 3 runs per condition, fresh seed for every with-guidelines run +python3 experiments/token_savings.py --runs 3 + +# 5 measure runs against a single shared seed (cheaper, lower variance) +python3 experiments/token_savings.py --runs 5 --shared-seed + +# Keep the per-run workspaces afterwards (transcripts on disk for inspection) +python3 experiments/token_savings.py --runs 5 --shared-seed --keep-workspaces +``` + +**Output** lands in `experiments/results/token_savings_/`: + +- `report.md` — auto-generated comparison table + per-turn breakdown for one + representative run per condition. +- `raw.json` — full `usage` payload from every run, plus per-turn usage parsed + from each saved transcript. +- `summary.md` — hand-written writeup (when present) with sample tool-call + traces and the contents of the recalled guidelines. +- `workspaces/` — only with `--keep-workspaces`. ~1–2 MB per run. + +**Wall-clock budget:** roughly 25–35 min for `--runs 5`. The script prints +per-run progress so you can see where it is. + +## Results layout + +`experiments/results/token_savings_/` per run. The timestamp is the +UTC start time, so directory order = chronological order. Old result dirs are +kept as-is — don't rename. From 5f71a050afdc91ed437e8ec4ae63dbb7e65a6fe8 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Fri, 29 May 2026 10:31:57 -0500 Subject: [PATCH 3/3] fix(experiments): apply ruff format to token_savings.py Fixes failing CI check: check-formatting (3.12) --- experiments/token_savings.py | 83 ++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/experiments/token_savings.py b/experiments/token_savings.py index b7e688b6..c3230391 100644 --- a/experiments/token_savings.py +++ b/experiments/token_savings.py @@ -74,20 +74,22 @@ def _fresh_workspace(tmp_root: Path, label: str) -> Path: def _run_sandbox_prompt_json(workspace: Path, prompt: str) -> tuple[subprocess.CompletedProcess, dict | None]: """Run a prompt with --output-format json and return (proc, parsed_json).""" plugins = REPO_ROOT / "platform-integrations" / "claude" / "plugins" - command = ( - "claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions " - "--output-format json -p " + shlex.quote(prompt) - ) + command = "claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions --output-format json -p " + shlex.quote(prompt) cmd = ["docker", "run", "--rm"] for var in FORWARDED_ENV_VARS: if os.environ.get(var): cmd += ["-e", var] cmd += [ - "-e", "EVOLVE_DEBUG=1", - "-v", f"{workspace}:/workspace", - "-v", f"{plugins}:/plugins", + "-e", + "EVOLVE_DEBUG=1", + "-v", + f"{workspace}:/workspace", + "-v", + f"{plugins}:/plugins", SANDBOX_IMAGE, - "bash", "-c", command, + "bash", + "-c", + command, ] proc = subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS) parsed: dict | None = None @@ -125,14 +127,16 @@ def _per_turn_usage(transcript_path: Path) -> list[dict]: usage = message.get("usage") if not isinstance(usage, dict): continue - turns.append({ - "type": record.get("type"), - "role": message.get("role"), - "input_tokens": usage.get("input_tokens"), - "output_tokens": usage.get("output_tokens"), - "cache_creation_input_tokens": usage.get("cache_creation_input_tokens"), - "cache_read_input_tokens": usage.get("cache_read_input_tokens"), - }) + turns.append( + { + "type": record.get("type"), + "role": message.get("role"), + "input_tokens": usage.get("input_tokens"), + "output_tokens": usage.get("output_tokens"), + "cache_creation_input_tokens": usage.get("cache_creation_input_tokens"), + "cache_read_input_tokens": usage.get("cache_read_input_tokens"), + } + ) return turns @@ -183,11 +187,13 @@ def _do_with_guidelines_run(tmp_root: Path, idx: int) -> dict: print(f" [{label}] seeding (utterance 1)...", flush=True) t0 = time.time() seed_proc, _seed_parsed = _run_sandbox_prompt_json(workspace, UTTERANCE_LEARN) - print(f" [{label}] seed done in {time.time()-t0:.0f}s rc={seed_proc.returncode}", flush=True) + print(f" [{label}] seed done in {time.time() - t0:.0f}s rc={seed_proc.returncode}", flush=True) if seed_proc.returncode != 0: return {"label": label, "error": "seed_failed", "stderr": seed_proc.stderr[-1000:]} - seed_transcripts = set((workspace / ".evolve" / "trajectories").glob("*.jsonl")) if (workspace / ".evolve" / "trajectories").is_dir() else set() + seed_transcripts = ( + set((workspace / ".evolve" / "trajectories").glob("*.jsonl")) if (workspace / ".evolve" / "trajectories").is_dir() else set() + ) entities = _list_entities(workspace) if not entities: return {"label": label, "error": "no_guideline_learned", "stdout": seed_proc.stdout[-1000:]} @@ -195,7 +201,7 @@ def _do_with_guidelines_run(tmp_root: Path, idx: int) -> dict: print(f" [{label}] measure (utterance 2) — {len(entities)} guideline(s) recallable...", flush=True) t1 = time.time() proc, parsed = _run_sandbox_prompt_json(workspace, UTTERANCE_MEASURE) - print(f" [{label}] measure done in {time.time()-t1:.0f}s rc={proc.returncode}", flush=True) + print(f" [{label}] measure done in {time.time() - t1:.0f}s rc={proc.returncode}", flush=True) if proc.returncode != 0: return {"label": label, "error": "measure_failed", "stderr": proc.stderr[-1000:]} @@ -225,7 +231,7 @@ def _do_shared_seed_runs(tmp_root: Path, n_runs: int) -> list[dict]: print(f" [{label_root}] seeding (utterance 1)...", flush=True) t0 = time.time() seed_proc, _seed_parsed = _run_sandbox_prompt_json(workspace, UTTERANCE_LEARN) - print(f" [{label_root}] seed done in {time.time()-t0:.0f}s rc={seed_proc.returncode}", flush=True) + print(f" [{label_root}] seed done in {time.time() - t0:.0f}s rc={seed_proc.returncode}", flush=True) if seed_proc.returncode != 0: return [{"label": label_root, "error": "seed_failed", "stderr": seed_proc.stderr[-1000:]}] @@ -244,21 +250,23 @@ def _do_shared_seed_runs(tmp_root: Path, n_runs: int) -> list[dict]: print(f" [{label}] measure (utterance 2) — {len(entities)} guideline(s) recallable...", flush=True) t1 = time.time() proc, parsed = _run_sandbox_prompt_json(workspace, UTTERANCE_MEASURE) - print(f" [{label}] measure done in {time.time()-t1:.0f}s rc={proc.returncode}", flush=True) + print(f" [{label}] measure done in {time.time() - t1:.0f}s rc={proc.returncode}", flush=True) if proc.returncode != 0: results.append({"label": label, "error": "measure_failed", "stderr": proc.stderr[-1000:]}) continue transcript = _newest_transcript(workspace, exclude=prior_transcripts) - results.append({ - "label": label, - "condition": "with_guidelines", - "headline_usage": _extract_usage(parsed), - "raw_json": parsed, - "per_turn": _per_turn_usage(transcript) if transcript else [], - "transcript_path": str(transcript) if transcript else None, - "entities_seeded": entities, - }) + results.append( + { + "label": label, + "condition": "with_guidelines", + "headline_usage": _extract_usage(parsed), + "raw_json": parsed, + "per_turn": _per_turn_usage(transcript) if transcript else [], + "transcript_path": str(transcript) if transcript else None, + "entities_seeded": entities, + } + ) return results @@ -268,7 +276,7 @@ def _do_without_guidelines_run(tmp_root: Path, idx: int) -> dict: print(f" [{label}] measure (utterance 2) — no .evolve/ ...", flush=True) t0 = time.time() proc, parsed = _run_sandbox_prompt_json(workspace, UTTERANCE_MEASURE) - print(f" [{label}] done in {time.time()-t0:.0f}s rc={proc.returncode}", flush=True) + print(f" [{label}] done in {time.time() - t0:.0f}s rc={proc.returncode}", flush=True) if proc.returncode != 0: return {"label": label, "error": "measure_failed", "stderr": proc.stderr[-1000:]} @@ -330,7 +338,7 @@ def _format_per_turn(run: dict) -> str: rows = ["| # | role | input | output | cache_create | cache_read |", "| --- | --- | --- | --- | --- | --- |"] for i, turn in enumerate(run["per_turn"], 1): rows.append( - f"| {i} | {turn.get('role','?')} | " + f"| {i} | {turn.get('role', '?')} | " f"{turn.get('input_tokens') or '-'} | " f"{turn.get('output_tokens') or '-'} | " f"{turn.get('cache_creation_input_tokens') or '-'} | " @@ -417,10 +425,13 @@ def main() -> int: without_runs.append(_do_without_guidelines_run(workspace_root, i)) raw_path = results_dir / "raw.json" - raw_path.write_text(json.dumps( - {"with_guidelines": with_runs, "without_guidelines": without_runs, "seeding_mode": seeding_mode}, - indent=2, default=str, - )) + raw_path.write_text( + json.dumps( + {"with_guidelines": with_runs, "without_guidelines": without_runs, "seeding_mode": seeding_mode}, + indent=2, + default=str, + ) + ) report_path = _write_report(results_dir, with_runs, without_runs, seeding_mode=seeding_mode) print("\n" + "=" * 60)