diff --git a/.gitignore b/.gitignore index 6fd486a..b34d6e4 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,6 @@ node_modules # Package build artifacts *.egg-info/ *.backup + +# Claude Code local session state +.claude/scheduled_tasks.lock diff --git a/CLAUDE.md b/CLAUDE.md index 323c627..c6964a2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -66,6 +66,33 @@ uv run ruff format The service processes compiler output through a pipeline: input validation → smart assembly filtering → Claude API call → response with metrics. See `claude_explain.md` for detailed architecture documentation. +## Anthropic API gotchas + +- **`max_tokens` includes thinking tokens.** When a prompt YAML sets `model.thinking: {type: adaptive}` (or + `{type: enabled, budget_tokens: N}`), thinking counts against `max_tokens`. The production value `1536` silently + starves the visible text output on complex cases when thinking is on. `Prompt.__init__` now refuses to load a + thinking-enabled config with `max_tokens < 4096`; ≥4096 (8192 worked in past experiments) is the floor. +- **The reviewer model rejects `temperature`.** Opus 4.7 deprecated the parameter, so `prompt_testing/reviewer.py` + omits it. The Sonnet explainer still accepts `temperature`. If you swap the reviewer to a model that requires + it, restore the param. +- **Reviewer thinking is on by default.** `prompt-test run --review` and `prompt-test review` default to + `--reviewer-thinking adaptive` / `--thinking adaptive`. It catches factual errors the no-think reviewer misses + but adds ~70% to review cost. Pass `off` to compare runs or save money on large batches. +- **Production explainer thinking is intentionally off.** Adaptive thinking on Sonnet 4.6 measurably improves + factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~11s end-to-end + latency, which is too much for the interactive endpoint. Don't enable it in `app/prompt.yaml` without an + explicit latency/quality decision. +- **Multi-block responses.** When thinking is enabled the API returns thinking blocks before the text block. + `app/explain.py` and `prompt_testing/runner.py` both pick the last text block via `getattr(c, "type", None) == + "text"`. Preserve that pattern for any new code that consumes responses. The API may also return + `redacted_thinking` blocks (encrypted reasoning when safety filters trip); the same filter excludes them + correctly, but be aware "no text block" can mean either max_tokens starvation *or* a redacted-thinking-only + response — the error message is the same. +- **Empty responses are not 500s.** When the model returns no text block, `app/explain.py` returns + `ExplainResponse(status="error")` with `usage` populated and emits `ClaudeExplainEmptyResponse`. The cache + layer skips storing error responses so retries hit the API. Don't change this to raise — the structured error + is what the CE frontend can render. + ## Code Style Guidelines - Prefer using modern Python 3.13+ type syntax. Good: `a: list[str] | None`. Bad: `a: Optional[List[str]]` diff --git a/app/cache.py b/app/cache.py index f7dd0fc..173bb59 100644 --- a/app/cache.py +++ b/app/cache.py @@ -142,6 +142,7 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str: "model": prompt_data["model"], "max_tokens": prompt_data["max_tokens"], "temperature": prompt_data["temperature"], + "thinking": prompt_data.get("thinking"), "system": prompt_data["system"], "messages": prompt_data["messages"], # Include a hash of the prompt config to invalidate cache when prompts change diff --git a/app/explain.py b/app/explain.py index c352bcb..d973075 100644 --- a/app/explain.py +++ b/app/explain.py @@ -1,4 +1,5 @@ import logging +from typing import Any from anthropic import AsyncAnthropic @@ -59,8 +60,10 @@ async def process_request( # Cache miss or no cache - proceed with Anthropic API call response = await _call_anthropic_api(body, client, prompt, metrics_provider) - # Cache the response (if cache provider is available) - if cache_provider is not None: + # Cache the response (if cache provider is available). Don't cache + # error responses — they consume real tokens but produce no useful + # content, and we want a retry to hit the API rather than the cache. + if cache_provider is not None and response.status == "success": await cache_response(body, prompt, response, cache_provider) metrics_provider.put_metric("ClaudeExplainCacheMiss", 1) @@ -90,24 +93,61 @@ async def _call_anthropic_api( LOGGER.debug("=== END PROMPT DEBUG ===") # Call Claude API - LOGGER.info("Using Anthropic client with model: %s", {prompt_data["model"]}) - - message = await client.messages.create( - model=prompt_data["model"], - max_tokens=prompt_data["max_tokens"], - temperature=prompt_data["temperature"], - system=prompt_data["system"], - messages=prompt_data["messages"], - ) - - # Get explanation and strip leading/trailing whitespace - explanation = message.content[0].text.strip() + LOGGER.info("Using Anthropic client with model: %s", prompt_data["model"]) + + api_kwargs: dict[str, Any] = { + "model": prompt_data["model"], + "max_tokens": prompt_data["max_tokens"], + "system": prompt_data["system"], + "messages": prompt_data["messages"], + } + if prompt_data.get("thinking"): + # Extended thinking: API requires temperature to be unset. + api_kwargs["thinking"] = prompt_data["thinking"] + else: + api_kwargs["temperature"] = prompt_data["temperature"] + + message = await client.messages.create(**api_kwargs) # Extract usage information input_tokens = message.usage.input_tokens output_tokens = message.usage.output_tokens total_tokens = input_tokens + output_tokens + # Pick the last text block — when thinking is enabled the response + # contains thinking blocks before the final text block. + text_blocks = [c for c in message.content if getattr(c, "type", None) == "text"] + explanation = text_blocks[-1].text.strip() if text_blocks else "" + if not explanation: + # Can happen if extended thinking exhausts max_tokens before any + # text block is emitted. Surface the failure to the caller with + # token usage populated, and emit a metric so this is visible on + # dashboards rather than buried in a generic 500. + message_text = ( + f"Claude returned no text content " + f"(stop_reason={message.stop_reason}, in={input_tokens}, out={output_tokens}). " + f"If thinking is enabled, max_tokens may be too low." + ) + LOGGER.warning(message_text) + metrics_provider.set_property("language", body.language) + metrics_provider.set_property("compiler", body.compiler) + metrics_provider.set_property("instructionSet", body.instructionSet or "unknown") + metrics_provider.set_property("cached", "false") + metrics_provider.put_metric("ClaudeExplainRequest", 1) + metrics_provider.put_metric("ClaudeExplainEmptyResponse", 1) + metrics_provider.put_metric("ClaudeExplainInputTokens", input_tokens) + metrics_provider.put_metric("ClaudeExplainOutputTokens", output_tokens) + return ExplainResponse( + status="error", + message=message_text, + model=prompt_data["model"], + usage=TokenUsage( + inputTokens=input_tokens, + outputTokens=output_tokens, + totalTokens=total_tokens, + ), + ) + # Calculate costs based on model cost_per_input_token, cost_per_output_token = get_model_cost(prompt_data["model"]) input_cost = input_tokens * cost_per_input_token diff --git a/app/prompt.py b/app/prompt.py index 7f7cdb4..eb0f044 100644 --- a/app/prompt.py +++ b/app/prompt.py @@ -15,6 +15,11 @@ # Constants from explain.py that are needed for data preparation MAX_ASSEMBLY_LINES = 300 # Maximum number of assembly lines to process +# Minimum max_tokens that's safe to pair with extended thinking. Below this, +# adaptive thinking can consume the whole budget on complex inputs and leave +# nothing for the visible response. +MIN_MAX_TOKENS_WITH_THINKING = 4096 + class Prompt: """Manages prompt templates and generates messages for Claude API.""" @@ -36,6 +41,18 @@ def __init__(self, config: dict[str, Any] | Path): self.model = self.config["model"]["name"] self.max_tokens = self.config["model"]["max_tokens"] self.temperature = self.config["model"].get("temperature", 0.0) + # Optional extended-thinking config, e.g. {"type": "adaptive"} or + # {"type": "enabled", "budget_tokens": 2000}. When set, callers + # should drop `temperature` (the API requires it to be unset/1). + self.thinking = self.config["model"].get("thinking") + if self.thinking and self.max_tokens < MIN_MAX_TOKENS_WITH_THINKING: + # Adaptive thinking happily consumes the entire token budget on + # complex inputs and leaves nothing for the visible text block. + # Fail loudly here rather than silently returning empty responses. + raise ValueError( + f"max_tokens={self.max_tokens} is too low for thinking; " + f"use at least {MIN_MAX_TOKENS_WITH_THINKING} when model.thinking is set." + ) # Extract prompt templates self.system_prompt_template = self.config["system_prompt"] @@ -275,6 +292,7 @@ def generate_messages(self, request: ExplainRequest) -> dict[str, Any]: "model": self.model, "max_tokens": self.max_tokens, "temperature": self.temperature, + "thinking": self.thinking, "system": system_prompt, "messages": messages, "structured_data": structured_data, # Include for reference diff --git a/app/test_explain.py b/app/test_explain.py index da3125d..753d245 100644 --- a/app/test_explain.py +++ b/app/test_explain.py @@ -57,6 +57,7 @@ def mock_anthropic_client(): mock_client = MagicMock() mock_message = MagicMock() mock_content = MagicMock() + mock_content.type = "text" mock_content.text = "This assembly code implements a simple square function..." mock_message.content = [mock_content] @@ -149,6 +150,93 @@ async def test_process_request_success(self, sample_request, mock_anthropic_clie assert structured_data["compiler"] == "g++" assert structured_data["sourceCode"] == "int square(int x) {\n return x * x;\n}" + @pytest.mark.asyncio + async def test_picks_last_text_block_with_thinking(self, sample_request, noop_metrics): + """When thinking is enabled, the response has thinking blocks before the + final text block. The handler should select the text block, not the + thinking block.""" + thinking_block = MagicMock() + thinking_block.type = "thinking" + thinking_block.thinking = "Let me trace through the imul..." + + text_block = MagicMock() + text_block.type = "text" + text_block.text = "Final explanation here." + + mock_message = MagicMock() + mock_message.content = [thinking_block, text_block] + mock_message.usage = MagicMock(input_tokens=100, output_tokens=50) + mock_message.stop_reason = "end_turn" + + mock_client = MagicMock() + mock_client.messages.create = AsyncMock(return_value=mock_message) + + test_prompt = Prompt(Path("app/prompt.yaml")) + response = await process_request(sample_request, mock_client, test_prompt, noop_metrics) + + assert response.status == "success" + assert response.explanation == "Final explanation here." + + @pytest.mark.asyncio + async def test_returns_error_when_no_text_block(self, sample_request, noop_metrics): + """A response with no text block (e.g. thinking exhausted max_tokens) + should return status='error' with token usage populated, not raise. + Production must surface a structured error rather than a generic 500.""" + thinking_block = MagicMock() + thinking_block.type = "thinking" + thinking_block.thinking = "..." + + mock_message = MagicMock() + mock_message.content = [thinking_block] + mock_message.usage = MagicMock(input_tokens=100, output_tokens=50) + mock_message.stop_reason = "max_tokens" + + mock_client = MagicMock() + mock_client.messages.create = AsyncMock(return_value=mock_message) + + test_prompt = Prompt(Path("app/prompt.yaml")) + response = await process_request(sample_request, mock_client, test_prompt, noop_metrics) + + assert response.status == "error" + assert response.explanation is None + assert "no text content" in response.message + # Real tokens were spent — surface them so callers can see the cost. + assert response.usage is not None + assert response.usage.inputTokens == 100 + assert response.usage.outputTokens == 50 + + +class TestPromptValidation: + """Validation rules enforced at Prompt construction.""" + + def test_thinking_requires_min_max_tokens(self): + """A prompt YAML that enables thinking must also bump max_tokens.""" + with pytest.raises(ValueError, match="too low for thinking"): + Prompt( + { + "model": {"name": "test", "max_tokens": 1536, "thinking": {"type": "adaptive"}}, + "system_prompt": "", + "user_prompt": "", + "assistant_prefill": "", + "audience_levels": {}, + "explanation_types": {}, + } + ) + + def test_thinking_with_sufficient_max_tokens_is_ok(self): + """At/above the floor (4096), thinking-enabled prompts load fine.""" + prompt = Prompt( + { + "model": {"name": "test", "max_tokens": 4096, "thinking": {"type": "adaptive"}}, + "system_prompt": "", + "user_prompt": "", + "assistant_prefill": "", + "audience_levels": {}, + "explanation_types": {}, + } + ) + assert prompt.thinking == {"type": "adaptive"} + class TestSelectImportantAssembly: """Test the select_important_assembly function.""" diff --git a/prompt_testing/cli.py b/prompt_testing/cli.py index e5d3af0..452e050 100644 --- a/prompt_testing/cli.py +++ b/prompt_testing/cli.py @@ -16,6 +16,7 @@ import sys from collections import defaultdict from pathlib import Path +from typing import Any import click from dotenv import load_dotenv @@ -45,8 +46,14 @@ def cli(ctx, project_root): @click.option("--max-concurrent", type=int, default=5) @click.option("--review", is_flag=True, help="Also run Opus correctness review on results") @click.option("--review-model", default="claude-opus-4-7", help="Model for correctness review") +@click.option( + "--reviewer-thinking", + type=click.Choice(["off", "adaptive"]), + default="adaptive", + help="Extended thinking on the reviewer. Default 'adaptive' improves rigor at ~70% extra reviewer cost.", +) @click.pass_context -def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_model): +def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_model, reviewer_thinking): """Run test cases and save results for review.""" tester = PromptTester(ctx.obj["project_root"], max_concurrent=max_concurrent) results = tester.run( @@ -56,7 +63,8 @@ def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_m ) if review: - results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, review_model)) + thinking = {"type": "adaptive"} if reviewer_thinking == "adaptive" else None + results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, review_model, thinking)) tester.save(results, output) @@ -195,11 +203,11 @@ def compilers(ctx, language, search, limit): # noqa: ARG001 click.echo(f"... and {len(results) - limit} more") -async def _run_reviews(project_root: Path, results: dict, model: str) -> dict: +async def _run_reviews(project_root: Path, results: dict, model: str, thinking: dict[str, Any] | None = None) -> dict: """Run correctness reviews on all successful results.""" from prompt_testing.reviewer import CorrectnessReviewer - reviewer = CorrectnessReviewer(model=model) + reviewer = CorrectnessReviewer(model=model, thinking=thinking) test_dir = project_root / "prompt_testing" / "test_cases" all_cases = load_all_test_cases(str(test_dir)) cases_by_id = {c["id"]: c for c in all_cases} @@ -209,6 +217,7 @@ async def _run_reviews(project_root: Path, results: dict, model: str) -> dict: review_cost = 0.0 errors_found = 0 + review_failures = 0 cost_per_input_token, cost_per_output_token = get_model_cost(model) for i, result in enumerate(successful, 1): @@ -219,10 +228,19 @@ async def _run_reviews(project_root: Path, results: dict, model: str) -> dict: review = await reviewer.review_test_result(case, result["explanation"]) result["review"] = review - status = "✓" if review.get("correct") else "✗" - n_issues = len(review.get("issues", [])) - if not review.get("correct"): + # `correct`: True = passed, False = real factual error, + # None = reviewer infrastructure failure (parse/empty response). + # Distinguish so suite metrics don't conflate the two. + correct = review.get("correct") + if correct is True: + status = "✓" + elif correct is False: + status = "✗" errors_found += 1 + else: + status = "?" + review_failures += 1 + n_issues = len(review.get("issues", [])) cost = ( review.get("reviewer_input_tokens", 0) * cost_per_input_token + review.get("reviewer_output_tokens", 0) * cost_per_output_token @@ -234,26 +252,33 @@ async def _run_reviews(project_root: Path, results: dict, model: str) -> dict: results["review_cost_usd"] = round(review_cost, 6) results["total_cost_usd"] = round(results["total_cost_usd"] + review_cost, 6) results["errors_found"] = errors_found + results["review_failures"] = review_failures return results -def _print_review_summary(results: dict) -> None: +def _print_review_summary(results: dict[str, Any]) -> None: """Print a summary of correctness reviews.""" reviewed = [r for r in results["results"] if r.get("review")] - correct = sum(1 for r in reviewed if r["review"].get("correct")) - incorrect = len(reviewed) - correct + passed = sum(1 for r in reviewed if r["review"].get("correct") is True) + failed = sum(1 for r in reviewed if r["review"].get("correct") is False) + review_failures = sum(1 for r in reviewed if r["review"].get("correct") is None) - click.echo(f"\nCorrectness: {correct}/{len(reviewed)} passed") - if incorrect: - click.echo(f"\n⚠ {incorrect} case(s) with issues:") + click.echo(f"\nCorrectness: {passed}/{len(reviewed)} passed") + if failed: + click.echo(f"\n⚠ {failed} case(s) with issues:") for r in reviewed: review = r["review"] - if not review.get("correct"): + if review.get("correct") is False: click.echo(f"\n {r['case_id']}:") for issue in review.get("issues", []): sev = "🔴" if issue["severity"] == "error" else "🟡" click.echo(f" {sev} {issue['claim']}") click.echo(f" → {issue['correction']}") + if review_failures: + click.echo(f"\n⚠ {review_failures} review(s) failed to run (likely max_tokens starvation):") + for r in reviewed: + if r["review"].get("correct") is None: + click.echo(f" {r['case_id']}: {r['review'].get('summary', '?')}") click.echo(f"\nReview cost: ${results.get('review_cost_usd', 0):.4f} ({results.get('review_model', '?')})") @@ -261,14 +286,21 @@ def _print_review_summary(results: dict) -> None: @cli.command() @click.argument("results_file") @click.option("--model", default="claude-opus-4-7", help="Reviewer model") +@click.option( + "--thinking", + type=click.Choice(["off", "adaptive"]), + default="adaptive", + help="Extended thinking on the reviewer (default 'adaptive' for tighter rigor).", +) @click.pass_context -def review(ctx, results_file, model): +def review(ctx, results_file, model, thinking): """Run Opus correctness review on existing results.""" results_dir = ctx.obj["project_root"] / "prompt_testing" / "results" path = results_dir / results_file if not Path(results_file).is_absolute() else Path(results_file) + thinking_cfg = {"type": "adaptive"} if thinking == "adaptive" else None results = json.loads(path.read_text()) - results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, model)) + results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, model, thinking_cfg)) # Save updated results path.write_text(json.dumps(results, indent=2)) diff --git a/prompt_testing/reviewer.py b/prompt_testing/reviewer.py index a329317..243dae7 100644 --- a/prompt_testing/reviewer.py +++ b/prompt_testing/reviewer.py @@ -66,8 +66,17 @@ class CorrectnessReviewer: """Reviews explanations for factual correctness using a powerful model.""" - def __init__(self, model: str = "claude-opus-4-7"): + def __init__(self, model: str = "claude-opus-4-7", thinking: dict[str, Any] | None = None): + """Initialise the reviewer. + + Args: + model: Claude model id to use for review. + thinking: Optional extended-thinking config, e.g. + ``{"type": "adaptive"}`` or + ``{"type": "enabled", "budget_tokens": 2000}``. + """ self.model = model + self.thinking = thinking self.client = AsyncAnthropic() async def review( @@ -96,31 +105,50 @@ async def review( ) # Opus 4.7+ rejects `temperature`; rely on the model's own default. - msg = await self.client.messages.create( - model=self.model, - max_tokens=2048, - system=REVIEW_SYSTEM_PROMPT, - messages=[{"role": "user", "content": user_prompt}], - ) - - text = msg.content[0].text.strip() - - # Parse JSON response - try: - result = json.loads(text) - except json.JSONDecodeError: - # Try to extract JSON from markdown fencing - if "```" in text: - json_part = text.split("```")[1] - if json_part.startswith("json"): - json_part = json_part[4:] - result = json.loads(json_part.strip()) - else: - result = { - "correct": None, - "issues": [], - "summary": f"Failed to parse reviewer response: {text[:200]}", - } + api_kwargs: dict[str, Any] = { + "model": self.model, + "max_tokens": 2048, + "system": REVIEW_SYSTEM_PROMPT, + "messages": [{"role": "user", "content": user_prompt}], + } + if self.thinking: + api_kwargs["thinking"] = self.thinking + msg = await self.client.messages.create(**api_kwargs) + + # When thinking is enabled the response contains thinking blocks + # before the final text block; pick the last text block. + text_blocks = [c for c in msg.content if getattr(c, "type", None) == "text"] + text = text_blocks[-1].text.strip() if text_blocks else "" + + if not text: + # Likely thinking exhausted max_tokens before any text block. + # Surface stop_reason/usage so this is diagnosable rather than a + # generic JSON parse failure. + result: dict[str, Any] = { + "correct": None, + "issues": [], + "summary": ( + f"Reviewer returned no text (stop_reason={msg.stop_reason}, " + f"in={msg.usage.input_tokens}, out={msg.usage.output_tokens})." + ), + } + else: + # Parse JSON response + try: + result = json.loads(text) + except json.JSONDecodeError: + # Try to extract JSON from markdown fencing + if "```" in text: + json_part = text.split("```")[1] + if json_part.startswith("json"): + json_part = json_part[4:] + result = json.loads(json_part.strip()) + else: + result = { + "correct": None, + "issues": [], + "summary": f"Failed to parse reviewer response: {text[:200]}", + } result["reviewer_model"] = self.model result["reviewer_input_tokens"] = msg.usage.input_tokens diff --git a/prompt_testing/runner.py b/prompt_testing/runner.py index 15ea143..e678447 100644 --- a/prompt_testing/runner.py +++ b/prompt_testing/runner.py @@ -15,6 +15,7 @@ from app.explain_api import AssemblyItem, ExplainRequest from app.explanation_types import AudienceLevel, ExplanationType +from app.model_costs import get_model_cost from app.prompt import Prompt from prompt_testing.file_utils import load_all_test_cases @@ -69,20 +70,45 @@ async def _run_one(self, test_case: dict[str, Any], prompt: Prompt) -> dict[str, request = self._to_request(test_case) prompt_data = prompt.generate_messages(request) + api_kwargs: dict[str, Any] = { + "model": prompt_data["model"], + "max_tokens": prompt_data["max_tokens"], + "system": prompt_data["system"], + "messages": prompt_data["messages"], + } + if prompt_data.get("thinking"): + # Extended thinking: temperature must be 1 / unset. + api_kwargs["thinking"] = prompt_data["thinking"] + else: + api_kwargs["temperature"] = prompt_data["temperature"] + start = time.time() try: - msg = await self.async_client.messages.create( - model=prompt_data["model"], - max_tokens=prompt_data["max_tokens"], - temperature=prompt_data["temperature"], - system=prompt_data["system"], - messages=prompt_data["messages"], - ) + msg = await self.async_client.messages.create(**api_kwargs) elapsed_ms = int((time.time() - start) * 1000) + text_blocks = [c for c in msg.content if getattr(c, "type", None) == "text"] + explanation = text_blocks[-1].text.strip() if text_blocks else "" + if not explanation: + # Treat empty output as a failure so suite metrics aren't + # skewed. Common cause: thinking exhausting max_tokens + # before any text block is emitted. Tokens were still + # spent — capture them so cost reporting stays accurate. + return { + "case_id": case_id, + "success": False, + "error": ( + f"empty response (stop_reason={msg.stop_reason}, " + f"in={msg.usage.input_tokens}, out={msg.usage.output_tokens})" + ), + "model": prompt_data["model"], + "input_tokens": msg.usage.input_tokens, + "output_tokens": msg.usage.output_tokens, + "elapsed_ms": elapsed_ms, + } return { "case_id": case_id, "success": True, - "explanation": msg.content[-1].text.strip(), + "explanation": explanation, "model": prompt_data["model"], "input_tokens": msg.usage.input_tokens, "output_tokens": msg.usage.output_tokens, @@ -124,9 +150,14 @@ async def run_async( print(f" [{i}/{len(cases)}] {status} {result['case_id']} ({tokens})") successful = [r for r in results if r["success"]] + # Cost includes failures that consumed tokens (e.g. thinking exhausted + # max_tokens before any text was emitted) — those aren't free. + # Use the prompt's actual model rather than hardcoded rates so the + # number stays correct across explainer-model experiments. + cost_per_input_token, cost_per_output_token = get_model_cost(prompt.model) total_cost = sum( - r["input_tokens"] * 3 / 1e6 + r["output_tokens"] * 15 / 1e6 # Sonnet pricing - for r in successful + r.get("input_tokens", 0) * cost_per_input_token + r.get("output_tokens", 0) * cost_per_output_token + for r in results ) return {