From 15df9ee26451a284662dc3ee5759494795aca70a Mon Sep 17 00:00:00 2001 From: mattgodbolt-molty Date: Wed, 6 May 2026 16:19:56 -0500 Subject: [PATCH 1/7] Add extended-thinking support; default reviewer to adaptive thinking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plumbs Anthropic's extended thinking through the prompt-testing pipeline and the production explain handler. The production prompt is unchanged (no `thinking` field set), so behaviour and cost are identical until a config opts in. Findings that motivated this change (21-case suite, full results in PR description): - Sonnet 4.6 explainer with adaptive thinking: errors flagged 5 -> 1. Eliminates the recurring `imul eax, edi, edi` invention (2/5 -> 0/5 in focused 5-run testing). Cost +~$0.013/req, latency +11s. Held back from production: too much wall-clock for an interactive endpoint. - Opus 4.7 reviewer with adaptive thinking: catches different errors and finds real issues the no-thinking reviewer misses (e.g. correctly flags `factorial_beginner_assembly` mis-naming TCO). ~70% extra reviewer cost, no user-facing latency since reviews are offline. Worth defaulting on for eval rigor. Implementation: - `Prompt` reads an optional `model.thinking` block (e.g. `{type: adaptive}` or `{type: enabled, budget_tokens: 2000}`) and surfaces it via `generate_messages()`. - `app/explain.py` and `prompt_testing/runner.py` both now construct api_kwargs conditionally: with thinking, drop `temperature` (the API rejects it); pick the last text block from the response (thinking blocks come first when enabled). - `CorrectnessReviewer` accepts an optional `thinking` config and applies the same multi-block extraction. - `prompt-test run` and `prompt-test review` gain `--reviewer-thinking / --thinking` flags. Default is `adaptive` because the rigor win on eval is worth the extra cost; pass `off` to compare or save money. Important gotcha: with adaptive thinking, `max_tokens` must be large enough to leave room for the visible response after the thinking budget. Production prompt's 1536 starves the text output on complex cases — bump to >=4096 if a future variant enables thinking. Co-Authored-By: Claude Opus 4.7 (1M context) --- app/explain.py | 28 ++++++++++++++++++---------- app/prompt.py | 5 +++++ app/test_explain.py | 1 + prompt_testing/cli.py | 26 ++++++++++++++++++++------ prompt_testing/reviewer.py | 33 ++++++++++++++++++++++++--------- prompt_testing/runner.py | 24 ++++++++++++++++-------- 6 files changed, 84 insertions(+), 33 deletions(-) diff --git a/app/explain.py b/app/explain.py index c352bcb..2b96716 100644 --- a/app/explain.py +++ b/app/explain.py @@ -92,16 +92,24 @@ async def _call_anthropic_api( # Call Claude API LOGGER.info("Using Anthropic client with model: %s", {prompt_data["model"]}) - message = await client.messages.create( - model=prompt_data["model"], - max_tokens=prompt_data["max_tokens"], - temperature=prompt_data["temperature"], - system=prompt_data["system"], - messages=prompt_data["messages"], - ) - - # Get explanation and strip leading/trailing whitespace - explanation = message.content[0].text.strip() + api_kwargs: dict = { + "model": prompt_data["model"], + "max_tokens": prompt_data["max_tokens"], + "system": prompt_data["system"], + "messages": prompt_data["messages"], + } + if prompt_data.get("thinking"): + # Extended thinking: API requires temperature to be unset. + api_kwargs["thinking"] = prompt_data["thinking"] + else: + api_kwargs["temperature"] = prompt_data["temperature"] + + message = await client.messages.create(**api_kwargs) + + # Pick the last text block — when thinking is enabled the response + # contains thinking blocks before the final text block. + text_blocks = [c for c in message.content if getattr(c, "type", None) == "text"] + explanation = text_blocks[-1].text.strip() if text_blocks else "" # Extract usage information input_tokens = message.usage.input_tokens diff --git a/app/prompt.py b/app/prompt.py index 7f7cdb4..55c80e2 100644 --- a/app/prompt.py +++ b/app/prompt.py @@ -36,6 +36,10 @@ def __init__(self, config: dict[str, Any] | Path): self.model = self.config["model"]["name"] self.max_tokens = self.config["model"]["max_tokens"] self.temperature = self.config["model"].get("temperature", 0.0) + # Optional extended-thinking config, e.g. {"type": "adaptive"} or + # {"type": "enabled", "budget_tokens": 2000}. When set, callers + # should drop `temperature` (the API requires it to be unset/1). + self.thinking = self.config["model"].get("thinking") # Extract prompt templates self.system_prompt_template = self.config["system_prompt"] @@ -275,6 +279,7 @@ def generate_messages(self, request: ExplainRequest) -> dict[str, Any]: "model": self.model, "max_tokens": self.max_tokens, "temperature": self.temperature, + "thinking": self.thinking, "system": system_prompt, "messages": messages, "structured_data": structured_data, # Include for reference diff --git a/app/test_explain.py b/app/test_explain.py index da3125d..675a069 100644 --- a/app/test_explain.py +++ b/app/test_explain.py @@ -57,6 +57,7 @@ def mock_anthropic_client(): mock_client = MagicMock() mock_message = MagicMock() mock_content = MagicMock() + mock_content.type = "text" mock_content.text = "This assembly code implements a simple square function..." mock_message.content = [mock_content] diff --git a/prompt_testing/cli.py b/prompt_testing/cli.py index e5d3af0..de5b1c0 100644 --- a/prompt_testing/cli.py +++ b/prompt_testing/cli.py @@ -45,8 +45,14 @@ def cli(ctx, project_root): @click.option("--max-concurrent", type=int, default=5) @click.option("--review", is_flag=True, help="Also run Opus correctness review on results") @click.option("--review-model", default="claude-opus-4-7", help="Model for correctness review") +@click.option( + "--reviewer-thinking", + type=click.Choice(["off", "adaptive"]), + default="adaptive", + help="Extended thinking on the reviewer. Default 'adaptive' improves rigor at ~70% extra reviewer cost.", +) @click.pass_context -def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_model): +def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_model, reviewer_thinking): """Run test cases and save results for review.""" tester = PromptTester(ctx.obj["project_root"], max_concurrent=max_concurrent) results = tester.run( @@ -56,7 +62,8 @@ def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_m ) if review: - results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, review_model)) + thinking = {"type": "adaptive"} if reviewer_thinking == "adaptive" else None + results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, review_model, thinking)) tester.save(results, output) @@ -195,11 +202,11 @@ def compilers(ctx, language, search, limit): # noqa: ARG001 click.echo(f"... and {len(results) - limit} more") -async def _run_reviews(project_root: Path, results: dict, model: str) -> dict: +async def _run_reviews(project_root: Path, results: dict, model: str, thinking: dict | None = None) -> dict: """Run correctness reviews on all successful results.""" from prompt_testing.reviewer import CorrectnessReviewer - reviewer = CorrectnessReviewer(model=model) + reviewer = CorrectnessReviewer(model=model, thinking=thinking) test_dir = project_root / "prompt_testing" / "test_cases" all_cases = load_all_test_cases(str(test_dir)) cases_by_id = {c["id"]: c for c in all_cases} @@ -261,14 +268,21 @@ def _print_review_summary(results: dict) -> None: @cli.command() @click.argument("results_file") @click.option("--model", default="claude-opus-4-7", help="Reviewer model") +@click.option( + "--thinking", + type=click.Choice(["off", "adaptive"]), + default="adaptive", + help="Extended thinking on the reviewer (default 'adaptive' for tighter rigor).", +) @click.pass_context -def review(ctx, results_file, model): +def review(ctx, results_file, model, thinking): """Run Opus correctness review on existing results.""" results_dir = ctx.obj["project_root"] / "prompt_testing" / "results" path = results_dir / results_file if not Path(results_file).is_absolute() else Path(results_file) + thinking_cfg = {"type": "adaptive"} if thinking == "adaptive" else None results = json.loads(path.read_text()) - results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, model)) + results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, model, thinking_cfg)) # Save updated results path.write_text(json.dumps(results, indent=2)) diff --git a/prompt_testing/reviewer.py b/prompt_testing/reviewer.py index a329317..13093d2 100644 --- a/prompt_testing/reviewer.py +++ b/prompt_testing/reviewer.py @@ -66,8 +66,17 @@ class CorrectnessReviewer: """Reviews explanations for factual correctness using a powerful model.""" - def __init__(self, model: str = "claude-opus-4-7"): + def __init__(self, model: str = "claude-opus-4-7", thinking: dict[str, Any] | None = None): + """Initialise the reviewer. + + Args: + model: Claude model id to use for review. + thinking: Optional extended-thinking config, e.g. + ``{"type": "adaptive"}`` or + ``{"type": "enabled", "budget_tokens": 2000}``. + """ self.model = model + self.thinking = thinking self.client = AsyncAnthropic() async def review( @@ -96,14 +105,20 @@ async def review( ) # Opus 4.7+ rejects `temperature`; rely on the model's own default. - msg = await self.client.messages.create( - model=self.model, - max_tokens=2048, - system=REVIEW_SYSTEM_PROMPT, - messages=[{"role": "user", "content": user_prompt}], - ) - - text = msg.content[0].text.strip() + api_kwargs: dict[str, Any] = { + "model": self.model, + "max_tokens": 2048, + "system": REVIEW_SYSTEM_PROMPT, + "messages": [{"role": "user", "content": user_prompt}], + } + if self.thinking: + api_kwargs["thinking"] = self.thinking + msg = await self.client.messages.create(**api_kwargs) + + # When thinking is enabled the response contains thinking blocks + # before the final text block; pick the last text block. + text_blocks = [c for c in msg.content if getattr(c, "type", None) == "text"] + text = text_blocks[-1].text.strip() if text_blocks else "" # Parse JSON response try: diff --git a/prompt_testing/runner.py b/prompt_testing/runner.py index 15ea143..492fca9 100644 --- a/prompt_testing/runner.py +++ b/prompt_testing/runner.py @@ -69,20 +69,28 @@ async def _run_one(self, test_case: dict[str, Any], prompt: Prompt) -> dict[str, request = self._to_request(test_case) prompt_data = prompt.generate_messages(request) + api_kwargs: dict[str, Any] = { + "model": prompt_data["model"], + "max_tokens": prompt_data["max_tokens"], + "system": prompt_data["system"], + "messages": prompt_data["messages"], + } + if prompt_data.get("thinking"): + # Extended thinking: temperature must be 1 / unset. + api_kwargs["thinking"] = prompt_data["thinking"] + else: + api_kwargs["temperature"] = prompt_data["temperature"] + start = time.time() try: - msg = await self.async_client.messages.create( - model=prompt_data["model"], - max_tokens=prompt_data["max_tokens"], - temperature=prompt_data["temperature"], - system=prompt_data["system"], - messages=prompt_data["messages"], - ) + msg = await self.async_client.messages.create(**api_kwargs) elapsed_ms = int((time.time() - start) * 1000) + text_blocks = [c for c in msg.content if getattr(c, "type", None) == "text"] + explanation = text_blocks[-1].text.strip() if text_blocks else "" return { "case_id": case_id, "success": True, - "explanation": msg.content[-1].text.strip(), + "explanation": explanation, "model": prompt_data["model"], "input_tokens": msg.usage.input_tokens, "output_tokens": msg.usage.output_tokens, From 7739322d0d3e8ac2629b0d09a48f408b47d7de2a Mon Sep 17 00:00:00 2001 From: mattgodbolt-molty Date: Wed, 6 May 2026 16:27:37 -0500 Subject: [PATCH 2/7] Document Anthropic API gotchas in CLAUDE.md Adds a short section covering the non-obvious things this PR's session turned up that would burn time to rediscover: max_tokens-includes- thinking, the reviewer's temperature deprecation, the new default reviewer thinking, the deliberate hold on production explainer thinking, and the multi-block response pattern. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index 323c627..7f78162 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -66,6 +66,26 @@ uv run ruff format The service processes compiler output through a pipeline: input validation → smart assembly filtering → Claude API call → response with metrics. See `claude_explain.md` for detailed architecture documentation. +## Anthropic API gotchas + +- **`max_tokens` includes thinking tokens.** When a prompt YAML sets `model.thinking: {type: adaptive}` (or + `{type: enabled, budget_tokens: N}`), thinking counts against `max_tokens`. The production value `1536` silently + starves the visible text output on complex cases when thinking is on — bump to ≥4096 (8192 worked in past + experiments) before enabling thinking on any explainer prompt. +- **The reviewer model rejects `temperature`.** Opus 4.7 deprecated the parameter, so `prompt_testing/reviewer.py` + omits it. The Sonnet explainer still accepts `temperature`. If you swap the reviewer to a model that requires + it, restore the param. +- **Reviewer thinking is on by default.** `prompt-test run --review` and `prompt-test review` default to + `--reviewer-thinking adaptive` / `--thinking adaptive`. It catches factual errors the no-think reviewer misses + but adds ~70% to review cost. Pass `off` to compare runs or save money on large batches. +- **Production explainer thinking is intentionally off.** Adaptive thinking on Sonnet 4.6 measurably improves + factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~11s end-to-end + latency, which is too much for the interactive endpoint. Don't enable it in `app/prompt.yaml` without an + explicit latency/quality decision. +- **Multi-block responses.** When thinking is enabled the API returns thinking blocks before the text block. + `app/explain.py` and `prompt_testing/runner.py` both pick the last text block via `getattr(c, "type", None) == + "text"`. Preserve that pattern for any new code that consumes responses. + ## Code Style Guidelines - Prefer using modern Python 3.13+ type syntax. Good: `a: list[str] | None`. Bad: `a: Optional[List[str]]` From acdb10db63fe26c31accf5eea69c07e76caf14a9 Mon Sep 17 00:00:00 2001 From: mattgodbolt-molty Date: Wed, 6 May 2026 16:35:03 -0500 Subject: [PATCH 3/7] Address Copilot review on PR #17 - app/explain.py: drop the accidental set literal in the model log so it prints the model id rather than `{'claude-...'}`. - app/explain.py: raise RuntimeError when the response contains no text block (e.g. extended thinking exhausting max_tokens), so we don't cache or serve an empty success response. - prompt_testing/runner.py: same empty-output guard, returning success=False with stop_reason and token counts so suite metrics reflect reality instead of silently absorbing empty responses. - app/test_explain.py: add coverage for multi-block responses (thinking + text picks the text block) and the empty-block case (raises RuntimeError). Co-Authored-By: Claude Opus 4.7 (1M context) --- app/explain.py | 9 +++++++- app/test_explain.py | 47 ++++++++++++++++++++++++++++++++++++++++ prompt_testing/runner.py | 12 ++++++++++ 3 files changed, 67 insertions(+), 1 deletion(-) diff --git a/app/explain.py b/app/explain.py index 2b96716..8e443cf 100644 --- a/app/explain.py +++ b/app/explain.py @@ -90,7 +90,7 @@ async def _call_anthropic_api( LOGGER.debug("=== END PROMPT DEBUG ===") # Call Claude API - LOGGER.info("Using Anthropic client with model: %s", {prompt_data["model"]}) + LOGGER.info("Using Anthropic client with model: %s", prompt_data["model"]) api_kwargs: dict = { "model": prompt_data["model"], @@ -110,6 +110,13 @@ async def _call_anthropic_api( # contains thinking blocks before the final text block. text_blocks = [c for c in message.content if getattr(c, "type", None) == "text"] explanation = text_blocks[-1].text.strip() if text_blocks else "" + if not explanation: + # Can happen if extended thinking exhausts max_tokens before any + # text block is emitted. Don't cache or return an empty response. + raise RuntimeError( + f"Claude returned no text content (stop_reason={message.stop_reason}). " + f"If thinking is enabled, max_tokens may be too low." + ) # Extract usage information input_tokens = message.usage.input_tokens diff --git a/app/test_explain.py b/app/test_explain.py index 675a069..de10d4a 100644 --- a/app/test_explain.py +++ b/app/test_explain.py @@ -150,6 +150,53 @@ async def test_process_request_success(self, sample_request, mock_anthropic_clie assert structured_data["compiler"] == "g++" assert structured_data["sourceCode"] == "int square(int x) {\n return x * x;\n}" + @pytest.mark.asyncio + async def test_picks_last_text_block_with_thinking(self, sample_request, noop_metrics): + """When thinking is enabled, the response has thinking blocks before the + final text block. The handler should select the text block, not the + thinking block.""" + thinking_block = MagicMock() + thinking_block.type = "thinking" + thinking_block.thinking = "Let me trace through the imul..." + + text_block = MagicMock() + text_block.type = "text" + text_block.text = "Final explanation here." + + mock_message = MagicMock() + mock_message.content = [thinking_block, text_block] + mock_message.usage = MagicMock(input_tokens=100, output_tokens=50) + mock_message.stop_reason = "end_turn" + + mock_client = MagicMock() + mock_client.messages.create = AsyncMock(return_value=mock_message) + + test_prompt = Prompt(Path("app/prompt.yaml")) + response = await process_request(sample_request, mock_client, test_prompt, noop_metrics) + + assert response.status == "success" + assert response.explanation == "Final explanation here." + + @pytest.mark.asyncio + async def test_raises_when_no_text_block(self, sample_request, noop_metrics): + """A response with no text block (e.g. thinking exhausted max_tokens) + should raise rather than silently caching an empty explanation.""" + thinking_block = MagicMock() + thinking_block.type = "thinking" + thinking_block.thinking = "..." + + mock_message = MagicMock() + mock_message.content = [thinking_block] + mock_message.usage = MagicMock(input_tokens=100, output_tokens=50) + mock_message.stop_reason = "max_tokens" + + mock_client = MagicMock() + mock_client.messages.create = AsyncMock(return_value=mock_message) + + test_prompt = Prompt(Path("app/prompt.yaml")) + with pytest.raises(RuntimeError, match="no text content"): + await process_request(sample_request, mock_client, test_prompt, noop_metrics) + class TestSelectImportantAssembly: """Test the select_important_assembly function.""" diff --git a/prompt_testing/runner.py b/prompt_testing/runner.py index 492fca9..75a6850 100644 --- a/prompt_testing/runner.py +++ b/prompt_testing/runner.py @@ -87,6 +87,18 @@ async def _run_one(self, test_case: dict[str, Any], prompt: Prompt) -> dict[str, elapsed_ms = int((time.time() - start) * 1000) text_blocks = [c for c in msg.content if getattr(c, "type", None) == "text"] explanation = text_blocks[-1].text.strip() if text_blocks else "" + if not explanation: + # Treat empty output as a failure so suite metrics aren't + # skewed. Common cause: thinking exhausting max_tokens + # before any text block is emitted. + return { + "case_id": case_id, + "success": False, + "error": ( + f"empty response (stop_reason={msg.stop_reason}, " + f"in={msg.usage.input_tokens}, out={msg.usage.output_tokens})" + ), + } return { "case_id": case_id, "success": True, From 01d80c89e6ff7b8e0dadf78813df1f39b58c85e3 Mon Sep 17 00:00:00 2001 From: mattgodbolt-molty Date: Wed, 6 May 2026 16:59:21 -0500 Subject: [PATCH 4/7] Address local code review on PR #17 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Findings beyond Copilot's pass: - app/explain.py: empty-text responses no longer raise (which surfaced to users as a generic 500 with no payload). They now return ExplainResponse(status="error") with usage populated, emit a ClaudeExplainEmptyResponse metric, and are not cached so retries hit the API. - prompt_testing/runner.py: empty-response failures now capture input_tokens/output_tokens/elapsed_ms/model and contribute to total_cost_usd. Previously the spent tokens vanished from suite cost. - app/cache.py: cache_data explicitly includes the thinking config. Currently captured via prompt_version's hash, but the documented cache contract is the explicit field list — listing thinking keeps that contract honest. - app/prompt.py: Prompt.__init__ refuses thinking-enabled configs with max_tokens < 4096. Below this, thinking can consume the whole budget and starve the visible text. Fail at load time rather than silently producing empty responses. - prompt_testing/cli.py: distinguish review_failures (correct=None, reviewer infrastructure failed) from errors_found (correct=False, real factual error). Previously conflated under errors_found, which hid the difference between "model said something wrong" and "we couldn't review the response at all." - prompt_testing/cli.py: tighten bare dict annotations to dict[str, Any] for consistency. - CLAUDE.md: note redacted_thinking blocks (rare, encrypted reasoning when safety filters trip) and document the structured-error contract for empty responses. Tests: 94 -> 96 passing (added two for the new Prompt validation, plus updated the empty-text test to assert the structured error response shape). Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 13 ++++++++--- app/cache.py | 5 ++++ app/explain.py | 47 ++++++++++++++++++++++++++++--------- app/prompt.py | 13 +++++++++++ app/test_explain.py | 48 ++++++++++++++++++++++++++++++++++---- prompt_testing/cli.py | 40 ++++++++++++++++++++++--------- prompt_testing/reviewer.py | 45 ++++++++++++++++++++++------------- prompt_testing/runner.py | 13 ++++++++--- 8 files changed, 176 insertions(+), 48 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 7f78162..c6964a2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -70,8 +70,8 @@ call → response with metrics. See `claude_explain.md` for detailed architectur - **`max_tokens` includes thinking tokens.** When a prompt YAML sets `model.thinking: {type: adaptive}` (or `{type: enabled, budget_tokens: N}`), thinking counts against `max_tokens`. The production value `1536` silently - starves the visible text output on complex cases when thinking is on — bump to ≥4096 (8192 worked in past - experiments) before enabling thinking on any explainer prompt. + starves the visible text output on complex cases when thinking is on. `Prompt.__init__` now refuses to load a + thinking-enabled config with `max_tokens < 4096`; ≥4096 (8192 worked in past experiments) is the floor. - **The reviewer model rejects `temperature`.** Opus 4.7 deprecated the parameter, so `prompt_testing/reviewer.py` omits it. The Sonnet explainer still accepts `temperature`. If you swap the reviewer to a model that requires it, restore the param. @@ -84,7 +84,14 @@ call → response with metrics. See `claude_explain.md` for detailed architectur explicit latency/quality decision. - **Multi-block responses.** When thinking is enabled the API returns thinking blocks before the text block. `app/explain.py` and `prompt_testing/runner.py` both pick the last text block via `getattr(c, "type", None) == - "text"`. Preserve that pattern for any new code that consumes responses. + "text"`. Preserve that pattern for any new code that consumes responses. The API may also return + `redacted_thinking` blocks (encrypted reasoning when safety filters trip); the same filter excludes them + correctly, but be aware "no text block" can mean either max_tokens starvation *or* a redacted-thinking-only + response — the error message is the same. +- **Empty responses are not 500s.** When the model returns no text block, `app/explain.py` returns + `ExplainResponse(status="error")` with `usage` populated and emits `ClaudeExplainEmptyResponse`. The cache + layer skips storing error responses so retries hit the API. Don't change this to raise — the structured error + is what the CE frontend can render. ## Code Style Guidelines diff --git a/app/cache.py b/app/cache.py index f7dd0fc..b612c11 100644 --- a/app/cache.py +++ b/app/cache.py @@ -142,6 +142,11 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str: "model": prompt_data["model"], "max_tokens": prompt_data["max_tokens"], "temperature": prompt_data["temperature"], + # Thinking config affects the API response so it must affect the cache + # key. Currently captured via `prompt_version` too, but listing it + # explicitly keeps the contract clear if thinking ever gets set + # outside the YAML (e.g. per-request override). + "thinking": prompt_data.get("thinking"), "system": prompt_data["system"], "messages": prompt_data["messages"], # Include a hash of the prompt config to invalidate cache when prompts change diff --git a/app/explain.py b/app/explain.py index 8e443cf..d973075 100644 --- a/app/explain.py +++ b/app/explain.py @@ -1,4 +1,5 @@ import logging +from typing import Any from anthropic import AsyncAnthropic @@ -59,8 +60,10 @@ async def process_request( # Cache miss or no cache - proceed with Anthropic API call response = await _call_anthropic_api(body, client, prompt, metrics_provider) - # Cache the response (if cache provider is available) - if cache_provider is not None: + # Cache the response (if cache provider is available). Don't cache + # error responses — they consume real tokens but produce no useful + # content, and we want a retry to hit the API rather than the cache. + if cache_provider is not None and response.status == "success": await cache_response(body, prompt, response, cache_provider) metrics_provider.put_metric("ClaudeExplainCacheMiss", 1) @@ -92,7 +95,7 @@ async def _call_anthropic_api( # Call Claude API LOGGER.info("Using Anthropic client with model: %s", prompt_data["model"]) - api_kwargs: dict = { + api_kwargs: dict[str, Any] = { "model": prompt_data["model"], "max_tokens": prompt_data["max_tokens"], "system": prompt_data["system"], @@ -106,22 +109,44 @@ async def _call_anthropic_api( message = await client.messages.create(**api_kwargs) + # Extract usage information + input_tokens = message.usage.input_tokens + output_tokens = message.usage.output_tokens + total_tokens = input_tokens + output_tokens + # Pick the last text block — when thinking is enabled the response # contains thinking blocks before the final text block. text_blocks = [c for c in message.content if getattr(c, "type", None) == "text"] explanation = text_blocks[-1].text.strip() if text_blocks else "" if not explanation: # Can happen if extended thinking exhausts max_tokens before any - # text block is emitted. Don't cache or return an empty response. - raise RuntimeError( - f"Claude returned no text content (stop_reason={message.stop_reason}). " + # text block is emitted. Surface the failure to the caller with + # token usage populated, and emit a metric so this is visible on + # dashboards rather than buried in a generic 500. + message_text = ( + f"Claude returned no text content " + f"(stop_reason={message.stop_reason}, in={input_tokens}, out={output_tokens}). " f"If thinking is enabled, max_tokens may be too low." ) - - # Extract usage information - input_tokens = message.usage.input_tokens - output_tokens = message.usage.output_tokens - total_tokens = input_tokens + output_tokens + LOGGER.warning(message_text) + metrics_provider.set_property("language", body.language) + metrics_provider.set_property("compiler", body.compiler) + metrics_provider.set_property("instructionSet", body.instructionSet or "unknown") + metrics_provider.set_property("cached", "false") + metrics_provider.put_metric("ClaudeExplainRequest", 1) + metrics_provider.put_metric("ClaudeExplainEmptyResponse", 1) + metrics_provider.put_metric("ClaudeExplainInputTokens", input_tokens) + metrics_provider.put_metric("ClaudeExplainOutputTokens", output_tokens) + return ExplainResponse( + status="error", + message=message_text, + model=prompt_data["model"], + usage=TokenUsage( + inputTokens=input_tokens, + outputTokens=output_tokens, + totalTokens=total_tokens, + ), + ) # Calculate costs based on model cost_per_input_token, cost_per_output_token = get_model_cost(prompt_data["model"]) diff --git a/app/prompt.py b/app/prompt.py index 55c80e2..eb0f044 100644 --- a/app/prompt.py +++ b/app/prompt.py @@ -15,6 +15,11 @@ # Constants from explain.py that are needed for data preparation MAX_ASSEMBLY_LINES = 300 # Maximum number of assembly lines to process +# Minimum max_tokens that's safe to pair with extended thinking. Below this, +# adaptive thinking can consume the whole budget on complex inputs and leave +# nothing for the visible response. +MIN_MAX_TOKENS_WITH_THINKING = 4096 + class Prompt: """Manages prompt templates and generates messages for Claude API.""" @@ -40,6 +45,14 @@ def __init__(self, config: dict[str, Any] | Path): # {"type": "enabled", "budget_tokens": 2000}. When set, callers # should drop `temperature` (the API requires it to be unset/1). self.thinking = self.config["model"].get("thinking") + if self.thinking and self.max_tokens < MIN_MAX_TOKENS_WITH_THINKING: + # Adaptive thinking happily consumes the entire token budget on + # complex inputs and leaves nothing for the visible text block. + # Fail loudly here rather than silently returning empty responses. + raise ValueError( + f"max_tokens={self.max_tokens} is too low for thinking; " + f"use at least {MIN_MAX_TOKENS_WITH_THINKING} when model.thinking is set." + ) # Extract prompt templates self.system_prompt_template = self.config["system_prompt"] diff --git a/app/test_explain.py b/app/test_explain.py index de10d4a..753d245 100644 --- a/app/test_explain.py +++ b/app/test_explain.py @@ -178,9 +178,10 @@ async def test_picks_last_text_block_with_thinking(self, sample_request, noop_me assert response.explanation == "Final explanation here." @pytest.mark.asyncio - async def test_raises_when_no_text_block(self, sample_request, noop_metrics): + async def test_returns_error_when_no_text_block(self, sample_request, noop_metrics): """A response with no text block (e.g. thinking exhausted max_tokens) - should raise rather than silently caching an empty explanation.""" + should return status='error' with token usage populated, not raise. + Production must surface a structured error rather than a generic 500.""" thinking_block = MagicMock() thinking_block.type = "thinking" thinking_block.thinking = "..." @@ -194,8 +195,47 @@ async def test_raises_when_no_text_block(self, sample_request, noop_metrics): mock_client.messages.create = AsyncMock(return_value=mock_message) test_prompt = Prompt(Path("app/prompt.yaml")) - with pytest.raises(RuntimeError, match="no text content"): - await process_request(sample_request, mock_client, test_prompt, noop_metrics) + response = await process_request(sample_request, mock_client, test_prompt, noop_metrics) + + assert response.status == "error" + assert response.explanation is None + assert "no text content" in response.message + # Real tokens were spent — surface them so callers can see the cost. + assert response.usage is not None + assert response.usage.inputTokens == 100 + assert response.usage.outputTokens == 50 + + +class TestPromptValidation: + """Validation rules enforced at Prompt construction.""" + + def test_thinking_requires_min_max_tokens(self): + """A prompt YAML that enables thinking must also bump max_tokens.""" + with pytest.raises(ValueError, match="too low for thinking"): + Prompt( + { + "model": {"name": "test", "max_tokens": 1536, "thinking": {"type": "adaptive"}}, + "system_prompt": "", + "user_prompt": "", + "assistant_prefill": "", + "audience_levels": {}, + "explanation_types": {}, + } + ) + + def test_thinking_with_sufficient_max_tokens_is_ok(self): + """At/above the floor (4096), thinking-enabled prompts load fine.""" + prompt = Prompt( + { + "model": {"name": "test", "max_tokens": 4096, "thinking": {"type": "adaptive"}}, + "system_prompt": "", + "user_prompt": "", + "assistant_prefill": "", + "audience_levels": {}, + "explanation_types": {}, + } + ) + assert prompt.thinking == {"type": "adaptive"} class TestSelectImportantAssembly: diff --git a/prompt_testing/cli.py b/prompt_testing/cli.py index de5b1c0..452e050 100644 --- a/prompt_testing/cli.py +++ b/prompt_testing/cli.py @@ -16,6 +16,7 @@ import sys from collections import defaultdict from pathlib import Path +from typing import Any import click from dotenv import load_dotenv @@ -202,7 +203,7 @@ def compilers(ctx, language, search, limit): # noqa: ARG001 click.echo(f"... and {len(results) - limit} more") -async def _run_reviews(project_root: Path, results: dict, model: str, thinking: dict | None = None) -> dict: +async def _run_reviews(project_root: Path, results: dict, model: str, thinking: dict[str, Any] | None = None) -> dict: """Run correctness reviews on all successful results.""" from prompt_testing.reviewer import CorrectnessReviewer @@ -216,6 +217,7 @@ async def _run_reviews(project_root: Path, results: dict, model: str, thinking: review_cost = 0.0 errors_found = 0 + review_failures = 0 cost_per_input_token, cost_per_output_token = get_model_cost(model) for i, result in enumerate(successful, 1): @@ -226,10 +228,19 @@ async def _run_reviews(project_root: Path, results: dict, model: str, thinking: review = await reviewer.review_test_result(case, result["explanation"]) result["review"] = review - status = "✓" if review.get("correct") else "✗" - n_issues = len(review.get("issues", [])) - if not review.get("correct"): + # `correct`: True = passed, False = real factual error, + # None = reviewer infrastructure failure (parse/empty response). + # Distinguish so suite metrics don't conflate the two. + correct = review.get("correct") + if correct is True: + status = "✓" + elif correct is False: + status = "✗" errors_found += 1 + else: + status = "?" + review_failures += 1 + n_issues = len(review.get("issues", [])) cost = ( review.get("reviewer_input_tokens", 0) * cost_per_input_token + review.get("reviewer_output_tokens", 0) * cost_per_output_token @@ -241,26 +252,33 @@ async def _run_reviews(project_root: Path, results: dict, model: str, thinking: results["review_cost_usd"] = round(review_cost, 6) results["total_cost_usd"] = round(results["total_cost_usd"] + review_cost, 6) results["errors_found"] = errors_found + results["review_failures"] = review_failures return results -def _print_review_summary(results: dict) -> None: +def _print_review_summary(results: dict[str, Any]) -> None: """Print a summary of correctness reviews.""" reviewed = [r for r in results["results"] if r.get("review")] - correct = sum(1 for r in reviewed if r["review"].get("correct")) - incorrect = len(reviewed) - correct + passed = sum(1 for r in reviewed if r["review"].get("correct") is True) + failed = sum(1 for r in reviewed if r["review"].get("correct") is False) + review_failures = sum(1 for r in reviewed if r["review"].get("correct") is None) - click.echo(f"\nCorrectness: {correct}/{len(reviewed)} passed") - if incorrect: - click.echo(f"\n⚠ {incorrect} case(s) with issues:") + click.echo(f"\nCorrectness: {passed}/{len(reviewed)} passed") + if failed: + click.echo(f"\n⚠ {failed} case(s) with issues:") for r in reviewed: review = r["review"] - if not review.get("correct"): + if review.get("correct") is False: click.echo(f"\n {r['case_id']}:") for issue in review.get("issues", []): sev = "🔴" if issue["severity"] == "error" else "🟡" click.echo(f" {sev} {issue['claim']}") click.echo(f" → {issue['correction']}") + if review_failures: + click.echo(f"\n⚠ {review_failures} review(s) failed to run (likely max_tokens starvation):") + for r in reviewed: + if r["review"].get("correct") is None: + click.echo(f" {r['case_id']}: {r['review'].get('summary', '?')}") click.echo(f"\nReview cost: ${results.get('review_cost_usd', 0):.4f} ({results.get('review_model', '?')})") diff --git a/prompt_testing/reviewer.py b/prompt_testing/reviewer.py index 13093d2..243dae7 100644 --- a/prompt_testing/reviewer.py +++ b/prompt_testing/reviewer.py @@ -120,22 +120,35 @@ async def review( text_blocks = [c for c in msg.content if getattr(c, "type", None) == "text"] text = text_blocks[-1].text.strip() if text_blocks else "" - # Parse JSON response - try: - result = json.loads(text) - except json.JSONDecodeError: - # Try to extract JSON from markdown fencing - if "```" in text: - json_part = text.split("```")[1] - if json_part.startswith("json"): - json_part = json_part[4:] - result = json.loads(json_part.strip()) - else: - result = { - "correct": None, - "issues": [], - "summary": f"Failed to parse reviewer response: {text[:200]}", - } + if not text: + # Likely thinking exhausted max_tokens before any text block. + # Surface stop_reason/usage so this is diagnosable rather than a + # generic JSON parse failure. + result: dict[str, Any] = { + "correct": None, + "issues": [], + "summary": ( + f"Reviewer returned no text (stop_reason={msg.stop_reason}, " + f"in={msg.usage.input_tokens}, out={msg.usage.output_tokens})." + ), + } + else: + # Parse JSON response + try: + result = json.loads(text) + except json.JSONDecodeError: + # Try to extract JSON from markdown fencing + if "```" in text: + json_part = text.split("```")[1] + if json_part.startswith("json"): + json_part = json_part[4:] + result = json.loads(json_part.strip()) + else: + result = { + "correct": None, + "issues": [], + "summary": f"Failed to parse reviewer response: {text[:200]}", + } result["reviewer_model"] = self.model result["reviewer_input_tokens"] = msg.usage.input_tokens diff --git a/prompt_testing/runner.py b/prompt_testing/runner.py index 75a6850..cb874bb 100644 --- a/prompt_testing/runner.py +++ b/prompt_testing/runner.py @@ -90,7 +90,8 @@ async def _run_one(self, test_case: dict[str, Any], prompt: Prompt) -> dict[str, if not explanation: # Treat empty output as a failure so suite metrics aren't # skewed. Common cause: thinking exhausting max_tokens - # before any text block is emitted. + # before any text block is emitted. Tokens were still + # spent — capture them so cost reporting stays accurate. return { "case_id": case_id, "success": False, @@ -98,6 +99,10 @@ async def _run_one(self, test_case: dict[str, Any], prompt: Prompt) -> dict[str, f"empty response (stop_reason={msg.stop_reason}, " f"in={msg.usage.input_tokens}, out={msg.usage.output_tokens})" ), + "model": prompt_data["model"], + "input_tokens": msg.usage.input_tokens, + "output_tokens": msg.usage.output_tokens, + "elapsed_ms": elapsed_ms, } return { "case_id": case_id, @@ -144,9 +149,11 @@ async def run_async( print(f" [{i}/{len(cases)}] {status} {result['case_id']} ({tokens})") successful = [r for r in results if r["success"]] + # Cost includes failures that consumed tokens (e.g. thinking exhausted + # max_tokens before any text was emitted) — those aren't free. total_cost = sum( - r["input_tokens"] * 3 / 1e6 + r["output_tokens"] * 15 / 1e6 # Sonnet pricing - for r in successful + r.get("input_tokens", 0) * 3 / 1e6 + r.get("output_tokens", 0) * 15 / 1e6 # Sonnet pricing + for r in results ) return { From cbb0f714533c09602cbb1af734b1e484ae735d21 Mon Sep 17 00:00:00 2001 From: mattgodbolt-molty Date: Wed, 6 May 2026 17:26:38 -0500 Subject: [PATCH 5/7] Ignore Claude Code's local scheduled_tasks.lock This file is per-user session state and shouldn't be tracked. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 6fd486a..b34d6e4 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,6 @@ node_modules # Package build artifacts *.egg-info/ *.backup + +# Claude Code local session state +.claude/scheduled_tasks.lock From 49df42b0d4e16104ccb80344a794656821c7fa5b Mon Sep 17 00:00:00 2001 From: mattgodbolt-molty Date: Wed, 6 May 2026 17:45:04 -0500 Subject: [PATCH 6/7] Address Copilot follow-up review on PR #17 - app/cache.py: only include `thinking` in cache_data when it's set. Always serialising the field (even as null) would silently invalidate every existing S3 cache entry on deploy, costing a wave of fresh API calls for cases where nothing about the prompt has changed. - prompt_testing/runner.py: compute total_cost via get_model_cost(prompt.model) rather than hardcoded Sonnet rates. The runner can load arbitrary prompt YAMLs, so hardcoding misreports cost whenever the prompt model isn't Sonnet. Co-Authored-By: Claude Opus 4.7 (1M context) --- app/cache.py | 10 +++++----- prompt_testing/runner.py | 6 +++++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/app/cache.py b/app/cache.py index b612c11..6ba2cec 100644 --- a/app/cache.py +++ b/app/cache.py @@ -142,16 +142,16 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str: "model": prompt_data["model"], "max_tokens": prompt_data["max_tokens"], "temperature": prompt_data["temperature"], - # Thinking config affects the API response so it must affect the cache - # key. Currently captured via `prompt_version` too, but listing it - # explicitly keeps the contract clear if thinking ever gets set - # outside the YAML (e.g. per-request override). - "thinking": prompt_data.get("thinking"), "system": prompt_data["system"], "messages": prompt_data["messages"], # Include a hash of the prompt config to invalidate cache when prompts change "prompt_version": _get_prompt_config_hash(prompt.config), } + # Thinking config affects the API response so it must affect the cache + # key. Only include when set to avoid invalidating the existing S3 cache + # for production prompts that don't use thinking. + if prompt_data.get("thinking") is not None: + cache_data["thinking"] = prompt_data["thinking"] # Convert to JSON with consistent ordering for deterministic hashing cache_json = json.dumps(cache_data, sort_keys=True, ensure_ascii=True, separators=(",", ":")) diff --git a/prompt_testing/runner.py b/prompt_testing/runner.py index cb874bb..e678447 100644 --- a/prompt_testing/runner.py +++ b/prompt_testing/runner.py @@ -15,6 +15,7 @@ from app.explain_api import AssemblyItem, ExplainRequest from app.explanation_types import AudienceLevel, ExplanationType +from app.model_costs import get_model_cost from app.prompt import Prompt from prompt_testing.file_utils import load_all_test_cases @@ -151,8 +152,11 @@ async def run_async( successful = [r for r in results if r["success"]] # Cost includes failures that consumed tokens (e.g. thinking exhausted # max_tokens before any text was emitted) — those aren't free. + # Use the prompt's actual model rather than hardcoded rates so the + # number stays correct across explainer-model experiments. + cost_per_input_token, cost_per_output_token = get_model_cost(prompt.model) total_cost = sum( - r.get("input_tokens", 0) * 3 / 1e6 + r.get("output_tokens", 0) * 15 / 1e6 # Sonnet pricing + r.get("input_tokens", 0) * cost_per_input_token + r.get("output_tokens", 0) * cost_per_output_token for r in results ) From df76c7a89cb79b880478f201e4d94ec33c61ee23 Mon Sep 17 00:00:00 2001 From: mattgodbolt-molty Date: Wed, 6 May 2026 17:45:23 -0500 Subject: [PATCH 7/7] =?UTF-8?q?Revert=20cache=20key=20conditional=20?= =?UTF-8?q?=E2=80=94=20unconditional=20thinking=20is=20simpler?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per maintainer preference, accept the one-time S3 cache flush in exchange for unconditionally including `thinking` in cache_data. Keeps the cache-key contract aligned with the docstring (explicit field list). Co-Authored-By: Claude Opus 4.7 (1M context) --- app/cache.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/app/cache.py b/app/cache.py index 6ba2cec..173bb59 100644 --- a/app/cache.py +++ b/app/cache.py @@ -142,16 +142,12 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str: "model": prompt_data["model"], "max_tokens": prompt_data["max_tokens"], "temperature": prompt_data["temperature"], + "thinking": prompt_data.get("thinking"), "system": prompt_data["system"], "messages": prompt_data["messages"], # Include a hash of the prompt config to invalidate cache when prompts change "prompt_version": _get_prompt_config_hash(prompt.config), } - # Thinking config affects the API response so it must affect the cache - # key. Only include when set to avoid invalidating the existing S3 cache - # for production prompts that don't use thinking. - if prompt_data.get("thinking") is not None: - cache_data["thinking"] = prompt_data["thinking"] # Convert to JSON with consistent ordering for deterministic hashing cache_json = json.dumps(cache_data, sort_keys=True, ensure_ascii=True, separators=(",", ":"))