From 2cc14022d0aedac8bea5eac327fd2fbc4f0cefe7 Mon Sep 17 00:00:00 2001 From: mattgodbolt-molty Date: Wed, 6 May 2026 18:07:24 -0500 Subject: [PATCH 1/3] Expose extended thinking as a per-request flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `useThinking: bool = false` to ExplainRequest. When true the explainer enables adaptive extended thinking and bumps max_tokens to the documented floor; otherwise behaviour is identical to today. Why opt-in rather than default-on: thinking measurably improves correctness (errors 5 -> 0 across the medium-sized cases in our 21-case suite, eliminates the recurring imul-eax-edi-edi invention) but pushes the largest queries past the 30s API Gateway v2 / Lambda timeout — about 3/16 of our reasonably-sized cases would 504. Per- request opt-in lets the CE frontend choose where the quality win is worth the latency, with no risk of timing out users who don't ask. Implementation: - ExplainRequest gains a `useThinking` field (camelCase to match the rest of the API; default false). - New `Prompt.generate_messages_for_request()` is the single entry point that applies per-request overrides on top of the YAML-loaded defaults. Currently the only override is thinking, but it's the natural seam for future per-request knobs. - Both `_call_anthropic_api` and `generate_cache_key` route through the new method, so cache keys split on `useThinking` and the API call always sees the same payload that was hashed. - CLAUDE.md updated to document the opt-in pattern, the 30s timeout ceiling, and the longer-term architectural fixes if we ever want default-on (Lambda Function URL response streaming or async pattern). - README documents the new request field. Tests: 96 -> 98 passing. New tests cover (a) `useThinking=True` overrides prompt config and bumps max_tokens, (b) cache keys split on the flag. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 11 +++++++---- README.md | 3 ++- app/cache.py | 6 ++++-- app/explain.py | 9 ++++++--- app/explain_api.py | 9 +++++++++ app/prompt.py | 17 +++++++++++++++++ app/test_explain.py | 32 ++++++++++++++++++++++++++++++++ 7 files changed, 77 insertions(+), 10 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index c6964a2..2d8993b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -78,10 +78,13 @@ call → response with metrics. See `claude_explain.md` for detailed architectur - **Reviewer thinking is on by default.** `prompt-test run --review` and `prompt-test review` default to `--reviewer-thinking adaptive` / `--thinking adaptive`. It catches factual errors the no-think reviewer misses but adds ~70% to review cost. Pass `off` to compare runs or save money on large batches. -- **Production explainer thinking is intentionally off.** Adaptive thinking on Sonnet 4.6 measurably improves - factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~11s end-to-end - latency, which is too much for the interactive endpoint. Don't enable it in `app/prompt.yaml` without an - explicit latency/quality decision. +- **Production explainer thinking is opt-in per request.** Adaptive thinking on Sonnet 4.6 measurably improves + factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~10s+ latency on + small/medium queries and can push large queries past the **30s Lambda + API Gateway v2 timeout** entirely (no + raising that — HTTP API has a 30s ceiling). Callers opt in by sending `useThinking: true` on the request; the + default (no field, or `false`) preserves current latency. Cache keys split on the flag, so on/off requests + cache independently. If we ever want default-on, we need either a smaller fixed thinking budget *or* an async + response architecture (Lambda Function URL with response streaming, SQS poll, etc.). - **Multi-block responses.** When thinking is enabled the API returns thinking blocks before the text block. `app/explain.py` and `prompt_testing/runner.py` both pick the last text block via `getattr(c, "type", None) == "text"`. Preserve that pattern for any new code that consumes responses. The API may also return diff --git a/README.md b/README.md index 2d27ac5..8a540b2 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,8 @@ The API uses **camelCase** for all field names to maintain consistency with Java "labels": [] } ], - "bypassCache": false // Optional: set to true to skip cache reads + "bypassCache": false, // Optional: set to true to skip cache reads + "useThinking": false // Optional: enable extended thinking for higher accuracy at cost of latency } ``` diff --git a/app/cache.py b/app/cache.py index 173bb59..0702951 100644 --- a/app/cache.py +++ b/app/cache.py @@ -134,8 +134,10 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str: Returns: A SHA-256 hash string to use as cache key """ - # Generate the full message data that would be sent to Anthropic - prompt_data = prompt.generate_messages(request) + # Generate the full message data that would be sent to Anthropic, + # including any per-request overrides (e.g. useThinking) so cache hits + # split correctly between thinking-on and thinking-off requests. + prompt_data = prompt.generate_messages_for_request(request) # Create a deterministic representation of all cache-affecting data cache_data = { diff --git a/app/explain.py b/app/explain.py index d973075..0f6ee0b 100644 --- a/app/explain.py +++ b/app/explain.py @@ -80,8 +80,7 @@ async def _call_anthropic_api( This is the original process_request logic, extracted for clarity. """ - # Generate messages using the Prompt instance - prompt_data = prompt.generate_messages(body) + prompt_data = prompt.generate_messages_for_request(body) # Debug logging for prompts LOGGER.debug(f"=== PROMPT DEBUG FOR {body.explanation.value.upper()} (audience: {body.audience.value}) ===") @@ -93,7 +92,11 @@ async def _call_anthropic_api( LOGGER.debug("=== END PROMPT DEBUG ===") # Call Claude API - LOGGER.info("Using Anthropic client with model: %s", prompt_data["model"]) + LOGGER.info( + "Using Anthropic client with model: %s (thinking=%s)", + prompt_data["model"], + bool(prompt_data.get("thinking")), + ) api_kwargs: dict[str, Any] = { "model": prompt_data["model"], diff --git a/app/explain_api.py b/app/explain_api.py index 5cb08f0..954d1c6 100644 --- a/app/explain_api.py +++ b/app/explain_api.py @@ -57,6 +57,15 @@ class ExplainRequest(BaseModel): default=ExplanationType.ASSEMBLY, description="Type of explanation to generate" ) bypassCache: bool = Field(default=False, description="If true, skip reading from cache but still write to cache") + useThinking: bool = Field( + default=False, + description=( + "If true, enable adaptive extended thinking on the explainer. " + "Improves correctness on complex inputs at the cost of significant " + "extra latency (often +10s, sometimes much more on large inputs). " + "May exceed the Lambda 30s timeout on the largest queries." + ), + ) @property def instruction_set_with_default(self) -> str: diff --git a/app/prompt.py b/app/prompt.py index eb0f044..e1c82df 100644 --- a/app/prompt.py +++ b/app/prompt.py @@ -235,6 +235,23 @@ def prepare_structured_data(self, request: ExplainRequest) -> dict[str, Any]: return structured_data + def generate_messages_for_request(self, request: ExplainRequest) -> dict[str, Any]: + """Generate API payload, applying per-request overrides. + + This is the entry point used by both the explain handler and the + cache-key generator so they stay in sync. Currently the only + override is ``useThinking``: when set, enable adaptive extended + thinking and bump ``max_tokens`` to satisfy the floor. + """ + prompt_data = self.generate_messages(request) + if request.useThinking: + prompt_data = { + **prompt_data, + "thinking": {"type": "adaptive"}, + "max_tokens": max(prompt_data["max_tokens"], MIN_MAX_TOKENS_WITH_THINKING), + } + return prompt_data + def generate_messages(self, request: ExplainRequest) -> dict[str, Any]: """Generate the complete message structure for Claude API. diff --git a/app/test_explain.py b/app/test_explain.py index 753d245..e484239 100644 --- a/app/test_explain.py +++ b/app/test_explain.py @@ -177,6 +177,38 @@ async def test_picks_last_text_block_with_thinking(self, sample_request, noop_me assert response.status == "success" assert response.explanation == "Final explanation here." + @pytest.mark.asyncio + async def test_use_thinking_overrides_prompt(self, sample_request, mock_anthropic_client, noop_metrics): + """Setting `useThinking=True` on the request must enable adaptive + thinking and bump max_tokens to satisfy the floor, even when the + loaded prompt has no thinking config.""" + sample_request.useThinking = True + + test_prompt = Prompt(Path("app/prompt.yaml")) + await process_request(sample_request, mock_anthropic_client, test_prompt, noop_metrics) + + _args, kwargs = mock_anthropic_client.messages.create.call_args + assert kwargs.get("thinking") == {"type": "adaptive"} + # When thinking is set, temperature must NOT be passed (API rejects it). + assert "temperature" not in kwargs + # max_tokens bumped to at least the documented floor. + assert kwargs["max_tokens"] >= 4096 + + @pytest.mark.asyncio + async def test_use_thinking_changes_cache_key(self, sample_request): + """Cache keys must split on `useThinking` so a thinking-enabled + request doesn't collide with a thinking-off cached response.""" + from app.cache import generate_cache_key + + test_prompt = Prompt(Path("app/prompt.yaml")) + + sample_request.useThinking = False + key_off = generate_cache_key(sample_request, test_prompt) + sample_request.useThinking = True + key_on = generate_cache_key(sample_request, test_prompt) + + assert key_off != key_on + @pytest.mark.asyncio async def test_returns_error_when_no_text_block(self, sample_request, noop_metrics): """A response with no text block (e.g. thinking exhausted max_tokens) From b447d9b7f9ef32b95ed5289879df0c63448668d7 Mon Sep 17 00:00:00 2001 From: mattgodbolt-molty Date: Wed, 6 May 2026 18:15:29 -0500 Subject: [PATCH 2/3] Address subagent review on PR #18 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Findings from a follow-up review: - Rename `Prompt.generate_messages_for_request` -> `build_api_payload`. The old name read as "generate messages, given a request" which is what `generate_messages` already does. The actual purpose is to apply per-request runtime overrides on top of the YAML defaults. - Always return a fresh dict from `build_api_payload` so callers can mutate without affecting prompt internals or each other. The previous shape returned the original dict in the no-override path and a shallow copy in the override path — a foot-gun in waiting. - Move `test_use_thinking_changes_cache_key` from `test_explain.py` to `test_cache.py` next to its sibling `test_generate_cache_key_bypass_cache_ignored`. Both tests answer "what affects the cache key" and belong together. - Add `test_generate_cache_key_use_thinking_default_matches_omitted` to lock in the invariant the PR depends on: a request with `useThinking` absent (Pydantic default) must produce the same key as one with `useThinking=False` explicit. Without this, a future refactor could silently invalidate the existing S3 cache. - Use `MIN_MAX_TOKENS_WITH_THINKING` constant in the test rather than hard-coding `4096`, so a future bump to the floor doesn't silently miss the test. Tests: 98 -> 99 passing. Co-Authored-By: Claude Opus 4.7 (1M context) --- app/cache.py | 7 +++---- app/explain.py | 2 +- app/prompt.py | 16 ++++++++-------- app/test_cache.py | 30 ++++++++++++++++++++++++++++++ app/test_explain.py | 19 ++----------------- 5 files changed, 44 insertions(+), 30 deletions(-) diff --git a/app/cache.py b/app/cache.py index 0702951..c9c00c8 100644 --- a/app/cache.py +++ b/app/cache.py @@ -134,10 +134,9 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str: Returns: A SHA-256 hash string to use as cache key """ - # Generate the full message data that would be sent to Anthropic, - # including any per-request overrides (e.g. useThinking) so cache hits - # split correctly between thinking-on and thinking-off requests. - prompt_data = prompt.generate_messages_for_request(request) + # Use the same payload-building path the API call uses so cache hits + # split correctly on per-request overrides (e.g. useThinking). + prompt_data = prompt.build_api_payload(request) # Create a deterministic representation of all cache-affecting data cache_data = { diff --git a/app/explain.py b/app/explain.py index 0f6ee0b..6db1c65 100644 --- a/app/explain.py +++ b/app/explain.py @@ -80,7 +80,7 @@ async def _call_anthropic_api( This is the original process_request logic, extracted for clarity. """ - prompt_data = prompt.generate_messages_for_request(body) + prompt_data = prompt.build_api_payload(body) # Debug logging for prompts LOGGER.debug(f"=== PROMPT DEBUG FOR {body.explanation.value.upper()} (audience: {body.audience.value}) ===") diff --git a/app/prompt.py b/app/prompt.py index e1c82df..bbbae78 100644 --- a/app/prompt.py +++ b/app/prompt.py @@ -235,21 +235,21 @@ def prepare_structured_data(self, request: ExplainRequest) -> dict[str, Any]: return structured_data - def generate_messages_for_request(self, request: ExplainRequest) -> dict[str, Any]: - """Generate API payload, applying per-request overrides. + def build_api_payload(self, request: ExplainRequest) -> dict[str, Any]: + """Build the API call payload, applying per-request runtime overrides. This is the entry point used by both the explain handler and the cache-key generator so they stay in sync. Currently the only override is ``useThinking``: when set, enable adaptive extended thinking and bump ``max_tokens`` to satisfy the floor. + + Always returns a fresh dict so callers can mutate without affecting + prompt internals or each other. """ - prompt_data = self.generate_messages(request) + prompt_data = dict(self.generate_messages(request)) if request.useThinking: - prompt_data = { - **prompt_data, - "thinking": {"type": "adaptive"}, - "max_tokens": max(prompt_data["max_tokens"], MIN_MAX_TOKENS_WITH_THINKING), - } + prompt_data["thinking"] = {"type": "adaptive"} + prompt_data["max_tokens"] = max(prompt_data["max_tokens"], MIN_MAX_TOKENS_WITH_THINKING) return prompt_data def generate_messages(self, request: ExplainRequest) -> dict[str, Any]: diff --git a/app/test_cache.py b/app/test_cache.py index ad5a531..2b11b92 100644 --- a/app/test_cache.py +++ b/app/test_cache.py @@ -265,6 +265,36 @@ def test_generate_cache_key_bypass_cache_ignored(self, test_prompt): key2 = generate_cache_key(request2, test_prompt) assert key1 == key2 # bypass_cache shouldn't affect cache key + def test_generate_cache_key_use_thinking_splits(self, test_prompt): + """useThinking changes the API call payload, so it must change the + cache key. Confirms cache hits don't bleed across thinking-on/off.""" + base_kwargs = { + "language": "c++", + "compiler": "g++", + "code": "int test() { return 0; }", + "asm": [AssemblyItem(text="test:", source=None)], + } + request_off = ExplainRequest(**base_kwargs, useThinking=False) + request_on = ExplainRequest(**base_kwargs, useThinking=True) + + assert generate_cache_key(request_off, test_prompt) != generate_cache_key(request_on, test_prompt) + + def test_generate_cache_key_use_thinking_default_matches_omitted(self, test_prompt): + """A request with useThinking absent (Pydantic default) must produce + the same key as one with useThinking=False explicit. This is the + invariant that lets us add the field without invalidating the + existing S3 cache.""" + base_kwargs = { + "language": "c++", + "compiler": "g++", + "code": "int test() { return 0; }", + "asm": [AssemblyItem(text="test:", source=None)], + } + request_default = ExplainRequest(**base_kwargs) # useThinking absent + request_explicit = ExplainRequest(**base_kwargs, useThinking=False) + + assert generate_cache_key(request_default, test_prompt) == generate_cache_key(request_explicit, test_prompt) + class TestCacheHighLevelFunctions: """Test the high-level cache functions.""" diff --git a/app/test_explain.py b/app/test_explain.py index e484239..b0d6150 100644 --- a/app/test_explain.py +++ b/app/test_explain.py @@ -11,7 +11,7 @@ SourceMapping, ) from app.metrics import NoopMetricsProvider -from app.prompt import MAX_ASSEMBLY_LINES, Prompt +from app.prompt import MAX_ASSEMBLY_LINES, MIN_MAX_TOKENS_WITH_THINKING, Prompt @pytest.fixture @@ -192,22 +192,7 @@ async def test_use_thinking_overrides_prompt(self, sample_request, mock_anthropi # When thinking is set, temperature must NOT be passed (API rejects it). assert "temperature" not in kwargs # max_tokens bumped to at least the documented floor. - assert kwargs["max_tokens"] >= 4096 - - @pytest.mark.asyncio - async def test_use_thinking_changes_cache_key(self, sample_request): - """Cache keys must split on `useThinking` so a thinking-enabled - request doesn't collide with a thinking-off cached response.""" - from app.cache import generate_cache_key - - test_prompt = Prompt(Path("app/prompt.yaml")) - - sample_request.useThinking = False - key_off = generate_cache_key(sample_request, test_prompt) - sample_request.useThinking = True - key_on = generate_cache_key(sample_request, test_prompt) - - assert key_off != key_on + assert kwargs["max_tokens"] >= MIN_MAX_TOKENS_WITH_THINKING @pytest.mark.asyncio async def test_returns_error_when_no_text_block(self, sample_request, noop_metrics): From 98154cf2b267dadc08a9725c0587b1875cae337b Mon Sep 17 00:00:00 2001 From: mattgodbolt-molty Date: Wed, 6 May 2026 18:42:59 -0500 Subject: [PATCH 3/3] Make build_api_payload the single source of truth for API kwargs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Copilot review: previously `build_api_payload()` returned a dict that included both `thinking` and `temperature` when useThinking was true, and `_call_anthropic_api()` had to re-derive the actual API kwargs (dropping `temperature` because the API rejects it alongside thinking). The cache key meanwhile hashed the un-derived dict — so the cache key included `temperature` for requests that wouldn't actually send it. Today this happened to be deterministic and internally consistent, but it's drift-in-waiting: any future per-request override with exclusivity rules would face the same trap. Refactor so `build_api_payload()` returns exactly the kwargs `messages.create` will receive, including the thinking-vs-temperature exclusivity. Then both call sites become trivial: - `_call_anthropic_api`: `await client.messages.create(**payload)` - `generate_cache_key`: `{**payload, "prompt_version": ...}` Net diff: -7 lines, fewer code paths to keep aligned. Cache key shape changes (no `temperature` field when thinking is on, no null `thinking` field when off), which invalidates existing S3 cache entries — acceptable per maintainer's stated tolerance for cache flushes when the alternative is uglier code. Co-Authored-By: Claude Opus 4.7 (1M context) --- app/cache.py | 16 ++++------------ app/explain.py | 20 ++++---------------- app/prompt.py | 37 +++++++++++++++++++++++++------------ 3 files changed, 33 insertions(+), 40 deletions(-) diff --git a/app/cache.py b/app/cache.py index c9c00c8..867b270 100644 --- a/app/cache.py +++ b/app/cache.py @@ -134,19 +134,11 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str: Returns: A SHA-256 hash string to use as cache key """ - # Use the same payload-building path the API call uses so cache hits - # split correctly on per-request overrides (e.g. useThinking). - prompt_data = prompt.build_api_payload(request) - - # Create a deterministic representation of all cache-affecting data + # The payload IS the cache-affecting data — the same dict that gets + # spread into messages.create. Add prompt_version on top so changes + # to the YAML invalidate cached responses. cache_data = { - "model": prompt_data["model"], - "max_tokens": prompt_data["max_tokens"], - "temperature": prompt_data["temperature"], - "thinking": prompt_data.get("thinking"), - "system": prompt_data["system"], - "messages": prompt_data["messages"], - # Include a hash of the prompt config to invalidate cache when prompts change + **prompt.build_api_payload(request), "prompt_version": _get_prompt_config_hash(prompt.config), } diff --git a/app/explain.py b/app/explain.py index 6db1c65..bcead24 100644 --- a/app/explain.py +++ b/app/explain.py @@ -1,5 +1,4 @@ import logging -from typing import Any from anthropic import AsyncAnthropic @@ -91,26 +90,15 @@ async def _call_anthropic_api( LOGGER.debug(message) LOGGER.debug("=== END PROMPT DEBUG ===") - # Call Claude API + # Call Claude API. `prompt_data` is already exactly the kwargs + # `messages.create` expects — `build_api_payload` resolved the + # thinking-vs-temperature exclusivity for us. LOGGER.info( "Using Anthropic client with model: %s (thinking=%s)", prompt_data["model"], bool(prompt_data.get("thinking")), ) - - api_kwargs: dict[str, Any] = { - "model": prompt_data["model"], - "max_tokens": prompt_data["max_tokens"], - "system": prompt_data["system"], - "messages": prompt_data["messages"], - } - if prompt_data.get("thinking"): - # Extended thinking: API requires temperature to be unset. - api_kwargs["thinking"] = prompt_data["thinking"] - else: - api_kwargs["temperature"] = prompt_data["temperature"] - - message = await client.messages.create(**api_kwargs) + message = await client.messages.create(**prompt_data) # Extract usage information input_tokens = message.usage.input_tokens diff --git a/app/prompt.py b/app/prompt.py index bbbae78..87b5428 100644 --- a/app/prompt.py +++ b/app/prompt.py @@ -236,21 +236,34 @@ def prepare_structured_data(self, request: ExplainRequest) -> dict[str, Any]: return structured_data def build_api_payload(self, request: ExplainRequest) -> dict[str, Any]: - """Build the API call payload, applying per-request runtime overrides. + """Return the exact kwargs to pass to ``client.messages.create``. - This is the entry point used by both the explain handler and the - cache-key generator so they stay in sync. Currently the only - override is ``useThinking``: when set, enable adaptive extended - thinking and bump ``max_tokens`` to satisfy the floor. + Single source of truth: the API call splats this dict directly, + and the cache-key generator hashes it. Per-request runtime + overrides (currently just ``useThinking``) are resolved here, and + mutually-exclusive fields like ``thinking`` vs ``temperature`` + are decided here too — callers don't need to re-derive anything. - Always returns a fresh dict so callers can mutate without affecting - prompt internals or each other. + Always returns a fresh dict so callers can mutate freely. """ - prompt_data = dict(self.generate_messages(request)) - if request.useThinking: - prompt_data["thinking"] = {"type": "adaptive"} - prompt_data["max_tokens"] = max(prompt_data["max_tokens"], MIN_MAX_TOKENS_WITH_THINKING) - return prompt_data + base = self.generate_messages(request) + payload: dict[str, Any] = { + "model": base["model"], + "max_tokens": base["max_tokens"], + "system": base["system"], + "messages": base["messages"], + } + # Resolve thinking config: per-request override wins over the YAML. + thinking = {"type": "adaptive"} if request.useThinking else base.get("thinking") + if thinking is not None: + payload["thinking"] = thinking + # Thinking and temperature are mutually exclusive (the API + # rejects temperature when thinking is set). Floor max_tokens + # so adaptive thinking can't starve the visible text. + payload["max_tokens"] = max(payload["max_tokens"], MIN_MAX_TOKENS_WITH_THINKING) + else: + payload["temperature"] = base["temperature"] + return payload def generate_messages(self, request: ExplainRequest) -> dict[str, Any]: """Generate the complete message structure for Claude API.