diff --git a/CLAUDE.md b/CLAUDE.md index c6964a2..2d8993b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -78,10 +78,13 @@ call → response with metrics. See `claude_explain.md` for detailed architectur - **Reviewer thinking is on by default.** `prompt-test run --review` and `prompt-test review` default to `--reviewer-thinking adaptive` / `--thinking adaptive`. It catches factual errors the no-think reviewer misses but adds ~70% to review cost. Pass `off` to compare runs or save money on large batches. -- **Production explainer thinking is intentionally off.** Adaptive thinking on Sonnet 4.6 measurably improves - factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~11s end-to-end - latency, which is too much for the interactive endpoint. Don't enable it in `app/prompt.yaml` without an - explicit latency/quality decision. +- **Production explainer thinking is opt-in per request.** Adaptive thinking on Sonnet 4.6 measurably improves + factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~10s+ latency on + small/medium queries and can push large queries past the **30s Lambda + API Gateway v2 timeout** entirely (no + raising that — HTTP API has a 30s ceiling). Callers opt in by sending `useThinking: true` on the request; the + default (no field, or `false`) preserves current latency. Cache keys split on the flag, so on/off requests + cache independently. If we ever want default-on, we need either a smaller fixed thinking budget *or* an async + response architecture (Lambda Function URL with response streaming, SQS poll, etc.). - **Multi-block responses.** When thinking is enabled the API returns thinking blocks before the text block. `app/explain.py` and `prompt_testing/runner.py` both pick the last text block via `getattr(c, "type", None) == "text"`. Preserve that pattern for any new code that consumes responses. The API may also return diff --git a/README.md b/README.md index 2d27ac5..8a540b2 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,8 @@ The API uses **camelCase** for all field names to maintain consistency with Java "labels": [] } ], - "bypassCache": false // Optional: set to true to skip cache reads + "bypassCache": false, // Optional: set to true to skip cache reads + "useThinking": false // Optional: enable extended thinking for higher accuracy at cost of latency } ``` diff --git a/app/cache.py b/app/cache.py index 173bb59..867b270 100644 --- a/app/cache.py +++ b/app/cache.py @@ -134,18 +134,11 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str: Returns: A SHA-256 hash string to use as cache key """ - # Generate the full message data that would be sent to Anthropic - prompt_data = prompt.generate_messages(request) - - # Create a deterministic representation of all cache-affecting data + # The payload IS the cache-affecting data — the same dict that gets + # spread into messages.create. Add prompt_version on top so changes + # to the YAML invalidate cached responses. cache_data = { - "model": prompt_data["model"], - "max_tokens": prompt_data["max_tokens"], - "temperature": prompt_data["temperature"], - "thinking": prompt_data.get("thinking"), - "system": prompt_data["system"], - "messages": prompt_data["messages"], - # Include a hash of the prompt config to invalidate cache when prompts change + **prompt.build_api_payload(request), "prompt_version": _get_prompt_config_hash(prompt.config), } diff --git a/app/explain.py b/app/explain.py index d973075..bcead24 100644 --- a/app/explain.py +++ b/app/explain.py @@ -1,5 +1,4 @@ import logging -from typing import Any from anthropic import AsyncAnthropic @@ -80,8 +79,7 @@ async def _call_anthropic_api( This is the original process_request logic, extracted for clarity. """ - # Generate messages using the Prompt instance - prompt_data = prompt.generate_messages(body) + prompt_data = prompt.build_api_payload(body) # Debug logging for prompts LOGGER.debug(f"=== PROMPT DEBUG FOR {body.explanation.value.upper()} (audience: {body.audience.value}) ===") @@ -92,22 +90,15 @@ async def _call_anthropic_api( LOGGER.debug(message) LOGGER.debug("=== END PROMPT DEBUG ===") - # Call Claude API - LOGGER.info("Using Anthropic client with model: %s", prompt_data["model"]) - - api_kwargs: dict[str, Any] = { - "model": prompt_data["model"], - "max_tokens": prompt_data["max_tokens"], - "system": prompt_data["system"], - "messages": prompt_data["messages"], - } - if prompt_data.get("thinking"): - # Extended thinking: API requires temperature to be unset. - api_kwargs["thinking"] = prompt_data["thinking"] - else: - api_kwargs["temperature"] = prompt_data["temperature"] - - message = await client.messages.create(**api_kwargs) + # Call Claude API. `prompt_data` is already exactly the kwargs + # `messages.create` expects — `build_api_payload` resolved the + # thinking-vs-temperature exclusivity for us. + LOGGER.info( + "Using Anthropic client with model: %s (thinking=%s)", + prompt_data["model"], + bool(prompt_data.get("thinking")), + ) + message = await client.messages.create(**prompt_data) # Extract usage information input_tokens = message.usage.input_tokens diff --git a/app/explain_api.py b/app/explain_api.py index 5cb08f0..954d1c6 100644 --- a/app/explain_api.py +++ b/app/explain_api.py @@ -57,6 +57,15 @@ class ExplainRequest(BaseModel): default=ExplanationType.ASSEMBLY, description="Type of explanation to generate" ) bypassCache: bool = Field(default=False, description="If true, skip reading from cache but still write to cache") + useThinking: bool = Field( + default=False, + description=( + "If true, enable adaptive extended thinking on the explainer. " + "Improves correctness on complex inputs at the cost of significant " + "extra latency (often +10s, sometimes much more on large inputs). " + "May exceed the Lambda 30s timeout on the largest queries." + ), + ) @property def instruction_set_with_default(self) -> str: diff --git a/app/prompt.py b/app/prompt.py index eb0f044..87b5428 100644 --- a/app/prompt.py +++ b/app/prompt.py @@ -235,6 +235,36 @@ def prepare_structured_data(self, request: ExplainRequest) -> dict[str, Any]: return structured_data + def build_api_payload(self, request: ExplainRequest) -> dict[str, Any]: + """Return the exact kwargs to pass to ``client.messages.create``. + + Single source of truth: the API call splats this dict directly, + and the cache-key generator hashes it. Per-request runtime + overrides (currently just ``useThinking``) are resolved here, and + mutually-exclusive fields like ``thinking`` vs ``temperature`` + are decided here too — callers don't need to re-derive anything. + + Always returns a fresh dict so callers can mutate freely. + """ + base = self.generate_messages(request) + payload: dict[str, Any] = { + "model": base["model"], + "max_tokens": base["max_tokens"], + "system": base["system"], + "messages": base["messages"], + } + # Resolve thinking config: per-request override wins over the YAML. + thinking = {"type": "adaptive"} if request.useThinking else base.get("thinking") + if thinking is not None: + payload["thinking"] = thinking + # Thinking and temperature are mutually exclusive (the API + # rejects temperature when thinking is set). Floor max_tokens + # so adaptive thinking can't starve the visible text. + payload["max_tokens"] = max(payload["max_tokens"], MIN_MAX_TOKENS_WITH_THINKING) + else: + payload["temperature"] = base["temperature"] + return payload + def generate_messages(self, request: ExplainRequest) -> dict[str, Any]: """Generate the complete message structure for Claude API. diff --git a/app/test_cache.py b/app/test_cache.py index ad5a531..2b11b92 100644 --- a/app/test_cache.py +++ b/app/test_cache.py @@ -265,6 +265,36 @@ def test_generate_cache_key_bypass_cache_ignored(self, test_prompt): key2 = generate_cache_key(request2, test_prompt) assert key1 == key2 # bypass_cache shouldn't affect cache key + def test_generate_cache_key_use_thinking_splits(self, test_prompt): + """useThinking changes the API call payload, so it must change the + cache key. Confirms cache hits don't bleed across thinking-on/off.""" + base_kwargs = { + "language": "c++", + "compiler": "g++", + "code": "int test() { return 0; }", + "asm": [AssemblyItem(text="test:", source=None)], + } + request_off = ExplainRequest(**base_kwargs, useThinking=False) + request_on = ExplainRequest(**base_kwargs, useThinking=True) + + assert generate_cache_key(request_off, test_prompt) != generate_cache_key(request_on, test_prompt) + + def test_generate_cache_key_use_thinking_default_matches_omitted(self, test_prompt): + """A request with useThinking absent (Pydantic default) must produce + the same key as one with useThinking=False explicit. This is the + invariant that lets us add the field without invalidating the + existing S3 cache.""" + base_kwargs = { + "language": "c++", + "compiler": "g++", + "code": "int test() { return 0; }", + "asm": [AssemblyItem(text="test:", source=None)], + } + request_default = ExplainRequest(**base_kwargs) # useThinking absent + request_explicit = ExplainRequest(**base_kwargs, useThinking=False) + + assert generate_cache_key(request_default, test_prompt) == generate_cache_key(request_explicit, test_prompt) + class TestCacheHighLevelFunctions: """Test the high-level cache functions.""" diff --git a/app/test_explain.py b/app/test_explain.py index 753d245..b0d6150 100644 --- a/app/test_explain.py +++ b/app/test_explain.py @@ -11,7 +11,7 @@ SourceMapping, ) from app.metrics import NoopMetricsProvider -from app.prompt import MAX_ASSEMBLY_LINES, Prompt +from app.prompt import MAX_ASSEMBLY_LINES, MIN_MAX_TOKENS_WITH_THINKING, Prompt @pytest.fixture @@ -177,6 +177,23 @@ async def test_picks_last_text_block_with_thinking(self, sample_request, noop_me assert response.status == "success" assert response.explanation == "Final explanation here." + @pytest.mark.asyncio + async def test_use_thinking_overrides_prompt(self, sample_request, mock_anthropic_client, noop_metrics): + """Setting `useThinking=True` on the request must enable adaptive + thinking and bump max_tokens to satisfy the floor, even when the + loaded prompt has no thinking config.""" + sample_request.useThinking = True + + test_prompt = Prompt(Path("app/prompt.yaml")) + await process_request(sample_request, mock_anthropic_client, test_prompt, noop_metrics) + + _args, kwargs = mock_anthropic_client.messages.create.call_args + assert kwargs.get("thinking") == {"type": "adaptive"} + # When thinking is set, temperature must NOT be passed (API rejects it). + assert "temperature" not in kwargs + # max_tokens bumped to at least the documented floor. + assert kwargs["max_tokens"] >= MIN_MAX_TOKENS_WITH_THINKING + @pytest.mark.asyncio async def test_returns_error_when_no_text_block(self, sample_request, noop_metrics): """A response with no text block (e.g. thinking exhausted max_tokens)