From 2cc14022d0aedac8bea5eac327fd2fbc4f0cefe7 Mon Sep 17 00:00:00 2001
From: mattgodbolt-molty <mattgodbolt-molty@users.noreply.github.com>
Date: Wed, 6 May 2026 18:07:24 -0500
Subject: [PATCH 1/3] Expose extended thinking as a per-request flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `useThinking: bool = false` to ExplainRequest. When true the
explainer enables adaptive extended thinking and bumps max_tokens to
the documented floor; otherwise behaviour is identical to today.

Why opt-in rather than default-on: thinking measurably improves
correctness (errors 5 -> 0 across the medium-sized cases in our
21-case suite, eliminates the recurring imul-eax-edi-edi invention)
but pushes the largest queries past the 30s API Gateway v2 / Lambda
timeout — about 3/16 of our reasonably-sized cases would 504. Per-
request opt-in lets the CE frontend choose where the quality win is
worth the latency, with no risk of timing out users who don't ask.

Implementation:

- ExplainRequest gains a `useThinking` field (camelCase to match the
  rest of the API; default false).
- New `Prompt.generate_messages_for_request()` is the single entry
  point that applies per-request overrides on top of the YAML-loaded
  defaults. Currently the only override is thinking, but it's the
  natural seam for future per-request knobs.
- Both `_call_anthropic_api` and `generate_cache_key` route through
  the new method, so cache keys split on `useThinking` and the API
  call always sees the same payload that was hashed.
- CLAUDE.md updated to document the opt-in pattern, the 30s timeout
  ceiling, and the longer-term architectural fixes if we ever want
  default-on (Lambda Function URL response streaming or async pattern).
- README documents the new request field.

Tests: 96 -> 98 passing. New tests cover (a) `useThinking=True`
overrides prompt config and bumps max_tokens, (b) cache keys split on
the flag.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md           | 11 +++++++----
 README.md           |  3 ++-
 app/cache.py        |  6 ++++--
 app/explain.py      |  9 ++++++---
 app/explain_api.py  |  9 +++++++++
 app/prompt.py       | 17 +++++++++++++++++
 app/test_explain.py | 32 ++++++++++++++++++++++++++++++++
 7 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index c6964a2..2d8993b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -78,10 +78,13 @@ call → response with metrics. See `claude_explain.md` for detailed architectur
 - **Reviewer thinking is on by default.** `prompt-test run --review` and `prompt-test review` default to
   `--reviewer-thinking adaptive` / `--thinking adaptive`. It catches factual errors the no-think reviewer misses
   but adds ~70% to review cost. Pass `off` to compare runs or save money on large batches.
-- **Production explainer thinking is intentionally off.** Adaptive thinking on Sonnet 4.6 measurably improves
-  factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~11s end-to-end
-  latency, which is too much for the interactive endpoint. Don't enable it in `app/prompt.yaml` without an
-  explicit latency/quality decision.
+- **Production explainer thinking is opt-in per request.** Adaptive thinking on Sonnet 4.6 measurably improves
+  factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~10s+ latency on
+  small/medium queries and can push large queries past the **30s Lambda + API Gateway v2 timeout** entirely (no
+  raising that — HTTP API has a 30s ceiling). Callers opt in by sending `useThinking: true` on the request; the
+  default (no field, or `false`) preserves current latency. Cache keys split on the flag, so on/off requests
+  cache independently. If we ever want default-on, we need either a smaller fixed thinking budget *or* an async
+  response architecture (Lambda Function URL with response streaming, SQS poll, etc.).
 - **Multi-block responses.** When thinking is enabled the API returns thinking blocks before the text block.
   `app/explain.py` and `prompt_testing/runner.py` both pick the last text block via `getattr(c, "type", None) ==
   "text"`. Preserve that pattern for any new code that consumes responses. The API may also return
diff --git a/README.md b/README.md
index 2d27ac5..8a540b2 100644
--- a/README.md
+++ b/README.md
@@ -124,7 +124,8 @@ The API uses **camelCase** for all field names to maintain consistency with Java
       "labels": []
     }
   ],
-  "bypassCache": false  // Optional: set to true to skip cache reads
+  "bypassCache": false,  // Optional: set to true to skip cache reads
+  "useThinking": false   // Optional: enable extended thinking for higher accuracy at cost of latency
 }
 ```
 
diff --git a/app/cache.py b/app/cache.py
index 173bb59..0702951 100644
--- a/app/cache.py
+++ b/app/cache.py
@@ -134,8 +134,10 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str:
     Returns:
         A SHA-256 hash string to use as cache key
     """
-    # Generate the full message data that would be sent to Anthropic
-    prompt_data = prompt.generate_messages(request)
+    # Generate the full message data that would be sent to Anthropic,
+    # including any per-request overrides (e.g. useThinking) so cache hits
+    # split correctly between thinking-on and thinking-off requests.
+    prompt_data = prompt.generate_messages_for_request(request)
 
     # Create a deterministic representation of all cache-affecting data
     cache_data = {
diff --git a/app/explain.py b/app/explain.py
index d973075..0f6ee0b 100644
--- a/app/explain.py
+++ b/app/explain.py
@@ -80,8 +80,7 @@ async def _call_anthropic_api(
 
     This is the original process_request logic, extracted for clarity.
     """
-    # Generate messages using the Prompt instance
-    prompt_data = prompt.generate_messages(body)
+    prompt_data = prompt.generate_messages_for_request(body)
 
     # Debug logging for prompts
     LOGGER.debug(f"=== PROMPT DEBUG FOR {body.explanation.value.upper()} (audience: {body.audience.value}) ===")
@@ -93,7 +92,11 @@ async def _call_anthropic_api(
     LOGGER.debug("=== END PROMPT DEBUG ===")
 
     # Call Claude API
-    LOGGER.info("Using Anthropic client with model: %s", prompt_data["model"])
+    LOGGER.info(
+        "Using Anthropic client with model: %s (thinking=%s)",
+        prompt_data["model"],
+        bool(prompt_data.get("thinking")),
+    )
 
     api_kwargs: dict[str, Any] = {
         "model": prompt_data["model"],
diff --git a/app/explain_api.py b/app/explain_api.py
index 5cb08f0..954d1c6 100644
--- a/app/explain_api.py
+++ b/app/explain_api.py
@@ -57,6 +57,15 @@ class ExplainRequest(BaseModel):
         default=ExplanationType.ASSEMBLY, description="Type of explanation to generate"
     )
     bypassCache: bool = Field(default=False, description="If true, skip reading from cache but still write to cache")
+    useThinking: bool = Field(
+        default=False,
+        description=(
+            "If true, enable adaptive extended thinking on the explainer. "
+            "Improves correctness on complex inputs at the cost of significant "
+            "extra latency (often +10s, sometimes much more on large inputs). "
+            "May exceed the Lambda 30s timeout on the largest queries."
+        ),
+    )
 
     @property
     def instruction_set_with_default(self) -> str:
diff --git a/app/prompt.py b/app/prompt.py
index eb0f044..e1c82df 100644
--- a/app/prompt.py
+++ b/app/prompt.py
@@ -235,6 +235,23 @@ def prepare_structured_data(self, request: ExplainRequest) -> dict[str, Any]:
 
         return structured_data
 
+    def generate_messages_for_request(self, request: ExplainRequest) -> dict[str, Any]:
+        """Generate API payload, applying per-request overrides.
+
+        This is the entry point used by both the explain handler and the
+        cache-key generator so they stay in sync. Currently the only
+        override is ``useThinking``: when set, enable adaptive extended
+        thinking and bump ``max_tokens`` to satisfy the floor.
+        """
+        prompt_data = self.generate_messages(request)
+        if request.useThinking:
+            prompt_data = {
+                **prompt_data,
+                "thinking": {"type": "adaptive"},
+                "max_tokens": max(prompt_data["max_tokens"], MIN_MAX_TOKENS_WITH_THINKING),
+            }
+        return prompt_data
+
     def generate_messages(self, request: ExplainRequest) -> dict[str, Any]:
         """Generate the complete message structure for Claude API.
 
diff --git a/app/test_explain.py b/app/test_explain.py
index 753d245..e484239 100644
--- a/app/test_explain.py
+++ b/app/test_explain.py
@@ -177,6 +177,38 @@ async def test_picks_last_text_block_with_thinking(self, sample_request, noop_me
         assert response.status == "success"
         assert response.explanation == "Final explanation here."
 
+    @pytest.mark.asyncio
+    async def test_use_thinking_overrides_prompt(self, sample_request, mock_anthropic_client, noop_metrics):
+        """Setting `useThinking=True` on the request must enable adaptive
+        thinking and bump max_tokens to satisfy the floor, even when the
+        loaded prompt has no thinking config."""
+        sample_request.useThinking = True
+
+        test_prompt = Prompt(Path("app/prompt.yaml"))
+        await process_request(sample_request, mock_anthropic_client, test_prompt, noop_metrics)
+
+        _args, kwargs = mock_anthropic_client.messages.create.call_args
+        assert kwargs.get("thinking") == {"type": "adaptive"}
+        # When thinking is set, temperature must NOT be passed (API rejects it).
+        assert "temperature" not in kwargs
+        # max_tokens bumped to at least the documented floor.
+        assert kwargs["max_tokens"] >= 4096
+
+    @pytest.mark.asyncio
+    async def test_use_thinking_changes_cache_key(self, sample_request):
+        """Cache keys must split on `useThinking` so a thinking-enabled
+        request doesn't collide with a thinking-off cached response."""
+        from app.cache import generate_cache_key
+
+        test_prompt = Prompt(Path("app/prompt.yaml"))
+
+        sample_request.useThinking = False
+        key_off = generate_cache_key(sample_request, test_prompt)
+        sample_request.useThinking = True
+        key_on = generate_cache_key(sample_request, test_prompt)
+
+        assert key_off != key_on
+
     @pytest.mark.asyncio
     async def test_returns_error_when_no_text_block(self, sample_request, noop_metrics):
         """A response with no text block (e.g. thinking exhausted max_tokens)

From b447d9b7f9ef32b95ed5289879df0c63448668d7 Mon Sep 17 00:00:00 2001
From: mattgodbolt-molty <mattgodbolt-molty@users.noreply.github.com>
Date: Wed, 6 May 2026 18:15:29 -0500
Subject: [PATCH 2/3] Address subagent review on PR #18
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Findings from a follow-up review:

- Rename `Prompt.generate_messages_for_request` -> `build_api_payload`.
  The old name read as "generate messages, given a request" which is
  what `generate_messages` already does. The actual purpose is to
  apply per-request runtime overrides on top of the YAML defaults.
- Always return a fresh dict from `build_api_payload` so callers can
  mutate without affecting prompt internals or each other. The
  previous shape returned the original dict in the no-override path
  and a shallow copy in the override path — a foot-gun in waiting.
- Move `test_use_thinking_changes_cache_key` from `test_explain.py`
  to `test_cache.py` next to its sibling
  `test_generate_cache_key_bypass_cache_ignored`. Both tests answer
  "what affects the cache key" and belong together.
- Add `test_generate_cache_key_use_thinking_default_matches_omitted`
  to lock in the invariant the PR depends on: a request with
  `useThinking` absent (Pydantic default) must produce the same key
  as one with `useThinking=False` explicit. Without this, a future
  refactor could silently invalidate the existing S3 cache.
- Use `MIN_MAX_TOKENS_WITH_THINKING` constant in the test rather
  than hard-coding `4096`, so a future bump to the floor doesn't
  silently miss the test.

Tests: 98 -> 99 passing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/cache.py        |  7 +++----
 app/explain.py      |  2 +-
 app/prompt.py       | 16 ++++++++--------
 app/test_cache.py   | 30 ++++++++++++++++++++++++++++++
 app/test_explain.py | 19 ++-----------------
 5 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/app/cache.py b/app/cache.py
index 0702951..c9c00c8 100644
--- a/app/cache.py
+++ b/app/cache.py
@@ -134,10 +134,9 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str:
     Returns:
         A SHA-256 hash string to use as cache key
     """
-    # Generate the full message data that would be sent to Anthropic,
-    # including any per-request overrides (e.g. useThinking) so cache hits
-    # split correctly between thinking-on and thinking-off requests.
-    prompt_data = prompt.generate_messages_for_request(request)
+    # Use the same payload-building path the API call uses so cache hits
+    # split correctly on per-request overrides (e.g. useThinking).
+    prompt_data = prompt.build_api_payload(request)
 
     # Create a deterministic representation of all cache-affecting data
     cache_data = {
diff --git a/app/explain.py b/app/explain.py
index 0f6ee0b..6db1c65 100644
--- a/app/explain.py
+++ b/app/explain.py
@@ -80,7 +80,7 @@ async def _call_anthropic_api(
 
     This is the original process_request logic, extracted for clarity.
     """
-    prompt_data = prompt.generate_messages_for_request(body)
+    prompt_data = prompt.build_api_payload(body)
 
     # Debug logging for prompts
     LOGGER.debug(f"=== PROMPT DEBUG FOR {body.explanation.value.upper()} (audience: {body.audience.value}) ===")
diff --git a/app/prompt.py b/app/prompt.py
index e1c82df..bbbae78 100644
--- a/app/prompt.py
+++ b/app/prompt.py
@@ -235,21 +235,21 @@ def prepare_structured_data(self, request: ExplainRequest) -> dict[str, Any]:
 
         return structured_data
 
-    def generate_messages_for_request(self, request: ExplainRequest) -> dict[str, Any]:
-        """Generate API payload, applying per-request overrides.
+    def build_api_payload(self, request: ExplainRequest) -> dict[str, Any]:
+        """Build the API call payload, applying per-request runtime overrides.
 
         This is the entry point used by both the explain handler and the
         cache-key generator so they stay in sync. Currently the only
         override is ``useThinking``: when set, enable adaptive extended
         thinking and bump ``max_tokens`` to satisfy the floor.
+
+        Always returns a fresh dict so callers can mutate without affecting
+        prompt internals or each other.
         """
-        prompt_data = self.generate_messages(request)
+        prompt_data = dict(self.generate_messages(request))
         if request.useThinking:
-            prompt_data = {
-                **prompt_data,
-                "thinking": {"type": "adaptive"},
-                "max_tokens": max(prompt_data["max_tokens"], MIN_MAX_TOKENS_WITH_THINKING),
-            }
+            prompt_data["thinking"] = {"type": "adaptive"}
+            prompt_data["max_tokens"] = max(prompt_data["max_tokens"], MIN_MAX_TOKENS_WITH_THINKING)
         return prompt_data
 
     def generate_messages(self, request: ExplainRequest) -> dict[str, Any]:
diff --git a/app/test_cache.py b/app/test_cache.py
index ad5a531..2b11b92 100644
--- a/app/test_cache.py
+++ b/app/test_cache.py
@@ -265,6 +265,36 @@ def test_generate_cache_key_bypass_cache_ignored(self, test_prompt):
         key2 = generate_cache_key(request2, test_prompt)
         assert key1 == key2  # bypass_cache shouldn't affect cache key
 
+    def test_generate_cache_key_use_thinking_splits(self, test_prompt):
+        """useThinking changes the API call payload, so it must change the
+        cache key. Confirms cache hits don't bleed across thinking-on/off."""
+        base_kwargs = {
+            "language": "c++",
+            "compiler": "g++",
+            "code": "int test() { return 0; }",
+            "asm": [AssemblyItem(text="test:", source=None)],
+        }
+        request_off = ExplainRequest(**base_kwargs, useThinking=False)
+        request_on = ExplainRequest(**base_kwargs, useThinking=True)
+
+        assert generate_cache_key(request_off, test_prompt) != generate_cache_key(request_on, test_prompt)
+
+    def test_generate_cache_key_use_thinking_default_matches_omitted(self, test_prompt):
+        """A request with useThinking absent (Pydantic default) must produce
+        the same key as one with useThinking=False explicit. This is the
+        invariant that lets us add the field without invalidating the
+        existing S3 cache."""
+        base_kwargs = {
+            "language": "c++",
+            "compiler": "g++",
+            "code": "int test() { return 0; }",
+            "asm": [AssemblyItem(text="test:", source=None)],
+        }
+        request_default = ExplainRequest(**base_kwargs)  # useThinking absent
+        request_explicit = ExplainRequest(**base_kwargs, useThinking=False)
+
+        assert generate_cache_key(request_default, test_prompt) == generate_cache_key(request_explicit, test_prompt)
+
 
 class TestCacheHighLevelFunctions:
     """Test the high-level cache functions."""
diff --git a/app/test_explain.py b/app/test_explain.py
index e484239..b0d6150 100644
--- a/app/test_explain.py
+++ b/app/test_explain.py
@@ -11,7 +11,7 @@
     SourceMapping,
 )
 from app.metrics import NoopMetricsProvider
-from app.prompt import MAX_ASSEMBLY_LINES, Prompt
+from app.prompt import MAX_ASSEMBLY_LINES, MIN_MAX_TOKENS_WITH_THINKING, Prompt
 
 
 @pytest.fixture
@@ -192,22 +192,7 @@ async def test_use_thinking_overrides_prompt(self, sample_request, mock_anthropi
         # When thinking is set, temperature must NOT be passed (API rejects it).
         assert "temperature" not in kwargs
         # max_tokens bumped to at least the documented floor.
-        assert kwargs["max_tokens"] >= 4096
-
-    @pytest.mark.asyncio
-    async def test_use_thinking_changes_cache_key(self, sample_request):
-        """Cache keys must split on `useThinking` so a thinking-enabled
-        request doesn't collide with a thinking-off cached response."""
-        from app.cache import generate_cache_key
-
-        test_prompt = Prompt(Path("app/prompt.yaml"))
-
-        sample_request.useThinking = False
-        key_off = generate_cache_key(sample_request, test_prompt)
-        sample_request.useThinking = True
-        key_on = generate_cache_key(sample_request, test_prompt)
-
-        assert key_off != key_on
+        assert kwargs["max_tokens"] >= MIN_MAX_TOKENS_WITH_THINKING
 
     @pytest.mark.asyncio
     async def test_returns_error_when_no_text_block(self, sample_request, noop_metrics):

From 98154cf2b267dadc08a9725c0587b1875cae337b Mon Sep 17 00:00:00 2001
From: mattgodbolt-molty <mattgodbolt-molty@users.noreply.github.com>
Date: Wed, 6 May 2026 18:42:59 -0500
Subject: [PATCH 3/3] Make build_api_payload the single source of truth for API
 kwargs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per Copilot review: previously `build_api_payload()` returned a dict
that included both `thinking` and `temperature` when useThinking was
true, and `_call_anthropic_api()` had to re-derive the actual API
kwargs (dropping `temperature` because the API rejects it alongside
thinking). The cache key meanwhile hashed the un-derived dict — so
the cache key included `temperature` for requests that wouldn't
actually send it. Today this happened to be deterministic and
internally consistent, but it's drift-in-waiting: any future
per-request override with exclusivity rules would face the same trap.

Refactor so `build_api_payload()` returns exactly the kwargs
`messages.create` will receive, including the thinking-vs-temperature
exclusivity. Then both call sites become trivial:
- `_call_anthropic_api`: `await client.messages.create(**payload)`
- `generate_cache_key`: `{**payload, "prompt_version": ...}`

Net diff: -7 lines, fewer code paths to keep aligned.

Cache key shape changes (no `temperature` field when thinking is on,
no null `thinking` field when off), which invalidates existing S3
cache entries — acceptable per maintainer's stated tolerance for
cache flushes when the alternative is uglier code.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/cache.py   | 16 ++++------------
 app/explain.py | 20 ++++----------------
 app/prompt.py  | 37 +++++++++++++++++++++++++------------
 3 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/app/cache.py b/app/cache.py
index c9c00c8..867b270 100644
--- a/app/cache.py
+++ b/app/cache.py
@@ -134,19 +134,11 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str:
     Returns:
         A SHA-256 hash string to use as cache key
     """
-    # Use the same payload-building path the API call uses so cache hits
-    # split correctly on per-request overrides (e.g. useThinking).
-    prompt_data = prompt.build_api_payload(request)
-
-    # Create a deterministic representation of all cache-affecting data
+    # The payload IS the cache-affecting data — the same dict that gets
+    # spread into messages.create. Add prompt_version on top so changes
+    # to the YAML invalidate cached responses.
     cache_data = {
-        "model": prompt_data["model"],
-        "max_tokens": prompt_data["max_tokens"],
-        "temperature": prompt_data["temperature"],
-        "thinking": prompt_data.get("thinking"),
-        "system": prompt_data["system"],
-        "messages": prompt_data["messages"],
-        # Include a hash of the prompt config to invalidate cache when prompts change
+        **prompt.build_api_payload(request),
         "prompt_version": _get_prompt_config_hash(prompt.config),
     }
 
diff --git a/app/explain.py b/app/explain.py
index 6db1c65..bcead24 100644
--- a/app/explain.py
+++ b/app/explain.py
@@ -1,5 +1,4 @@
 import logging
-from typing import Any
 
 from anthropic import AsyncAnthropic
 
@@ -91,26 +90,15 @@ async def _call_anthropic_api(
         LOGGER.debug(message)
     LOGGER.debug("=== END PROMPT DEBUG ===")
 
-    # Call Claude API
+    # Call Claude API. `prompt_data` is already exactly the kwargs
+    # `messages.create` expects — `build_api_payload` resolved the
+    # thinking-vs-temperature exclusivity for us.
     LOGGER.info(
         "Using Anthropic client with model: %s (thinking=%s)",
         prompt_data["model"],
         bool(prompt_data.get("thinking")),
     )
-
-    api_kwargs: dict[str, Any] = {
-        "model": prompt_data["model"],
-        "max_tokens": prompt_data["max_tokens"],
-        "system": prompt_data["system"],
-        "messages": prompt_data["messages"],
-    }
-    if prompt_data.get("thinking"):
-        # Extended thinking: API requires temperature to be unset.
-        api_kwargs["thinking"] = prompt_data["thinking"]
-    else:
-        api_kwargs["temperature"] = prompt_data["temperature"]
-
-    message = await client.messages.create(**api_kwargs)
+    message = await client.messages.create(**prompt_data)
 
     # Extract usage information
     input_tokens = message.usage.input_tokens
diff --git a/app/prompt.py b/app/prompt.py
index bbbae78..87b5428 100644
--- a/app/prompt.py
+++ b/app/prompt.py
@@ -236,21 +236,34 @@ def prepare_structured_data(self, request: ExplainRequest) -> dict[str, Any]:
         return structured_data
 
     def build_api_payload(self, request: ExplainRequest) -> dict[str, Any]:
-        """Build the API call payload, applying per-request runtime overrides.
+        """Return the exact kwargs to pass to ``client.messages.create``.
 
-        This is the entry point used by both the explain handler and the
-        cache-key generator so they stay in sync. Currently the only
-        override is ``useThinking``: when set, enable adaptive extended
-        thinking and bump ``max_tokens`` to satisfy the floor.
+        Single source of truth: the API call splats this dict directly,
+        and the cache-key generator hashes it. Per-request runtime
+        overrides (currently just ``useThinking``) are resolved here, and
+        mutually-exclusive fields like ``thinking`` vs ``temperature``
+        are decided here too — callers don't need to re-derive anything.
 
-        Always returns a fresh dict so callers can mutate without affecting
-        prompt internals or each other.
+        Always returns a fresh dict so callers can mutate freely.
         """
-        prompt_data = dict(self.generate_messages(request))
-        if request.useThinking:
-            prompt_data["thinking"] = {"type": "adaptive"}
-            prompt_data["max_tokens"] = max(prompt_data["max_tokens"], MIN_MAX_TOKENS_WITH_THINKING)
-        return prompt_data
+        base = self.generate_messages(request)
+        payload: dict[str, Any] = {
+            "model": base["model"],
+            "max_tokens": base["max_tokens"],
+            "system": base["system"],
+            "messages": base["messages"],
+        }
+        # Resolve thinking config: per-request override wins over the YAML.
+        thinking = {"type": "adaptive"} if request.useThinking else base.get("thinking")
+        if thinking is not None:
+            payload["thinking"] = thinking
+            # Thinking and temperature are mutually exclusive (the API
+            # rejects temperature when thinking is set). Floor max_tokens
+            # so adaptive thinking can't starve the visible text.
+            payload["max_tokens"] = max(payload["max_tokens"], MIN_MAX_TOKENS_WITH_THINKING)
+        else:
+            payload["temperature"] = base["temperature"]
+        return payload
 
     def generate_messages(self, request: ExplainRequest) -> dict[str, Any]:
         """Generate the complete message structure for Claude API.