diff --git a/.gitignore b/.gitignore
index 6fd486a..b34d6e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,6 @@ node_modules
 # Package build artifacts
 *.egg-info/
 *.backup
+
+# Claude Code local session state
+.claude/scheduled_tasks.lock
diff --git a/CLAUDE.md b/CLAUDE.md
index 323c627..c6964a2 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -66,6 +66,33 @@ uv run ruff format
 The service processes compiler output through a pipeline: input validation → smart assembly filtering → Claude API
 call → response with metrics. See `claude_explain.md` for detailed architecture documentation.
 
+## Anthropic API gotchas
+
+- **`max_tokens` includes thinking tokens.** When a prompt YAML sets `model.thinking: {type: adaptive}` (or
+  `{type: enabled, budget_tokens: N}`), thinking counts against `max_tokens`. The production value `1536` silently
+  starves the visible text output on complex cases when thinking is on. `Prompt.__init__` now refuses to load a
+  thinking-enabled config with `max_tokens < 4096`; ≥4096 (8192 worked in past experiments) is the floor.
+- **The reviewer model rejects `temperature`.** Opus 4.7 deprecated the parameter, so `prompt_testing/reviewer.py`
+  omits it. The Sonnet explainer still accepts `temperature`. If you swap the reviewer to a model that requires
+  it, restore the param.
+- **Reviewer thinking is on by default.** `prompt-test run --review` and `prompt-test review` default to
+  `--reviewer-thinking adaptive` / `--thinking adaptive`. It catches factual errors the no-think reviewer misses
+  but adds ~70% to review cost. Pass `off` to compare runs or save money on large batches.
+- **Production explainer thinking is intentionally off.** Adaptive thinking on Sonnet 4.6 measurably improves
+  factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~11s end-to-end
+  latency, which is too much for the interactive endpoint. Don't enable it in `app/prompt.yaml` without an
+  explicit latency/quality decision.
+- **Multi-block responses.** When thinking is enabled the API returns thinking blocks before the text block.
+  `app/explain.py` and `prompt_testing/runner.py` both pick the last text block via `getattr(c, "type", None) ==
+  "text"`. Preserve that pattern for any new code that consumes responses. The API may also return
+  `redacted_thinking` blocks (encrypted reasoning when safety filters trip); the same filter excludes them
+  correctly, but be aware "no text block" can mean either max_tokens starvation *or* a redacted-thinking-only
+  response — the error message is the same.
+- **Empty responses are not 500s.** When the model returns no text block, `app/explain.py` returns
+  `ExplainResponse(status="error")` with `usage` populated and emits `ClaudeExplainEmptyResponse`. The cache
+  layer skips storing error responses so retries hit the API. Don't change this to raise — the structured error
+  is what the CE frontend can render.
+
 ## Code Style Guidelines
 
 - Prefer using modern Python 3.13+ type syntax. Good: `a: list[str] | None`. Bad: `a: Optional[List[str]]`
diff --git a/app/cache.py b/app/cache.py
index f7dd0fc..173bb59 100644
--- a/app/cache.py
+++ b/app/cache.py
@@ -142,6 +142,7 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str:
         "model": prompt_data["model"],
         "max_tokens": prompt_data["max_tokens"],
         "temperature": prompt_data["temperature"],
+        "thinking": prompt_data.get("thinking"),
         "system": prompt_data["system"],
         "messages": prompt_data["messages"],
         # Include a hash of the prompt config to invalidate cache when prompts change
diff --git a/app/explain.py b/app/explain.py
index c352bcb..d973075 100644
--- a/app/explain.py
+++ b/app/explain.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Any
 
 from anthropic import AsyncAnthropic
 
@@ -59,8 +60,10 @@ async def process_request(
     # Cache miss or no cache - proceed with Anthropic API call
     response = await _call_anthropic_api(body, client, prompt, metrics_provider)
 
-    # Cache the response (if cache provider is available)
-    if cache_provider is not None:
+    # Cache the response (if cache provider is available). Don't cache
+    # error responses — they consume real tokens but produce no useful
+    # content, and we want a retry to hit the API rather than the cache.
+    if cache_provider is not None and response.status == "success":
         await cache_response(body, prompt, response, cache_provider)
         metrics_provider.put_metric("ClaudeExplainCacheMiss", 1)
 
@@ -90,24 +93,61 @@ async def _call_anthropic_api(
     LOGGER.debug("=== END PROMPT DEBUG ===")
 
     # Call Claude API
-    LOGGER.info("Using Anthropic client with model: %s", {prompt_data["model"]})
-
-    message = await client.messages.create(
-        model=prompt_data["model"],
-        max_tokens=prompt_data["max_tokens"],
-        temperature=prompt_data["temperature"],
-        system=prompt_data["system"],
-        messages=prompt_data["messages"],
-    )
-
-    # Get explanation and strip leading/trailing whitespace
-    explanation = message.content[0].text.strip()
+    LOGGER.info("Using Anthropic client with model: %s", prompt_data["model"])
+
+    api_kwargs: dict[str, Any] = {
+        "model": prompt_data["model"],
+        "max_tokens": prompt_data["max_tokens"],
+        "system": prompt_data["system"],
+        "messages": prompt_data["messages"],
+    }
+    if prompt_data.get("thinking"):
+        # Extended thinking: API requires temperature to be unset.
+        api_kwargs["thinking"] = prompt_data["thinking"]
+    else:
+        api_kwargs["temperature"] = prompt_data["temperature"]
+
+    message = await client.messages.create(**api_kwargs)
 
     # Extract usage information
     input_tokens = message.usage.input_tokens
     output_tokens = message.usage.output_tokens
     total_tokens = input_tokens + output_tokens
 
+    # Pick the last text block — when thinking is enabled the response
+    # contains thinking blocks before the final text block.
+    text_blocks = [c for c in message.content if getattr(c, "type", None) == "text"]
+    explanation = text_blocks[-1].text.strip() if text_blocks else ""
+    if not explanation:
+        # Can happen if extended thinking exhausts max_tokens before any
+        # text block is emitted. Surface the failure to the caller with
+        # token usage populated, and emit a metric so this is visible on
+        # dashboards rather than buried in a generic 500.
+        message_text = (
+            f"Claude returned no text content "
+            f"(stop_reason={message.stop_reason}, in={input_tokens}, out={output_tokens}). "
+            f"If thinking is enabled, max_tokens may be too low."
+        )
+        LOGGER.warning(message_text)
+        metrics_provider.set_property("language", body.language)
+        metrics_provider.set_property("compiler", body.compiler)
+        metrics_provider.set_property("instructionSet", body.instructionSet or "unknown")
+        metrics_provider.set_property("cached", "false")
+        metrics_provider.put_metric("ClaudeExplainRequest", 1)
+        metrics_provider.put_metric("ClaudeExplainEmptyResponse", 1)
+        metrics_provider.put_metric("ClaudeExplainInputTokens", input_tokens)
+        metrics_provider.put_metric("ClaudeExplainOutputTokens", output_tokens)
+        return ExplainResponse(
+            status="error",
+            message=message_text,
+            model=prompt_data["model"],
+            usage=TokenUsage(
+                inputTokens=input_tokens,
+                outputTokens=output_tokens,
+                totalTokens=total_tokens,
+            ),
+        )
+
     # Calculate costs based on model
     cost_per_input_token, cost_per_output_token = get_model_cost(prompt_data["model"])
     input_cost = input_tokens * cost_per_input_token
diff --git a/app/prompt.py b/app/prompt.py
index 7f7cdb4..eb0f044 100644
--- a/app/prompt.py
+++ b/app/prompt.py
@@ -15,6 +15,11 @@
 # Constants from explain.py that are needed for data preparation
 MAX_ASSEMBLY_LINES = 300  # Maximum number of assembly lines to process
 
+# Minimum max_tokens that's safe to pair with extended thinking. Below this,
+# adaptive thinking can consume the whole budget on complex inputs and leave
+# nothing for the visible response.
+MIN_MAX_TOKENS_WITH_THINKING = 4096
+
 
 class Prompt:
     """Manages prompt templates and generates messages for Claude API."""
@@ -36,6 +41,18 @@ def __init__(self, config: dict[str, Any] | Path):
         self.model = self.config["model"]["name"]
         self.max_tokens = self.config["model"]["max_tokens"]
         self.temperature = self.config["model"].get("temperature", 0.0)
+        # Optional extended-thinking config, e.g. {"type": "adaptive"} or
+        # {"type": "enabled", "budget_tokens": 2000}. When set, callers
+        # should drop `temperature` (the API requires it to be unset/1).
+        self.thinking = self.config["model"].get("thinking")
+        if self.thinking and self.max_tokens < MIN_MAX_TOKENS_WITH_THINKING:
+            # Adaptive thinking happily consumes the entire token budget on
+            # complex inputs and leaves nothing for the visible text block.
+            # Fail loudly here rather than silently returning empty responses.
+            raise ValueError(
+                f"max_tokens={self.max_tokens} is too low for thinking; "
+                f"use at least {MIN_MAX_TOKENS_WITH_THINKING} when model.thinking is set."
+            )
 
         # Extract prompt templates
         self.system_prompt_template = self.config["system_prompt"]
@@ -275,6 +292,7 @@ def generate_messages(self, request: ExplainRequest) -> dict[str, Any]:
             "model": self.model,
             "max_tokens": self.max_tokens,
             "temperature": self.temperature,
+            "thinking": self.thinking,
             "system": system_prompt,
             "messages": messages,
             "structured_data": structured_data,  # Include for reference
diff --git a/app/test_explain.py b/app/test_explain.py
index da3125d..753d245 100644
--- a/app/test_explain.py
+++ b/app/test_explain.py
@@ -57,6 +57,7 @@ def mock_anthropic_client():
     mock_client = MagicMock()
     mock_message = MagicMock()
     mock_content = MagicMock()
+    mock_content.type = "text"
     mock_content.text = "This assembly code implements a simple square function..."
     mock_message.content = [mock_content]
 
@@ -149,6 +150,93 @@ async def test_process_request_success(self, sample_request, mock_anthropic_clie
         assert structured_data["compiler"] == "g++"
         assert structured_data["sourceCode"] == "int square(int x) {\n  return x * x;\n}"
 
+    @pytest.mark.asyncio
+    async def test_picks_last_text_block_with_thinking(self, sample_request, noop_metrics):
+        """When thinking is enabled, the response has thinking blocks before the
+        final text block. The handler should select the text block, not the
+        thinking block."""
+        thinking_block = MagicMock()
+        thinking_block.type = "thinking"
+        thinking_block.thinking = "Let me trace through the imul..."
+
+        text_block = MagicMock()
+        text_block.type = "text"
+        text_block.text = "Final explanation here."
+
+        mock_message = MagicMock()
+        mock_message.content = [thinking_block, text_block]
+        mock_message.usage = MagicMock(input_tokens=100, output_tokens=50)
+        mock_message.stop_reason = "end_turn"
+
+        mock_client = MagicMock()
+        mock_client.messages.create = AsyncMock(return_value=mock_message)
+
+        test_prompt = Prompt(Path("app/prompt.yaml"))
+        response = await process_request(sample_request, mock_client, test_prompt, noop_metrics)
+
+        assert response.status == "success"
+        assert response.explanation == "Final explanation here."
+
+    @pytest.mark.asyncio
+    async def test_returns_error_when_no_text_block(self, sample_request, noop_metrics):
+        """A response with no text block (e.g. thinking exhausted max_tokens)
+        should return status='error' with token usage populated, not raise.
+        Production must surface a structured error rather than a generic 500."""
+        thinking_block = MagicMock()
+        thinking_block.type = "thinking"
+        thinking_block.thinking = "..."
+
+        mock_message = MagicMock()
+        mock_message.content = [thinking_block]
+        mock_message.usage = MagicMock(input_tokens=100, output_tokens=50)
+        mock_message.stop_reason = "max_tokens"
+
+        mock_client = MagicMock()
+        mock_client.messages.create = AsyncMock(return_value=mock_message)
+
+        test_prompt = Prompt(Path("app/prompt.yaml"))
+        response = await process_request(sample_request, mock_client, test_prompt, noop_metrics)
+
+        assert response.status == "error"
+        assert response.explanation is None
+        assert "no text content" in response.message
+        # Real tokens were spent — surface them so callers can see the cost.
+        assert response.usage is not None
+        assert response.usage.inputTokens == 100
+        assert response.usage.outputTokens == 50
+
+
+class TestPromptValidation:
+    """Validation rules enforced at Prompt construction."""
+
+    def test_thinking_requires_min_max_tokens(self):
+        """A prompt YAML that enables thinking must also bump max_tokens."""
+        with pytest.raises(ValueError, match="too low for thinking"):
+            Prompt(
+                {
+                    "model": {"name": "test", "max_tokens": 1536, "thinking": {"type": "adaptive"}},
+                    "system_prompt": "",
+                    "user_prompt": "",
+                    "assistant_prefill": "",
+                    "audience_levels": {},
+                    "explanation_types": {},
+                }
+            )
+
+    def test_thinking_with_sufficient_max_tokens_is_ok(self):
+        """At/above the floor (4096), thinking-enabled prompts load fine."""
+        prompt = Prompt(
+            {
+                "model": {"name": "test", "max_tokens": 4096, "thinking": {"type": "adaptive"}},
+                "system_prompt": "",
+                "user_prompt": "",
+                "assistant_prefill": "",
+                "audience_levels": {},
+                "explanation_types": {},
+            }
+        )
+        assert prompt.thinking == {"type": "adaptive"}
+
 
 class TestSelectImportantAssembly:
     """Test the select_important_assembly function."""
diff --git a/prompt_testing/cli.py b/prompt_testing/cli.py
index e5d3af0..452e050 100644
--- a/prompt_testing/cli.py
+++ b/prompt_testing/cli.py
@@ -16,6 +16,7 @@
 import sys
 from collections import defaultdict
 from pathlib import Path
+from typing import Any
 
 import click
 from dotenv import load_dotenv
@@ -45,8 +46,14 @@ def cli(ctx, project_root):
 @click.option("--max-concurrent", type=int, default=5)
 @click.option("--review", is_flag=True, help="Also run Opus correctness review on results")
 @click.option("--review-model", default="claude-opus-4-7", help="Model for correctness review")
+@click.option(
+    "--reviewer-thinking",
+    type=click.Choice(["off", "adaptive"]),
+    default="adaptive",
+    help="Extended thinking on the reviewer. Default 'adaptive' improves rigor at ~70% extra reviewer cost.",
+)
 @click.pass_context
-def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_model):
+def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_model, reviewer_thinking):
     """Run test cases and save results for review."""
     tester = PromptTester(ctx.obj["project_root"], max_concurrent=max_concurrent)
     results = tester.run(
@@ -56,7 +63,8 @@ def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_m
     )
 
     if review:
-        results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, review_model))
+        thinking = {"type": "adaptive"} if reviewer_thinking == "adaptive" else None
+        results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, review_model, thinking))
 
     tester.save(results, output)
 
@@ -195,11 +203,11 @@ def compilers(ctx, language, search, limit):  # noqa: ARG001
             click.echo(f"... and {len(results) - limit} more")
 
 
-async def _run_reviews(project_root: Path, results: dict, model: str) -> dict:
+async def _run_reviews(project_root: Path, results: dict, model: str, thinking: dict[str, Any] | None = None) -> dict:
     """Run correctness reviews on all successful results."""
     from prompt_testing.reviewer import CorrectnessReviewer
 
-    reviewer = CorrectnessReviewer(model=model)
+    reviewer = CorrectnessReviewer(model=model, thinking=thinking)
     test_dir = project_root / "prompt_testing" / "test_cases"
     all_cases = load_all_test_cases(str(test_dir))
     cases_by_id = {c["id"]: c for c in all_cases}
@@ -209,6 +217,7 @@ async def _run_reviews(project_root: Path, results: dict, model: str) -> dict:
 
     review_cost = 0.0
     errors_found = 0
+    review_failures = 0
     cost_per_input_token, cost_per_output_token = get_model_cost(model)
 
     for i, result in enumerate(successful, 1):
@@ -219,10 +228,19 @@ async def _run_reviews(project_root: Path, results: dict, model: str) -> dict:
         review = await reviewer.review_test_result(case, result["explanation"])
         result["review"] = review
 
-        status = "✓" if review.get("correct") else "✗"
-        n_issues = len(review.get("issues", []))
-        if not review.get("correct"):
+        # `correct`: True = passed, False = real factual error,
+        # None = reviewer infrastructure failure (parse/empty response).
+        # Distinguish so suite metrics don't conflate the two.
+        correct = review.get("correct")
+        if correct is True:
+            status = "✓"
+        elif correct is False:
+            status = "✗"
             errors_found += 1
+        else:
+            status = "?"
+            review_failures += 1
+        n_issues = len(review.get("issues", []))
         cost = (
             review.get("reviewer_input_tokens", 0) * cost_per_input_token
             + review.get("reviewer_output_tokens", 0) * cost_per_output_token
@@ -234,26 +252,33 @@ async def _run_reviews(project_root: Path, results: dict, model: str) -> dict:
     results["review_cost_usd"] = round(review_cost, 6)
     results["total_cost_usd"] = round(results["total_cost_usd"] + review_cost, 6)
     results["errors_found"] = errors_found
+    results["review_failures"] = review_failures
     return results
 
 
-def _print_review_summary(results: dict) -> None:
+def _print_review_summary(results: dict[str, Any]) -> None:
     """Print a summary of correctness reviews."""
     reviewed = [r for r in results["results"] if r.get("review")]
-    correct = sum(1 for r in reviewed if r["review"].get("correct"))
-    incorrect = len(reviewed) - correct
+    passed = sum(1 for r in reviewed if r["review"].get("correct") is True)
+    failed = sum(1 for r in reviewed if r["review"].get("correct") is False)
+    review_failures = sum(1 for r in reviewed if r["review"].get("correct") is None)
 
-    click.echo(f"\nCorrectness: {correct}/{len(reviewed)} passed")
-    if incorrect:
-        click.echo(f"\n⚠ {incorrect} case(s) with issues:")
+    click.echo(f"\nCorrectness: {passed}/{len(reviewed)} passed")
+    if failed:
+        click.echo(f"\n⚠ {failed} case(s) with issues:")
         for r in reviewed:
             review = r["review"]
-            if not review.get("correct"):
+            if review.get("correct") is False:
                 click.echo(f"\n  {r['case_id']}:")
                 for issue in review.get("issues", []):
                     sev = "🔴" if issue["severity"] == "error" else "🟡"
                     click.echo(f"    {sev} {issue['claim']}")
                     click.echo(f"       → {issue['correction']}")
+    if review_failures:
+        click.echo(f"\n⚠ {review_failures} review(s) failed to run (likely max_tokens starvation):")
+        for r in reviewed:
+            if r["review"].get("correct") is None:
+                click.echo(f"  {r['case_id']}: {r['review'].get('summary', '?')}")
 
     click.echo(f"\nReview cost: ${results.get('review_cost_usd', 0):.4f} ({results.get('review_model', '?')})")
 
@@ -261,14 +286,21 @@ def _print_review_summary(results: dict) -> None:
 @cli.command()
 @click.argument("results_file")
 @click.option("--model", default="claude-opus-4-7", help="Reviewer model")
+@click.option(
+    "--thinking",
+    type=click.Choice(["off", "adaptive"]),
+    default="adaptive",
+    help="Extended thinking on the reviewer (default 'adaptive' for tighter rigor).",
+)
 @click.pass_context
-def review(ctx, results_file, model):
+def review(ctx, results_file, model, thinking):
     """Run Opus correctness review on existing results."""
     results_dir = ctx.obj["project_root"] / "prompt_testing" / "results"
     path = results_dir / results_file if not Path(results_file).is_absolute() else Path(results_file)
 
+    thinking_cfg = {"type": "adaptive"} if thinking == "adaptive" else None
     results = json.loads(path.read_text())
-    results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, model))
+    results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, model, thinking_cfg))
 
     # Save updated results
     path.write_text(json.dumps(results, indent=2))
diff --git a/prompt_testing/reviewer.py b/prompt_testing/reviewer.py
index a329317..243dae7 100644
--- a/prompt_testing/reviewer.py
+++ b/prompt_testing/reviewer.py
@@ -66,8 +66,17 @@
 class CorrectnessReviewer:
     """Reviews explanations for factual correctness using a powerful model."""
 
-    def __init__(self, model: str = "claude-opus-4-7"):
+    def __init__(self, model: str = "claude-opus-4-7", thinking: dict[str, Any] | None = None):
+        """Initialise the reviewer.
+
+        Args:
+            model: Claude model id to use for review.
+            thinking: Optional extended-thinking config, e.g.
+                ``{"type": "adaptive"}`` or
+                ``{"type": "enabled", "budget_tokens": 2000}``.
+        """
         self.model = model
+        self.thinking = thinking
         self.client = AsyncAnthropic()
 
     async def review(
@@ -96,31 +105,50 @@ async def review(
         )
 
         # Opus 4.7+ rejects `temperature`; rely on the model's own default.
-        msg = await self.client.messages.create(
-            model=self.model,
-            max_tokens=2048,
-            system=REVIEW_SYSTEM_PROMPT,
-            messages=[{"role": "user", "content": user_prompt}],
-        )
-
-        text = msg.content[0].text.strip()
-
-        # Parse JSON response
-        try:
-            result = json.loads(text)
-        except json.JSONDecodeError:
-            # Try to extract JSON from markdown fencing
-            if "```" in text:
-                json_part = text.split("```")[1]
-                if json_part.startswith("json"):
-                    json_part = json_part[4:]
-                result = json.loads(json_part.strip())
-            else:
-                result = {
-                    "correct": None,
-                    "issues": [],
-                    "summary": f"Failed to parse reviewer response: {text[:200]}",
-                }
+        api_kwargs: dict[str, Any] = {
+            "model": self.model,
+            "max_tokens": 2048,
+            "system": REVIEW_SYSTEM_PROMPT,
+            "messages": [{"role": "user", "content": user_prompt}],
+        }
+        if self.thinking:
+            api_kwargs["thinking"] = self.thinking
+        msg = await self.client.messages.create(**api_kwargs)
+
+        # When thinking is enabled the response contains thinking blocks
+        # before the final text block; pick the last text block.
+        text_blocks = [c for c in msg.content if getattr(c, "type", None) == "text"]
+        text = text_blocks[-1].text.strip() if text_blocks else ""
+
+        if not text:
+            # Likely thinking exhausted max_tokens before any text block.
+            # Surface stop_reason/usage so this is diagnosable rather than a
+            # generic JSON parse failure.
+            result: dict[str, Any] = {
+                "correct": None,
+                "issues": [],
+                "summary": (
+                    f"Reviewer returned no text (stop_reason={msg.stop_reason}, "
+                    f"in={msg.usage.input_tokens}, out={msg.usage.output_tokens})."
+                ),
+            }
+        else:
+            # Parse JSON response
+            try:
+                result = json.loads(text)
+            except json.JSONDecodeError:
+                # Try to extract JSON from markdown fencing
+                if "```" in text:
+                    json_part = text.split("```")[1]
+                    if json_part.startswith("json"):
+                        json_part = json_part[4:]
+                    result = json.loads(json_part.strip())
+                else:
+                    result = {
+                        "correct": None,
+                        "issues": [],
+                        "summary": f"Failed to parse reviewer response: {text[:200]}",
+                    }
 
         result["reviewer_model"] = self.model
         result["reviewer_input_tokens"] = msg.usage.input_tokens
diff --git a/prompt_testing/runner.py b/prompt_testing/runner.py
index 15ea143..e678447 100644
--- a/prompt_testing/runner.py
+++ b/prompt_testing/runner.py
@@ -15,6 +15,7 @@
 
 from app.explain_api import AssemblyItem, ExplainRequest
 from app.explanation_types import AudienceLevel, ExplanationType
+from app.model_costs import get_model_cost
 from app.prompt import Prompt
 from prompt_testing.file_utils import load_all_test_cases
 
@@ -69,20 +70,45 @@ async def _run_one(self, test_case: dict[str, Any], prompt: Prompt) -> dict[str,
             request = self._to_request(test_case)
             prompt_data = prompt.generate_messages(request)
 
+            api_kwargs: dict[str, Any] = {
+                "model": prompt_data["model"],
+                "max_tokens": prompt_data["max_tokens"],
+                "system": prompt_data["system"],
+                "messages": prompt_data["messages"],
+            }
+            if prompt_data.get("thinking"):
+                # Extended thinking: temperature must be 1 / unset.
+                api_kwargs["thinking"] = prompt_data["thinking"]
+            else:
+                api_kwargs["temperature"] = prompt_data["temperature"]
+
             start = time.time()
             try:
-                msg = await self.async_client.messages.create(
-                    model=prompt_data["model"],
-                    max_tokens=prompt_data["max_tokens"],
-                    temperature=prompt_data["temperature"],
-                    system=prompt_data["system"],
-                    messages=prompt_data["messages"],
-                )
+                msg = await self.async_client.messages.create(**api_kwargs)
                 elapsed_ms = int((time.time() - start) * 1000)
+                text_blocks = [c for c in msg.content if getattr(c, "type", None) == "text"]
+                explanation = text_blocks[-1].text.strip() if text_blocks else ""
+                if not explanation:
+                    # Treat empty output as a failure so suite metrics aren't
+                    # skewed. Common cause: thinking exhausting max_tokens
+                    # before any text block is emitted. Tokens were still
+                    # spent — capture them so cost reporting stays accurate.
+                    return {
+                        "case_id": case_id,
+                        "success": False,
+                        "error": (
+                            f"empty response (stop_reason={msg.stop_reason}, "
+                            f"in={msg.usage.input_tokens}, out={msg.usage.output_tokens})"
+                        ),
+                        "model": prompt_data["model"],
+                        "input_tokens": msg.usage.input_tokens,
+                        "output_tokens": msg.usage.output_tokens,
+                        "elapsed_ms": elapsed_ms,
+                    }
                 return {
                     "case_id": case_id,
                     "success": True,
-                    "explanation": msg.content[-1].text.strip(),
+                    "explanation": explanation,
                     "model": prompt_data["model"],
                     "input_tokens": msg.usage.input_tokens,
                     "output_tokens": msg.usage.output_tokens,
@@ -124,9 +150,14 @@ async def run_async(
             print(f"  [{i}/{len(cases)}] {status} {result['case_id']} ({tokens})")
 
         successful = [r for r in results if r["success"]]
+        # Cost includes failures that consumed tokens (e.g. thinking exhausted
+        # max_tokens before any text was emitted) — those aren't free.
+        # Use the prompt's actual model rather than hardcoded rates so the
+        # number stays correct across explainer-model experiments.
+        cost_per_input_token, cost_per_output_token = get_model_cost(prompt.model)
         total_cost = sum(
-            r["input_tokens"] * 3 / 1e6 + r["output_tokens"] * 15 / 1e6  # Sonnet pricing
-            for r in successful
+            r.get("input_tokens", 0) * cost_per_input_token + r.get("output_tokens", 0) * cost_per_output_token
+            for r in results
         )
 
         return {