From 15df9ee26451a284662dc3ee5759494795aca70a Mon Sep 17 00:00:00 2001
From: mattgodbolt-molty <mattgodbolt-molty@users.noreply.github.com>
Date: Wed, 6 May 2026 16:19:56 -0500
Subject: [PATCH 1/7] Add extended-thinking support; default reviewer to
 adaptive thinking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Plumbs Anthropic's extended thinking through the prompt-testing pipeline
and the production explain handler. The production prompt is unchanged
(no `thinking` field set), so behaviour and cost are identical until a
config opts in.

Findings that motivated this change (21-case suite, full results in PR
description):

- Sonnet 4.6 explainer with adaptive thinking: errors flagged 5 -> 1.
  Eliminates the recurring `imul eax, edi, edi` invention (2/5 -> 0/5
  in focused 5-run testing). Cost +~$0.013/req, latency +11s. Held back
  from production: too much wall-clock for an interactive endpoint.
- Opus 4.7 reviewer with adaptive thinking: catches different errors
  and finds real issues the no-thinking reviewer misses (e.g. correctly
  flags `factorial_beginner_assembly` mis-naming TCO). ~70% extra
  reviewer cost, no user-facing latency since reviews are offline.
  Worth defaulting on for eval rigor.

Implementation:

- `Prompt` reads an optional `model.thinking` block (e.g. `{type:
  adaptive}` or `{type: enabled, budget_tokens: 2000}`) and surfaces it
  via `generate_messages()`.
- `app/explain.py` and `prompt_testing/runner.py` both now construct
  api_kwargs conditionally: with thinking, drop `temperature` (the API
  rejects it); pick the last text block from the response (thinking
  blocks come first when enabled).
- `CorrectnessReviewer` accepts an optional `thinking` config and
  applies the same multi-block extraction.
- `prompt-test run` and `prompt-test review` gain `--reviewer-thinking
  / --thinking` flags. Default is `adaptive` because the rigor win on
  eval is worth the extra cost; pass `off` to compare or save money.

Important gotcha: with adaptive thinking, `max_tokens` must be large
enough to leave room for the visible response after the thinking
budget. Production prompt's 1536 starves the text output on complex
cases — bump to >=4096 if a future variant enables thinking.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/explain.py             | 28 ++++++++++++++++++----------
 app/prompt.py              |  5 +++++
 app/test_explain.py        |  1 +
 prompt_testing/cli.py      | 26 ++++++++++++++++++++------
 prompt_testing/reviewer.py | 33 ++++++++++++++++++++++++---------
 prompt_testing/runner.py   | 24 ++++++++++++++++--------
 6 files changed, 84 insertions(+), 33 deletions(-)

diff --git a/app/explain.py b/app/explain.py
index c352bcb..2b96716 100644
--- a/app/explain.py
+++ b/app/explain.py
@@ -92,16 +92,24 @@ async def _call_anthropic_api(
     # Call Claude API
     LOGGER.info("Using Anthropic client with model: %s", {prompt_data["model"]})
 
-    message = await client.messages.create(
-        model=prompt_data["model"],
-        max_tokens=prompt_data["max_tokens"],
-        temperature=prompt_data["temperature"],
-        system=prompt_data["system"],
-        messages=prompt_data["messages"],
-    )
-
-    # Get explanation and strip leading/trailing whitespace
-    explanation = message.content[0].text.strip()
+    api_kwargs: dict = {
+        "model": prompt_data["model"],
+        "max_tokens": prompt_data["max_tokens"],
+        "system": prompt_data["system"],
+        "messages": prompt_data["messages"],
+    }
+    if prompt_data.get("thinking"):
+        # Extended thinking: API requires temperature to be unset.
+        api_kwargs["thinking"] = prompt_data["thinking"]
+    else:
+        api_kwargs["temperature"] = prompt_data["temperature"]
+
+    message = await client.messages.create(**api_kwargs)
+
+    # Pick the last text block — when thinking is enabled the response
+    # contains thinking blocks before the final text block.
+    text_blocks = [c for c in message.content if getattr(c, "type", None) == "text"]
+    explanation = text_blocks[-1].text.strip() if text_blocks else ""
 
     # Extract usage information
     input_tokens = message.usage.input_tokens
diff --git a/app/prompt.py b/app/prompt.py
index 7f7cdb4..55c80e2 100644
--- a/app/prompt.py
+++ b/app/prompt.py
@@ -36,6 +36,10 @@ def __init__(self, config: dict[str, Any] | Path):
         self.model = self.config["model"]["name"]
         self.max_tokens = self.config["model"]["max_tokens"]
         self.temperature = self.config["model"].get("temperature", 0.0)
+        # Optional extended-thinking config, e.g. {"type": "adaptive"} or
+        # {"type": "enabled", "budget_tokens": 2000}. When set, callers
+        # should drop `temperature` (the API requires it to be unset/1).
+        self.thinking = self.config["model"].get("thinking")
 
         # Extract prompt templates
         self.system_prompt_template = self.config["system_prompt"]
@@ -275,6 +279,7 @@ def generate_messages(self, request: ExplainRequest) -> dict[str, Any]:
             "model": self.model,
             "max_tokens": self.max_tokens,
             "temperature": self.temperature,
+            "thinking": self.thinking,
             "system": system_prompt,
             "messages": messages,
             "structured_data": structured_data,  # Include for reference
diff --git a/app/test_explain.py b/app/test_explain.py
index da3125d..675a069 100644
--- a/app/test_explain.py
+++ b/app/test_explain.py
@@ -57,6 +57,7 @@ def mock_anthropic_client():
     mock_client = MagicMock()
     mock_message = MagicMock()
     mock_content = MagicMock()
+    mock_content.type = "text"
     mock_content.text = "This assembly code implements a simple square function..."
     mock_message.content = [mock_content]
 
diff --git a/prompt_testing/cli.py b/prompt_testing/cli.py
index e5d3af0..de5b1c0 100644
--- a/prompt_testing/cli.py
+++ b/prompt_testing/cli.py
@@ -45,8 +45,14 @@ def cli(ctx, project_root):
 @click.option("--max-concurrent", type=int, default=5)
 @click.option("--review", is_flag=True, help="Also run Opus correctness review on results")
 @click.option("--review-model", default="claude-opus-4-7", help="Model for correctness review")
+@click.option(
+    "--reviewer-thinking",
+    type=click.Choice(["off", "adaptive"]),
+    default="adaptive",
+    help="Extended thinking on the reviewer. Default 'adaptive' improves rigor at ~70% extra reviewer cost.",
+)
 @click.pass_context
-def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_model):
+def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_model, reviewer_thinking):
     """Run test cases and save results for review."""
     tester = PromptTester(ctx.obj["project_root"], max_concurrent=max_concurrent)
     results = tester.run(
@@ -56,7 +62,8 @@ def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_m
     )
 
     if review:
-        results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, review_model))
+        thinking = {"type": "adaptive"} if reviewer_thinking == "adaptive" else None
+        results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, review_model, thinking))
 
     tester.save(results, output)
 
@@ -195,11 +202,11 @@ def compilers(ctx, language, search, limit):  # noqa: ARG001
             click.echo(f"... and {len(results) - limit} more")
 
 
-async def _run_reviews(project_root: Path, results: dict, model: str) -> dict:
+async def _run_reviews(project_root: Path, results: dict, model: str, thinking: dict | None = None) -> dict:
     """Run correctness reviews on all successful results."""
     from prompt_testing.reviewer import CorrectnessReviewer
 
-    reviewer = CorrectnessReviewer(model=model)
+    reviewer = CorrectnessReviewer(model=model, thinking=thinking)
     test_dir = project_root / "prompt_testing" / "test_cases"
     all_cases = load_all_test_cases(str(test_dir))
     cases_by_id = {c["id"]: c for c in all_cases}
@@ -261,14 +268,21 @@ def _print_review_summary(results: dict) -> None:
 @cli.command()
 @click.argument("results_file")
 @click.option("--model", default="claude-opus-4-7", help="Reviewer model")
+@click.option(
+    "--thinking",
+    type=click.Choice(["off", "adaptive"]),
+    default="adaptive",
+    help="Extended thinking on the reviewer (default 'adaptive' for tighter rigor).",
+)
 @click.pass_context
-def review(ctx, results_file, model):
+def review(ctx, results_file, model, thinking):
     """Run Opus correctness review on existing results."""
     results_dir = ctx.obj["project_root"] / "prompt_testing" / "results"
     path = results_dir / results_file if not Path(results_file).is_absolute() else Path(results_file)
 
+    thinking_cfg = {"type": "adaptive"} if thinking == "adaptive" else None
     results = json.loads(path.read_text())
-    results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, model))
+    results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, model, thinking_cfg))
 
     # Save updated results
     path.write_text(json.dumps(results, indent=2))
diff --git a/prompt_testing/reviewer.py b/prompt_testing/reviewer.py
index a329317..13093d2 100644
--- a/prompt_testing/reviewer.py
+++ b/prompt_testing/reviewer.py
@@ -66,8 +66,17 @@
 class CorrectnessReviewer:
     """Reviews explanations for factual correctness using a powerful model."""
 
-    def __init__(self, model: str = "claude-opus-4-7"):
+    def __init__(self, model: str = "claude-opus-4-7", thinking: dict[str, Any] | None = None):
+        """Initialise the reviewer.
+
+        Args:
+            model: Claude model id to use for review.
+            thinking: Optional extended-thinking config, e.g.
+                ``{"type": "adaptive"}`` or
+                ``{"type": "enabled", "budget_tokens": 2000}``.
+        """
         self.model = model
+        self.thinking = thinking
         self.client = AsyncAnthropic()
 
     async def review(
@@ -96,14 +105,20 @@ async def review(
         )
 
         # Opus 4.7+ rejects `temperature`; rely on the model's own default.
-        msg = await self.client.messages.create(
-            model=self.model,
-            max_tokens=2048,
-            system=REVIEW_SYSTEM_PROMPT,
-            messages=[{"role": "user", "content": user_prompt}],
-        )
-
-        text = msg.content[0].text.strip()
+        api_kwargs: dict[str, Any] = {
+            "model": self.model,
+            "max_tokens": 2048,
+            "system": REVIEW_SYSTEM_PROMPT,
+            "messages": [{"role": "user", "content": user_prompt}],
+        }
+        if self.thinking:
+            api_kwargs["thinking"] = self.thinking
+        msg = await self.client.messages.create(**api_kwargs)
+
+        # When thinking is enabled the response contains thinking blocks
+        # before the final text block; pick the last text block.
+        text_blocks = [c for c in msg.content if getattr(c, "type", None) == "text"]
+        text = text_blocks[-1].text.strip() if text_blocks else ""
 
         # Parse JSON response
         try:
diff --git a/prompt_testing/runner.py b/prompt_testing/runner.py
index 15ea143..492fca9 100644
--- a/prompt_testing/runner.py
+++ b/prompt_testing/runner.py
@@ -69,20 +69,28 @@ async def _run_one(self, test_case: dict[str, Any], prompt: Prompt) -> dict[str,
             request = self._to_request(test_case)
             prompt_data = prompt.generate_messages(request)
 
+            api_kwargs: dict[str, Any] = {
+                "model": prompt_data["model"],
+                "max_tokens": prompt_data["max_tokens"],
+                "system": prompt_data["system"],
+                "messages": prompt_data["messages"],
+            }
+            if prompt_data.get("thinking"):
+                # Extended thinking: temperature must be 1 / unset.
+                api_kwargs["thinking"] = prompt_data["thinking"]
+            else:
+                api_kwargs["temperature"] = prompt_data["temperature"]
+
             start = time.time()
             try:
-                msg = await self.async_client.messages.create(
-                    model=prompt_data["model"],
-                    max_tokens=prompt_data["max_tokens"],
-                    temperature=prompt_data["temperature"],
-                    system=prompt_data["system"],
-                    messages=prompt_data["messages"],
-                )
+                msg = await self.async_client.messages.create(**api_kwargs)
                 elapsed_ms = int((time.time() - start) * 1000)
+                text_blocks = [c for c in msg.content if getattr(c, "type", None) == "text"]
+                explanation = text_blocks[-1].text.strip() if text_blocks else ""
                 return {
                     "case_id": case_id,
                     "success": True,
-                    "explanation": msg.content[-1].text.strip(),
+                    "explanation": explanation,
                     "model": prompt_data["model"],
                     "input_tokens": msg.usage.input_tokens,
                     "output_tokens": msg.usage.output_tokens,

From 7739322d0d3e8ac2629b0d09a48f408b47d7de2a Mon Sep 17 00:00:00 2001
From: mattgodbolt-molty <mattgodbolt-molty@users.noreply.github.com>
Date: Wed, 6 May 2026 16:27:37 -0500
Subject: [PATCH 2/7] Document Anthropic API gotchas in CLAUDE.md

Adds a short section covering the non-obvious things this PR's session
turned up that would burn time to rediscover: max_tokens-includes-
thinking, the reviewer's temperature deprecation, the new default
reviewer thinking, the deliberate hold on production explainer
thinking, and the multi-block response pattern.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/CLAUDE.md b/CLAUDE.md
index 323c627..7f78162 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -66,6 +66,26 @@ uv run ruff format
 The service processes compiler output through a pipeline: input validation → smart assembly filtering → Claude API
 call → response with metrics. See `claude_explain.md` for detailed architecture documentation.
 
+## Anthropic API gotchas
+
+- **`max_tokens` includes thinking tokens.** When a prompt YAML sets `model.thinking: {type: adaptive}` (or
+  `{type: enabled, budget_tokens: N}`), thinking counts against `max_tokens`. The production value `1536` silently
+  starves the visible text output on complex cases when thinking is on — bump to ≥4096 (8192 worked in past
+  experiments) before enabling thinking on any explainer prompt.
+- **The reviewer model rejects `temperature`.** Opus 4.7 deprecated the parameter, so `prompt_testing/reviewer.py`
+  omits it. The Sonnet explainer still accepts `temperature`. If you swap the reviewer to a model that requires
+  it, restore the param.
+- **Reviewer thinking is on by default.** `prompt-test run --review` and `prompt-test review` default to
+  `--reviewer-thinking adaptive` / `--thinking adaptive`. It catches factual errors the no-think reviewer misses
+  but adds ~70% to review cost. Pass `off` to compare runs or save money on large batches.
+- **Production explainer thinking is intentionally off.** Adaptive thinking on Sonnet 4.6 measurably improves
+  factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~11s end-to-end
+  latency, which is too much for the interactive endpoint. Don't enable it in `app/prompt.yaml` without an
+  explicit latency/quality decision.
+- **Multi-block responses.** When thinking is enabled the API returns thinking blocks before the text block.
+  `app/explain.py` and `prompt_testing/runner.py` both pick the last text block via `getattr(c, "type", None) ==
+  "text"`. Preserve that pattern for any new code that consumes responses.
+
 ## Code Style Guidelines
 
 - Prefer using modern Python 3.13+ type syntax. Good: `a: list[str] | None`. Bad: `a: Optional[List[str]]`

From acdb10db63fe26c31accf5eea69c07e76caf14a9 Mon Sep 17 00:00:00 2001
From: mattgodbolt-molty <mattgodbolt-molty@users.noreply.github.com>
Date: Wed, 6 May 2026 16:35:03 -0500
Subject: [PATCH 3/7] Address Copilot review on PR #17

- app/explain.py: drop the accidental set literal in the model log so it
  prints the model id rather than `{'claude-...'}`.
- app/explain.py: raise RuntimeError when the response contains no text
  block (e.g. extended thinking exhausting max_tokens), so we don't
  cache or serve an empty success response.
- prompt_testing/runner.py: same empty-output guard, returning
  success=False with stop_reason and token counts so suite metrics
  reflect reality instead of silently absorbing empty responses.
- app/test_explain.py: add coverage for multi-block responses
  (thinking + text picks the text block) and the empty-block case
  (raises RuntimeError).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/explain.py           |  9 +++++++-
 app/test_explain.py      | 47 ++++++++++++++++++++++++++++++++++++++++
 prompt_testing/runner.py | 12 ++++++++++
 3 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/app/explain.py b/app/explain.py
index 2b96716..8e443cf 100644
--- a/app/explain.py
+++ b/app/explain.py
@@ -90,7 +90,7 @@ async def _call_anthropic_api(
     LOGGER.debug("=== END PROMPT DEBUG ===")
 
     # Call Claude API
-    LOGGER.info("Using Anthropic client with model: %s", {prompt_data["model"]})
+    LOGGER.info("Using Anthropic client with model: %s", prompt_data["model"])
 
     api_kwargs: dict = {
         "model": prompt_data["model"],
@@ -110,6 +110,13 @@ async def _call_anthropic_api(
     # contains thinking blocks before the final text block.
     text_blocks = [c for c in message.content if getattr(c, "type", None) == "text"]
     explanation = text_blocks[-1].text.strip() if text_blocks else ""
+    if not explanation:
+        # Can happen if extended thinking exhausts max_tokens before any
+        # text block is emitted. Don't cache or return an empty response.
+        raise RuntimeError(
+            f"Claude returned no text content (stop_reason={message.stop_reason}). "
+            f"If thinking is enabled, max_tokens may be too low."
+        )
 
     # Extract usage information
     input_tokens = message.usage.input_tokens
diff --git a/app/test_explain.py b/app/test_explain.py
index 675a069..de10d4a 100644
--- a/app/test_explain.py
+++ b/app/test_explain.py
@@ -150,6 +150,53 @@ async def test_process_request_success(self, sample_request, mock_anthropic_clie
         assert structured_data["compiler"] == "g++"
         assert structured_data["sourceCode"] == "int square(int x) {\n  return x * x;\n}"
 
+    @pytest.mark.asyncio
+    async def test_picks_last_text_block_with_thinking(self, sample_request, noop_metrics):
+        """When thinking is enabled, the response has thinking blocks before the
+        final text block. The handler should select the text block, not the
+        thinking block."""
+        thinking_block = MagicMock()
+        thinking_block.type = "thinking"
+        thinking_block.thinking = "Let me trace through the imul..."
+
+        text_block = MagicMock()
+        text_block.type = "text"
+        text_block.text = "Final explanation here."
+
+        mock_message = MagicMock()
+        mock_message.content = [thinking_block, text_block]
+        mock_message.usage = MagicMock(input_tokens=100, output_tokens=50)
+        mock_message.stop_reason = "end_turn"
+
+        mock_client = MagicMock()
+        mock_client.messages.create = AsyncMock(return_value=mock_message)
+
+        test_prompt = Prompt(Path("app/prompt.yaml"))
+        response = await process_request(sample_request, mock_client, test_prompt, noop_metrics)
+
+        assert response.status == "success"
+        assert response.explanation == "Final explanation here."
+
+    @pytest.mark.asyncio
+    async def test_raises_when_no_text_block(self, sample_request, noop_metrics):
+        """A response with no text block (e.g. thinking exhausted max_tokens)
+        should raise rather than silently caching an empty explanation."""
+        thinking_block = MagicMock()
+        thinking_block.type = "thinking"
+        thinking_block.thinking = "..."
+
+        mock_message = MagicMock()
+        mock_message.content = [thinking_block]
+        mock_message.usage = MagicMock(input_tokens=100, output_tokens=50)
+        mock_message.stop_reason = "max_tokens"
+
+        mock_client = MagicMock()
+        mock_client.messages.create = AsyncMock(return_value=mock_message)
+
+        test_prompt = Prompt(Path("app/prompt.yaml"))
+        with pytest.raises(RuntimeError, match="no text content"):
+            await process_request(sample_request, mock_client, test_prompt, noop_metrics)
+
 
 class TestSelectImportantAssembly:
     """Test the select_important_assembly function."""
diff --git a/prompt_testing/runner.py b/prompt_testing/runner.py
index 492fca9..75a6850 100644
--- a/prompt_testing/runner.py
+++ b/prompt_testing/runner.py
@@ -87,6 +87,18 @@ async def _run_one(self, test_case: dict[str, Any], prompt: Prompt) -> dict[str,
                 elapsed_ms = int((time.time() - start) * 1000)
                 text_blocks = [c for c in msg.content if getattr(c, "type", None) == "text"]
                 explanation = text_blocks[-1].text.strip() if text_blocks else ""
+                if not explanation:
+                    # Treat empty output as a failure so suite metrics aren't
+                    # skewed. Common cause: thinking exhausting max_tokens
+                    # before any text block is emitted.
+                    return {
+                        "case_id": case_id,
+                        "success": False,
+                        "error": (
+                            f"empty response (stop_reason={msg.stop_reason}, "
+                            f"in={msg.usage.input_tokens}, out={msg.usage.output_tokens})"
+                        ),
+                    }
                 return {
                     "case_id": case_id,
                     "success": True,

From 01d80c89e6ff7b8e0dadf78813df1f39b58c85e3 Mon Sep 17 00:00:00 2001
From: mattgodbolt-molty <mattgodbolt-molty@users.noreply.github.com>
Date: Wed, 6 May 2026 16:59:21 -0500
Subject: [PATCH 4/7] Address local code review on PR #17
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Findings beyond Copilot's pass:

- app/explain.py: empty-text responses no longer raise (which surfaced
  to users as a generic 500 with no payload). They now return
  ExplainResponse(status="error") with usage populated, emit a
  ClaudeExplainEmptyResponse metric, and are not cached so retries hit
  the API.
- prompt_testing/runner.py: empty-response failures now capture
  input_tokens/output_tokens/elapsed_ms/model and contribute to
  total_cost_usd. Previously the spent tokens vanished from suite cost.
- app/cache.py: cache_data explicitly includes the thinking config.
  Currently captured via prompt_version's hash, but the documented
  cache contract is the explicit field list — listing thinking keeps
  that contract honest.
- app/prompt.py: Prompt.__init__ refuses thinking-enabled configs with
  max_tokens < 4096. Below this, thinking can consume the whole budget
  and starve the visible text. Fail at load time rather than silently
  producing empty responses.
- prompt_testing/cli.py: distinguish review_failures (correct=None,
  reviewer infrastructure failed) from errors_found (correct=False,
  real factual error). Previously conflated under errors_found, which
  hid the difference between "model said something wrong" and "we
  couldn't review the response at all."
- prompt_testing/cli.py: tighten bare dict annotations to
  dict[str, Any] for consistency.
- CLAUDE.md: note redacted_thinking blocks (rare, encrypted reasoning
  when safety filters trip) and document the structured-error contract
  for empty responses.

Tests: 94 -> 96 passing (added two for the new Prompt validation, plus
updated the empty-text test to assert the structured error response
shape).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md                  | 13 ++++++++---
 app/cache.py               |  5 ++++
 app/explain.py             | 47 ++++++++++++++++++++++++++++---------
 app/prompt.py              | 13 +++++++++++
 app/test_explain.py        | 48 ++++++++++++++++++++++++++++++++++----
 prompt_testing/cli.py      | 40 ++++++++++++++++++++++---------
 prompt_testing/reviewer.py | 45 ++++++++++++++++++++++-------------
 prompt_testing/runner.py   | 13 ++++++++---
 8 files changed, 176 insertions(+), 48 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 7f78162..c6964a2 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -70,8 +70,8 @@ call → response with metrics. See `claude_explain.md` for detailed architectur
 
 - **`max_tokens` includes thinking tokens.** When a prompt YAML sets `model.thinking: {type: adaptive}` (or
   `{type: enabled, budget_tokens: N}`), thinking counts against `max_tokens`. The production value `1536` silently
-  starves the visible text output on complex cases when thinking is on — bump to ≥4096 (8192 worked in past
-  experiments) before enabling thinking on any explainer prompt.
+  starves the visible text output on complex cases when thinking is on. `Prompt.__init__` now refuses to load a
+  thinking-enabled config with `max_tokens < 4096`; ≥4096 (8192 worked in past experiments) is the floor.
 - **The reviewer model rejects `temperature`.** Opus 4.7 deprecated the parameter, so `prompt_testing/reviewer.py`
   omits it. The Sonnet explainer still accepts `temperature`. If you swap the reviewer to a model that requires
   it, restore the param.
@@ -84,7 +84,14 @@ call → response with metrics. See `claude_explain.md` for detailed architectur
   explicit latency/quality decision.
 - **Multi-block responses.** When thinking is enabled the API returns thinking blocks before the text block.
   `app/explain.py` and `prompt_testing/runner.py` both pick the last text block via `getattr(c, "type", None) ==
-  "text"`. Preserve that pattern for any new code that consumes responses.
+  "text"`. Preserve that pattern for any new code that consumes responses. The API may also return
+  `redacted_thinking` blocks (encrypted reasoning when safety filters trip); the same filter excludes them
+  correctly, but be aware "no text block" can mean either max_tokens starvation *or* a redacted-thinking-only
+  response — the error message is the same.
+- **Empty responses are not 500s.** When the model returns no text block, `app/explain.py` returns
+  `ExplainResponse(status="error")` with `usage` populated and emits `ClaudeExplainEmptyResponse`. The cache
+  layer skips storing error responses so retries hit the API. Don't change this to raise — the structured error
+  is what the CE frontend can render.
 
 ## Code Style Guidelines
 
diff --git a/app/cache.py b/app/cache.py
index f7dd0fc..b612c11 100644
--- a/app/cache.py
+++ b/app/cache.py
@@ -142,6 +142,11 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str:
         "model": prompt_data["model"],
         "max_tokens": prompt_data["max_tokens"],
         "temperature": prompt_data["temperature"],
+        # Thinking config affects the API response so it must affect the cache
+        # key. Currently captured via `prompt_version` too, but listing it
+        # explicitly keeps the contract clear if thinking ever gets set
+        # outside the YAML (e.g. per-request override).
+        "thinking": prompt_data.get("thinking"),
         "system": prompt_data["system"],
         "messages": prompt_data["messages"],
         # Include a hash of the prompt config to invalidate cache when prompts change
diff --git a/app/explain.py b/app/explain.py
index 8e443cf..d973075 100644
--- a/app/explain.py
+++ b/app/explain.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Any
 
 from anthropic import AsyncAnthropic
 
@@ -59,8 +60,10 @@ async def process_request(
     # Cache miss or no cache - proceed with Anthropic API call
     response = await _call_anthropic_api(body, client, prompt, metrics_provider)
 
-    # Cache the response (if cache provider is available)
-    if cache_provider is not None:
+    # Cache the response (if cache provider is available). Don't cache
+    # error responses — they consume real tokens but produce no useful
+    # content, and we want a retry to hit the API rather than the cache.
+    if cache_provider is not None and response.status == "success":
         await cache_response(body, prompt, response, cache_provider)
         metrics_provider.put_metric("ClaudeExplainCacheMiss", 1)
 
@@ -92,7 +95,7 @@ async def _call_anthropic_api(
     # Call Claude API
     LOGGER.info("Using Anthropic client with model: %s", prompt_data["model"])
 
-    api_kwargs: dict = {
+    api_kwargs: dict[str, Any] = {
         "model": prompt_data["model"],
         "max_tokens": prompt_data["max_tokens"],
         "system": prompt_data["system"],
@@ -106,22 +109,44 @@ async def _call_anthropic_api(
 
     message = await client.messages.create(**api_kwargs)
 
+    # Extract usage information
+    input_tokens = message.usage.input_tokens
+    output_tokens = message.usage.output_tokens
+    total_tokens = input_tokens + output_tokens
+
     # Pick the last text block — when thinking is enabled the response
     # contains thinking blocks before the final text block.
     text_blocks = [c for c in message.content if getattr(c, "type", None) == "text"]
     explanation = text_blocks[-1].text.strip() if text_blocks else ""
     if not explanation:
         # Can happen if extended thinking exhausts max_tokens before any
-        # text block is emitted. Don't cache or return an empty response.
-        raise RuntimeError(
-            f"Claude returned no text content (stop_reason={message.stop_reason}). "
+        # text block is emitted. Surface the failure to the caller with
+        # token usage populated, and emit a metric so this is visible on
+        # dashboards rather than buried in a generic 500.
+        message_text = (
+            f"Claude returned no text content "
+            f"(stop_reason={message.stop_reason}, in={input_tokens}, out={output_tokens}). "
             f"If thinking is enabled, max_tokens may be too low."
         )
-
-    # Extract usage information
-    input_tokens = message.usage.input_tokens
-    output_tokens = message.usage.output_tokens
-    total_tokens = input_tokens + output_tokens
+        LOGGER.warning(message_text)
+        metrics_provider.set_property("language", body.language)
+        metrics_provider.set_property("compiler", body.compiler)
+        metrics_provider.set_property("instructionSet", body.instructionSet or "unknown")
+        metrics_provider.set_property("cached", "false")
+        metrics_provider.put_metric("ClaudeExplainRequest", 1)
+        metrics_provider.put_metric("ClaudeExplainEmptyResponse", 1)
+        metrics_provider.put_metric("ClaudeExplainInputTokens", input_tokens)
+        metrics_provider.put_metric("ClaudeExplainOutputTokens", output_tokens)
+        return ExplainResponse(
+            status="error",
+            message=message_text,
+            model=prompt_data["model"],
+            usage=TokenUsage(
+                inputTokens=input_tokens,
+                outputTokens=output_tokens,
+                totalTokens=total_tokens,
+            ),
+        )
 
     # Calculate costs based on model
     cost_per_input_token, cost_per_output_token = get_model_cost(prompt_data["model"])
diff --git a/app/prompt.py b/app/prompt.py
index 55c80e2..eb0f044 100644
--- a/app/prompt.py
+++ b/app/prompt.py
@@ -15,6 +15,11 @@
 # Constants from explain.py that are needed for data preparation
 MAX_ASSEMBLY_LINES = 300  # Maximum number of assembly lines to process
 
+# Minimum max_tokens that's safe to pair with extended thinking. Below this,
+# adaptive thinking can consume the whole budget on complex inputs and leave
+# nothing for the visible response.
+MIN_MAX_TOKENS_WITH_THINKING = 4096
+
 
 class Prompt:
     """Manages prompt templates and generates messages for Claude API."""
@@ -40,6 +45,14 @@ def __init__(self, config: dict[str, Any] | Path):
         # {"type": "enabled", "budget_tokens": 2000}. When set, callers
         # should drop `temperature` (the API requires it to be unset/1).
         self.thinking = self.config["model"].get("thinking")
+        if self.thinking and self.max_tokens < MIN_MAX_TOKENS_WITH_THINKING:
+            # Adaptive thinking happily consumes the entire token budget on
+            # complex inputs and leaves nothing for the visible text block.
+            # Fail loudly here rather than silently returning empty responses.
+            raise ValueError(
+                f"max_tokens={self.max_tokens} is too low for thinking; "
+                f"use at least {MIN_MAX_TOKENS_WITH_THINKING} when model.thinking is set."
+            )
 
         # Extract prompt templates
         self.system_prompt_template = self.config["system_prompt"]
diff --git a/app/test_explain.py b/app/test_explain.py
index de10d4a..753d245 100644
--- a/app/test_explain.py
+++ b/app/test_explain.py
@@ -178,9 +178,10 @@ async def test_picks_last_text_block_with_thinking(self, sample_request, noop_me
         assert response.explanation == "Final explanation here."
 
     @pytest.mark.asyncio
-    async def test_raises_when_no_text_block(self, sample_request, noop_metrics):
+    async def test_returns_error_when_no_text_block(self, sample_request, noop_metrics):
         """A response with no text block (e.g. thinking exhausted max_tokens)
-        should raise rather than silently caching an empty explanation."""
+        should return status='error' with token usage populated, not raise.
+        Production must surface a structured error rather than a generic 500."""
         thinking_block = MagicMock()
         thinking_block.type = "thinking"
         thinking_block.thinking = "..."
@@ -194,8 +195,47 @@ async def test_raises_when_no_text_block(self, sample_request, noop_metrics):
         mock_client.messages.create = AsyncMock(return_value=mock_message)
 
         test_prompt = Prompt(Path("app/prompt.yaml"))
-        with pytest.raises(RuntimeError, match="no text content"):
-            await process_request(sample_request, mock_client, test_prompt, noop_metrics)
+        response = await process_request(sample_request, mock_client, test_prompt, noop_metrics)
+
+        assert response.status == "error"
+        assert response.explanation is None
+        assert "no text content" in response.message
+        # Real tokens were spent — surface them so callers can see the cost.
+        assert response.usage is not None
+        assert response.usage.inputTokens == 100
+        assert response.usage.outputTokens == 50
+
+
+class TestPromptValidation:
+    """Validation rules enforced at Prompt construction."""
+
+    def test_thinking_requires_min_max_tokens(self):
+        """A prompt YAML that enables thinking must also bump max_tokens."""
+        with pytest.raises(ValueError, match="too low for thinking"):
+            Prompt(
+                {
+                    "model": {"name": "test", "max_tokens": 1536, "thinking": {"type": "adaptive"}},
+                    "system_prompt": "",
+                    "user_prompt": "",
+                    "assistant_prefill": "",
+                    "audience_levels": {},
+                    "explanation_types": {},
+                }
+            )
+
+    def test_thinking_with_sufficient_max_tokens_is_ok(self):
+        """At/above the floor (4096), thinking-enabled prompts load fine."""
+        prompt = Prompt(
+            {
+                "model": {"name": "test", "max_tokens": 4096, "thinking": {"type": "adaptive"}},
+                "system_prompt": "",
+                "user_prompt": "",
+                "assistant_prefill": "",
+                "audience_levels": {},
+                "explanation_types": {},
+            }
+        )
+        assert prompt.thinking == {"type": "adaptive"}
 
 
 class TestSelectImportantAssembly:
diff --git a/prompt_testing/cli.py b/prompt_testing/cli.py
index de5b1c0..452e050 100644
--- a/prompt_testing/cli.py
+++ b/prompt_testing/cli.py
@@ -16,6 +16,7 @@
 import sys
 from collections import defaultdict
 from pathlib import Path
+from typing import Any
 
 import click
 from dotenv import load_dotenv
@@ -202,7 +203,7 @@ def compilers(ctx, language, search, limit):  # noqa: ARG001
             click.echo(f"... and {len(results) - limit} more")
 
 
-async def _run_reviews(project_root: Path, results: dict, model: str, thinking: dict | None = None) -> dict:
+async def _run_reviews(project_root: Path, results: dict, model: str, thinking: dict[str, Any] | None = None) -> dict:
     """Run correctness reviews on all successful results."""
     from prompt_testing.reviewer import CorrectnessReviewer
 
@@ -216,6 +217,7 @@ async def _run_reviews(project_root: Path, results: dict, model: str, thinking:
 
     review_cost = 0.0
     errors_found = 0
+    review_failures = 0
     cost_per_input_token, cost_per_output_token = get_model_cost(model)
 
     for i, result in enumerate(successful, 1):
@@ -226,10 +228,19 @@ async def _run_reviews(project_root: Path, results: dict, model: str, thinking:
         review = await reviewer.review_test_result(case, result["explanation"])
         result["review"] = review
 
-        status = "✓" if review.get("correct") else "✗"
-        n_issues = len(review.get("issues", []))
-        if not review.get("correct"):
+        # `correct`: True = passed, False = real factual error,
+        # None = reviewer infrastructure failure (parse/empty response).
+        # Distinguish so suite metrics don't conflate the two.
+        correct = review.get("correct")
+        if correct is True:
+            status = "✓"
+        elif correct is False:
+            status = "✗"
             errors_found += 1
+        else:
+            status = "?"
+            review_failures += 1
+        n_issues = len(review.get("issues", []))
         cost = (
             review.get("reviewer_input_tokens", 0) * cost_per_input_token
             + review.get("reviewer_output_tokens", 0) * cost_per_output_token
@@ -241,26 +252,33 @@ async def _run_reviews(project_root: Path, results: dict, model: str, thinking:
     results["review_cost_usd"] = round(review_cost, 6)
     results["total_cost_usd"] = round(results["total_cost_usd"] + review_cost, 6)
     results["errors_found"] = errors_found
+    results["review_failures"] = review_failures
     return results
 
 
-def _print_review_summary(results: dict) -> None:
+def _print_review_summary(results: dict[str, Any]) -> None:
     """Print a summary of correctness reviews."""
     reviewed = [r for r in results["results"] if r.get("review")]
-    correct = sum(1 for r in reviewed if r["review"].get("correct"))
-    incorrect = len(reviewed) - correct
+    passed = sum(1 for r in reviewed if r["review"].get("correct") is True)
+    failed = sum(1 for r in reviewed if r["review"].get("correct") is False)
+    review_failures = sum(1 for r in reviewed if r["review"].get("correct") is None)
 
-    click.echo(f"\nCorrectness: {correct}/{len(reviewed)} passed")
-    if incorrect:
-        click.echo(f"\n⚠ {incorrect} case(s) with issues:")
+    click.echo(f"\nCorrectness: {passed}/{len(reviewed)} passed")
+    if failed:
+        click.echo(f"\n⚠ {failed} case(s) with issues:")
         for r in reviewed:
             review = r["review"]
-            if not review.get("correct"):
+            if review.get("correct") is False:
                 click.echo(f"\n  {r['case_id']}:")
                 for issue in review.get("issues", []):
                     sev = "🔴" if issue["severity"] == "error" else "🟡"
                     click.echo(f"    {sev} {issue['claim']}")
                     click.echo(f"       → {issue['correction']}")
+    if review_failures:
+        click.echo(f"\n⚠ {review_failures} review(s) failed to run (likely max_tokens starvation):")
+        for r in reviewed:
+            if r["review"].get("correct") is None:
+                click.echo(f"  {r['case_id']}: {r['review'].get('summary', '?')}")
 
     click.echo(f"\nReview cost: ${results.get('review_cost_usd', 0):.4f} ({results.get('review_model', '?')})")
 
diff --git a/prompt_testing/reviewer.py b/prompt_testing/reviewer.py
index 13093d2..243dae7 100644
--- a/prompt_testing/reviewer.py
+++ b/prompt_testing/reviewer.py
@@ -120,22 +120,35 @@ async def review(
         text_blocks = [c for c in msg.content if getattr(c, "type", None) == "text"]
         text = text_blocks[-1].text.strip() if text_blocks else ""
 
-        # Parse JSON response
-        try:
-            result = json.loads(text)
-        except json.JSONDecodeError:
-            # Try to extract JSON from markdown fencing
-            if "```" in text:
-                json_part = text.split("```")[1]
-                if json_part.startswith("json"):
-                    json_part = json_part[4:]
-                result = json.loads(json_part.strip())
-            else:
-                result = {
-                    "correct": None,
-                    "issues": [],
-                    "summary": f"Failed to parse reviewer response: {text[:200]}",
-                }
+        if not text:
+            # Likely thinking exhausted max_tokens before any text block.
+            # Surface stop_reason/usage so this is diagnosable rather than a
+            # generic JSON parse failure.
+            result: dict[str, Any] = {
+                "correct": None,
+                "issues": [],
+                "summary": (
+                    f"Reviewer returned no text (stop_reason={msg.stop_reason}, "
+                    f"in={msg.usage.input_tokens}, out={msg.usage.output_tokens})."
+                ),
+            }
+        else:
+            # Parse JSON response
+            try:
+                result = json.loads(text)
+            except json.JSONDecodeError:
+                # Try to extract JSON from markdown fencing
+                if "```" in text:
+                    json_part = text.split("```")[1]
+                    if json_part.startswith("json"):
+                        json_part = json_part[4:]
+                    result = json.loads(json_part.strip())
+                else:
+                    result = {
+                        "correct": None,
+                        "issues": [],
+                        "summary": f"Failed to parse reviewer response: {text[:200]}",
+                    }
 
         result["reviewer_model"] = self.model
         result["reviewer_input_tokens"] = msg.usage.input_tokens
diff --git a/prompt_testing/runner.py b/prompt_testing/runner.py
index 75a6850..cb874bb 100644
--- a/prompt_testing/runner.py
+++ b/prompt_testing/runner.py
@@ -90,7 +90,8 @@ async def _run_one(self, test_case: dict[str, Any], prompt: Prompt) -> dict[str,
                 if not explanation:
                     # Treat empty output as a failure so suite metrics aren't
                     # skewed. Common cause: thinking exhausting max_tokens
-                    # before any text block is emitted.
+                    # before any text block is emitted. Tokens were still
+                    # spent — capture them so cost reporting stays accurate.
                     return {
                         "case_id": case_id,
                         "success": False,
@@ -98,6 +99,10 @@ async def _run_one(self, test_case: dict[str, Any], prompt: Prompt) -> dict[str,
                             f"empty response (stop_reason={msg.stop_reason}, "
                             f"in={msg.usage.input_tokens}, out={msg.usage.output_tokens})"
                         ),
+                        "model": prompt_data["model"],
+                        "input_tokens": msg.usage.input_tokens,
+                        "output_tokens": msg.usage.output_tokens,
+                        "elapsed_ms": elapsed_ms,
                     }
                 return {
                     "case_id": case_id,
@@ -144,9 +149,11 @@ async def run_async(
             print(f"  [{i}/{len(cases)}] {status} {result['case_id']} ({tokens})")
 
         successful = [r for r in results if r["success"]]
+        # Cost includes failures that consumed tokens (e.g. thinking exhausted
+        # max_tokens before any text was emitted) — those aren't free.
         total_cost = sum(
-            r["input_tokens"] * 3 / 1e6 + r["output_tokens"] * 15 / 1e6  # Sonnet pricing
-            for r in successful
+            r.get("input_tokens", 0) * 3 / 1e6 + r.get("output_tokens", 0) * 15 / 1e6  # Sonnet pricing
+            for r in results
         )
 
         return {

From cbb0f714533c09602cbb1af734b1e484ae735d21 Mon Sep 17 00:00:00 2001
From: mattgodbolt-molty <mattgodbolt-molty@users.noreply.github.com>
Date: Wed, 6 May 2026 17:26:38 -0500
Subject: [PATCH 5/7] Ignore Claude Code's local scheduled_tasks.lock

This file is per-user session state and shouldn't be tracked.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 6fd486a..b34d6e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,6 @@ node_modules
 # Package build artifacts
 *.egg-info/
 *.backup
+
+# Claude Code local session state
+.claude/scheduled_tasks.lock

From 49df42b0d4e16104ccb80344a794656821c7fa5b Mon Sep 17 00:00:00 2001
From: mattgodbolt-molty <mattgodbolt-molty@users.noreply.github.com>
Date: Wed, 6 May 2026 17:45:04 -0500
Subject: [PATCH 6/7] Address Copilot follow-up review on PR #17

- app/cache.py: only include `thinking` in cache_data when it's set.
  Always serialising the field (even as null) would silently invalidate
  every existing S3 cache entry on deploy, costing a wave of fresh API
  calls for cases where nothing about the prompt has changed.
- prompt_testing/runner.py: compute total_cost via get_model_cost(prompt.model)
  rather than hardcoded Sonnet rates. The runner can load arbitrary
  prompt YAMLs, so hardcoding misreports cost whenever the prompt
  model isn't Sonnet.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/cache.py             | 10 +++++-----
 prompt_testing/runner.py |  6 +++++-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/app/cache.py b/app/cache.py
index b612c11..6ba2cec 100644
--- a/app/cache.py
+++ b/app/cache.py
@@ -142,16 +142,16 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str:
         "model": prompt_data["model"],
         "max_tokens": prompt_data["max_tokens"],
         "temperature": prompt_data["temperature"],
-        # Thinking config affects the API response so it must affect the cache
-        # key. Currently captured via `prompt_version` too, but listing it
-        # explicitly keeps the contract clear if thinking ever gets set
-        # outside the YAML (e.g. per-request override).
-        "thinking": prompt_data.get("thinking"),
         "system": prompt_data["system"],
         "messages": prompt_data["messages"],
         # Include a hash of the prompt config to invalidate cache when prompts change
         "prompt_version": _get_prompt_config_hash(prompt.config),
     }
+    # Thinking config affects the API response so it must affect the cache
+    # key. Only include when set to avoid invalidating the existing S3 cache
+    # for production prompts that don't use thinking.
+    if prompt_data.get("thinking") is not None:
+        cache_data["thinking"] = prompt_data["thinking"]
 
     # Convert to JSON with consistent ordering for deterministic hashing
     cache_json = json.dumps(cache_data, sort_keys=True, ensure_ascii=True, separators=(",", ":"))
diff --git a/prompt_testing/runner.py b/prompt_testing/runner.py
index cb874bb..e678447 100644
--- a/prompt_testing/runner.py
+++ b/prompt_testing/runner.py
@@ -15,6 +15,7 @@
 
 from app.explain_api import AssemblyItem, ExplainRequest
 from app.explanation_types import AudienceLevel, ExplanationType
+from app.model_costs import get_model_cost
 from app.prompt import Prompt
 from prompt_testing.file_utils import load_all_test_cases
 
@@ -151,8 +152,11 @@ async def run_async(
         successful = [r for r in results if r["success"]]
         # Cost includes failures that consumed tokens (e.g. thinking exhausted
         # max_tokens before any text was emitted) — those aren't free.
+        # Use the prompt's actual model rather than hardcoded rates so the
+        # number stays correct across explainer-model experiments.
+        cost_per_input_token, cost_per_output_token = get_model_cost(prompt.model)
         total_cost = sum(
-            r.get("input_tokens", 0) * 3 / 1e6 + r.get("output_tokens", 0) * 15 / 1e6  # Sonnet pricing
+            r.get("input_tokens", 0) * cost_per_input_token + r.get("output_tokens", 0) * cost_per_output_token
             for r in results
         )
 

From df76c7a89cb79b880478f201e4d94ec33c61ee23 Mon Sep 17 00:00:00 2001
From: mattgodbolt-molty <mattgodbolt-molty@users.noreply.github.com>
Date: Wed, 6 May 2026 17:45:23 -0500
Subject: [PATCH 7/7] =?UTF-8?q?Revert=20cache=20key=20conditional=20?=
 =?UTF-8?q?=E2=80=94=20unconditional=20thinking=20is=20simpler?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per maintainer preference, accept the one-time S3 cache flush in
exchange for unconditionally including `thinking` in cache_data. Keeps
the cache-key contract aligned with the docstring (explicit field
list).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/cache.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/app/cache.py b/app/cache.py
index 6ba2cec..173bb59 100644
--- a/app/cache.py
+++ b/app/cache.py
@@ -142,16 +142,12 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str:
         "model": prompt_data["model"],
         "max_tokens": prompt_data["max_tokens"],
         "temperature": prompt_data["temperature"],
+        "thinking": prompt_data.get("thinking"),
         "system": prompt_data["system"],
         "messages": prompt_data["messages"],
         # Include a hash of the prompt config to invalidate cache when prompts change
         "prompt_version": _get_prompt_config_hash(prompt.config),
     }
-    # Thinking config affects the API response so it must affect the cache
-    # key. Only include when set to avoid invalidating the existing S3 cache
-    # for production prompts that don't use thinking.
-    if prompt_data.get("thinking") is not None:
-        cache_data["thinking"] = prompt_data["thinking"]
 
     # Convert to JSON with consistent ordering for deterministic hashing
     cache_json = json.dumps(cache_data, sort_keys=True, ensure_ascii=True, separators=(",", ":"))