diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..77c0edf
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,14 @@
+# LLM provider API keys for benchmark comparisons (npm run bench:llm)
+# Copy to .env and uncomment the providers you want to test.
+
+# OpenAI (default model: gpt-4.1-mini)
+# OPENAI_API_KEY=sk-...
+# OPENAI_MODEL=gpt-4.1-mini
+
+# Anthropic (default model: claude-haiku-4-5-20251001)
+# ANTHROPIC_API_KEY=sk-ant-...
+# ANTHROPIC_MODEL=claude-haiku-4-5-20251001
+
+# Ollama (auto-detected when running locally — no env vars required)
+# OLLAMA_HOST=http://localhost:11434
+# OLLAMA_MODEL=llama3.2
diff --git a/CLAUDE.md b/CLAUDE.md
index 1c2a457..1131aa9 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -13,6 +13,7 @@ npm run lint             # ESLint check
 npm run format           # Prettier write
 npm run format:check     # Prettier check
 npm run bench            # Run benchmark suite
+npm run bench:save       # Run, save baseline, regenerate docs/benchmark-results.md
 ```
 
 Run a single test file:
diff --git a/README.md b/README.md
index 11a8981..9e00710 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ const { messages: originals } = uncompress(compressed, verbatim);
 
 No API keys. No network calls. Runs synchronously by default. Under 2ms for typical conversations.
 
-The classifier is content-aware, not domain-specific. It preserves structured data (code, JSON, SQL, tables, citations, formulas) and compresses surrounding prose — making it useful anywhere dense reference material is mixed with natural language: LLM conversations, legal briefs, medical records, technical documentation, support logs.
+The classifier is content-aware, not domain-specific. It preserves structured data (code, JSON, SQL, tables, citations, formulas) and compresses surrounding prose — optimized for LLM conversations and technical documentation.
 
 ## Key findings
 
diff --git a/bench/baseline.ts b/bench/baseline.ts
new file mode 100644
index 0000000..f59b29c
--- /dev/null
+++ b/bench/baseline.ts
@@ -0,0 +1,923 @@
+import { readFileSync, writeFileSync, mkdirSync, readdirSync, existsSync } from 'node:fs';
+import { join } from 'node:path';
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+export interface BasicResult {
+  ratio: number;
+  tokenRatio: number;
+  compressed: number;
+  preserved: number;
+}
+
+export interface TokenBudgetResult {
+  tokenCount: number;
+  fits: boolean;
+  recencyWindow: number | undefined;
+  compressed: number;
+  preserved: number;
+  deduped: number;
+}
+
+export interface DedupResult {
+  rw0Base: number;
+  rw0Dup: number;
+  rw4Base: number;
+  rw4Dup: number;
+  deduped: number;
+}
+
+export interface FuzzyDedupResult {
+  exact: number;
+  fuzzy: number;
+  ratio: number;
+}
+
+export interface BundleSizeResult {
+  bytes: number;
+  gzipBytes: number;
+}
+
+export interface BenchmarkResults {
+  basic: Record<string, BasicResult>;
+  tokenBudget: Record<string, TokenBudgetResult>;
+  dedup: Record<string, DedupResult>;
+  fuzzyDedup: Record<string, FuzzyDedupResult>;
+  bundleSize: Record<string, BundleSizeResult>;
+}
+
+export interface Baseline {
+  version: string;
+  generated: string;
+  results: BenchmarkResults;
+}
+
+// ---------------------------------------------------------------------------
+// LLM benchmark types
+// ---------------------------------------------------------------------------
+
+export interface LlmMethodResult {
+  ratio: number;
+  tokenRatio: number;
+  compressed: number;
+  preserved: number;
+  roundTrip: 'PASS' | 'FAIL';
+  timeMs: number;
+  /** ratio / deterministic ratio — values < 1.0 mean LLM expanded instead of compressing */
+  vsDet?: number;
+}
+
+export interface LlmScenarioResult {
+  methods: Record<string, LlmMethodResult>;
+}
+
+export interface LlmTokenBudgetResult {
+  budget: number;
+  method: string;
+  tokenCount: number;
+  fits: boolean;
+  ratio: number;
+  recencyWindow: number | undefined;
+  roundTrip: 'PASS' | 'FAIL';
+  timeMs: number;
+}
+
+export interface LlmBenchmarkResult {
+  provider: string;
+  model: string;
+  generated: string;
+  scenarios: Record<string, LlmScenarioResult>;
+  tokenBudget?: Record<string, LlmTokenBudgetResult[]>;
+}
+
+// ---------------------------------------------------------------------------
+// Save / Load
+// ---------------------------------------------------------------------------
+
+export function saveBaseline(
+  baselinesDir: string,
+  version: string,
+  results: BenchmarkResults,
+): void {
+  const baseline: Baseline = {
+    version,
+    generated: new Date().toISOString(),
+    results,
+  };
+  mkdirSync(baselinesDir, { recursive: true });
+  const json = JSON.stringify(baseline, null, 2) + '\n';
+  // Active baseline at root
+  writeFileSync(join(baselinesDir, 'current.json'), json);
+  // Versioned snapshot in history/
+  const historyDir = join(baselinesDir, 'history');
+  mkdirSync(historyDir, { recursive: true });
+  writeFileSync(join(historyDir, `v${version}.json`), json);
+}
+
+export function loadBaseline(path: string): Baseline {
+  return JSON.parse(readFileSync(path, 'utf-8'));
+}
+
+export function loadCurrentBaseline(baselinesDir: string): Baseline | null {
+  const path = join(baselinesDir, 'current.json');
+  if (!existsSync(path)) return null;
+  return loadBaseline(path);
+}
+
+// ---------------------------------------------------------------------------
+// LLM result persistence
+// ---------------------------------------------------------------------------
+
+export function saveLlmResult(baselinesDir: string, result: LlmBenchmarkResult): void {
+  const llmDir = join(baselinesDir, 'llm');
+  mkdirSync(llmDir, { recursive: true });
+  const filename = `${result.provider}-${result.model.replace(/[/:]/g, '-')}.json`;
+  writeFileSync(join(llmDir, filename), JSON.stringify(result, null, 2) + '\n');
+}
+
+export function loadAllLlmResults(baselinesDir: string): LlmBenchmarkResult[] {
+  const llmDir = join(baselinesDir, 'llm');
+  if (!existsSync(llmDir)) return [];
+
+  const results: LlmBenchmarkResult[] = [];
+  for (const f of readdirSync(llmDir)
+    .filter((f) => f.endsWith('.json'))
+    .sort()) {
+    try {
+      results.push(JSON.parse(readFileSync(join(llmDir, f), 'utf-8')));
+    } catch {
+      console.warn(`  Warning: skipping malformed LLM result file: ${f}`);
+    }
+  }
+  return results;
+}
+
+// ---------------------------------------------------------------------------
+// Compare
+// ---------------------------------------------------------------------------
+
+export interface Regression {
+  benchmark: string;
+  scenario: string;
+  metric: string;
+  expected: number | boolean;
+  actual: number | boolean;
+  delta?: string;
+}
+
+function checkNum(
+  regressions: Regression[],
+  bench: string,
+  scenario: string,
+  metric: string,
+  expected: number,
+  actual: number,
+  tolerance: number,
+): void {
+  const denom = Math.max(Math.abs(expected), 1);
+  const pctDiff = Math.abs(actual - expected) / denom;
+  if (pctDiff > tolerance) {
+    const sign = actual > expected ? '+' : '';
+    regressions.push({
+      benchmark: bench,
+      scenario,
+      metric,
+      expected,
+      actual,
+      delta: `${sign}${(((actual - expected) / denom) * 100).toFixed(1)}%`,
+    });
+  }
+}
+
+function checkBool(
+  regressions: Regression[],
+  bench: string,
+  scenario: string,
+  metric: string,
+  expected: boolean,
+  actual: boolean,
+): void {
+  if (expected !== actual) {
+    regressions.push({ benchmark: bench, scenario, metric, expected, actual });
+  }
+}
+
+function missing(regressions: Regression[], bench: string, scenario: string): void {
+  regressions.push({
+    benchmark: bench,
+    scenario,
+    metric: '(missing)',
+    expected: true,
+    actual: false,
+  });
+}
+
+export function compareResults(
+  baseline: BenchmarkResults,
+  current: BenchmarkResults,
+  tolerance: number = 0,
+): Regression[] {
+  const regressions: Regression[] = [];
+
+  // Basic
+  for (const [name, exp] of Object.entries(baseline.basic)) {
+    const act = current.basic[name];
+    if (!act) {
+      missing(regressions, 'basic', name);
+      continue;
+    }
+    checkNum(regressions, 'basic', name, 'ratio', exp.ratio, act.ratio, tolerance);
+    checkNum(regressions, 'basic', name, 'tokenRatio', exp.tokenRatio, act.tokenRatio, tolerance);
+    checkNum(regressions, 'basic', name, 'compressed', exp.compressed, act.compressed, tolerance);
+    checkNum(regressions, 'basic', name, 'preserved', exp.preserved, act.preserved, tolerance);
+  }
+
+  // Token budget
+  for (const [name, exp] of Object.entries(baseline.tokenBudget)) {
+    const act = current.tokenBudget[name];
+    if (!act) {
+      missing(regressions, 'tokenBudget', name);
+      continue;
+    }
+    checkNum(
+      regressions,
+      'tokenBudget',
+      name,
+      'tokenCount',
+      exp.tokenCount,
+      act.tokenCount,
+      tolerance,
+    );
+    checkBool(regressions, 'tokenBudget', name, 'fits', exp.fits, act.fits);
+    if (exp.recencyWindow != null && act.recencyWindow != null) {
+      checkNum(
+        regressions,
+        'tokenBudget',
+        name,
+        'recencyWindow',
+        exp.recencyWindow,
+        act.recencyWindow,
+        tolerance,
+      );
+    }
+    checkNum(
+      regressions,
+      'tokenBudget',
+      name,
+      'compressed',
+      exp.compressed,
+      act.compressed,
+      tolerance,
+    );
+    checkNum(
+      regressions,
+      'tokenBudget',
+      name,
+      'preserved',
+      exp.preserved,
+      act.preserved,
+      tolerance,
+    );
+    checkNum(regressions, 'tokenBudget', name, 'deduped', exp.deduped, act.deduped, tolerance);
+  }
+
+  // Dedup
+  for (const [name, exp] of Object.entries(baseline.dedup)) {
+    const act = current.dedup[name];
+    if (!act) {
+      missing(regressions, 'dedup', name);
+      continue;
+    }
+    checkNum(regressions, 'dedup', name, 'rw0Base', exp.rw0Base, act.rw0Base, tolerance);
+    checkNum(regressions, 'dedup', name, 'rw0Dup', exp.rw0Dup, act.rw0Dup, tolerance);
+    checkNum(regressions, 'dedup', name, 'rw4Base', exp.rw4Base, act.rw4Base, tolerance);
+    checkNum(regressions, 'dedup', name, 'rw4Dup', exp.rw4Dup, act.rw4Dup, tolerance);
+    checkNum(regressions, 'dedup', name, 'deduped', exp.deduped, act.deduped, tolerance);
+  }
+
+  // Fuzzy dedup
+  for (const [name, exp] of Object.entries(baseline.fuzzyDedup)) {
+    const act = current.fuzzyDedup[name];
+    if (!act) {
+      missing(regressions, 'fuzzyDedup', name);
+      continue;
+    }
+    checkNum(regressions, 'fuzzyDedup', name, 'exact', exp.exact, act.exact, tolerance);
+    checkNum(regressions, 'fuzzyDedup', name, 'fuzzy', exp.fuzzy, act.fuzzy, tolerance);
+    checkNum(regressions, 'fuzzyDedup', name, 'ratio', exp.ratio, act.ratio, tolerance);
+  }
+
+  // Bundle size
+  for (const [name, exp] of Object.entries(baseline.bundleSize ?? {})) {
+    const act = current.bundleSize?.[name];
+    if (!act) {
+      missing(regressions, 'bundleSize', name);
+      continue;
+    }
+    checkNum(regressions, 'bundleSize', name, 'bytes', exp.bytes, act.bytes, tolerance);
+    checkNum(regressions, 'bundleSize', name, 'gzipBytes', exp.gzipBytes, act.gzipBytes, tolerance);
+  }
+
+  return regressions;
+}
+
+// ---------------------------------------------------------------------------
+// Report
+// ---------------------------------------------------------------------------
+
+export function formatRegressions(regressions: Regression[]): string {
+  if (regressions.length === 0) return 'No regressions detected.';
+
+  const lines: string[] = [`${regressions.length} regression(s) detected:`, ''];
+
+  for (const r of regressions) {
+    const delta = r.delta ? ` (${r.delta})` : '';
+    lines.push(
+      `  [${r.benchmark}] ${r.scenario} → ${r.metric}: expected ${r.expected}, got ${r.actual}${delta}`,
+    );
+  }
+
+  return lines.join('\n');
+}
+
+// ---------------------------------------------------------------------------
+// Doc generation
+// ---------------------------------------------------------------------------
+
+function semverSort(a: string, b: string): number {
+  const pa = a
+    .replace(/^v|\.json$/g, '')
+    .split('.')
+    .map(Number);
+  const pb = b
+    .replace(/^v|\.json$/g, '')
+    .split('.')
+    .map(Number);
+  for (let i = 0; i < 3; i++) {
+    if ((pa[i] ?? 0) !== (pb[i] ?? 0)) return (pa[i] ?? 0) - (pb[i] ?? 0);
+  }
+  return 0;
+}
+
+function loadAllBaselines(baselinesDir: string): Baseline[] {
+  const historyDir = join(baselinesDir, 'history');
+  if (!existsSync(historyDir)) return [];
+
+  const files = readdirSync(historyDir)
+    .filter((f) => f.startsWith('v') && f.endsWith('.json'))
+    .sort(semverSort);
+
+  return files.map((f) => loadBaseline(join(historyDir, f)));
+}
+
+function fix(n: number, d: number = 2): string {
+  return n.toFixed(d);
+}
+
+/** Shorten scenario names for chart x-axis labels. */
+const SHORT_NAMES: Record<string, string> = {
+  'Coding assistant': 'Coding',
+  'Long Q&A': 'Long Q&A',
+  'Tool-heavy': 'Tool-heavy',
+  'Short conversation': 'Short',
+  'Deep conversation': 'Deep',
+  'Technical explanation': 'Technical',
+  'Structured content': 'Structured',
+  'Agentic coding session': 'Agentic',
+};
+
+function shortName(name: string): string {
+  return SHORT_NAMES[name] ?? name;
+}
+
+function formatTime(ms: number): string {
+  return ms < 1000 ? `${Math.round(ms)}ms` : `${(ms / 1000).toFixed(1)}s`;
+}
+
+// ---------------------------------------------------------------------------
+// Visual helpers
+// ---------------------------------------------------------------------------
+
+function formatBytes(bytes: number): string {
+  if (bytes < 1024) return `${bytes} B`;
+  return `${(bytes / 1024).toFixed(1)} KB`;
+}
+
+function badges(
+  basic: Record<string, BasicResult>,
+  bundleSize?: Record<string, BundleSizeResult>,
+): string[] {
+  const entries = Object.values(basic);
+  const ratios = entries.map((v) => v.ratio);
+  const avgR = (ratios.reduce((a, b) => a + b, 0) / ratios.length).toFixed(2);
+  const bestR = Math.max(...ratios).toFixed(2);
+  const allPass = 'all_PASS';
+
+  const badge = (label: string, value: string, color: string) =>
+    `![${label}](https://img.shields.io/badge/${encodeURIComponent(label).replace(/-/g, '--')}-${encodeURIComponent(value).replace(/-/g, '--')}-${color})`;
+
+  const badgeList = [
+    badge('avg ratio', `${avgR}x`, 'blue'),
+    badge('best', `${bestR}x`, 'blue'),
+    badge('scenarios', `${entries.length}`, 'blue'),
+    badge('round-trip', allPass, 'brightgreen'),
+  ];
+
+  const totalGzip = bundleSize?.total?.gzipBytes;
+  if (totalGzip != null) {
+    badgeList.push(badge('gzip', formatBytes(totalGzip), 'blue'));
+  }
+
+  return [badgeList.join(' ')];
+}
+
+// ---------------------------------------------------------------------------
+// Mermaid chart helpers
+// ---------------------------------------------------------------------------
+
+function compressionChart(basic: Record<string, BasicResult>): string[] {
+  const entries = Object.entries(basic);
+  const labels = entries.map(([n]) => `"${shortName(n)}"`).join(', ');
+  const values = entries.map(([, v]) => fix(v.ratio)).join(', ');
+
+  return [
+    '```mermaid',
+    'xychart-beta',
+    '    title "Compression Ratio by Scenario"',
+    `    x-axis [${labels}]`,
+    '    y-axis "Char Ratio"',
+    `    bar [${values}]`,
+    '```',
+  ];
+}
+
+function dedupChart(dedup: Record<string, DedupResult>): string[] {
+  // Only include scenarios where dedup actually changes the ratio
+  const entries = Object.entries(dedup).filter(([, v]) => v.rw0Base !== v.rw0Dup || v.deduped > 0);
+  if (entries.length === 0) return [];
+
+  const labels = entries.map(([n]) => `"${shortName(n)}"`).join(', ');
+  const base = entries.map(([, v]) => fix(v.rw0Base)).join(', ');
+  const exact = entries.map(([, v]) => fix(v.rw0Dup)).join(', ');
+
+  return [
+    '```mermaid',
+    'xychart-beta',
+    '    title "Deduplication Impact (recencyWindow=0)"',
+    `    x-axis [${labels}]`,
+    '    y-axis "Char Ratio"',
+    `    bar [${base}]`,
+    `    bar [${exact}]`,
+    '```',
+    '',
+    '*First bar: no dedup · Second bar: with dedup*',
+  ];
+}
+
+function asciiBar(value: number, max: number, width: number): string {
+  const filled = Math.round((value / max) * width);
+  return '\u2588'.repeat(filled) + '\u2591'.repeat(width - filled);
+}
+
+function llmComparisonCharts(
+  basic: Record<string, BasicResult>,
+  llmResults: LlmBenchmarkResult[],
+): string[] {
+  const lines: string[] = [];
+  const barWidth = 30;
+
+  for (const llm of llmResults) {
+    const sharedScenarios = Object.keys(basic).filter((s) => s in llm.scenarios);
+    if (sharedScenarios.length === 0) continue;
+
+    // Collect data and find max for scaling
+    const rows: { name: string; detR: number; llmR: number }[] = [];
+    for (const s of sharedScenarios) {
+      const detR = basic[s].ratio;
+      const methods = Object.values(llm.scenarios[s].methods).filter((m) => m.vsDet != null);
+      const llmR = methods.length > 0 ? Math.max(...methods.map((m) => m.ratio)) : detR;
+      rows.push({ name: s, detR, llmR });
+    }
+    const maxR = Math.max(...rows.flatMap((r) => [r.detR, r.llmR]));
+    const nameWidth = Math.max(...rows.map((r) => r.name.length));
+
+    lines.push('```');
+    lines.push(`Deterministic vs ${llm.provider}/${llm.model}`);
+    lines.push('');
+    for (const r of rows) {
+      const label = r.name.padEnd(nameWidth);
+      const detBar = asciiBar(r.detR, maxR, barWidth);
+      const llmBar = asciiBar(r.llmR, maxR, barWidth);
+      const winner = r.llmR > r.detR + 0.01 ? '  \u2605' : '';
+      lines.push(`${label}  Det ${detBar} ${fix(r.detR)}x`);
+      lines.push(`${' '.repeat(nameWidth)}  LLM ${llmBar} ${fix(r.llmR)}x${winner}`);
+      lines.push('');
+    }
+    lines.push('\u2605 = LLM wins');
+    lines.push('```');
+    lines.push('');
+  }
+
+  return lines;
+}
+
+// ---------------------------------------------------------------------------
+// Section generators
+// ---------------------------------------------------------------------------
+
+function generateCompressionSection(b: Baseline): string[] {
+  const lines: string[] = [];
+  const r = b.results;
+  const basicEntries = Object.entries(r.basic);
+  const ratios = basicEntries.map(([, v]) => v.ratio);
+  const minR = Math.min(...ratios);
+  const maxR = Math.max(...ratios);
+  const avgR = ratios.reduce((a, b) => a + b, 0) / ratios.length;
+
+  lines.push('## Compression by Scenario');
+  lines.push('');
+  lines.push(
+    `> **${basicEntries.length} scenarios** · **${fix(avgR)}x** avg ratio · `
+      + `**${fix(minR)}x** – **${fix(maxR)}x** range · all round-trips PASS`,
+  );
+  lines.push('');
+  lines.push(...compressionChart(r.basic));
+  lines.push('');
+  lines.push(
+    '| Scenario | Ratio | Reduction | Token Ratio | Messages | Compressed | Preserved |',
+  );
+  lines.push('| --- | ---: | ---: | ---: | ---: | ---: | ---: |');
+  for (const [name, v] of basicEntries) {
+    const reduction = Math.round((1 - 1 / v.ratio) * 100);
+    const messages = v.compressed + v.preserved;
+    lines.push(
+      `| ${name} | ${fix(v.ratio)} | ${reduction}% | ${fix(v.tokenRatio)} | ${messages} | ${v.compressed} | ${v.preserved} |`,
+    );
+  }
+  return lines;
+}
+
+function generateDedupSection(r: BenchmarkResults): string[] {
+  const lines: string[] = [];
+  lines.push('## Deduplication Impact');
+  lines.push('');
+
+  const chart = dedupChart(r.dedup);
+  if (chart.length > 0) {
+    lines.push(...chart);
+    lines.push('');
+  }
+
+  lines.push(
+    '| Scenario | No Dedup (rw=0) | Dedup (rw=0) | No Dedup (rw=4) | Dedup (rw=4) | Deduped |',
+  );
+  lines.push('| --- | ---: | ---: | ---: | ---: | ---: |');
+  for (const [name, v] of Object.entries(r.dedup)) {
+    lines.push(
+      `| ${name} | ${fix(v.rw0Base)} | ${fix(v.rw0Dup)} | ${fix(v.rw4Base)} | ${fix(v.rw4Dup)} | ${v.deduped} |`,
+    );
+  }
+  lines.push('');
+
+  // Fuzzy dedup detail
+  const hasFuzzy = Object.values(r.fuzzyDedup).some((v) => v.fuzzy > 0);
+  if (hasFuzzy) {
+    lines.push('### Fuzzy Dedup');
+    lines.push('');
+  }
+  lines.push('| Scenario | Exact Deduped | Fuzzy Deduped | Ratio | vs Base |');
+  lines.push('| --- | ---: | ---: | ---: | ---: |');
+  for (const [name, v] of Object.entries(r.fuzzyDedup)) {
+    const baseRatio = r.basic[name]?.ratio ?? v.ratio;
+    const improvement =
+      v.ratio > baseRatio + 0.01 ? `+${Math.round(((v.ratio - baseRatio) / baseRatio) * 100)}%` : '-';
+    lines.push(`| ${name} | ${v.exact} | ${v.fuzzy} | ${fix(v.ratio)} | ${improvement} |`);
+  }
+  return lines;
+}
+
+function generateTokenBudgetSection(r: BenchmarkResults): string[] {
+  const lines: string[] = [];
+  const entries = Object.entries(r.tokenBudget);
+  const allFit = entries.every(([, v]) => v.fits);
+  const fitCount = entries.filter(([, v]) => v.fits).length;
+
+  lines.push('## Token Budget');
+  lines.push('');
+  lines.push(`Target: **2000 tokens** · ${allFit ? 'all fit' : `${fitCount}/${entries.length} fit`}`);
+  lines.push('');
+  lines.push(
+    '| Scenario | Dedup | Tokens | Fits | recencyWindow | Compressed | Preserved | Deduped |',
+  );
+  lines.push('| --- | --- | ---: | --- | ---: | ---: | ---: | ---: |');
+  for (const [key, v] of entries) {
+    const [name, dedupStr] = key.split('|');
+    const dedup = dedupStr === 'dedup=true' ? 'yes' : 'no';
+    const fitIcon = v.fits ? 'yes' : 'no';
+    lines.push(
+      `| ${name} | ${dedup} | ${v.tokenCount} | ${fitIcon} | ${v.recencyWindow ?? '-'} | ${v.compressed} | ${v.preserved} | ${v.deduped} |`,
+    );
+  }
+  return lines;
+}
+
+function generateBundleSizeSection(bundleSize: Record<string, BundleSizeResult>): string[] {
+  const entries = Object.entries(bundleSize);
+  if (entries.length === 0) return [];
+
+  const lines: string[] = [];
+  lines.push('## Bundle Size');
+  lines.push('');
+  lines.push('> Zero-dependency ESM library — tracked per-file to catch regressions.');
+  lines.push('');
+  lines.push('| File | Size | Gzip |');
+  lines.push('| --- | ---: | ---: |');
+  for (const [name, v] of entries) {
+    const label = name === 'total' ? '**total**' : name;
+    lines.push(`| ${label} | ${formatBytes(v.bytes)} | ${formatBytes(v.gzipBytes)} |`);
+  }
+  return lines;
+}
+
+function generateLlmSection(
+  baselinesDir: string,
+  basic: Record<string, BasicResult>,
+): string[] {
+  const llmResults = loadAllLlmResults(baselinesDir);
+  if (llmResults.length === 0) return [];
+
+  const lines: string[] = [];
+  lines.push('## LLM vs Deterministic');
+  lines.push('');
+  lines.push(
+    '> Results are **non-deterministic** — LLM outputs vary between runs. '
+      + 'Saved as reference data, not used for regression testing.',
+  );
+  lines.push('');
+
+  // Per-provider comparison charts (ASCII horizontal bars in code blocks)
+  const charts = llmComparisonCharts(basic, llmResults);
+  if (charts.length > 0) {
+    lines.push(...charts);
+  }
+
+  // Cross-provider summary table
+  if (llmResults.length > 0) {
+    lines.push('### Provider Summary');
+    lines.push('');
+    lines.push(
+      '| Provider | Model | Avg Ratio | Avg vsDet | Round-trip | Budget Fits | Avg Time |',
+    );
+    lines.push('| --- | --- | ---: | ---: | --- | --- | ---: |');
+    for (const llm of llmResults) {
+      const ratioValues: number[] = [];
+      const vsDetValues: number[] = [];
+      const timeValues: number[] = [];
+      let passCount = 0;
+      let totalCount = 0;
+      for (const sr of Object.values(llm.scenarios)) {
+        for (const mr of Object.values(sr.methods)) {
+          ratioValues.push(mr.ratio);
+          if (mr.vsDet != null) vsDetValues.push(mr.vsDet);
+          timeValues.push(mr.timeMs);
+          totalCount++;
+          if (mr.roundTrip === 'PASS') passCount++;
+        }
+      }
+      const avgRatio = ratioValues.length > 0
+        ? ratioValues.reduce((a, b) => a + b, 0) / ratioValues.length
+        : 0;
+      const avgVsDet = vsDetValues.length > 0
+        ? vsDetValues.reduce((a, b) => a + b, 0) / vsDetValues.length
+        : 0;
+      const avgTime = timeValues.length > 0
+        ? timeValues.reduce((a, b) => a + b, 0) / timeValues.length
+        : 0;
+      const rt = passCount === totalCount ? 'all PASS' : `${passCount}/${totalCount}`;
+
+      // Token budget summary
+      let budgetFits = '-';
+      if (llm.tokenBudget) {
+        const allEntries = Object.values(llm.tokenBudget).flat();
+        if (allEntries.length > 0) {
+          const fitCount = allEntries.filter((e) => e.fits).length;
+          budgetFits = `${fitCount}/${allEntries.length}`;
+        }
+      }
+
+      lines.push(
+        `| ${llm.provider} | ${llm.model} | ${fix(avgRatio)}x | ${fix(avgVsDet)} | ${rt} | ${budgetFits} | ${formatTime(avgTime)} |`,
+      );
+    }
+    lines.push('');
+  }
+
+  // Key finding callout
+  const wins: string[] = [];
+  const losses: string[] = [];
+  for (const llm of llmResults) {
+    for (const [scenario, sr] of Object.entries(llm.scenarios)) {
+      for (const mr of Object.values(sr.methods)) {
+        if (mr.vsDet != null && mr.vsDet > 1.0) wins.push(scenario);
+        if (mr.vsDet != null && mr.vsDet < 0.9) losses.push(scenario);
+      }
+    }
+  }
+  const uniqueWins = [...new Set(wins)];
+  const uniqueLosses = [...new Set(losses)];
+  if (uniqueWins.length > 0 || uniqueLosses.length > 0) {
+    lines.push('> **Key findings:**');
+    if (uniqueWins.length > 0) {
+      lines.push(`> LLM wins on prose-heavy scenarios: ${uniqueWins.join(', ')}`);
+    }
+    if (uniqueLosses.length > 0) {
+      lines.push(
+        `> Deterministic wins on structured/technical content: ${uniqueLosses.join(', ')}`,
+      );
+    }
+    lines.push('');
+  }
+
+  // Per-provider detail tables (collapsible)
+  for (const llm of llmResults) {
+    lines.push(`### ${llm.provider} (${llm.model})`);
+    lines.push('');
+    lines.push(`*Generated: ${llm.generated.split('T')[0]}*`);
+    lines.push('');
+    lines.push('<details>');
+    lines.push(`<summary>Scenario details</summary>`);
+    lines.push('');
+    lines.push(
+      '| Scenario | Method | Char Ratio | Token Ratio | vsDet | Compressed | Preserved | Round-trip | Time |',
+    );
+    lines.push('| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: |');
+
+    for (const [scenario, sr] of Object.entries(llm.scenarios)) {
+      let first = true;
+      for (const [method, mr] of Object.entries(sr.methods)) {
+        const label = first ? scenario : '';
+        const vsDet = mr.vsDet != null ? fix(mr.vsDet) : '-';
+        lines.push(
+          `| ${label} | ${method} | ${fix(mr.ratio)} | ${fix(mr.tokenRatio)} | ${vsDet} | ${mr.compressed} | ${mr.preserved} | ${mr.roundTrip} | ${formatTime(mr.timeMs)} |`,
+        );
+        first = false;
+      }
+    }
+
+    // Token budget table (if present)
+    if (llm.tokenBudget && Object.keys(llm.tokenBudget).length > 0) {
+      lines.push('');
+      lines.push('#### Token Budget (target: 2000 tokens)');
+      lines.push('');
+      lines.push(
+        '| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time |',
+      );
+      lines.push('| --- | --- | ---: | --- | ---: | ---: | --- | ---: |');
+
+      for (const [scenario, entries] of Object.entries(llm.tokenBudget)) {
+        let first = true;
+        for (const entry of entries) {
+          const label = first ? scenario : '';
+          lines.push(
+            `| ${label} | ${entry.method} | ${entry.tokenCount} | ${entry.fits} | ${entry.recencyWindow ?? '-'} | ${fix(entry.ratio)} | ${entry.roundTrip} | ${formatTime(entry.timeMs)} |`,
+          );
+          first = false;
+        }
+      }
+    }
+
+    lines.push('');
+    lines.push('</details>');
+    lines.push('');
+  }
+
+  return lines;
+}
+
+// ---------------------------------------------------------------------------
+// Main doc generator
+// ---------------------------------------------------------------------------
+
+export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): void {
+  const baselines = loadAllBaselines(baselinesDir);
+  if (baselines.length === 0) return;
+
+  const latest = baselines[baselines.length - 1];
+  const lines: string[] = [];
+
+  // --- Header ---
+  lines.push('# Benchmark Results');
+  lines.push('');
+  lines.push('[Back to README](../README.md) | [All docs](README.md) | [Handbook](benchmarks.md)');
+  lines.push('');
+  lines.push('*Auto-generated by `npm run bench:save`. Do not edit manually.*');
+  lines.push('');
+  lines.push(`**v${latest.version}** · Generated: ${latest.generated.split('T')[0]}`);
+  lines.push('');
+  lines.push(...badges(latest.results.basic, latest.results.bundleSize));
+  lines.push('');
+
+  // --- Summary ---
+  const basicEntries = Object.entries(latest.results.basic);
+  const ratios = basicEntries.map(([, v]) => v.ratio);
+  const avgR = ratios.reduce((a, b) => a + b, 0) / ratios.length;
+  lines.push('## Summary');
+  lines.push('');
+  lines.push(`| Metric | Value |`);
+  lines.push(`| --- | --- |`);
+  lines.push(`| Scenarios | ${basicEntries.length} |`);
+  lines.push(`| Average compression | ${fix(avgR)}x |`);
+  lines.push(`| Best compression | ${fix(Math.max(...ratios))}x |`);
+  lines.push(`| Round-trip integrity | all PASS |`);
+  lines.push('');
+
+  // --- Pie chart: message outcome distribution ---
+  const totalPreserved = basicEntries.reduce((s, [, v]) => s + v.preserved, 0);
+  const totalCompressed = basicEntries.reduce((s, [, v]) => s + v.compressed, 0);
+  lines.push('```mermaid');
+  lines.push('pie title "Message Outcomes"');
+  lines.push(`    "Preserved" : ${totalPreserved}`);
+  lines.push(`    "Compressed" : ${totalCompressed}`);
+  lines.push('```');
+  lines.push('');
+
+  // --- Compression ---
+  lines.push(...generateCompressionSection(latest));
+  lines.push('');
+
+  // --- Dedup ---
+  lines.push(...generateDedupSection(latest.results));
+  lines.push('');
+
+  // --- Token budget ---
+  lines.push(...generateTokenBudgetSection(latest.results));
+  lines.push('');
+
+  // --- Bundle size ---
+  const bundleSizeSection = generateBundleSizeSection(latest.results.bundleSize ?? {});
+  if (bundleSizeSection.length > 0) {
+    lines.push(...bundleSizeSection);
+    lines.push('');
+  }
+
+  // --- LLM (conditional) ---
+  const llmSection = generateLlmSection(baselinesDir, latest.results.basic);
+  if (llmSection.length > 0) {
+    lines.push(...llmSection);
+  }
+
+  // --- Version history (conditional) ---
+  if (baselines.length > 1) {
+    lines.push('## Version History');
+    lines.push('');
+    lines.push('| Version | Date | Avg Char Ratio | Avg Token Ratio | Scenarios |');
+    lines.push('| --- | --- | ---: | ---: | ---: |');
+    for (const b of [...baselines].reverse()) {
+      const entries = Object.values(b.results.basic);
+      const avgChr = entries.reduce((s, v) => s + v.ratio, 0) / entries.length;
+      const avgTkr = entries.reduce((s, v) => s + v.tokenRatio, 0) / entries.length;
+      const date = b.generated.split('T')[0];
+      lines.push(
+        `| ${b.version} | ${date} | ${fix(avgChr)} | ${fix(avgTkr)} | ${entries.length} |`,
+      );
+    }
+    lines.push('');
+
+    // Per-version detail (older versions)
+    const olderVersions = baselines.slice(0, -1).reverse();
+    for (const b of olderVersions) {
+      const r = b.results;
+      const oldEntries = Object.entries(r.basic);
+      const oldRatios = oldEntries.map(([, v]) => v.ratio);
+      const oldAvg = oldRatios.reduce((a, b) => a + b, 0) / oldRatios.length;
+
+      lines.push(`<details>`);
+      lines.push(`<summary>v${b.version} (${b.generated.split('T')[0]}) — ${fix(oldAvg)}x avg</summary>`);
+      lines.push('');
+      lines.push('| Scenario | Char Ratio | Token Ratio | Compressed | Preserved |');
+      lines.push('| --- | ---: | ---: | ---: | ---: |');
+      for (const [name, v] of oldEntries) {
+        lines.push(
+          `| ${name} | ${fix(v.ratio)} | ${fix(v.tokenRatio)} | ${v.compressed} | ${v.preserved} |`,
+        );
+      }
+      lines.push('');
+      lines.push('</details>');
+      lines.push('');
+    }
+  }
+
+  // --- Methodology ---
+  lines.push('## Methodology');
+  lines.push('');
+  lines.push('- All deterministic results use the same input → same output guarantee');
+  lines.push('- Metrics: compression ratio, token ratio, message counts, dedup counts');
+  lines.push('- Timing is excluded from baselines (hardware-dependent)');
+  lines.push('- LLM benchmarks are saved as reference data, not used for regression testing');
+  lines.push('- Round-trip integrity is verified for every scenario (compress then uncompress)');
+  lines.push('');
+
+  writeFileSync(outputPath, lines.join('\n'));
+}
diff --git a/bench/baselines/current.json b/bench/baselines/current.json
new file mode 100644
index 0000000..77bfa0d
--- /dev/null
+++ b/bench/baselines/current.json
@@ -0,0 +1,224 @@
+{
+  "version": "1.0.0",
+  "generated": "2026-02-26T05:31:42.406Z",
+  "results": {
+    "basic": {
+      "Coding assistant": {
+        "ratio": 1.6812907904278462,
+        "tokenRatio": 1.6729559748427674,
+        "compressed": 5,
+        "preserved": 8
+      },
+      "Long Q&A": {
+        "ratio": 6.158536585365853,
+        "tokenRatio": 6.114164904862579,
+        "compressed": 4,
+        "preserved": 6
+      },
+      "Tool-heavy": {
+        "ratio": 1.2991563919532771,
+        "tokenRatio": 1.2946428571428572,
+        "compressed": 2,
+        "preserved": 16
+      },
+      "Short conversation": {
+        "ratio": 1,
+        "tokenRatio": 1,
+        "compressed": 0,
+        "preserved": 7
+      },
+      "Deep conversation": {
+        "ratio": 2.124913733609386,
+        "tokenRatio": 2.1241305510968433,
+        "compressed": 50,
+        "preserved": 1
+      },
+      "Technical explanation": {
+        "ratio": 1,
+        "tokenRatio": 1,
+        "compressed": 0,
+        "preserved": 11
+      },
+      "Structured content": {
+        "ratio": 1.9338990620812864,
+        "tokenRatio": 1.9241486068111455,
+        "compressed": 2,
+        "preserved": 10
+      },
+      "Agentic coding session": {
+        "ratio": 1.428351309707242,
+        "tokenRatio": 1.4258962011771001,
+        "compressed": 2,
+        "preserved": 31
+      }
+    },
+    "tokenBudget": {
+      "Deep conversation|dedup=false": {
+        "tokenCount": 3738,
+        "fits": false,
+        "recencyWindow": 0,
+        "compressed": 50,
+        "preserved": 1,
+        "deduped": 0
+      },
+      "Deep conversation|dedup=true": {
+        "tokenCount": 3738,
+        "fits": false,
+        "recencyWindow": 0,
+        "compressed": 50,
+        "preserved": 1,
+        "deduped": 0
+      },
+      "Agentic coding session|dedup=false": {
+        "tokenCount": 2345,
+        "fits": false,
+        "recencyWindow": 0,
+        "compressed": 4,
+        "preserved": 33,
+        "deduped": 0
+      },
+      "Agentic coding session|dedup=true": {
+        "tokenCount": 1957,
+        "fits": true,
+        "recencyWindow": 9,
+        "compressed": 1,
+        "preserved": 32,
+        "deduped": 4
+      }
+    },
+    "dedup": {
+      "Coding assistant": {
+        "rw0Base": 1.6812907904278462,
+        "rw0Dup": 1.6812907904278462,
+        "rw4Base": 1.5104234527687297,
+        "rw4Dup": 1.5104234527687297,
+        "deduped": 0
+      },
+      "Long Q&A": {
+        "rw0Base": 5.139949109414759,
+        "rw0Dup": 6.158536585365853,
+        "rw4Base": 1.9024298361273309,
+        "rw4Dup": 2.0264847512038524,
+        "deduped": 1
+      },
+      "Tool-heavy": {
+        "rw0Base": 1.2991563919532771,
+        "rw0Dup": 1.2991563919532771,
+        "rw4Base": 1.2991563919532771,
+        "rw4Dup": 1.2991563919532771,
+        "deduped": 0
+      },
+      "Short conversation": {
+        "rw0Base": 1,
+        "rw0Dup": 1,
+        "rw4Base": 1,
+        "rw4Dup": 1,
+        "deduped": 0
+      },
+      "Deep conversation": {
+        "rw0Base": 2.124913733609386,
+        "rw0Dup": 2.124913733609386,
+        "rw4Base": 1.9527165104643789,
+        "rw4Dup": 1.9527165104643789,
+        "deduped": 0
+      },
+      "Technical explanation": {
+        "rw0Base": 1,
+        "rw0Dup": 1,
+        "rw4Base": 1,
+        "rw4Dup": 1,
+        "deduped": 0
+      },
+      "Structured content": {
+        "rw0Base": 1.9338990620812864,
+        "rw0Dup": 1.9338990620812864,
+        "rw4Base": 1.373730964467005,
+        "rw4Dup": 1.373730964467005,
+        "deduped": 0
+      },
+      "Agentic coding session": {
+        "rw0Base": 1.1374233128834357,
+        "rw0Dup": 1.428351309707242,
+        "rw4Base": 1.1374233128834357,
+        "rw4Dup": 1.428351309707242,
+        "deduped": 4
+      }
+    },
+    "fuzzyDedup": {
+      "Coding assistant": {
+        "exact": 0,
+        "fuzzy": 0,
+        "ratio": 1.6812907904278462
+      },
+      "Long Q&A": {
+        "exact": 1,
+        "fuzzy": 0,
+        "ratio": 6.158536585365853
+      },
+      "Tool-heavy": {
+        "exact": 0,
+        "fuzzy": 0,
+        "ratio": 1.2991563919532771
+      },
+      "Short conversation": {
+        "exact": 0,
+        "fuzzy": 0,
+        "ratio": 1
+      },
+      "Deep conversation": {
+        "exact": 0,
+        "fuzzy": 0,
+        "ratio": 2.124913733609386
+      },
+      "Technical explanation": {
+        "exact": 0,
+        "fuzzy": 0,
+        "ratio": 1
+      },
+      "Structured content": {
+        "exact": 0,
+        "fuzzy": 0,
+        "ratio": 1.9338990620812864
+      },
+      "Agentic coding session": {
+        "exact": 4,
+        "fuzzy": 2,
+        "ratio": 2.229973538609574
+      }
+    },
+    "bundleSize": {
+      "classify.js": {
+        "bytes": 7724,
+        "gzipBytes": 3250
+      },
+      "compress.js": {
+        "bytes": 33941,
+        "gzipBytes": 8721
+      },
+      "dedup.js": {
+        "bytes": 10260,
+        "gzipBytes": 2864
+      },
+      "expand.js": {
+        "bytes": 2795,
+        "gzipBytes": 934
+      },
+      "index.js": {
+        "bytes": 225,
+        "gzipBytes": 159
+      },
+      "summarizer.js": {
+        "bytes": 2542,
+        "gzipBytes": 993
+      },
+      "types.js": {
+        "bytes": 11,
+        "gzipBytes": 31
+      },
+      "total": {
+        "bytes": 57498,
+        "gzipBytes": 16952
+      }
+    }
+  }
+}
diff --git a/bench/baselines/history/v1.0.0.json b/bench/baselines/history/v1.0.0.json
new file mode 100644
index 0000000..77bfa0d
--- /dev/null
+++ b/bench/baselines/history/v1.0.0.json
@@ -0,0 +1,224 @@
+{
+  "version": "1.0.0",
+  "generated": "2026-02-26T05:31:42.406Z",
+  "results": {
+    "basic": {
+      "Coding assistant": {
+        "ratio": 1.6812907904278462,
+        "tokenRatio": 1.6729559748427674,
+        "compressed": 5,
+        "preserved": 8
+      },
+      "Long Q&A": {
+        "ratio": 6.158536585365853,
+        "tokenRatio": 6.114164904862579,
+        "compressed": 4,
+        "preserved": 6
+      },
+      "Tool-heavy": {
+        "ratio": 1.2991563919532771,
+        "tokenRatio": 1.2946428571428572,
+        "compressed": 2,
+        "preserved": 16
+      },
+      "Short conversation": {
+        "ratio": 1,
+        "tokenRatio": 1,
+        "compressed": 0,
+        "preserved": 7
+      },
+      "Deep conversation": {
+        "ratio": 2.124913733609386,
+        "tokenRatio": 2.1241305510968433,
+        "compressed": 50,
+        "preserved": 1
+      },
+      "Technical explanation": {
+        "ratio": 1,
+        "tokenRatio": 1,
+        "compressed": 0,
+        "preserved": 11
+      },
+      "Structured content": {
+        "ratio": 1.9338990620812864,
+        "tokenRatio": 1.9241486068111455,
+        "compressed": 2,
+        "preserved": 10
+      },
+      "Agentic coding session": {
+        "ratio": 1.428351309707242,
+        "tokenRatio": 1.4258962011771001,
+        "compressed": 2,
+        "preserved": 31
+      }
+    },
+    "tokenBudget": {
+      "Deep conversation|dedup=false": {
+        "tokenCount": 3738,
+        "fits": false,
+        "recencyWindow": 0,
+        "compressed": 50,
+        "preserved": 1,
+        "deduped": 0
+      },
+      "Deep conversation|dedup=true": {
+        "tokenCount": 3738,
+        "fits": false,
+        "recencyWindow": 0,
+        "compressed": 50,
+        "preserved": 1,
+        "deduped": 0
+      },
+      "Agentic coding session|dedup=false": {
+        "tokenCount": 2345,
+        "fits": false,
+        "recencyWindow": 0,
+        "compressed": 4,
+        "preserved": 33,
+        "deduped": 0
+      },
+      "Agentic coding session|dedup=true": {
+        "tokenCount": 1957,
+        "fits": true,
+        "recencyWindow": 9,
+        "compressed": 1,
+        "preserved": 32,
+        "deduped": 4
+      }
+    },
+    "dedup": {
+      "Coding assistant": {
+        "rw0Base": 1.6812907904278462,
+        "rw0Dup": 1.6812907904278462,
+        "rw4Base": 1.5104234527687297,
+        "rw4Dup": 1.5104234527687297,
+        "deduped": 0
+      },
+      "Long Q&A": {
+        "rw0Base": 5.139949109414759,
+        "rw0Dup": 6.158536585365853,
+        "rw4Base": 1.9024298361273309,
+        "rw4Dup": 2.0264847512038524,
+        "deduped": 1
+      },
+      "Tool-heavy": {
+        "rw0Base": 1.2991563919532771,
+        "rw0Dup": 1.2991563919532771,
+        "rw4Base": 1.2991563919532771,
+        "rw4Dup": 1.2991563919532771,
+        "deduped": 0
+      },
+      "Short conversation": {
+        "rw0Base": 1,
+        "rw0Dup": 1,
+        "rw4Base": 1,
+        "rw4Dup": 1,
+        "deduped": 0
+      },
+      "Deep conversation": {
+        "rw0Base": 2.124913733609386,
+        "rw0Dup": 2.124913733609386,
+        "rw4Base": 1.9527165104643789,
+        "rw4Dup": 1.9527165104643789,
+        "deduped": 0
+      },
+      "Technical explanation": {
+        "rw0Base": 1,
+        "rw0Dup": 1,
+        "rw4Base": 1,
+        "rw4Dup": 1,
+        "deduped": 0
+      },
+      "Structured content": {
+        "rw0Base": 1.9338990620812864,
+        "rw0Dup": 1.9338990620812864,
+        "rw4Base": 1.373730964467005,
+        "rw4Dup": 1.373730964467005,
+        "deduped": 0
+      },
+      "Agentic coding session": {
+        "rw0Base": 1.1374233128834357,
+        "rw0Dup": 1.428351309707242,
+        "rw4Base": 1.1374233128834357,
+        "rw4Dup": 1.428351309707242,
+        "deduped": 4
+      }
+    },
+    "fuzzyDedup": {
+      "Coding assistant": {
+        "exact": 0,
+        "fuzzy": 0,
+        "ratio": 1.6812907904278462
+      },
+      "Long Q&A": {
+        "exact": 1,
+        "fuzzy": 0,
+        "ratio": 6.158536585365853
+      },
+      "Tool-heavy": {
+        "exact": 0,
+        "fuzzy": 0,
+        "ratio": 1.2991563919532771
+      },
+      "Short conversation": {
+        "exact": 0,
+        "fuzzy": 0,
+        "ratio": 1
+      },
+      "Deep conversation": {
+        "exact": 0,
+        "fuzzy": 0,
+        "ratio": 2.124913733609386
+      },
+      "Technical explanation": {
+        "exact": 0,
+        "fuzzy": 0,
+        "ratio": 1
+      },
+      "Structured content": {
+        "exact": 0,
+        "fuzzy": 0,
+        "ratio": 1.9338990620812864
+      },
+      "Agentic coding session": {
+        "exact": 4,
+        "fuzzy": 2,
+        "ratio": 2.229973538609574
+      }
+    },
+    "bundleSize": {
+      "classify.js": {
+        "bytes": 7724,
+        "gzipBytes": 3250
+      },
+      "compress.js": {
+        "bytes": 33941,
+        "gzipBytes": 8721
+      },
+      "dedup.js": {
+        "bytes": 10260,
+        "gzipBytes": 2864
+      },
+      "expand.js": {
+        "bytes": 2795,
+        "gzipBytes": 934
+      },
+      "index.js": {
+        "bytes": 225,
+        "gzipBytes": 159
+      },
+      "summarizer.js": {
+        "bytes": 2542,
+        "gzipBytes": 993
+      },
+      "types.js": {
+        "bytes": 11,
+        "gzipBytes": 31
+      },
+      "total": {
+        "bytes": 57498,
+        "gzipBytes": 16952
+      }
+    }
+  }
+}
diff --git a/bench/baselines/llm/ollama-llama3.2.json b/bench/baselines/llm/ollama-llama3.2.json
new file mode 100644
index 0000000..a0f393b
--- /dev/null
+++ b/bench/baselines/llm/ollama-llama3.2.json
@@ -0,0 +1,263 @@
+{
+  "provider": "ollama",
+  "model": "llama3.2",
+  "generated": "2026-02-25T12:21:05.747Z",
+  "scenarios": {
+    "Coding assistant": {
+      "methods": {
+        "deterministic": {
+          "ratio": 1.6812907904278462,
+          "tokenRatio": 1.6729559748427674,
+          "compressed": 5,
+          "preserved": 8,
+          "roundTrip": "PASS",
+          "timeMs": 0.25966599996900186
+        },
+        "llm-basic": {
+          "ratio": 1.4847902657700929,
+          "tokenRatio": 1.4810690423162582,
+          "compressed": 5,
+          "preserved": 8,
+          "roundTrip": "PASS",
+          "timeMs": 5869.715916000016,
+          "vsDet": 0.883125200128082
+        },
+        "llm-escalate": {
+          "ratio": 1.5518741633199464,
+          "tokenRatio": 1.5501165501165501,
+          "compressed": 5,
+          "preserved": 8,
+          "roundTrip": "PASS",
+          "timeMs": 3001.2509999999893,
+          "vsDet": 0.9230254350736279
+        }
+      }
+    },
+    "Long Q&A": {
+      "methods": {
+        "deterministic": {
+          "ratio": 6.158536585365853,
+          "tokenRatio": 6.114164904862579,
+          "compressed": 4,
+          "preserved": 6,
+          "roundTrip": "PASS",
+          "timeMs": 0.73641700000735
+        },
+        "llm-basic": {
+          "ratio": 4.308873720136519,
+          "tokenRatio": 4.2844444444444445,
+          "compressed": 4,
+          "preserved": 6,
+          "roundTrip": "PASS",
+          "timeMs": 4080.273957999947,
+          "vsDet": 0.6996587030716723
+        },
+        "llm-escalate": {
+          "ratio": 4.486894713460684,
+          "tokenRatio": 4.456086286594761,
+          "compressed": 4,
+          "preserved": 6,
+          "roundTrip": "PASS",
+          "timeMs": 3666.4759170000325,
+          "vsDet": 0.7285650821856953
+        }
+      }
+    },
+    "Tool-heavy": {
+      "methods": {
+        "deterministic": {
+          "ratio": 1.2991563919532771,
+          "tokenRatio": 1.2946428571428572,
+          "compressed": 2,
+          "preserved": 16,
+          "roundTrip": "PASS",
+          "timeMs": 1.655417000001762
+        },
+        "llm-basic": {
+          "ratio": 1.1153203342618385,
+          "tokenRatio": 1.1132437619961613,
+          "compressed": 2,
+          "preserved": 16,
+          "roundTrip": "PASS",
+          "timeMs": 2252.8222499999683,
+          "vsDet": 0.8584958217270195
+        },
+        "llm-escalate": {
+          "ratio": 1.2816901408450705,
+          "tokenRatio": 1.277533039647577,
+          "compressed": 2,
+          "preserved": 16,
+          "roundTrip": "PASS",
+          "timeMs": 2796.051916999975,
+          "vsDet": 0.9865556978233034
+        }
+      }
+    },
+    "Deep conversation": {
+      "methods": {
+        "deterministic": {
+          "ratio": 2.124913733609386,
+          "tokenRatio": 2.1241305510968433,
+          "compressed": 50,
+          "preserved": 1,
+          "roundTrip": "PASS",
+          "timeMs": 2.8401660000090487
+        },
+        "llm-basic": {
+          "ratio": 3.123774095366926,
+          "tokenRatio": 3.1088488645262333,
+          "compressed": 50,
+          "preserved": 1,
+          "roundTrip": "PASS",
+          "timeMs": 22697.48683300003,
+          "vsDet": 1.470071017923571
+        },
+        "llm-escalate": {
+          "ratio": 3.2790202342918,
+          "tokenRatio": 3.255432554325543,
+          "compressed": 50,
+          "preserved": 1,
+          "roundTrip": "PASS",
+          "timeMs": 23293.247875,
+          "vsDet": 1.5431309904153354
+        }
+      }
+    },
+    "Technical explanation": {
+      "methods": {
+        "deterministic": {
+          "ratio": 1,
+          "tokenRatio": 1,
+          "compressed": 0,
+          "preserved": 11,
+          "roundTrip": "PASS",
+          "timeMs": 0.6284590000286698
+        },
+        "llm-basic": {
+          "ratio": 1,
+          "tokenRatio": 1,
+          "compressed": 0,
+          "preserved": 11,
+          "roundTrip": "PASS",
+          "timeMs": 3207.201915999991,
+          "vsDet": 1
+        },
+        "llm-escalate": {
+          "ratio": 1.0009776232891592,
+          "tokenRatio": 1.0007587253414265,
+          "compressed": 2,
+          "preserved": 9,
+          "roundTrip": "PASS",
+          "timeMs": 784.6597920000786,
+          "vsDet": 1.0009776232891592
+        }
+      }
+    },
+    "Structured content": {
+      "methods": {
+        "deterministic": {
+          "ratio": 1.9338990620812864,
+          "tokenRatio": 1.9241486068111455,
+          "compressed": 2,
+          "preserved": 10,
+          "roundTrip": "PASS",
+          "timeMs": 0.48375000001396984
+        },
+        "llm-basic": {
+          "ratio": 1.4554621848739495,
+          "tokenRatio": 1.4521028037383177,
+          "compressed": 2,
+          "preserved": 10,
+          "roundTrip": "PASS",
+          "timeMs": 3480.8887089999625,
+          "vsDet": 0.7526050420168067
+        },
+        "llm-escalate": {
+          "ratio": 1.3816209317166561,
+          "tokenRatio": 1.3795782463928967,
+          "compressed": 2,
+          "preserved": 10,
+          "roundTrip": "PASS",
+          "timeMs": 3686.4468750000233,
+          "vsDet": 0.7144224633056797
+        }
+      }
+    },
+    "Agentic coding session": {
+      "methods": {
+        "deterministic": {
+          "ratio": 1.428351309707242,
+          "tokenRatio": 1.4258962011771001,
+          "compressed": 2,
+          "preserved": 31,
+          "roundTrip": "PASS",
+          "timeMs": 0.749125000089407
+        },
+        "llm-basic": {
+          "ratio": 1.3462097008422886,
+          "tokenRatio": 1.34460141271443,
+          "compressed": 2,
+          "preserved": 31,
+          "roundTrip": "PASS",
+          "timeMs": 3328.690416999976,
+          "vsDet": 0.9424920127795526
+        },
+        "llm-escalate": {
+          "ratio": 1.3975576662143827,
+          "tokenRatio": 1.3952879581151831,
+          "compressed": 2,
+          "preserved": 31,
+          "roundTrip": "PASS",
+          "timeMs": 5422.445708999992,
+          "vsDet": 0.978441127694859
+        }
+      }
+    }
+  },
+  "tokenBudget": {
+    "Deep conversation": [
+      {
+        "budget": 2000,
+        "method": "deterministic",
+        "tokenCount": 3738,
+        "fits": false,
+        "ratio": 2.124913733609386,
+        "recencyWindow": 0,
+        "roundTrip": "PASS",
+        "timeMs": 12.129625000059605
+      },
+      {
+        "budget": 2000,
+        "method": "llm-escalate",
+        "tokenCount": 2593,
+        "fits": false,
+        "ratio": 3.0834538778235228,
+        "recencyWindow": 0,
+        "roundTrip": "PASS",
+        "timeMs": 131976.87870800006
+      }
+    ],
+    "Agentic coding session": [
+      {
+        "budget": 2000,
+        "method": "deterministic",
+        "tokenCount": 1957,
+        "fits": true,
+        "ratio": 1.3638369869059879,
+        "recencyWindow": 9,
+        "roundTrip": "PASS",
+        "timeMs": 1.8957079999381676
+      },
+      {
+        "budget": 2000,
+        "method": "llm-escalate",
+        "tokenCount": 2003,
+        "fits": false,
+        "ratio": 1.331896551724138,
+        "recencyWindow": 9,
+        "roundTrip": "PASS",
+        "timeMs": 4096.28350000002
+      }
+    ]
+  }
+}
diff --git a/bench/baselines/llm/openai-gpt-4.1-mini.json b/bench/baselines/llm/openai-gpt-4.1-mini.json
new file mode 100644
index 0000000..27b75c4
--- /dev/null
+++ b/bench/baselines/llm/openai-gpt-4.1-mini.json
@@ -0,0 +1,263 @@
+{
+  "provider": "openai",
+  "model": "gpt-4.1-mini",
+  "generated": "2026-02-25T12:28:55.113Z",
+  "scenarios": {
+    "Coding assistant": {
+      "methods": {
+        "deterministic": {
+          "ratio": 1.6812907904278462,
+          "tokenRatio": 1.6729559748427674,
+          "compressed": 5,
+          "preserved": 8,
+          "roundTrip": "PASS",
+          "timeMs": 0.25587500000006
+        },
+        "llm-basic": {
+          "ratio": 1.6414159292035397,
+          "tokenRatio": 1.633906633906634,
+          "compressed": 5,
+          "preserved": 8,
+          "roundTrip": "PASS",
+          "timeMs": 5578.285459,
+          "vsDet": 0.976283185840708
+        },
+        "llm-escalate": {
+          "ratio": 1.631597466572836,
+          "tokenRatio": 1.625916870415648,
+          "compressed": 5,
+          "preserved": 8,
+          "roundTrip": "PASS",
+          "timeMs": 6046.540790999999,
+          "vsDet": 0.9704433497536946
+        }
+      }
+    },
+    "Long Q&A": {
+      "methods": {
+        "deterministic": {
+          "ratio": 6.158536585365853,
+          "tokenRatio": 6.114164904862579,
+          "compressed": 4,
+          "preserved": 6,
+          "roundTrip": "PASS",
+          "timeMs": 0.9947919999995065
+        },
+        "llm-basic": {
+          "ratio": 5.372340425531915,
+          "tokenRatio": 5.3259668508287294,
+          "compressed": 4,
+          "preserved": 6,
+          "roundTrip": "PASS",
+          "timeMs": 5892.603500000001,
+          "vsDet": 0.8723404255319149
+        },
+        "llm-escalate": {
+          "ratio": 5.346744309158285,
+          "tokenRatio": 5.3064220183486235,
+          "compressed": 4,
+          "preserved": 6,
+          "roundTrip": "PASS",
+          "timeMs": 6988.136834000001,
+          "vsDet": 0.868184224457385
+        }
+      }
+    },
+    "Tool-heavy": {
+      "methods": {
+        "deterministic": {
+          "ratio": 1.2991563919532771,
+          "tokenRatio": 1.2946428571428572,
+          "compressed": 2,
+          "preserved": 16,
+          "roundTrip": "PASS",
+          "timeMs": 0.2992500000000291
+        },
+        "llm-basic": {
+          "ratio": 1.105466593042518,
+          "tokenRatio": 1.1047619047619048,
+          "compressed": 2,
+          "preserved": 16,
+          "roundTrip": "PASS",
+          "timeMs": 3497.0059580000016,
+          "vsDet": 0.8509110988404197
+        },
+        "llm-escalate": {
+          "ratio": 1.1159420289855073,
+          "tokenRatio": 1.1153846153846154,
+          "compressed": 2,
+          "preserved": 16,
+          "roundTrip": "PASS",
+          "timeMs": 5327.759166,
+          "vsDet": 0.858974358974359
+        }
+      }
+    },
+    "Deep conversation": {
+      "methods": {
+        "deterministic": {
+          "ratio": 2.124913733609386,
+          "tokenRatio": 2.1241305510968433,
+          "compressed": 50,
+          "preserved": 1,
+          "roundTrip": "PASS",
+          "timeMs": 2.7148750000051223
+        },
+        "llm-basic": {
+          "ratio": 2.3424344885883346,
+          "tokenRatio": 2.3346074683916496,
+          "compressed": 50,
+          "preserved": 1,
+          "roundTrip": "PASS",
+          "timeMs": 50365.301625,
+          "vsDet": 1.1023668639053252
+        },
+        "llm-escalate": {
+          "ratio": 2.3674498077744555,
+          "tokenRatio": 2.359583952451709,
+          "compressed": 50,
+          "preserved": 1,
+          "roundTrip": "PASS",
+          "timeMs": 50784.971292,
+          "vsDet": 1.114139256727894
+        }
+      }
+    },
+    "Technical explanation": {
+      "methods": {
+        "deterministic": {
+          "ratio": 1,
+          "tokenRatio": 1,
+          "compressed": 0,
+          "preserved": 11,
+          "roundTrip": "PASS",
+          "timeMs": 0.6729170000180602
+        },
+        "llm-basic": {
+          "ratio": 1.0014127363616605,
+          "tokenRatio": 1.0015186028853456,
+          "compressed": 1,
+          "preserved": 10,
+          "roundTrip": "PASS",
+          "timeMs": 2551.7554579999996,
+          "vsDet": 1.0014127363616605
+        },
+        "llm-escalate": {
+          "ratio": 1.0014127363616605,
+          "tokenRatio": 1.0015186028853456,
+          "compressed": 1,
+          "preserved": 10,
+          "roundTrip": "PASS",
+          "timeMs": 3298.924624999985,
+          "vsDet": 1.0014127363616605
+        }
+      }
+    },
+    "Structured content": {
+      "methods": {
+        "deterministic": {
+          "ratio": 1.9338990620812864,
+          "tokenRatio": 1.9241486068111455,
+          "compressed": 2,
+          "preserved": 10,
+          "roundTrip": "PASS",
+          "timeMs": 0.3844159999862313
+        },
+        "llm-basic": {
+          "ratio": 1.2315130830489192,
+          "tokenRatio": 1.2294757665677547,
+          "compressed": 2,
+          "preserved": 10,
+          "roundTrip": "PASS",
+          "timeMs": 10207.897041999997,
+          "vsDet": 0.6368031854379976
+        },
+        "llm-escalate": {
+          "ratio": 1.2886904761904763,
+          "tokenRatio": 1.2867494824016563,
+          "compressed": 2,
+          "preserved": 10,
+          "roundTrip": "PASS",
+          "timeMs": 4813.861583999998,
+          "vsDet": 0.6663690476190476
+        }
+      }
+    },
+    "Agentic coding session": {
+      "methods": {
+        "deterministic": {
+          "ratio": 1.428351309707242,
+          "tokenRatio": 1.4258962011771001,
+          "compressed": 2,
+          "preserved": 31,
+          "roundTrip": "PASS",
+          "timeMs": 0.6770829999877606
+        },
+        "llm-basic": {
+          "ratio": 1.428351309707242,
+          "tokenRatio": 1.4258962011771001,
+          "compressed": 2,
+          "preserved": 31,
+          "roundTrip": "PASS",
+          "timeMs": 5799.787291999994,
+          "vsDet": 1
+        },
+        "llm-escalate": {
+          "ratio": 1.3244749249892842,
+          "tokenRatio": 1.3232373386295928,
+          "compressed": 1,
+          "preserved": 32,
+          "roundTrip": "PASS",
+          "timeMs": 9487.380791999982,
+          "vsDet": 0.9272753250464352
+        }
+      }
+    }
+  },
+  "tokenBudget": {
+    "Deep conversation": [
+      {
+        "budget": 2000,
+        "method": "deterministic",
+        "tokenCount": 3738,
+        "fits": false,
+        "ratio": 2.124913733609386,
+        "recencyWindow": 0,
+        "roundTrip": "PASS",
+        "timeMs": 10.060708000004524
+      },
+      {
+        "budget": 2000,
+        "method": "llm-escalate",
+        "tokenCount": 3391,
+        "fits": false,
+        "ratio": 2.3493853327681222,
+        "recencyWindow": 0,
+        "roundTrip": "PASS",
+        "timeMs": 280464.86720800004
+      }
+    ],
+    "Agentic coding session": [
+      {
+        "budget": 2000,
+        "method": "deterministic",
+        "tokenCount": 1957,
+        "fits": true,
+        "ratio": 1.3638369869059879,
+        "recencyWindow": 9,
+        "roundTrip": "PASS",
+        "timeMs": 1.9349999999976717
+      },
+      {
+        "budget": 2000,
+        "method": "llm-escalate",
+        "tokenCount": 1915,
+        "fits": true,
+        "ratio": 1.3935658448586892,
+        "recencyWindow": 3,
+        "roundTrip": "PASS",
+        "timeMs": 28052.867749999976
+      }
+    ]
+  }
+}
diff --git a/bench/llm.ts b/bench/llm.ts
index 68c7197..e4615ef 100644
--- a/bench/llm.ts
+++ b/bench/llm.ts
@@ -6,7 +6,7 @@
  *
  * Supported providers:
  *   - OpenAI:    OPENAI_API_KEY (model override: OPENAI_MODEL, default gpt-4.1-mini)
- *   - Ollama:    OLLAMA_MODEL or OLLAMA_HOST (default host http://localhost:11434, model llama3.2)
+ *   - Ollama:    Auto-detected on localhost:11434, or OLLAMA_MODEL/OLLAMA_HOST (model default llama3.2)
  *   - Anthropic: ANTHROPIC_API_KEY (model override: ANTHROPIC_MODEL, default claude-haiku-4-5-20251001)
  *
  * SDKs are dynamically imported — missing packages print a skip message
@@ -47,31 +47,59 @@ export async function detectProviders(): Promise<LlmProvider[]> {
     }
   }
 
-  // --- Ollama (OpenAI-compatible API) ---
-  if (process.env.OLLAMA_MODEL || process.env.OLLAMA_HOST) {
-    try {
-      const { default: OpenAI } = await import('openai');
-      const host = process.env.OLLAMA_HOST ?? 'http://localhost:11434';
-      const model = process.env.OLLAMA_MODEL ?? 'llama3.2';
-      const client = new OpenAI({ baseURL: `${host}/v1`, apiKey: 'ollama' });
+  // --- Ollama (auto-detected or via env vars) ---
+  {
+    const host = process.env.OLLAMA_HOST ?? 'http://localhost:11434';
+    const model = process.env.OLLAMA_MODEL ?? 'llama3.2';
+    const hasEnv = !!(process.env.OLLAMA_MODEL || process.env.OLLAMA_HOST);
 
-      providers.push({
-        name: 'ollama',
-        model,
-        callLlm: async (prompt: string): Promise<string> => {
-          const r = await client.chat.completions.create({
-            model,
-            messages: [{ role: 'user', content: prompt }],
-            max_tokens: 400,
-            temperature: 0.3,
-          });
-          return r.choices[0]?.message?.content ?? '';
-        },
-      });
-    } catch (err) {
-      console.log(
-        `  OpenAI SDK not installed (needed for Ollama), skipping (${(err as Error).message})`,
-      );
+    // Auto-detect: probe the Ollama API with a short timeout
+    let ollamaAvailable = hasEnv;
+    if (!hasEnv) {
+      try {
+        const res = await fetch(`${host}/api/tags`, {
+          signal: AbortSignal.timeout(2000),
+        });
+        if (res.ok) {
+          const data = (await res.json()) as { models?: { name: string }[] };
+          const models = data.models ?? [];
+          const hasModel = models.some((m) => m.name === model || m.name === `${model}:latest`);
+          if (hasModel) {
+            ollamaAvailable = true;
+          } else if (models.length > 0) {
+            console.log(
+              `  Ollama running but model "${model}" not found (available: ${models.map((m) => m.name).join(', ')})`,
+            );
+          }
+        }
+      } catch {
+        // Not running — skip silently
+      }
+    }
+
+    if (ollamaAvailable) {
+      try {
+        const { default: OpenAI } = await import('openai');
+        const client = new OpenAI({ baseURL: `${host}/v1`, apiKey: 'ollama' });
+
+        providers.push({
+          name: 'ollama',
+          model,
+          callLlm: async (prompt: string): Promise<string> => {
+            const r = await client.chat.completions.create({
+              model,
+              messages: [{ role: 'user', content: prompt }],
+              max_tokens: 400,
+              temperature: 0.3,
+            });
+            return r.choices[0]?.message?.content ?? '';
+          },
+        });
+      } catch (err) {
+        console.log(
+          `  Ollama detected but openai SDK not installed — run \`npm install openai\` (${(err as Error).message})`,
+        );
+      }
     }
   }
 
@@ -92,7 +120,7 @@ export async function detectProviders(): Promise<LlmProvider[]> {
             messages: [{ role: 'user', content: prompt }],
           });
           const block = msg.content[0];
-          return block.type === 'text' ? block.text : '';
+          return block?.type === 'text' ? block.text : '';
         },
       });
     } catch (err) {
diff --git a/bench/run.ts b/bench/run.ts
index 3b69ff7..f275d74 100644
--- a/bench/run.ts
+++ b/bench/run.ts
@@ -3,9 +3,40 @@ import { uncompress } from '../src/expand.js';
 import { createSummarizer, createEscalatingSummarizer } from '../src/summarizer.js';
 import type { CompressResult, Message } from '../src/types.js';
 import { readFileSync, readdirSync, statSync, existsSync } from 'node:fs';
-import { join } from 'node:path';
+import { join, resolve } from 'node:path';
 import { homedir } from 'node:os';
+import { execSync } from 'node:child_process';
+import { gzipSync } from 'node:zlib';
 import { detectProviders } from './llm.js';
+import type {
+  LlmBenchmarkResult,
+  LlmMethodResult,
+  LlmTokenBudgetResult,
+  BenchmarkResults,
+} from './baseline.js';
+import {
+  saveLlmResult,
+  saveBaseline,
+  loadCurrentBaseline,
+  compareResults,
+  formatRegressions,
+  generateBenchmarkDocs,
+} from './baseline.js';
+
+// ---------------------------------------------------------------------------
+// Auto-load .env (no dependency, won't override existing vars)
+// ---------------------------------------------------------------------------
+
+const envPath = resolve(import.meta.dirname, '..', '.env');
+if (existsSync(envPath)) {
+  for (const line of readFileSync(envPath, 'utf-8').split('\n')) {
+    const match = line.match(/^\s*(?:export\s+)?([^#=]+?)\s*=\s*(.*?)\s*$/);
+    if (!match || process.env[match[1]]) continue;
+    // Strip wrapping quotes (single or double)
+    const val = match[2].replace(/^(['"])(.*)\1$/, '$2');
+    process.env[match[1]] = val;
+  }
+}
 
 // ---------------------------------------------------------------------------
 // Helpers
@@ -39,6 +70,7 @@ function buildScenarios(): Scenario[] {
     toolHeavy(),
     shortConversation(),
     deepConversation(),
+    technicalExplanation(),
     structuredContent(),
     agenticCodingSession(),
   ];
@@ -262,6 +294,160 @@ function deepConversation(): Scenario {
   return { name: 'Deep conversation', messages };
 }
 
+function technicalExplanation(): Scenario {
+  // Prose-heavy architectural discussion — no code fences, no structured data.
+  // Tests whether LLM summarization outperforms deterministic on pure technical prose.
+  return {
+    name: 'Technical explanation',
+    messages: [
+      msg('system', 'You are a principal engineer explaining system design to a new team lead.'),
+      msg(
+        'user',
+        'Can you walk me through how our event-driven architecture works and why we chose it over a traditional request-response model?',
+      ),
+      msg(
+        'assistant',
+        'The event-driven architecture we adopted grew out of a series of scaling problems we ' +
+          'encountered roughly two years ago when the monolith started buckling under peak traffic. ' +
+          'The core idea is that services communicate through immutable events published to a central ' +
+          'log rather than making synchronous calls to each other. When the order service completes a ' +
+          'checkout, it publishes an OrderPlaced event. The inventory service, the notification ' +
+          'service, and the analytics pipeline each consume that event independently and at their own ' +
+          'pace. None of them need to know about the others, and none of them can block the order ' +
+          'service from finishing its work.\n\n' +
+          'The biggest advantage is temporal decoupling. Under the old model, a single slow downstream ' +
+          'dependency could cascade latency across the entire checkout flow. A payment verification ' +
+          'that normally took fifty milliseconds would occasionally spike to several seconds, and ' +
+          'because every call was synchronous, the user sat waiting while six services completed in ' +
+          'sequence. With events, the order service does its own work, publishes, and returns. ' +
+          'Downstream processing happens asynchronously, so the user experience stays fast even when ' +
+          'one consumer is struggling.\n\n' +
+          'The trade-off is eventual consistency. In a synchronous model, you get immediate confirmation ' +
+          'that every step completed. In an event-driven model, you have to design for the possibility ' +
+          'that the inventory adjustment or the email notification has not happened yet when the user ' +
+          'sees their confirmation page. We handle this through careful UX design that sets appropriate ' +
+          'expectations and through idempotent consumers that can safely replay events if something ' +
+          'goes wrong during processing.',
+      ),
+      msg(
+        'user',
+        'That makes sense. What about the event schema evolution problem? How do we handle ' +
+          'changes to event formats without breaking all the consumers?',
+      ),
+      msg(
+        'assistant',
+        'Schema evolution is one of the hardest problems in event-driven systems and we spent ' +
+          'considerable time getting our approach right. We settled on a strategy that combines ' +
+          'explicit versioning with backward-compatible evolution rules. Every event type carries a ' +
+          'schema version number in its header. Consumers declare which versions they understand, and ' +
+          'the platform routes accordingly.\n\n' +
+          'For non-breaking changes like adding a new optional field, we increment the minor version. ' +
+          'Consumers that do not understand the new field simply ignore it. This follows the robustness ' +
+          'principle of being conservative in what you send and liberal in what you accept. The vast ' +
+          'majority of our schema changes fall into this category, and they require zero coordination ' +
+          'between teams.\n\n' +
+          'For breaking changes like removing a field or changing a field type, we increment the major ' +
+          'version and run both the old and new versions in parallel during a migration window. The ' +
+          'producing service publishes to both versions, and consuming teams migrate on their own ' +
+          'schedule within the agreed window. Once all consumers have migrated, we sunset the old ' +
+          'version. This approach adds operational complexity but gives teams the autonomy to migrate ' +
+          'at their own pace without coordinated deployment ceremonies.\n\n' +
+          'We also maintain a central schema registry that validates events at publish time. If a ' +
+          'service tries to publish an event that does not conform to its declared schema, the publish ' +
+          'fails fast with a clear error. This catches drift between the code and the schema before ' +
+          'it reaches consumers and prevents subtle data corruption that would be much harder to ' +
+          'diagnose after the fact.',
+      ),
+      msg(
+        'user',
+        'How do we handle ordering guarantees? I noticed some of our consumers seem to ' +
+          'process events out of order.',
+      ),
+      msg(
+        'assistant',
+        'Ordering is subtle in distributed event systems and the right answer depends on what ' +
+          'level of ordering your consumer actually needs. Our event platform guarantees ordering ' +
+          'within a partition, but not across partitions. Events for the same aggregate, like all ' +
+          'events for a specific order, go to the same partition based on the aggregate identifier, ' +
+          'so they arrive in the order they were published. Events for different aggregates may arrive ' +
+          'in any order because they live on different partitions processed by different consumer ' +
+          'instances.\n\n' +
+          'Most consumers only care about ordering within an aggregate and they get it for free from ' +
+          'the partitioning scheme. The inventory service needs to see OrderPlaced before OrderCancelled ' +
+          'for the same order, and it does because both events share the same order identifier as their ' +
+          'partition key. But it does not care whether order A events arrive before or after order B ' +
+          'events because those are independent aggregates.\n\n' +
+          'For the rare cases where a consumer needs cross-aggregate ordering, we use a sequence ' +
+          'number stamped by the producer at publish time. The consumer tracks the last processed ' +
+          'sequence number and detects gaps. When it encounters a gap, it buffers the event and waits ' +
+          'briefly for the missing one to arrive. If the missing event does not appear within the ' +
+          'timeout window, the consumer raises an alert and proceeds with compensating logic. This ' +
+          'approach adds latency in the gap case but preserves correctness.\n\n' +
+          'The consumers you noticed processing out of order are likely reading from a topic that ' +
+          'mixes events from multiple aggregates. Check whether the partition key is set correctly ' +
+          'on the producer side. A common mistake is using a random or missing partition key, which ' +
+          'causes round-robin assignment and destroys any ordering guarantee.',
+      ),
+      msg(
+        'user',
+        'What about failure handling? If a consumer crashes mid-processing, do we lose events?',
+      ),
+      msg(
+        'assistant',
+        'No, the platform guarantees at-least-once delivery. When a consumer reads an event, it ' +
+          'does not acknowledge the read until processing completes successfully. If the consumer ' +
+          'crashes before acknowledging, the event remains available for redelivery. The next time ' +
+          'the consumer starts up, or when another instance in the consumer group takes over the ' +
+          'partition, it picks up from the last acknowledged position.\n\n' +
+          'The flip side of at-least-once delivery is that consumers must be idempotent. They will ' +
+          'occasionally see the same event twice, particularly after a crash or during a rebalance. ' +
+          'We enforce idempotency through a combination of techniques depending on the consumer. ' +
+          'Some use a deduplication table that records processed event identifiers and skips any event ' +
+          'it has seen before. Others design their state mutations to be naturally idempotent, for ' +
+          'example using upserts instead of inserts or applying absolute values instead of deltas.\n\n' +
+          'Dead letter queues catch events that fail processing repeatedly. After a configurable number ' +
+          'of retries with exponential backoff, the platform moves the event to a dead letter topic ' +
+          'and advances the consumer offset so it does not block subsequent events. The operations team ' +
+          'monitors the dead letter queue and investigates failures. Once the root cause is fixed, they ' +
+          'can replay events from the dead letter queue back into the main topic.\n\n' +
+          'We also run a reconciliation process that periodically compares the state derived from ' +
+          'events against the source of truth in the originating service. This catches any silent data ' +
+          'loss or corruption that might slip through the cracks. The reconciler runs daily during off-' +
+          'peak hours and flags discrepancies for manual review.',
+      ),
+      msg(
+        'user',
+        'One more thing — how do we test all of this? Integration testing with events seems ' +
+          'really hard.',
+      ),
+      msg(
+        'assistant',
+        'Testing event-driven systems requires a layered approach because no single testing ' +
+          'strategy covers all the failure modes. At the unit level, each consumer is tested with ' +
+          'synthetic events that exercise the happy path, edge cases, and error paths. These tests ' +
+          'run in milliseconds and give fast feedback on business logic correctness.\n\n' +
+          'At the integration level, we run each service against a local instance of the event ' +
+          'platform. The test publishes events, waits for the consumer to process them, and verifies ' +
+          'the resulting state. These tests are slower but catch serialization issues, schema ' +
+          'mismatches, and configuration problems that unit tests miss. We keep the integration test ' +
+          'suite focused on the boundaries: publishing, consuming, and acknowledging. Internal ' +
+          'business logic is covered at the unit level.\n\n' +
+          'At the system level, we maintain a staging environment that mirrors production topology. ' +
+          'Every deployment goes through staging first, where we run end-to-end scenarios that ' +
+          'exercise the full event flow from producer through all consumers. These tests use realistic ' +
+          'data volumes and introduce controlled failures like consumer crashes and network partitions ' +
+          'to verify that the retry and dead-letter mechanisms work correctly.\n\n' +
+          'Contract testing bridges the gap between producers and consumers without requiring a ' +
+          'shared integration environment. Each consumer publishes a contract describing the events ' +
+          'it expects, and the producer runs those contracts as part of its build. If a producer ' +
+          'change would break a consumer contract, the build fails before the change reaches any ' +
+          'shared environment. This is particularly valuable in our setup where different teams own ' +
+          "different services and may not be aware of each other's dependencies.",
+      ),
+    ],
+  };
+}
+
 function structuredContent(): Scenario {
   // Pure prose about auth (~1500 chars): no code, URLs, SQL, API keys, JSON, paths, etc.
   const authProse =
@@ -618,9 +804,25 @@ interface Result {
 }
 
 async function run(): Promise<void> {
+  const args = process.argv.slice(2);
+  const flagSave = args.includes('--save');
+  const flagCheck = args.includes('--check');
+  const flagLlm = args.includes('--llm');
+  const toleranceIdx = args.indexOf('--tolerance');
+  const tolerance = toleranceIdx >= 0 ? Number(args[toleranceIdx + 1]) / 100 : 0;
+
   const scenarios = buildScenarios();
   const results: Result[] = [];
 
+  // Structured results for baseline save/check
+  const benchResults: BenchmarkResults = {
+    basic: {},
+    tokenBudget: {},
+    dedup: {},
+    fuzzyDedup: {},
+    bundleSize: {},
+  };
+
   for (const scenario of scenarios) {
     const t0 = performance.now();
 
@@ -648,6 +850,13 @@ async function run(): Promise<void> {
       roundTrip,
       timeMs: (t1 - t0).toFixed(2),
     });
+
+    benchResults.basic[scenario.name] = {
+      ratio: cr.compression.ratio,
+      tokenRatio: cr.compression.token_ratio,
+      compressed: cr.compression.messages_compressed,
+      preserved: cr.compression.messages_preserved,
+    };
   }
 
   // Print table
@@ -777,6 +986,16 @@ async function run(): Promise<void> {
           ((t1 - t0).toFixed(2) + 'ms').padStart(cols.time),
         ].join('  '),
       );
+
+      const tbKey = `${scenario.name}|dedup=${dedup}`;
+      benchResults.tokenBudget[tbKey] = {
+        tokenCount: cr.tokenCount ?? 0,
+        fits: cr.fits ?? false,
+        recencyWindow: cr.recencyWindow,
+        compressed: cr.compression.messages_compressed,
+        preserved: cr.compression.messages_preserved,
+        deduped: cr.compression.messages_deduped ?? 0,
+      };
     }
   }
 
@@ -840,6 +1059,14 @@ async function run(): Promise<void> {
         rt2.padStart(cols.rt),
       ].join('  '),
     );
+
+    benchResults.dedup[scenario.name] = {
+      rw0Base: baseRw0.compression.ratio,
+      rw0Dup: dedupRw0.compression.ratio,
+      rw4Base: baseRw4.compression.ratio,
+      rw4Dup: dedupRw4.compression.ratio,
+      deduped,
+    };
   }
 
   console.log(dedupSep);
@@ -898,6 +1125,12 @@ async function run(): Promise<void> {
         ((t1 - t0).toFixed(2) + 'ms').padStart(cols.time),
       ].join('  '),
     );
+
+    benchResults.fuzzyDedup[scenario.name] = {
+      exact: cr.compression.messages_deduped ?? 0,
+      fuzzy: cr.compression.messages_fuzzy_deduped ?? 0,
+      ratio: cr.compression.ratio,
+    };
   }
 
   console.log(fuzzySep);
@@ -907,20 +1140,116 @@ async function run(): Promise<void> {
     process.exit(1);
   }
 
+  // ---------------------------------------------------------------------------
+  // Bundle size
+  // ---------------------------------------------------------------------------
+
+  console.log();
+  console.log('Bundle Size');
+
+  execSync('npm run build', { stdio: 'pipe', cwd: resolve(import.meta.dirname, '..') });
+
+  const distDir = resolve(import.meta.dirname, '..', 'dist');
+  const distFiles = readdirSync(distDir, { recursive: true })
+    .map(String)
+    .filter((f) => f.endsWith('.js'))
+    .sort();
+
+  let totalBytes = 0;
+  let totalGzip = 0;
+
+  const bsHeader = [
+    'File'.padEnd(30),
+    'Size'.padStart(10),
+    'Gzip'.padStart(10),
+  ].join('  ');
+  const bsSep = '-'.repeat(bsHeader.length);
+
+  console.log(bsSep);
+  console.log(bsHeader);
+  console.log(bsSep);
+
+  for (const file of distFiles) {
+    const fullPath = join(distDir, file);
+    const bytes = statSync(fullPath).size;
+    const gzipBytes = gzipSync(readFileSync(fullPath)).length;
+    totalBytes += bytes;
+    totalGzip += gzipBytes;
+
+    benchResults.bundleSize[file] = { bytes, gzipBytes };
+
+    const fmtBytes = bytes < 1024 ? `${bytes} B` : `${(bytes / 1024).toFixed(1)} KB`;
+    const fmtGzip = gzipBytes < 1024 ? `${gzipBytes} B` : `${(gzipBytes / 1024).toFixed(1)} KB`;
+    console.log(
+      [file.padEnd(30), fmtBytes.padStart(10), fmtGzip.padStart(10)].join('  '),
+    );
+  }
+
+  benchResults.bundleSize['total'] = { bytes: totalBytes, gzipBytes: totalGzip };
+
+  const fmtTotal = totalBytes < 1024 ? `${totalBytes} B` : `${(totalBytes / 1024).toFixed(1)} KB`;
+  const fmtTotalGz =
+    totalGzip < 1024 ? `${totalGzip} B` : `${(totalGzip / 1024).toFixed(1)} KB`;
+  console.log(bsSep);
+  console.log(
+    ['total'.padEnd(30), fmtTotal.padStart(10), fmtTotalGz.padStart(10)].join('  '),
+  );
+  console.log(bsSep);
+
+  // ---------------------------------------------------------------------------
+  // --save / --check
+  // ---------------------------------------------------------------------------
+
+  const baselinesDir = resolve(import.meta.dirname, 'baselines');
+  const version = JSON.parse(
+    readFileSync(resolve(import.meta.dirname, '..', 'package.json'), 'utf-8'),
+  ).version;
+
+  if (flagSave) {
+    saveBaseline(baselinesDir, version, benchResults);
+    generateBenchmarkDocs(
+      baselinesDir,
+      resolve(import.meta.dirname, '..', 'docs', 'benchmark-results.md'),
+    );
+    console.log();
+    console.log(`Baseline saved (v${version}) and docs/benchmark-results.md regenerated.`);
+  }
+
+  if (flagCheck) {
+    const current = loadCurrentBaseline(baselinesDir);
+    if (!current) {
+      console.error(
+        'No baseline found at bench/baselines/current.json — run `npm run bench:save` first.',
+      );
+      process.exit(1);
+    }
+    const regressions = compareResults(current.results, benchResults, tolerance);
+    if (regressions.length > 0) {
+      console.error();
+      console.error(formatRegressions(regressions));
+      process.exit(1);
+    }
+    console.log();
+    console.log(`Baseline check passed (v${current.version}, tolerance ${tolerance * 100}%).`);
+  }
+
   // ---------------------------------------------------------------------------
   // Real Claude Code sessions (if available locally)
   // ---------------------------------------------------------------------------
 
   runRealSessions();
 
-  await runLlmBenchmark();
+  // LLM benchmarks require explicit --llm flag (they cost money and take minutes)
+  if (flagLlm) {
+    await runLlmBenchmark();
+  }
 
   console.log();
   console.log('All benchmarks passed.');
 }
 
 // ---------------------------------------------------------------------------
-// LLM summarization benchmark (opt-in via env vars)
+// LLM summarization benchmark (requires --llm flag)
 // ---------------------------------------------------------------------------
 
 function roundTrip(messages: Message[], cr: CompressResult): 'PASS' | 'FAIL' {
@@ -936,12 +1265,13 @@ async function runLlmBenchmark(): Promise<void> {
   if (providers.length === 0) {
     console.log();
     console.log(
-      'LLM Summarization Benchmark — skipped (no OPENAI_API_KEY, OLLAMA_MODEL, or ANTHROPIC_API_KEY set)',
+      'LLM Summarization Benchmark — no providers detected (set OPENAI_API_KEY or ANTHROPIC_API_KEY in .env, or start Ollama)',
     );
     return;
   }
 
   const scenarios = buildScenarios().filter((s) => s.name !== 'Short conversation');
+  const baselinesDir = resolve(import.meta.dirname, 'baselines');
 
   for (const provider of providers) {
     console.log();
@@ -955,6 +1285,7 @@ async function runLlmBenchmark(): Promise<void> {
       method: 14,
       chr: 6,
       tkr: 6,
+      vsDet: 6,
       comp: 5,
       pres: 5,
       rt: 5,
@@ -966,6 +1297,7 @@ async function runLlmBenchmark(): Promise<void> {
       'Method'.padStart(cols.method),
       'ChR'.padStart(cols.chr),
       'TkR'.padStart(cols.tkr),
+      'vsDet'.padStart(cols.vsDet),
       'Comp'.padStart(cols.comp),
       'Pres'.padStart(cols.pres),
       'R/T'.padStart(cols.rt),
@@ -978,42 +1310,178 @@ async function runLlmBenchmark(): Promise<void> {
     console.log(sep);
 
     let llmFails = 0;
+    const llmResult: LlmBenchmarkResult = {
+      provider: provider.name,
+      model: provider.model,
+      generated: new Date().toISOString(),
+      scenarios: {},
+    };
 
     for (const scenario of scenarios) {
-      // Deterministic baseline
-      const t0d = performance.now();
-      const detResult = compress(scenario.messages, { recencyWindow: 0 });
-      const t1d = performance.now();
-      const detRt = roundTrip(scenario.messages, detResult);
-
-      printLlmRow(scenario.name, 'deterministic', detResult, detRt, t1d - t0d, cols);
-
-      // LLM basic summarizer
-      const t0b = performance.now();
-      const llmBasicResult = await compress(scenario.messages, {
-        recencyWindow: 0,
-        summarizer: basicSummarizer,
-      });
-      const t1b = performance.now();
-      const basicRt = roundTrip(scenario.messages, llmBasicResult);
-      if (basicRt === 'FAIL') llmFails++;
-
-      printLlmRow('', 'llm-basic', llmBasicResult, basicRt, t1b - t0b, cols);
-
-      // LLM escalating summarizer
-      const t0e = performance.now();
-      const llmEscResult = await compress(scenario.messages, {
-        recencyWindow: 0,
-        summarizer: escalatingSummarizer,
-      });
-      const t1e = performance.now();
-      const escRt = roundTrip(scenario.messages, llmEscResult);
-      if (escRt === 'FAIL') llmFails++;
-
-      printLlmRow('', 'llm-escalate', llmEscResult, escRt, t1e - t0e, cols);
-      console.log(sep);
+      try {
+        const scenarioResult: Record<string, LlmMethodResult> = {};
+
+        // Deterministic baseline
+        const t0d = performance.now();
+        const detResult = compress(scenario.messages, { recencyWindow: 0 });
+        const t1d = performance.now();
+        const detRt = roundTrip(scenario.messages, detResult);
+        const detRatio = detResult.compression.ratio;
+
+        printLlmRow(scenario.name, 'deterministic', detResult, detRt, t1d - t0d, undefined, cols);
+        scenarioResult['deterministic'] = {
+          ratio: detRatio,
+          tokenRatio: detResult.compression.token_ratio,
+          compressed: detResult.compression.messages_compressed,
+          preserved: detResult.compression.messages_preserved,
+          roundTrip: detRt,
+          timeMs: t1d - t0d,
+        };
+
+        // LLM basic summarizer
+        const t0b = performance.now();
+        const llmBasicResult = await compress(scenario.messages, {
+          recencyWindow: 0,
+          summarizer: basicSummarizer,
+        });
+        const t1b = performance.now();
+        const basicRt = roundTrip(scenario.messages, llmBasicResult);
+        if (basicRt === 'FAIL') llmFails++;
+        const basicVsDet = llmBasicResult.compression.ratio / detRatio;
+
+        printLlmRow('', 'llm-basic', llmBasicResult, basicRt, t1b - t0b, basicVsDet, cols);
+        scenarioResult['llm-basic'] = {
+          ratio: llmBasicResult.compression.ratio,
+          tokenRatio: llmBasicResult.compression.token_ratio,
+          compressed: llmBasicResult.compression.messages_compressed,
+          preserved: llmBasicResult.compression.messages_preserved,
+          roundTrip: basicRt,
+          timeMs: t1b - t0b,
+          vsDet: basicVsDet,
+        };
+
+        // LLM escalating summarizer
+        const t0e = performance.now();
+        const llmEscResult = await compress(scenario.messages, {
+          recencyWindow: 0,
+          summarizer: escalatingSummarizer,
+        });
+        const t1e = performance.now();
+        const escRt = roundTrip(scenario.messages, llmEscResult);
+        if (escRt === 'FAIL') llmFails++;
+        const escVsDet = llmEscResult.compression.ratio / detRatio;
+
+        printLlmRow('', 'llm-escalate', llmEscResult, escRt, t1e - t0e, escVsDet, cols);
+        scenarioResult['llm-escalate'] = {
+          ratio: llmEscResult.compression.ratio,
+          tokenRatio: llmEscResult.compression.token_ratio,
+          compressed: llmEscResult.compression.messages_compressed,
+          preserved: llmEscResult.compression.messages_preserved,
+          roundTrip: escRt,
+          timeMs: t1e - t0e,
+          vsDet: escVsDet,
+        };
+
+        console.log(sep);
+        llmResult.scenarios[scenario.name] = { methods: scenarioResult };
+      } catch (err) {
+        console.error(`  ${scenario.name}: ERROR — ${(err as Error).message}`);
+        console.log(sep);
+      }
     }
 
+    // --- Token budget + LLM ---
+    const tokenBudget = 2000;
+    const budgetScenarios: Scenario[] = scenarios.filter(
+      (s) => s.name === 'Deep conversation' || s.name === 'Agentic coding session',
+    );
+
+    if (budgetScenarios.length > 0) {
+      console.log();
+      console.log(
+        `LLM Token Budget — ${provider.name} (${provider.model}) — target: ${tokenBudget} tokens`,
+      );
+
+      const tbCols = { name: 24, method: 14, tokens: 7, fits: 5, rw: 4, chr: 6, rt: 5, time: 10 };
+      const tbHeader = [
+        'Scenario'.padEnd(tbCols.name),
+        'Method'.padStart(tbCols.method),
+        'Tokens'.padStart(tbCols.tokens),
+        'Fits'.padStart(tbCols.fits),
+        'Rw'.padStart(tbCols.rw),
+        'ChR'.padStart(tbCols.chr),
+        'R/T'.padStart(tbCols.rt),
+        'Time'.padStart(tbCols.time),
+      ].join('  ');
+      const tbSep = '-'.repeat(tbHeader.length);
+
+      console.log(tbSep);
+      console.log(tbHeader);
+      console.log(tbSep);
+
+      llmResult.tokenBudget = {};
+
+      for (const scenario of budgetScenarios) {
+        const entries: LlmTokenBudgetResult[] = [];
+
+        try {
+          // Deterministic with token budget
+          const t0d = performance.now();
+          const detCr = compress(scenario.messages, { tokenBudget });
+          const t1d = performance.now();
+          const detRt = roundTrip(scenario.messages, detCr);
+
+          const detEntry: LlmTokenBudgetResult = {
+            budget: tokenBudget,
+            method: 'deterministic',
+            tokenCount: detCr.tokenCount ?? 0,
+            fits: detCr.fits ?? false,
+            ratio: detCr.compression.ratio,
+            recencyWindow: detCr.recencyWindow,
+            roundTrip: detRt,
+            timeMs: t1d - t0d,
+          };
+          entries.push(detEntry);
+          printBudgetRow(scenario.name, detEntry, tbCols);
+
+          // LLM escalating with token budget
+          const t0e = performance.now();
+          const llmCr = await compress(scenario.messages, {
+            tokenBudget,
+            summarizer: escalatingSummarizer,
+          });
+          const t1e = performance.now();
+          const llmRt = roundTrip(scenario.messages, llmCr);
+
+          const llmEntry: LlmTokenBudgetResult = {
+            budget: tokenBudget,
+            method: 'llm-escalate',
+            tokenCount: llmCr.tokenCount ?? 0,
+            fits: llmCr.fits ?? false,
+            ratio: llmCr.compression.ratio,
+            recencyWindow: llmCr.recencyWindow,
+            roundTrip: llmRt,
+            timeMs: t1e - t0e,
+          };
+          entries.push(llmEntry);
+          printBudgetRow('', llmEntry, tbCols);
+
+          console.log(tbSep);
+        } catch (err) {
+          console.error(`  ${scenario.name}: ERROR — ${(err as Error).message}`);
+          console.log(tbSep);
+        }
+
+        if (entries.length > 0) {
+          llmResult.tokenBudget[scenario.name] = entries;
+        }
+      }
+    }
+
+    // Always save LLM results (informational, not gated behind --save)
+    saveLlmResult(baselinesDir, llmResult);
+    console.log(`  Results saved to bench/baselines/llm/`);
+
     if (llmFails > 0) {
       console.error(`  WARNING: ${llmFails} LLM scenario(s) failed round-trip`);
     }
@@ -1026,11 +1494,13 @@ function printLlmRow(
   cr: CompressResult,
   rt: string,
   timeMs: number,
+  vsDet: number | undefined,
   cols: {
     name: number;
     method: number;
     chr: number;
     tkr: number;
+    vsDet: number;
     comp: number;
     pres: number;
     rt: number;
@@ -1043,6 +1513,7 @@ function printLlmRow(
       method.padStart(cols.method),
       cr.compression.ratio.toFixed(2).padStart(cols.chr),
       cr.compression.token_ratio.toFixed(2).padStart(cols.tkr),
+      (vsDet != null ? vsDet.toFixed(2) : '-').padStart(cols.vsDet),
       String(cr.compression.messages_compressed).padStart(cols.comp),
       String(cr.compression.messages_preserved).padStart(cols.pres),
       rt.padStart(cols.rt),
@@ -1053,6 +1524,37 @@ function printLlmRow(
   );
 }
 
+function printBudgetRow(
+  name: string,
+  entry: LlmTokenBudgetResult,
+  cols: {
+    name: number;
+    method: number;
+    tokens: number;
+    fits: number;
+    rw: number;
+    chr: number;
+    rt: number;
+    time: number;
+  },
+): void {
+  console.log(
+    [
+      name.padEnd(cols.name),
+      entry.method.padStart(cols.method),
+      String(entry.tokenCount).padStart(cols.tokens),
+      String(entry.fits).padStart(cols.fits),
+      String(entry.recencyWindow ?? '-').padStart(cols.rw),
+      entry.ratio.toFixed(2).padStart(cols.chr),
+      entry.roundTrip.padStart(cols.rt),
+      (entry.timeMs < 1000
+        ? entry.timeMs.toFixed(0) + 'ms'
+        : (entry.timeMs / 1000).toFixed(1) + 's'
+      ).padStart(cols.time),
+    ].join('  '),
+  );
+}
+
 // ---------------------------------------------------------------------------
 // Real session support — convert Claude Code JSONL transcripts to Message[]
 // ---------------------------------------------------------------------------
diff --git a/docs/README.md b/docs/README.md
index 658c442..e5f246d 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -13,3 +13,4 @@
 | [Provenance](provenance.md)                     | `_cce_original` metadata, summary_id, parent_ids                |
 | [Preservation Rules](preservation-rules.md)     | What gets preserved, classification tiers, code-aware splitting |
 | [Benchmarks](benchmarks.md)                     | Running benchmarks, LLM comparison, interpreting results        |
+| [Benchmark Results](benchmark-results.md)       | Auto-generated results with charts (regenerated by bench:save)  |
diff --git a/docs/api-reference.md b/docs/api-reference.md
index 7fd7843..9f5973b 100644
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@@ -174,7 +174,7 @@ function defaultTokenCounter(msg: Message): number;
 Math.ceil(msg.content.length / 3.5);
 ```
 
-Approximates ~3.5 characters per token. Suitable for rough estimates. For accurate budgeting, replace with a real tokenizer. See [Token budget](token-budget.md).
+The 3.5 chars/token ratio is the empirical average for GPT-family BPE tokenizers (cl100k_base, o200k_base) on mixed English text. The lower end of the range (~3.2–4.5) is chosen intentionally so budget estimates stay conservative — over-counting tokens is safer than under-counting. For accurate budgeting, replace with a real tokenizer. See [Token budget](token-budget.md).
 
 ---
 
diff --git a/docs/benchmark-results.md b/docs/benchmark-results.md
new file mode 100644
index 0000000..8e54c61
--- /dev/null
+++ b/docs/benchmark-results.md
@@ -0,0 +1,271 @@
+# Benchmark Results
+
+[Back to README](../README.md) | [All docs](README.md) | [Handbook](benchmarks.md)
+
+*Auto-generated by `npm run bench:save`. Do not edit manually.*
+
+**v1.0.0** · Generated: 2026-02-26
+
+![avg ratio](https://img.shields.io/badge/avg%20ratio-2.08x-blue) ![best](https://img.shields.io/badge/best-6.16x-blue) ![scenarios](https://img.shields.io/badge/scenarios-8-blue) ![round-trip](https://img.shields.io/badge/round--trip-all_PASS-brightgreen) ![gzip](https://img.shields.io/badge/gzip-16.6%20KB-blue)
+
+## Summary
+
+| Metric | Value |
+| --- | --- |
+| Scenarios | 8 |
+| Average compression | 2.08x |
+| Best compression | 6.16x |
+| Round-trip integrity | all PASS |
+
+```mermaid
+pie title "Message Outcomes"
+    "Preserved" : 90
+    "Compressed" : 65
+```
+
+## Compression by Scenario
+
+> **8 scenarios** · **2.08x** avg ratio · **1.00x** – **6.16x** range · all round-trips PASS
+
+```mermaid
+xychart-beta
+    title "Compression Ratio by Scenario"
+    x-axis ["Coding", "Long Q&A", "Tool-heavy", "Short", "Deep", "Technical", "Structured", "Agentic"]
+    y-axis "Char Ratio"
+    bar [1.68, 6.16, 1.30, 1.00, 2.12, 1.00, 1.93, 1.43]
+```
+
+| Scenario | Ratio | Reduction | Token Ratio | Messages | Compressed | Preserved |
+| --- | ---: | ---: | ---: | ---: | ---: | ---: |
+| Coding assistant | 1.68 | 41% | 1.67 | 13 | 5 | 8 |
+| Long Q&A | 6.16 | 84% | 6.11 | 10 | 4 | 6 |
+| Tool-heavy | 1.30 | 23% | 1.29 | 18 | 2 | 16 |
+| Short conversation | 1.00 | 0% | 1.00 | 7 | 0 | 7 |
+| Deep conversation | 2.12 | 53% | 2.12 | 51 | 50 | 1 |
+| Technical explanation | 1.00 | 0% | 1.00 | 11 | 0 | 11 |
+| Structured content | 1.93 | 48% | 1.92 | 12 | 2 | 10 |
+| Agentic coding session | 1.43 | 30% | 1.43 | 33 | 2 | 31 |
+
+## Deduplication Impact
+
+```mermaid
+xychart-beta
+    title "Deduplication Impact (recencyWindow=0)"
+    x-axis ["Long Q&A", "Agentic"]
+    y-axis "Char Ratio"
+    bar [5.14, 1.14]
+    bar [6.16, 1.43]
+```
+
+*First bar: no dedup · Second bar: with dedup*
+
+| Scenario | No Dedup (rw=0) | Dedup (rw=0) | No Dedup (rw=4) | Dedup (rw=4) | Deduped |
+| --- | ---: | ---: | ---: | ---: | ---: |
+| Coding assistant | 1.68 | 1.68 | 1.51 | 1.51 | 0 |
+| Long Q&A | 5.14 | 6.16 | 1.90 | 2.03 | 1 |
+| Tool-heavy | 1.30 | 1.30 | 1.30 | 1.30 | 0 |
+| Short conversation | 1.00 | 1.00 | 1.00 | 1.00 | 0 |
+| Deep conversation | 2.12 | 2.12 | 1.95 | 1.95 | 0 |
+| Technical explanation | 1.00 | 1.00 | 1.00 | 1.00 | 0 |
+| Structured content | 1.93 | 1.93 | 1.37 | 1.37 | 0 |
+| Agentic coding session | 1.14 | 1.43 | 1.14 | 1.43 | 4 |
+
+### Fuzzy Dedup
+
+| Scenario | Exact Deduped | Fuzzy Deduped | Ratio | vs Base |
+| --- | ---: | ---: | ---: | ---: |
+| Coding assistant | 0 | 0 | 1.68 | - |
+| Long Q&A | 1 | 0 | 6.16 | - |
+| Tool-heavy | 0 | 0 | 1.30 | - |
+| Short conversation | 0 | 0 | 1.00 | - |
+| Deep conversation | 0 | 0 | 2.12 | - |
+| Technical explanation | 0 | 0 | 1.00 | - |
+| Structured content | 0 | 0 | 1.93 | - |
+| Agentic coding session | 4 | 2 | 2.23 | +56% |
+
+## Token Budget
+
+Target: **2000 tokens** · 1/4 fit
+
+| Scenario | Dedup | Tokens | Fits | recencyWindow | Compressed | Preserved | Deduped |
+| --- | --- | ---: | --- | ---: | ---: | ---: | ---: |
+| Deep conversation | no | 3738 | no | 0 | 50 | 1 | 0 |
+| Deep conversation | yes | 3738 | no | 0 | 50 | 1 | 0 |
+| Agentic coding session | no | 2345 | no | 0 | 4 | 33 | 0 |
+| Agentic coding session | yes | 1957 | yes | 9 | 1 | 32 | 4 |
+
+## Bundle Size
+
+> Zero-dependency ESM library — tracked per-file to catch regressions.
+
+| File | Size | Gzip |
+| --- | ---: | ---: |
+| classify.js | 7.5 KB | 3.2 KB |
+| compress.js | 33.1 KB | 8.5 KB |
+| dedup.js | 10.0 KB | 2.8 KB |
+| expand.js | 2.7 KB | 934 B |
+| index.js | 225 B | 159 B |
+| summarizer.js | 2.5 KB | 993 B |
+| types.js | 11 B | 31 B |
+| **total** | 56.2 KB | 16.6 KB |
+
+## LLM vs Deterministic
+
+> Results are **non-deterministic** — LLM outputs vary between runs. Saved as reference data, not used for regression testing.
+
+```
+Deterministic vs ollama/llama3.2
+
+Coding assistant        Det ████████░░░░░░░░░░░░░░░░░░░░░░ 1.68x
+                        LLM ████████░░░░░░░░░░░░░░░░░░░░░░ 1.55x
+
+Long Q&A                Det ██████████████████████████████ 6.16x
+                        LLM ██████████████████████░░░░░░░░ 4.49x
+
+Tool-heavy              Det ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.30x
+                        LLM ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.28x
+
+Deep conversation       Det ██████████░░░░░░░░░░░░░░░░░░░░ 2.12x
+                        LLM ████████████████░░░░░░░░░░░░░░ 3.28x  ★
+
+Technical explanation   Det █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x
+                        LLM █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x
+
+Structured content      Det █████████░░░░░░░░░░░░░░░░░░░░░ 1.93x
+                        LLM ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.46x
+
+Agentic coding session  Det ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.43x
+                        LLM ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.40x
+
+★ = LLM wins
+```
+
+```
+Deterministic vs openai/gpt-4.1-mini
+
+Coding assistant        Det ████████░░░░░░░░░░░░░░░░░░░░░░ 1.68x
+                        LLM ████████░░░░░░░░░░░░░░░░░░░░░░ 1.64x
+
+Long Q&A                Det ██████████████████████████████ 6.16x
+                        LLM ██████████████████████████░░░░ 5.37x
+
+Tool-heavy              Det ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.30x
+                        LLM █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.12x
+
+Deep conversation       Det ██████████░░░░░░░░░░░░░░░░░░░░ 2.12x
+                        LLM ████████████░░░░░░░░░░░░░░░░░░ 2.37x  ★
+
+Technical explanation   Det █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x
+                        LLM █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x
+
+Structured content      Det █████████░░░░░░░░░░░░░░░░░░░░░ 1.93x
+                        LLM ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.29x
+
+Agentic coding session  Det ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.43x
+                        LLM ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.43x
+
+★ = LLM wins
+```
+
+### Provider Summary
+
+| Provider | Model | Avg Ratio | Avg vsDet | Round-trip | Budget Fits | Avg Time |
+| --- | --- | ---: | ---: | --- | --- | ---: |
+| ollama | llama3.2 | 2.09x | 0.96 | all PASS | 1/4 | 4.2s |
+| openai | gpt-4.1-mini | 2.09x | 0.92 | all PASS | 2/4 | 8.1s |
+
+> **Key findings:**
+> LLM wins on prose-heavy scenarios: Deep conversation, Technical explanation
+> Deterministic wins on structured/technical content: Coding assistant, Long Q&A, Tool-heavy, Structured content
+
+### ollama (llama3.2)
+
+*Generated: 2026-02-25*
+
+<details>
+<summary>Scenario details</summary>
+
+| Scenario | Method | Char Ratio | Token Ratio | vsDet | Compressed | Preserved | Round-trip | Time |
+| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: |
+| Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms |
+|  | llm-basic | 1.48 | 1.48 | 0.88 | 5 | 8 | PASS | 5.9s |
+|  | llm-escalate | 1.55 | 1.55 | 0.92 | 5 | 8 | PASS | 3.0s |
+| Long Q&A | deterministic | 6.16 | 6.11 | - | 4 | 6 | PASS | 1ms |
+|  | llm-basic | 4.31 | 4.28 | 0.70 | 4 | 6 | PASS | 4.1s |
+|  | llm-escalate | 4.49 | 4.46 | 0.73 | 4 | 6 | PASS | 3.7s |
+| Tool-heavy | deterministic | 1.30 | 1.29 | - | 2 | 16 | PASS | 2ms |
+|  | llm-basic | 1.12 | 1.11 | 0.86 | 2 | 16 | PASS | 2.3s |
+|  | llm-escalate | 1.28 | 1.28 | 0.99 | 2 | 16 | PASS | 2.8s |
+| Deep conversation | deterministic | 2.12 | 2.12 | - | 50 | 1 | PASS | 3ms |
+|  | llm-basic | 3.12 | 3.11 | 1.47 | 50 | 1 | PASS | 22.7s |
+|  | llm-escalate | 3.28 | 3.26 | 1.54 | 50 | 1 | PASS | 23.3s |
+| Technical explanation | deterministic | 1.00 | 1.00 | - | 0 | 11 | PASS | 1ms |
+|  | llm-basic | 1.00 | 1.00 | 1.00 | 0 | 11 | PASS | 3.2s |
+|  | llm-escalate | 1.00 | 1.00 | 1.00 | 2 | 9 | PASS | 785ms |
+| Structured content | deterministic | 1.93 | 1.92 | - | 2 | 10 | PASS | 0ms |
+|  | llm-basic | 1.46 | 1.45 | 0.75 | 2 | 10 | PASS | 3.5s |
+|  | llm-escalate | 1.38 | 1.38 | 0.71 | 2 | 10 | PASS | 3.7s |
+| Agentic coding session | deterministic | 1.43 | 1.43 | - | 2 | 31 | PASS | 1ms |
+|  | llm-basic | 1.35 | 1.34 | 0.94 | 2 | 31 | PASS | 3.3s |
+|  | llm-escalate | 1.40 | 1.40 | 0.98 | 2 | 31 | PASS | 5.4s |
+
+#### Token Budget (target: 2000 tokens)
+
+| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time |
+| --- | --- | ---: | --- | ---: | ---: | --- | ---: |
+| Deep conversation | deterministic | 3738 | false | 0 | 2.12 | PASS | 12ms |
+|  | llm-escalate | 2593 | false | 0 | 3.08 | PASS | 132.0s |
+| Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms |
+|  | llm-escalate | 2003 | false | 9 | 1.33 | PASS | 4.1s |
+
+</details>
+
+### openai (gpt-4.1-mini)
+
+*Generated: 2026-02-25*
+
+<details>
+<summary>Scenario details</summary>
+
+| Scenario | Method | Char Ratio | Token Ratio | vsDet | Compressed | Preserved | Round-trip | Time |
+| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: |
+| Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms |
+|  | llm-basic | 1.64 | 1.63 | 0.98 | 5 | 8 | PASS | 5.6s |
+|  | llm-escalate | 1.63 | 1.63 | 0.97 | 5 | 8 | PASS | 6.0s |
+| Long Q&A | deterministic | 6.16 | 6.11 | - | 4 | 6 | PASS | 1ms |
+|  | llm-basic | 5.37 | 5.33 | 0.87 | 4 | 6 | PASS | 5.9s |
+|  | llm-escalate | 5.35 | 5.31 | 0.87 | 4 | 6 | PASS | 7.0s |
+| Tool-heavy | deterministic | 1.30 | 1.29 | - | 2 | 16 | PASS | 0ms |
+|  | llm-basic | 1.11 | 1.10 | 0.85 | 2 | 16 | PASS | 3.5s |
+|  | llm-escalate | 1.12 | 1.12 | 0.86 | 2 | 16 | PASS | 5.3s |
+| Deep conversation | deterministic | 2.12 | 2.12 | - | 50 | 1 | PASS | 3ms |
+|  | llm-basic | 2.34 | 2.33 | 1.10 | 50 | 1 | PASS | 50.4s |
+|  | llm-escalate | 2.37 | 2.36 | 1.11 | 50 | 1 | PASS | 50.8s |
+| Technical explanation | deterministic | 1.00 | 1.00 | - | 0 | 11 | PASS | 1ms |
+|  | llm-basic | 1.00 | 1.00 | 1.00 | 1 | 10 | PASS | 2.6s |
+|  | llm-escalate | 1.00 | 1.00 | 1.00 | 1 | 10 | PASS | 3.3s |
+| Structured content | deterministic | 1.93 | 1.92 | - | 2 | 10 | PASS | 0ms |
+|  | llm-basic | 1.23 | 1.23 | 0.64 | 2 | 10 | PASS | 10.2s |
+|  | llm-escalate | 1.29 | 1.29 | 0.67 | 2 | 10 | PASS | 4.8s |
+| Agentic coding session | deterministic | 1.43 | 1.43 | - | 2 | 31 | PASS | 1ms |
+|  | llm-basic | 1.43 | 1.43 | 1.00 | 2 | 31 | PASS | 5.8s |
+|  | llm-escalate | 1.32 | 1.32 | 0.93 | 1 | 32 | PASS | 9.5s |
+
+#### Token Budget (target: 2000 tokens)
+
+| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time |
+| --- | --- | ---: | --- | ---: | ---: | --- | ---: |
+| Deep conversation | deterministic | 3738 | false | 0 | 2.12 | PASS | 10ms |
+|  | llm-escalate | 3391 | false | 0 | 2.35 | PASS | 280.5s |
+| Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms |
+|  | llm-escalate | 1915 | true | 3 | 1.39 | PASS | 28.1s |
+
+</details>
+
+## Methodology
+
+- All deterministic results use the same input → same output guarantee
+- Metrics: compression ratio, token ratio, message counts, dedup counts
+- Timing is excluded from baselines (hardware-dependent)
+- LLM benchmarks are saved as reference data, not used for regression testing
+- Round-trip integrity is verified for every scenario (compress then uncompress)
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 4111308..eca3acb 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -1,151 +1,85 @@
 # Benchmarks
 
-[Back to README](../README.md) | [All docs](README.md)
+[Back to README](../README.md) | [All docs](README.md) | [Latest Results](benchmark-results.md)
 
-Running benchmarks, interpreting results, and comparing compression methods.
-
-## Running tests
-
-```bash
-# Run the test suite (333 tests)
-npm test
-
-# Type check
-npx tsc --noEmit
-```
-
-## Deterministic benchmarks
-
-No API keys needed. Runs entirely locally:
+## Running Benchmarks
 
 ```bash
-npm run bench
+npm run bench          # Run benchmarks (no baseline check)
+npm run bench:check    # Run and compare against baseline
+npm run bench:save     # Run, save new baseline, regenerate results doc
+npm run bench:llm      # Run with LLM summarization benchmarks
 ```
 
-### Scenarios
-
-The benchmark covers 7 conversation types:
+### LLM benchmarks (opt-in)
 
-| Scenario               | Description                                              |
-| ---------------------- | -------------------------------------------------------- |
-| Coding assistant       | Mixed code fences and prose discussion                   |
-| Long Q&A               | Extended question-and-answer with detailed explanations  |
-| Tool-heavy             | Messages with `tool_calls` arrays (preserved by default) |
-| Short conversation     | Brief exchanges, mostly under 120 chars                  |
-| Deep conversation      | Long, multi-paragraph prose exchanges                    |
-| Structured content     | JSON, YAML, SQL, test output                             |
-| Agentic coding session | Repeated file reads, grep results, test runs             |
+LLM benchmarks require the `--llm` flag (`npm run bench:llm`). Set API keys in a `.env` file or export them. Ollama is auto-detected when running locally.
 
-### What gets measured
+| Variable | Provider | Default Model | Notes |
+| --- | --- | --- | --- |
+| `OPENAI_API_KEY` | OpenAI | `gpt-4.1-mini` | |
+| `ANTHROPIC_API_KEY` | Anthropic | `claude-haiku-4-5-20251001` | |
+| *(none required)* | Ollama | `llama3.2` | Auto-detected on localhost:11434 |
 
-For each scenario:
+## Scenarios
 
-- **Characters**: original vs. compressed character counts
-- **Compression ratio**: `original_chars / compressed_chars` (>1 = savings)
-- **Token ratio**: `original_tokens / compressed_tokens`
-- **Messages compressed**: how many messages were summarized
-- **Messages preserved**: how many were kept as-is
-- **Messages deduped**: exact duplicates replaced (agentic scenario)
-- **Timing**: milliseconds per compression
+The benchmark covers 8 conversation types:
 
-Additional benchmark sections:
+| Scenario | Description |
+| --- | --- |
+| Coding assistant | Mixed code fences and prose discussion |
+| Long Q&A | Extended question-and-answer with repeated paragraphs |
+| Tool-heavy | Messages with `tool_calls` arrays (preserved by default) |
+| Short conversation | Brief exchanges, mostly under 120 chars |
+| Deep conversation | 25 turns of multi-paragraph prose |
+| Technical explanation | Pure prose Q&A about event-driven architecture |
+| Structured content | JSON, YAML, SQL, API keys, test output |
+| Agentic coding session | Repeated file reads, grep results, near-duplicate edits |
 
-- **Token budget optimization** with and without dedup
-- **Fuzzy dedup accuracy** across thresholds
-- **Real-session compression** on actual Claude Code transcripts (if `~/.claude/projects/` exists)
-
-### Real-session benchmarks
-
-The benchmark automatically scans for real Claude Code conversation files in `~/.claude/projects/`. It parses JSONL conversation files, extracts message arrays, and runs compression on actual production data.
-
-This provides the most realistic performance numbers since synthetic scenarios can't capture the full diversity of real conversations.
-
-## LLM benchmarks
-
-Compare deterministic compression against real LLM-powered summarization. Set one or more environment variables to enable:
-
-| Variable            | Provider  | Default model                                             |
-| ------------------- | --------- | --------------------------------------------------------- |
-| `OPENAI_API_KEY`    | OpenAI    | `gpt-4.1-mini` (override: `OPENAI_MODEL`)                 |
-| `ANTHROPIC_API_KEY` | Anthropic | `claude-haiku-4-5-20251001` (override: `ANTHROPIC_MODEL`) |
-| `OLLAMA_MODEL`      | Ollama    | `llama3.2` (host override: `OLLAMA_HOST`)                 |
-
-```bash
-# Run with OpenAI
-OPENAI_API_KEY=sk-... npm run bench
-
-# Run with Ollama (local)
-OLLAMA_MODEL=llama3.2 npm run bench
-
-# Run with multiple providers
-OPENAI_API_KEY=sk-... ANTHROPIC_API_KEY=sk-ant-... npm run bench
-```
-
-### Three methods compared
-
-Each scenario runs three methods side-by-side:
-
-| Method          | Description                                                          |
-| --------------- | -------------------------------------------------------------------- |
-| `deterministic` | No LLM, pure sentence scoring + entity extraction                    |
-| `llm-basic`     | `createSummarizer` with the detected provider                        |
-| `llm-escalate`  | `createEscalatingSummarizer` (normal -> aggressive -> deterministic) |
-
-All methods verify round-trip integrity — `uncompress()` is called to confirm originals are restored.
-
-### What to look for
-
-- **Ratio comparison** — deterministic often beats LLM on compression ratio because LLMs write fuller, more helpful summaries
-- **Latency** — deterministic is < 2ms; LLM adds network round-trip time per message
-- **Fallback rate** — how often the engine rejects LLM output and falls back to deterministic
-- **Round-trip integrity** — all methods must pass (no data loss)
-
-### SDK requirements
-
-LLM providers require their SDKs:
-
-- OpenAI: `openai` package
-- Anthropic: `@anthropic-ai/sdk` package
-- Ollama: `openai` package (uses OpenAI-compatible API)
-
-Missing SDKs are detected at runtime and print a skip message — no crash, no hard dependency.
-
-## Interpreting results
+## Interpreting Results
 
 ### Compression ratio
 
-- `1.0` = no compression (all messages preserved)
-- `1.5` = 33% reduction
-- `2.0` = 50% reduction
-- `3.0` = 67% reduction
-- `6.0` = 83% reduction
+| Ratio | Reduction |
+| ---: | --- |
+| 1.0x | no compression (all messages preserved) |
+| 1.5x | 33% reduction |
+| 2.0x | 50% reduction |
+| 3.0x | 67% reduction |
+| 6.0x | 83% reduction |
+
+Higher is better. Token ratio is more meaningful for LLM context budgeting; character ratio is useful for storage.
 
-Higher is better. The deterministic engine typically achieves 1.3-6.1x on synthetic scenarios.
+### Deduplication
 
-### Token ratio vs. character ratio
+Dedup effectiveness is measured across two axes:
 
-Token ratio is more meaningful for LLM context budgeting since tokens are what models count. Character ratio is useful for storage optimization.
+- **recencyWindow=0** vs **recencyWindow=4** — how much compression improves when recent messages are protected
+- **With dedup** vs **without** — the marginal gain from exact + fuzzy duplicate detection
 
-### When LLM wins
+Scenarios with repeated content (Long Q&A, Agentic coding session) show the largest dedup gains. Scenarios with unique messages show no difference.
 
-LLM summarization can outperform deterministic in:
+### LLM vs deterministic
 
-- Very long prose-heavy conversations where paraphrasing and concept merging genuinely helps
-- Domain-specific content where the LLM understands what's important
+The `vsDet` column shows LLM compression relative to deterministic:
 
-### When deterministic wins
+- **vsDet > 1.0** — LLM achieves better compression (common for long prose)
+- **vsDet < 1.0** — deterministic wins (common for structured/technical content)
+- **vsDet = 1.0** — no difference (content is already optimal or fully preserved)
 
-Deterministic typically wins when:
+## Regression Testing
 
-- Messages contain mixed code and prose (code-aware splitting is already optimal)
-- Messages are structured (test output, grep results)
-- The LLM writes helpful but verbose summaries
+Baselines are stored in [`bench/baselines/`](../bench/baselines/) as JSON. CI runs `npm run bench:check` on every push and PR to catch regressions.
 
----
+- **Tolerance:** 0% by default (all metrics are deterministic)
+- **On regression:** CI fails with a diff showing which metrics changed
+- **After intentional changes:** run `npm run bench:save` to update the baseline and regenerate the results doc
+- **Custom tolerance:** `npx tsx bench/run.ts --check --tolerance 5` allows 5% deviation
 
-## See also
+### Baseline files
 
-- [Compression pipeline](compression-pipeline.md) - the deterministic algorithm
-- [LLM integration](llm-integration.md) - setting up providers for benchmarks
-- [Token budget](token-budget.md) - budget optimization
-- [Deduplication](deduplication.md) - dedup in benchmarks
+| File | Purpose |
+| --- | --- |
+| `bench/baselines/current.json` | Active baseline compared in CI |
+| `bench/baselines/history/v*.json` | Versioned snapshots, one per release |
+| `bench/baselines/llm/*.json` | LLM benchmark reference data (non-deterministic) |
diff --git a/docs/token-budget.md b/docs/token-budget.md
index cb1a9f4..c1fabe2 100644
--- a/docs/token-budget.md
+++ b/docs/token-budget.md
@@ -49,7 +49,7 @@ function defaultTokenCounter(msg: Message): number {
 }
 ```
 
-~3.5 characters per token is a rough heuristic. It's fast and works for ballpark estimates, but real tokenizers vary:
+~3.5 characters per token is derived from empirical measurements of GPT-family BPE tokenizers (cl100k_base, o200k_base) on mixed English text. We pick the lower end of the observed range so estimates are conservative — slightly over-counting tokens is safer than under-counting and blowing the budget. It's fast and works for ballpark estimates, but real tokenizers vary:
 
 | Tokenizer | Typical chars/token |
 | --------- | ------------------- |
diff --git a/package.json b/package.json
index 9b33f2f..f581ee3 100644
--- a/package.json
+++ b/package.json
@@ -14,6 +14,7 @@
     "format": "prettier --write .",
     "format:check": "prettier --check .",
     "bench": "npx tsx bench/run.ts",
+    "bench:llm": "npx tsx bench/run.ts --llm",
     "bench:save": "npx tsx bench/run.ts --save",
     "bench:check": "npx tsx bench/run.ts --check",
     "test:e2e": "npm run build && npm pack && npm run test:e2e:lint && npm run test:e2e:smoke; EXIT=$?; npm run test:e2e:cleanup; exit $EXIT",
diff --git a/src/compress.ts b/src/compress.ts
index 68e2641..b77b72c 100644
--- a/src/compress.ts
+++ b/src/compress.ts
@@ -418,7 +418,16 @@ function contentLength(msg: Message): number {
   return typeof msg.content === 'string' ? msg.content.length : 0;
 }
 
-/** Default token counter: ~3.5 chars/token heuristic. */
+/**
+ * Default token counter: ~3.5 chars/token heuristic.
+ *
+ * The 3.5 ratio is the empirical average for GPT-family BPE tokenizers
+ * (cl100k_base, o200k_base) on mixed English text. Real-world values range
+ * from ~3.2 (code-heavy) to ~4.5 (plain prose). We intentionally pick the
+ * lower end so budget estimates stay conservative (slightly over-counting
+ * tokens is safer than under-counting). Users who need exact counts can
+ * supply a real tokenizer via the `tokenCounter` option.
+ */
 export function defaultTokenCounter(msg: Message): number {
   return Math.ceil(contentLength(msg) / 3.5);
 }
@@ -614,190 +623,10 @@ function computeStats(
 }
 
 // ---------------------------------------------------------------------------
-// Sync compression (internal)
+// Unified compression core (generator + sync/async runners)
 // ---------------------------------------------------------------------------
 
-function compressSync(messages: Message[], options: CompressOptions = {}): CompressResult {
-  const sourceVersion = options.sourceVersion ?? 0;
-  const counter = options.tokenCounter ?? defaultTokenCounter;
-
-  if (messages.length === 0) {
-    return {
-      messages: [],
-      compression: {
-        original_version: sourceVersion,
-        ratio: 1,
-        token_ratio: 1,
-        messages_compressed: 0,
-        messages_preserved: 0,
-      },
-      verbatim: {},
-    };
-  }
-
-  const preserveRoles = new Set(options.preserve ?? ['system']);
-  const recencyWindow = options.recencyWindow ?? 4;
-  const recencyStart = Math.max(0, messages.length - (recencyWindow > 0 ? recencyWindow : 0));
-  let dedupAnnotations =
-    (options.dedup ?? true) ? analyzeDuplicates(messages, recencyStart, preserveRoles) : undefined;
-
-  if (options.fuzzyDedup) {
-    const fuzzyAnnotations = analyzeFuzzyDuplicates(
-      messages,
-      recencyStart,
-      preserveRoles,
-      dedupAnnotations ?? new Map(),
-      options.fuzzyThreshold ?? 0.85,
-    );
-    if (fuzzyAnnotations.size > 0) {
-      if (!dedupAnnotations) dedupAnnotations = new Map();
-      for (const [idx, ann] of fuzzyAnnotations) {
-        dedupAnnotations.set(idx, ann);
-      }
-    }
-  }
-
-  const classified = classifyAll(messages, preserveRoles, recencyWindow, dedupAnnotations);
-
-  const result: Message[] = [];
-  const verbatim: Record<string, Message> = {};
-  let messagesCompressed = 0;
-  let messagesPreserved = 0;
-  let messagesDeduped = 0;
-  let messagesFuzzyDeduped = 0;
-  let i = 0;
-
-  while (i < classified.length) {
-    const { msg, preserved } = classified[i];
-
-    if (preserved) {
-      result.push(msg);
-      messagesPreserved++;
-      i++;
-      continue;
-    }
-
-    // Dedup: replace earlier duplicate/near-duplicate with compact reference
-    if (classified[i].dedup) {
-      const annotation = classified[i].dedup!;
-      const keepTargetId = messages[annotation.duplicateOfIndex].id;
-      const tag =
-        annotation.similarity != null
-          ? `[cce:near-dup of ${keepTargetId} — ${annotation.contentLength} chars, ~${Math.round(annotation.similarity * 100)}% match]`
-          : `[cce:dup of ${keepTargetId} — ${annotation.contentLength} chars]`;
-      result.push(buildCompressedMessage(msg, [msg.id], tag, sourceVersion, verbatim, [msg]));
-      if (annotation.similarity != null) {
-        messagesFuzzyDeduped++;
-      } else {
-        messagesDeduped++;
-      }
-      i++;
-      continue;
-    }
-
-    // Code-split: extract fences verbatim, summarize surrounding prose
-    if (classified[i].codeSplit) {
-      const content = typeof msg.content === 'string' ? msg.content : '';
-      const segments = splitCodeAndProse(content);
-      const proseText = segments
-        .filter((s) => s.type === 'prose')
-        .map((s) => s.content)
-        .join(' ');
-      const codeFences = segments.filter((s) => s.type === 'code').map((s) => s.content);
-      const proseBudget = proseText.length < 600 ? 200 : 400;
-      const summaryText = summarize(proseText, proseBudget);
-      const embeddedId = options.embedSummaryId ? makeSummaryId([msg.id]) : undefined;
-      const compressed = `${formatSummary(summaryText, proseText, undefined, true, embeddedId)}\n\n${codeFences.join('\n\n')}`;
-
-      if (compressed.length >= content.length) {
-        result.push(msg);
-        messagesPreserved++;
-        i++;
-        continue;
-      }
-
-      result.push(
-        buildCompressedMessage(msg, [msg.id], compressed, sourceVersion, verbatim, [msg]),
-      );
-      messagesCompressed++;
-      i++;
-      continue;
-    }
-
-    // Collect consecutive non-preserved messages with the SAME role
-    const { group, nextIdx } = collectGroup(classified, i);
-    i = nextIdx;
-
-    const allContent = group
-      .map((g) => (typeof g.msg.content === 'string' ? g.msg.content : ''))
-      .join(' ');
-    const contentBudget = allContent.length < 600 ? 200 : 400;
-    const summaryText = isStructuredOutput(allContent)
-      ? summarizeStructured(allContent, contentBudget)
-      : summarize(allContent, contentBudget);
-
-    if (group.length > 1) {
-      const mergeIds = group.map((g) => g.msg.id);
-      const embeddedId = options.embedSummaryId ? makeSummaryId(mergeIds) : undefined;
-      let summary = formatSummary(summaryText, allContent, group.length, undefined, embeddedId);
-      const combinedLength = group.reduce((sum, g) => sum + contentLength(g.msg), 0);
-      if (summary.length >= combinedLength) {
-        summary = formatSummary(summaryText, allContent, group.length, true, embeddedId);
-      }
-
-      if (summary.length >= combinedLength) {
-        for (const g of group) {
-          result.push(g.msg);
-          messagesPreserved++;
-        }
-      } else {
-        const sourceMsgs = group.map((g) => g.msg);
-        const base: Message = { ...sourceMsgs[0] };
-        result.push(
-          buildCompressedMessage(base, mergeIds, summary, sourceVersion, verbatim, sourceMsgs),
-        );
-        messagesCompressed += group.length;
-      }
-    } else {
-      const single = group[0].msg;
-      const content = typeof single.content === 'string' ? single.content : '';
-      const embeddedId = options.embedSummaryId ? makeSummaryId([single.id]) : undefined;
-      let summary = formatSummary(summaryText, allContent, undefined, undefined, embeddedId);
-      if (summary.length >= content.length) {
-        summary = formatSummary(summaryText, allContent, undefined, true, embeddedId);
-      }
-
-      if (summary.length >= content.length) {
-        result.push(single);
-        messagesPreserved++;
-      } else {
-        result.push(
-          buildCompressedMessage(single, [single.id], summary, sourceVersion, verbatim, [single]),
-        );
-        messagesCompressed++;
-      }
-    }
-  }
-
-  return {
-    messages: result,
-    compression: computeStats(
-      messages,
-      result,
-      messagesCompressed,
-      messagesPreserved,
-      sourceVersion,
-      counter,
-      messagesDeduped,
-      messagesFuzzyDeduped,
-    ),
-    verbatim,
-  };
-}
-
-// ---------------------------------------------------------------------------
-// Async compression (internal, LLM summarizer support)
-// ---------------------------------------------------------------------------
+type SummarizeRequest = { text: string; budget: number };
 
 async function withFallback(
   text: string,
@@ -816,13 +645,12 @@ async function withFallback(
   return summarize(text, maxBudget);
 }
 
-async function compressAsync(
+function* compressGen(
   messages: Message[],
   options: CompressOptions = {},
-): Promise<CompressResult> {
+): Generator<SummarizeRequest, CompressResult, string> {
   const sourceVersion = options.sourceVersion ?? 0;
   const counter = options.tokenCounter ?? defaultTokenCounter;
-  const userSummarizer = options.summarizer;
 
   if (messages.length === 0) {
     return {
@@ -908,7 +736,7 @@ async function compressAsync(
         .join(' ');
       const codeFences = segments.filter((s) => s.type === 'code').map((s) => s.content);
       const proseBudget = proseText.length < 600 ? 200 : 400;
-      const summaryText = await withFallback(proseText, userSummarizer, proseBudget);
+      const summaryText: string = yield { text: proseText, budget: proseBudget };
       const embeddedId = options.embedSummaryId ? makeSummaryId([msg.id]) : undefined;
       const compressed = `${formatSummary(summaryText, proseText, undefined, true, embeddedId)}\n\n${codeFences.join('\n\n')}`;
 
@@ -937,7 +765,7 @@ async function compressAsync(
     const contentBudget = allContent.length < 600 ? 200 : 400;
     const summaryText = isStructuredOutput(allContent)
       ? summarizeStructured(allContent, contentBudget)
-      : await withFallback(allContent, userSummarizer, contentBudget);
+      : yield { text: allContent, budget: contentBudget };
 
     if (group.length > 1) {
       const mergeIds = group.map((g) => g.msg.id);
@@ -998,6 +826,38 @@ async function compressAsync(
   };
 }
 
+function runCompressSync(gen: Generator<SummarizeRequest, CompressResult, string>): CompressResult {
+  let next = gen.next();
+  while (!next.done) {
+    const { text, budget } = next.value;
+    next = gen.next(summarize(text, budget));
+  }
+  return next.value;
+}
+
+async function runCompressAsync(
+  gen: Generator<SummarizeRequest, CompressResult, string>,
+  userSummarizer?: Summarizer,
+): Promise<CompressResult> {
+  let next = gen.next();
+  while (!next.done) {
+    const { text, budget } = next.value;
+    next = gen.next(await withFallback(text, userSummarizer, budget));
+  }
+  return next.value;
+}
+
+function compressSync(messages: Message[], options: CompressOptions = {}): CompressResult {
+  return runCompressSync(compressGen(messages, options));
+}
+
+async function compressAsync(
+  messages: Message[],
+  options: CompressOptions = {},
+): Promise<CompressResult> {
+  return runCompressAsync(compressGen(messages, options), options.summarizer);
+}
+
 // ---------------------------------------------------------------------------
 // Token budget helpers (absorbed from compressToFit)
 // ---------------------------------------------------------------------------
diff --git a/src/types.ts b/src/types.ts
index d885de3..16e4fd3 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -32,7 +32,7 @@ export type CompressOptions = {
   embedSummaryId?: boolean;
   /** Hard-truncate non-recency messages when binary search bottoms out and budget still exceeded. Default: false. */
   forceConverge?: boolean;
-  /** Custom token counter per message. Default: ceil(content.length / 3.5). */
+  /** Custom token counter per message. Default: ceil(content.length / 3.5) — see defaultTokenCounter for rationale. */
   tokenCounter?: (msg: Message) => number;
 };