From 578f6be84db8a3cd9bdc70d9bec847f5bba2d424 Mon Sep 17 00:00:00 2001 From: Lisa Date: Wed, 25 Feb 2026 12:52:04 +0100 Subject: [PATCH 01/13] feat: auto-load .env, auto-detect Ollama, persist LLM benchmark results - Add inline .env parser in bench/run.ts (no dependency, won't override existing vars) - Probe localhost:11434/api/tags to auto-detect Ollama without env vars - Add LLM result types and save/load in bench/baseline.ts - Auto-save LLM results to bench/baselines/llm/-.json - Extend doc generator with LLM comparison tables when result files exist - Add .env.example template with commented-out provider keys - Update skip message to mention Ollama auto-detection --- .env.example | 14 + bench/baseline.ts | 587 +++++++++++++++++++ bench/baselines/llm/ollama-llama3.2.json | 175 ++++++ bench/baselines/llm/openai-gpt-4.1-mini.json | 175 ++++++ bench/llm.ts | 80 ++- bench/run.ts | 129 ++-- 6 files changed, 1100 insertions(+), 60 deletions(-) create mode 100644 .env.example create mode 100644 bench/baseline.ts create mode 100644 bench/baselines/llm/ollama-llama3.2.json create mode 100644 bench/baselines/llm/openai-gpt-4.1-mini.json diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..d695863 --- /dev/null +++ b/.env.example @@ -0,0 +1,14 @@ +# LLM provider API keys for benchmark comparisons +# Copy to .env and uncomment the providers you want to test. + +# OpenAI (default model: gpt-4.1-mini) +# OPENAI_API_KEY=sk-... +# OPENAI_MODEL=gpt-4.1-mini + +# Anthropic (default model: claude-haiku-4-5-20251001) +# ANTHROPIC_API_KEY=sk-ant-... +# ANTHROPIC_MODEL=claude-haiku-4-5-20251001 + +# Ollama (auto-detected when running locally — no env vars required) +# OLLAMA_HOST=http://localhost:11434 +# OLLAMA_MODEL=llama3.2 diff --git a/bench/baseline.ts b/bench/baseline.ts new file mode 100644 index 0000000..25d7006 --- /dev/null +++ b/bench/baseline.ts @@ -0,0 +1,587 @@ +import { readFileSync, writeFileSync, mkdirSync, readdirSync, existsSync } from 'node:fs'; +import { dirname, join } from 'node:path'; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface BasicResult { + ratio: number; + tokenRatio: number; + compressed: number; + preserved: number; +} + +export interface TokenBudgetResult { + tokenCount: number; + fits: boolean; + recencyWindow: number | undefined; + compressed: number; + preserved: number; + deduped: number; +} + +export interface DedupResult { + rw0Base: number; + rw0Dup: number; + rw4Base: number; + rw4Dup: number; + deduped: number; +} + +export interface FuzzyDedupResult { + exact: number; + fuzzy: number; + ratio: number; +} + +export interface BenchmarkResults { + basic: Record; + tokenBudget: Record; + dedup: Record; + fuzzyDedup: Record; +} + +export interface Baseline { + version: string; + generated: string; + results: BenchmarkResults; +} + +// --------------------------------------------------------------------------- +// LLM benchmark types +// --------------------------------------------------------------------------- + +export interface LlmMethodResult { + ratio: number; + tokenRatio: number; + compressed: number; + preserved: number; + roundTrip: 'PASS' | 'FAIL'; + timeMs: number; +} + +export interface LlmScenarioResult { + methods: Record; +} + +export interface LlmBenchmarkResult { + provider: string; + model: string; + generated: string; + scenarios: Record; +} + +// --------------------------------------------------------------------------- +// Save / Load +// --------------------------------------------------------------------------- + +export function saveBaseline(path: string, version: string, results: BenchmarkResults): void { + const baseline: Baseline = { + version, + generated: new Date().toISOString(), + results, + }; + const dir = dirname(path); + mkdirSync(dir, { recursive: true }); + const json = JSON.stringify(baseline, null, 2) + '\n'; + writeFileSync(path, json); + // Also save a versioned copy for history + writeFileSync(join(dir, `v${version}.json`), json); +} + +export function loadBaseline(path: string): Baseline { + return JSON.parse(readFileSync(path, 'utf-8')); +} + +// --------------------------------------------------------------------------- +// LLM result persistence +// --------------------------------------------------------------------------- + +export function saveLlmResult(baselinesDir: string, result: LlmBenchmarkResult): void { + const llmDir = join(baselinesDir, 'llm'); + mkdirSync(llmDir, { recursive: true }); + const filename = `${result.provider}-${result.model.replace(/[/:]/g, '-')}.json`; + writeFileSync(join(llmDir, filename), JSON.stringify(result, null, 2) + '\n'); +} + +export function loadAllLlmResults(baselinesDir: string): LlmBenchmarkResult[] { + const llmDir = join(baselinesDir, 'llm'); + if (!existsSync(llmDir)) return []; + + const results: LlmBenchmarkResult[] = []; + for (const f of readdirSync(llmDir) + .filter((f) => f.endsWith('.json')) + .sort()) { + try { + results.push(JSON.parse(readFileSync(join(llmDir, f), 'utf-8'))); + } catch { + console.warn(` Warning: skipping malformed LLM result file: ${f}`); + } + } + return results; +} + +// --------------------------------------------------------------------------- +// Compare +// --------------------------------------------------------------------------- + +export interface Regression { + benchmark: string; + scenario: string; + metric: string; + expected: number | boolean; + actual: number | boolean; + delta?: string; +} + +function checkNum( + regressions: Regression[], + bench: string, + scenario: string, + metric: string, + expected: number, + actual: number, + tolerance: number, +): void { + const denom = Math.max(Math.abs(expected), 1); + const pctDiff = Math.abs(actual - expected) / denom; + if (pctDiff > tolerance) { + const sign = actual > expected ? '+' : ''; + regressions.push({ + benchmark: bench, + scenario, + metric, + expected, + actual, + delta: `${sign}${(((actual - expected) / denom) * 100).toFixed(1)}%`, + }); + } +} + +function checkBool( + regressions: Regression[], + bench: string, + scenario: string, + metric: string, + expected: boolean, + actual: boolean, +): void { + if (expected !== actual) { + regressions.push({ benchmark: bench, scenario, metric, expected, actual }); + } +} + +function missing(regressions: Regression[], bench: string, scenario: string): void { + regressions.push({ + benchmark: bench, + scenario, + metric: '(missing)', + expected: true, + actual: false, + }); +} + +export function compareResults( + baseline: BenchmarkResults, + current: BenchmarkResults, + tolerance: number = 0, +): Regression[] { + const regressions: Regression[] = []; + + // Basic + for (const [name, exp] of Object.entries(baseline.basic)) { + const act = current.basic[name]; + if (!act) { + missing(regressions, 'basic', name); + continue; + } + checkNum(regressions, 'basic', name, 'ratio', exp.ratio, act.ratio, tolerance); + checkNum(regressions, 'basic', name, 'tokenRatio', exp.tokenRatio, act.tokenRatio, tolerance); + checkNum(regressions, 'basic', name, 'compressed', exp.compressed, act.compressed, tolerance); + checkNum(regressions, 'basic', name, 'preserved', exp.preserved, act.preserved, tolerance); + } + + // Token budget + for (const [name, exp] of Object.entries(baseline.tokenBudget)) { + const act = current.tokenBudget[name]; + if (!act) { + missing(regressions, 'tokenBudget', name); + continue; + } + checkNum( + regressions, + 'tokenBudget', + name, + 'tokenCount', + exp.tokenCount, + act.tokenCount, + tolerance, + ); + checkBool(regressions, 'tokenBudget', name, 'fits', exp.fits, act.fits); + if (exp.recencyWindow != null && act.recencyWindow != null) { + checkNum( + regressions, + 'tokenBudget', + name, + 'recencyWindow', + exp.recencyWindow, + act.recencyWindow, + tolerance, + ); + } + checkNum( + regressions, + 'tokenBudget', + name, + 'compressed', + exp.compressed, + act.compressed, + tolerance, + ); + checkNum( + regressions, + 'tokenBudget', + name, + 'preserved', + exp.preserved, + act.preserved, + tolerance, + ); + checkNum(regressions, 'tokenBudget', name, 'deduped', exp.deduped, act.deduped, tolerance); + } + + // Dedup + for (const [name, exp] of Object.entries(baseline.dedup)) { + const act = current.dedup[name]; + if (!act) { + missing(regressions, 'dedup', name); + continue; + } + checkNum(regressions, 'dedup', name, 'rw0Base', exp.rw0Base, act.rw0Base, tolerance); + checkNum(regressions, 'dedup', name, 'rw0Dup', exp.rw0Dup, act.rw0Dup, tolerance); + checkNum(regressions, 'dedup', name, 'rw4Base', exp.rw4Base, act.rw4Base, tolerance); + checkNum(regressions, 'dedup', name, 'rw4Dup', exp.rw4Dup, act.rw4Dup, tolerance); + checkNum(regressions, 'dedup', name, 'deduped', exp.deduped, act.deduped, tolerance); + } + + // Fuzzy dedup + for (const [name, exp] of Object.entries(baseline.fuzzyDedup)) { + const act = current.fuzzyDedup[name]; + if (!act) { + missing(regressions, 'fuzzyDedup', name); + continue; + } + checkNum(regressions, 'fuzzyDedup', name, 'exact', exp.exact, act.exact, tolerance); + checkNum(regressions, 'fuzzyDedup', name, 'fuzzy', exp.fuzzy, act.fuzzy, tolerance); + checkNum(regressions, 'fuzzyDedup', name, 'ratio', exp.ratio, act.ratio, tolerance); + } + + return regressions; +} + +// --------------------------------------------------------------------------- +// Report +// --------------------------------------------------------------------------- + +export function formatRegressions(regressions: Regression[]): string { + if (regressions.length === 0) return 'No regressions detected.'; + + const lines: string[] = [`${regressions.length} regression(s) detected:`, '']; + + for (const r of regressions) { + const delta = r.delta ? ` (${r.delta})` : ''; + lines.push( + ` [${r.benchmark}] ${r.scenario} → ${r.metric}: expected ${r.expected}, got ${r.actual}${delta}`, + ); + } + + return lines.join('\n'); +} + +// --------------------------------------------------------------------------- +// Doc generation +// --------------------------------------------------------------------------- + +function loadAllBaselines(baselinesDir: string): Baseline[] { + const files = readdirSync(baselinesDir) + .filter((f) => f.startsWith('v') && f.endsWith('.json')) + .sort((a, b) => { + // Sort by semver: v1.0.0.json < v1.1.0.json < v2.0.0.json + const pa = a + .replace(/^v|\.json$/g, '') + .split('.') + .map(Number); + const pb = b + .replace(/^v|\.json$/g, '') + .split('.') + .map(Number); + for (let i = 0; i < 3; i++) { + if ((pa[i] ?? 0) !== (pb[i] ?? 0)) return (pa[i] ?? 0) - (pb[i] ?? 0); + } + return 0; + }); + + return files.map((f) => loadBaseline(join(baselinesDir, f))); +} + +function fix(n: number, d: number = 2): string { + return n.toFixed(d); +} + +function generateSection(b: Baseline): string { + const lines: string[] = []; + const r = b.results; + + // Basic compression table + const basicEntries = Object.entries(r.basic); + const ratios = basicEntries.map(([, v]) => v.ratio); + const minR = Math.min(...ratios); + const maxR = Math.max(...ratios); + const avgR = ratios.reduce((a, b) => a + b, 0) / ratios.length; + + lines.push(`### Basic Compression`); + lines.push(''); + lines.push( + `**Range:** ${fix(minR)}x \u2013 ${fix(maxR)}x \u00b7 **Average:** ${fix(avgR)}x \u00b7 **Round-trip:** all PASS`, + ); + lines.push(''); + lines.push('| Scenario | Char Ratio | Token Ratio | Compressed | Preserved |'); + lines.push('| --- | ---: | ---: | ---: | ---: |'); + for (const [name, v] of basicEntries) { + lines.push( + `| ${name} | ${fix(v.ratio)} | ${fix(v.tokenRatio)} | ${v.compressed} | ${v.preserved} |`, + ); + } + + // Token budget table + lines.push(''); + lines.push('### Token Budget (target: 2000 tokens)'); + lines.push(''); + lines.push( + '| Scenario | Dedup | Tokens | Fits | recencyWindow | Compressed | Preserved | Deduped |', + ); + lines.push('| --- | --- | ---: | --- | ---: | ---: | ---: | ---: |'); + for (const [key, v] of Object.entries(r.tokenBudget)) { + const [name, dedupStr] = key.split('|'); + const dedup = dedupStr === 'dedup=true' ? 'yes' : 'no'; + lines.push( + `| ${name} | ${dedup} | ${v.tokenCount} | ${v.fits} | ${v.recencyWindow ?? '-'} | ${v.compressed} | ${v.preserved} | ${v.deduped} |`, + ); + } + + // Dedup comparison table + lines.push(''); + lines.push('### Dedup Effectiveness'); + lines.push(''); + lines.push( + '| Scenario | No Dedup (rw=0) | Dedup (rw=0) | No Dedup (rw=4) | Dedup (rw=4) | Deduped |', + ); + lines.push('| --- | ---: | ---: | ---: | ---: | ---: |'); + for (const [name, v] of Object.entries(r.dedup)) { + lines.push( + `| ${name} | ${fix(v.rw0Base)} | ${fix(v.rw0Dup)} | ${fix(v.rw4Base)} | ${fix(v.rw4Dup)} | ${v.deduped} |`, + ); + } + + // Fuzzy dedup table + lines.push(''); + lines.push('### Fuzzy Dedup'); + lines.push(''); + lines.push('| Scenario | Exact Deduped | Fuzzy Deduped | Ratio |'); + lines.push('| --- | ---: | ---: | ---: |'); + for (const [name, v] of Object.entries(r.fuzzyDedup)) { + lines.push(`| ${name} | ${v.exact} | ${v.fuzzy} | ${fix(v.ratio)} |`); + } + + return lines.join('\n'); +} + +export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): void { + const baselines = loadAllBaselines(baselinesDir); + if (baselines.length === 0) return; + + const latest = baselines[baselines.length - 1]; + const lines: string[] = []; + + lines.push('# Benchmark Results'); + lines.push(''); + lines.push('[Back to README](../README.md) | [All docs](README.md)'); + lines.push(''); + lines.push(''); + lines.push(''); + lines.push(''); + + // --- How to run section --- + lines.push('## Running Benchmarks'); + lines.push(''); + lines.push('```bash'); + lines.push('npm run bench # Run benchmarks (no baseline check)'); + lines.push('npm run bench:check # Run and compare against baseline'); + lines.push('npm run bench:save # Run, save new baseline, regenerate this doc'); + lines.push('```'); + lines.push(''); + lines.push('### LLM benchmarks (optional)'); + lines.push(''); + lines.push( + 'Set environment variables (or add a `.env` file) to enable LLM-powered summarization comparison. Ollama is auto-detected when running locally.', + ); + lines.push(''); + lines.push('| Variable | Provider | Default Model | Notes |'); + lines.push('| --- | --- | --- | --- |'); + lines.push('| `OPENAI_API_KEY` | OpenAI | `gpt-4.1-mini` | |'); + lines.push('| `ANTHROPIC_API_KEY` | Anthropic | `claude-haiku-4-5-20251001` | |'); + lines.push('| *(none required)* | Ollama | `llama3.2` | Auto-detected on localhost:11434 |'); + lines.push(''); + + // --- Latest version results --- + lines.push(`## Current Results (v${latest.version})`); + lines.push(''); + lines.push(generateSection(latest)); + lines.push(''); + + // --- Version history --- + if (baselines.length > 1) { + lines.push('## Version History'); + lines.push(''); + lines.push('| Version | Date | Avg Char Ratio | Avg Token Ratio | Scenarios |'); + lines.push('| --- | --- | ---: | ---: | ---: |'); + for (const b of [...baselines].reverse()) { + const basicEntries = Object.values(b.results.basic); + const avgChr = basicEntries.reduce((s, v) => s + v.ratio, 0) / basicEntries.length; + const avgTkr = basicEntries.reduce((s, v) => s + v.tokenRatio, 0) / basicEntries.length; + const date = b.generated.split('T')[0]; + lines.push( + `| ${b.version} | ${date} | ${fix(avgChr)} | ${fix(avgTkr)} | ${basicEntries.length} |`, + ); + } + lines.push(''); + } + + // --- Per-version detail (older versions) --- + const olderVersions = baselines.slice(0, -1).reverse(); + if (olderVersions.length > 0) { + lines.push('## Previous Versions'); + lines.push(''); + for (const b of olderVersions) { + lines.push(`
`); + lines.push(`v${b.version} (${b.generated.split('T')[0]})`); + lines.push(''); + lines.push(generateSection(b)); + lines.push(''); + lines.push('
'); + lines.push(''); + } + } + + // --- Scenarios --- + lines.push('## Scenarios'); + lines.push(''); + lines.push('The benchmark covers 7 conversation types:'); + lines.push(''); + lines.push('| Scenario | Description |'); + lines.push('| --- | --- |'); + lines.push('| Coding assistant | Mixed code fences and prose discussion |'); + lines.push('| Long Q&A | Extended question-and-answer with repeated paragraphs |'); + lines.push('| Tool-heavy | Messages with `tool_calls` arrays (preserved by default) |'); + lines.push('| Short conversation | Brief exchanges, mostly under 120 chars |'); + lines.push('| Deep conversation | 25 turns of multi-paragraph prose |'); + lines.push('| Structured content | JSON, YAML, SQL, API keys, test output |'); + lines.push( + '| Agentic coding session | Repeated file reads, grep results, near-duplicate edits |', + ); + lines.push(''); + + // --- Interpreting results --- + lines.push('## Interpreting Results'); + lines.push(''); + lines.push('### Compression ratio'); + lines.push(''); + lines.push('| Ratio | Reduction |'); + lines.push('| ---: | --- |'); + lines.push('| 1.0x | no compression (all messages preserved) |'); + lines.push('| 1.5x | 33% reduction |'); + lines.push('| 2.0x | 50% reduction |'); + lines.push('| 3.0x | 67% reduction |'); + lines.push('| 6.0x | 83% reduction |'); + lines.push(''); + lines.push( + 'Higher is better. Token ratio is more meaningful for LLM context budgeting; character ratio is useful for storage.', + ); + lines.push(''); + + // --- Regression testing --- + lines.push('## Regression Testing'); + lines.push(''); + lines.push( + 'Baselines are stored in [`bench/baselines/`](../bench/baselines/) as JSON. CI runs `npm run bench:check` on every push and PR to catch regressions.', + ); + lines.push(''); + lines.push('- **Tolerance:** 0% by default (all metrics are deterministic)'); + lines.push('- **On regression:** CI fails with a diff showing which metrics changed'); + lines.push( + '- **After intentional changes:** run `npm run bench:save` to update the baseline and regenerate this doc', + ); + lines.push( + '- **Custom tolerance:** `npx tsx bench/run.ts --check --tolerance 5` allows 5% deviation', + ); + lines.push(''); + lines.push('### Baseline files'); + lines.push(''); + lines.push('| File | Purpose |'); + lines.push('| --- | --- |'); + lines.push('| `bench/baselines/current.json` | Active baseline compared in CI |'); + lines.push('| `bench/baselines/v*.json` | Versioned snapshots, one per release |'); + lines.push(''); + + // --- LLM comparison (if result files exist) --- + const llmResults = loadAllLlmResults(baselinesDir); + if (llmResults.length > 0) { + lines.push('## LLM Summarization Comparison'); + lines.push(''); + lines.push( + '> Results are **non-deterministic** — LLM outputs vary between runs. These are saved as reference data, not used for regression testing.', + ); + lines.push(''); + + for (const llm of llmResults) { + lines.push(`### ${llm.provider} (${llm.model})`); + lines.push(''); + lines.push(`*Generated: ${llm.generated.split('T')[0]}*`); + lines.push(''); + lines.push( + '| Scenario | Method | Char Ratio | Token Ratio | Compressed | Preserved | Round-trip | Time |', + ); + lines.push('| --- | --- | ---: | ---: | ---: | ---: | --- | ---: |'); + + for (const [scenario, sr] of Object.entries(llm.scenarios)) { + let first = true; + for (const [method, mr] of Object.entries(sr.methods)) { + const label = first ? scenario : ''; + const time = + mr.timeMs < 1000 ? `${Math.round(mr.timeMs)}ms` : `${(mr.timeMs / 1000).toFixed(1)}s`; + lines.push( + `| ${label} | ${method} | ${fix(mr.ratio)} | ${fix(mr.tokenRatio)} | ${mr.compressed} | ${mr.preserved} | ${mr.roundTrip} | ${time} |`, + ); + first = false; + } + } + + lines.push(''); + } + } + + // --- Methodology --- + lines.push('## Methodology'); + lines.push(''); + lines.push('- All results are **deterministic** — same input always produces the same output'); + lines.push('- Metrics tracked: compression ratio, token ratio, message counts, dedup counts'); + lines.push('- Timing is excluded from baselines (hardware-dependent)'); + lines.push( + '- Real-session and LLM benchmarks are excluded from baselines (environment-dependent)', + ); + lines.push('- Round-trip integrity is verified for every scenario (compress then uncompress)'); + lines.push(''); + + writeFileSync(outputPath, lines.join('\n')); +} diff --git a/bench/baselines/llm/ollama-llama3.2.json b/bench/baselines/llm/ollama-llama3.2.json new file mode 100644 index 0000000..df2daf8 --- /dev/null +++ b/bench/baselines/llm/ollama-llama3.2.json @@ -0,0 +1,175 @@ +{ + "provider": "ollama", + "model": "llama3.2", + "generated": "2026-02-25T09:04:08.797Z", + "scenarios": { + "Coding assistant": { + "methods": { + "deterministic": { + "ratio": 1.6812907904278462, + "tokenRatio": 1.6729559748427674, + "compressed": 5, + "preserved": 8, + "roundTrip": "PASS", + "timeMs": 0.25633400000515394 + }, + "llm-basic": { + "ratio": 1.542581503659348, + "tokenRatio": 1.5375722543352601, + "compressed": 5, + "preserved": 8, + "roundTrip": "PASS", + "timeMs": 4017.3837080000085 + }, + "llm-escalate": { + "ratio": 1.4490625, + "tokenRatio": 1.4472252448313385, + "compressed": 5, + "preserved": 8, + "roundTrip": "PASS", + "timeMs": 4231.839750000014 + } + } + }, + "Long Q&A": { + "methods": { + "deterministic": { + "ratio": 6.158536585365853, + "tokenRatio": 6.114164904862579, + "compressed": 4, + "preserved": 6, + "roundTrip": "PASS", + "timeMs": 0.9851250000065193 + }, + "llm-basic": { + "ratio": 4.482911673324456, + "tokenRatio": 4.449230769230769, + "compressed": 4, + "preserved": 6, + "roundTrip": "PASS", + "timeMs": 4514.40400000001 + }, + "llm-escalate": { + "ratio": 4.46113074204947, + "tokenRatio": 4.435582822085889, + "compressed": 4, + "preserved": 6, + "roundTrip": "PASS", + "timeMs": 4495.341832999984 + } + } + }, + "Tool-heavy": { + "methods": { + "deterministic": { + "ratio": 1.2991563919532771, + "tokenRatio": 1.2946428571428572, + "compressed": 2, + "preserved": 16, + "roundTrip": "PASS", + "timeMs": 0.43775000001187436 + }, + "llm-basic": { + "ratio": 1.1762632197414806, + "tokenRatio": 1.174089068825911, + "compressed": 2, + "preserved": 16, + "roundTrip": "PASS", + "timeMs": 2554.344207999995 + }, + "llm-escalate": { + "ratio": 1.0945872061235649, + "tokenRatio": 1.0922787193973635, + "compressed": 2, + "preserved": 16, + "roundTrip": "PASS", + "timeMs": 3072.135583000025 + } + } + }, + "Deep conversation": { + "methods": { + "deterministic": { + "ratio": 2.124913733609386, + "tokenRatio": 2.1241305510968433, + "compressed": 50, + "preserved": 1, + "roundTrip": "PASS", + "timeMs": 2.759959000017261 + }, + "llm-basic": { + "ratio": 3.1500511538024325, + "tokenRatio": 3.133385951065509, + "compressed": 50, + "preserved": 1, + "roundTrip": "PASS", + "timeMs": 24927.85712499998 + }, + "llm-escalate": { + "ratio": 3.3075913105753165, + "tokenRatio": 3.2891466445733224, + "compressed": 50, + "preserved": 1, + "roundTrip": "PASS", + "timeMs": 25502.586582999997 + } + } + }, + "Structured content": { + "methods": { + "deterministic": { + "ratio": 1.9338990620812864, + "tokenRatio": 1.9241486068111455, + "compressed": 2, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 0.7937499999825377 + }, + "llm-basic": { + "ratio": 1.338071693448702, + "tokenRatio": 1.3365591397849463, + "compressed": 2, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 4280.396999999997 + }, + "llm-escalate": { + "ratio": 1.338899196042053, + "tokenRatio": 1.3365591397849463, + "compressed": 2, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 4270.229416999995 + } + } + }, + "Agentic coding session": { + "methods": { + "deterministic": { + "ratio": 1.428351309707242, + "tokenRatio": 1.4258962011771001, + "compressed": 2, + "preserved": 31, + "roundTrip": "PASS", + "timeMs": 6.805083999992348 + }, + "llm-basic": { + "ratio": 1.3898050974512743, + "tokenRatio": 1.3880208333333333, + "compressed": 2, + "preserved": 31, + "roundTrip": "PASS", + "timeMs": 4489.758750000008 + }, + "llm-escalate": { + "ratio": 1.367660076718796, + "tokenRatio": 1.3659661711942594, + "compressed": 2, + "preserved": 31, + "roundTrip": "PASS", + "timeMs": 2974.9285839999793 + } + } + } + } +} diff --git a/bench/baselines/llm/openai-gpt-4.1-mini.json b/bench/baselines/llm/openai-gpt-4.1-mini.json new file mode 100644 index 0000000..986174d --- /dev/null +++ b/bench/baselines/llm/openai-gpt-4.1-mini.json @@ -0,0 +1,175 @@ +{ + "provider": "openai", + "model": "gpt-4.1-mini", + "generated": "2026-02-25T09:01:27.143Z", + "scenarios": { + "Coding assistant": { + "methods": { + "deterministic": { + "ratio": 1.6812907904278462, + "tokenRatio": 1.6729559748427674, + "compressed": 5, + "preserved": 8, + "roundTrip": "PASS", + "timeMs": 0.24499999999989086 + }, + "llm-basic": { + "ratio": 1.6385159010600707, + "tokenRatio": 1.6319018404907975, + "compressed": 5, + "preserved": 8, + "roundTrip": "PASS", + "timeMs": 5283.475208 + }, + "llm-escalate": { + "ratio": 1.634473034896017, + "tokenRatio": 1.627906976744186, + "compressed": 5, + "preserved": 8, + "roundTrip": "PASS", + "timeMs": 5459.967416999999 + } + } + }, + "Long Q&A": { + "methods": { + "deterministic": { + "ratio": 6.158536585365853, + "tokenRatio": 6.114164904862579, + "compressed": 4, + "preserved": 6, + "roundTrip": "PASS", + "timeMs": 0.747084000000541 + }, + "llm-basic": { + "ratio": 5.34957627118644, + "tokenRatio": 5.3161764705882355, + "compressed": 4, + "preserved": 6, + "roundTrip": "PASS", + "timeMs": 4945.921291999999 + }, + "llm-escalate": { + "ratio": 5.324196099103848, + "tokenRatio": 5.287020109689214, + "compressed": 4, + "preserved": 6, + "roundTrip": "PASS", + "timeMs": 6080.180457999999 + } + } + }, + "Tool-heavy": { + "methods": { + "deterministic": { + "ratio": 1.2991563919532771, + "tokenRatio": 1.2946428571428572, + "compressed": 2, + "preserved": 16, + "roundTrip": "PASS", + "timeMs": 0.2543749999967986 + }, + "llm-basic": { + "ratio": 1.1159420289855073, + "tokenRatio": 1.1153846153846154, + "compressed": 2, + "preserved": 16, + "roundTrip": "PASS", + "timeMs": 3140.399959000002 + }, + "llm-escalate": { + "ratio": 1.114079020589872, + "tokenRatio": 1.1132437619961613, + "compressed": 2, + "preserved": 16, + "roundTrip": "PASS", + "timeMs": 9754.72075 + } + } + }, + "Deep conversation": { + "methods": { + "deterministic": { + "ratio": 2.124913733609386, + "tokenRatio": 2.1241305510968433, + "compressed": 50, + "preserved": 1, + "roundTrip": "PASS", + "timeMs": 2.977333999995608 + }, + "llm-basic": { + "ratio": 2.3589852728356178, + "tokenRatio": 2.350503256364713, + "compressed": 50, + "preserved": 1, + "roundTrip": "PASS", + "timeMs": 51435.262625 + }, + "llm-escalate": { + "ratio": 2.3507804546996947, + "tokenRatio": 2.344949793266391, + "compressed": 50, + "preserved": 1, + "roundTrip": "PASS", + "timeMs": 50272.36633300001 + } + } + }, + "Structured content": { + "methods": { + "deterministic": { + "ratio": 1.9338990620812864, + "tokenRatio": 1.9241486068111455, + "compressed": 2, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 0.5064999999885913 + }, + "llm-basic": { + "ratio": 1.2727807172251617, + "tokenRatio": 1.270961145194274, + "compressed": 2, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 4792.548290999985 + }, + "llm-escalate": { + "ratio": 1.3229453101130462, + "tokenRatio": 1.3209351753453773, + "compressed": 2, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 4862.093292000005 + } + } + }, + "Agentic coding session": { + "methods": { + "deterministic": { + "ratio": 1.428351309707242, + "tokenRatio": 1.4258962011771001, + "compressed": 2, + "preserved": 31, + "roundTrip": "PASS", + "timeMs": 0.7409169999882579 + }, + "llm-basic": { + "ratio": 1.3914740318222756, + "tokenRatio": 1.389468196037539, + "compressed": 2, + "preserved": 31, + "roundTrip": "PASS", + "timeMs": 4425.688000000024 + }, + "llm-escalate": { + "ratio": 1.348559790514984, + "tokenRatio": 1.3473205257836198, + "compressed": 2, + "preserved": 31, + "roundTrip": "PASS", + "timeMs": 11189.863042000012 + } + } + } + } +} diff --git a/bench/llm.ts b/bench/llm.ts index 68c7197..e4615ef 100644 --- a/bench/llm.ts +++ b/bench/llm.ts @@ -6,7 +6,7 @@ * * Supported providers: * - OpenAI: OPENAI_API_KEY (model override: OPENAI_MODEL, default gpt-4.1-mini) - * - Ollama: OLLAMA_MODEL or OLLAMA_HOST (default host http://localhost:11434, model llama3.2) + * - Ollama: Auto-detected on localhost:11434, or OLLAMA_MODEL/OLLAMA_HOST (model default llama3.2) * - Anthropic: ANTHROPIC_API_KEY (model override: ANTHROPIC_MODEL, default claude-haiku-4-5-20251001) * * SDKs are dynamically imported — missing packages print a skip message @@ -47,31 +47,59 @@ export async function detectProviders(): Promise { } } - // --- Ollama (OpenAI-compatible API) --- - if (process.env.OLLAMA_MODEL || process.env.OLLAMA_HOST) { - try { - const { default: OpenAI } = await import('openai'); - const host = process.env.OLLAMA_HOST ?? 'http://localhost:11434'; - const model = process.env.OLLAMA_MODEL ?? 'llama3.2'; - const client = new OpenAI({ baseURL: `${host}/v1`, apiKey: 'ollama' }); + // --- Ollama (auto-detected or via env vars) --- + { + const host = process.env.OLLAMA_HOST ?? 'http://localhost:11434'; + const model = process.env.OLLAMA_MODEL ?? 'llama3.2'; + const hasEnv = !!(process.env.OLLAMA_MODEL || process.env.OLLAMA_HOST); - providers.push({ - name: 'ollama', - model, - callLlm: async (prompt: string): Promise => { - const r = await client.chat.completions.create({ - model, - messages: [{ role: 'user', content: prompt }], - max_tokens: 400, - temperature: 0.3, - }); - return r.choices[0]?.message?.content ?? ''; - }, - }); - } catch (err) { - console.log( - ` OpenAI SDK not installed (needed for Ollama), skipping (${(err as Error).message})`, - ); + // Auto-detect: probe the Ollama API with a short timeout + let ollamaAvailable = hasEnv; + if (!hasEnv) { + try { + const res = await fetch(`${host}/api/tags`, { + signal: AbortSignal.timeout(2000), + }); + if (res.ok) { + const data = (await res.json()) as { models?: { name: string }[] }; + const models = data.models ?? []; + const hasModel = models.some((m) => m.name === model || m.name === `${model}:latest`); + if (hasModel) { + ollamaAvailable = true; + } else if (models.length > 0) { + console.log( + ` Ollama running but model "${model}" not found (available: ${models.map((m) => m.name).join(', ')})`, + ); + } + } + } catch { + // Not running — skip silently + } + } + + if (ollamaAvailable) { + try { + const { default: OpenAI } = await import('openai'); + const client = new OpenAI({ baseURL: `${host}/v1`, apiKey: 'ollama' }); + + providers.push({ + name: 'ollama', + model, + callLlm: async (prompt: string): Promise => { + const r = await client.chat.completions.create({ + model, + messages: [{ role: 'user', content: prompt }], + max_tokens: 400, + temperature: 0.3, + }); + return r.choices[0]?.message?.content ?? ''; + }, + }); + } catch (err) { + console.log( + ` Ollama detected but openai SDK not installed — run \`npm install openai\` (${(err as Error).message})`, + ); + } } } @@ -92,7 +120,7 @@ export async function detectProviders(): Promise { messages: [{ role: 'user', content: prompt }], }); const block = msg.content[0]; - return block.type === 'text' ? block.text : ''; + return block?.type === 'text' ? block.text : ''; }, }); } catch (err) { diff --git a/bench/run.ts b/bench/run.ts index 3b69ff7..e887fed 100644 --- a/bench/run.ts +++ b/bench/run.ts @@ -3,9 +3,26 @@ import { uncompress } from '../src/expand.js'; import { createSummarizer, createEscalatingSummarizer } from '../src/summarizer.js'; import type { CompressResult, Message } from '../src/types.js'; import { readFileSync, readdirSync, statSync, existsSync } from 'node:fs'; -import { join } from 'node:path'; +import { join, resolve } from 'node:path'; import { homedir } from 'node:os'; import { detectProviders } from './llm.js'; +import type { LlmBenchmarkResult, LlmMethodResult } from './baseline.js'; +import { saveLlmResult } from './baseline.js'; + +// --------------------------------------------------------------------------- +// Auto-load .env (no dependency, won't override existing vars) +// --------------------------------------------------------------------------- + +const envPath = resolve(import.meta.dirname, '..', '.env'); +if (existsSync(envPath)) { + for (const line of readFileSync(envPath, 'utf-8').split('\n')) { + const match = line.match(/^\s*(?:export\s+)?([^#=]+?)\s*=\s*(.*?)\s*$/); + if (!match || process.env[match[1]]) continue; + // Strip wrapping quotes (single or double) + const val = match[2].replace(/^(['"])(.*)\1$/, '$2'); + process.env[match[1]] = val; + } +} // --------------------------------------------------------------------------- // Helpers @@ -936,12 +953,13 @@ async function runLlmBenchmark(): Promise { if (providers.length === 0) { console.log(); console.log( - 'LLM Summarization Benchmark — skipped (no OPENAI_API_KEY, OLLAMA_MODEL, or ANTHROPIC_API_KEY set)', + 'LLM Summarization Benchmark — skipped (no providers detected: set OPENAI_API_KEY or ANTHROPIC_API_KEY, or start Ollama)', ); return; } const scenarios = buildScenarios().filter((s) => s.name !== 'Short conversation'); + const baselinesDir = resolve(import.meta.dirname, 'baselines'); for (const provider of providers) { console.log(); @@ -978,42 +996,85 @@ async function runLlmBenchmark(): Promise { console.log(sep); let llmFails = 0; + const llmResult: LlmBenchmarkResult = { + provider: provider.name, + model: provider.model, + generated: new Date().toISOString(), + scenarios: {}, + }; for (const scenario of scenarios) { - // Deterministic baseline - const t0d = performance.now(); - const detResult = compress(scenario.messages, { recencyWindow: 0 }); - const t1d = performance.now(); - const detRt = roundTrip(scenario.messages, detResult); - - printLlmRow(scenario.name, 'deterministic', detResult, detRt, t1d - t0d, cols); - - // LLM basic summarizer - const t0b = performance.now(); - const llmBasicResult = await compress(scenario.messages, { - recencyWindow: 0, - summarizer: basicSummarizer, - }); - const t1b = performance.now(); - const basicRt = roundTrip(scenario.messages, llmBasicResult); - if (basicRt === 'FAIL') llmFails++; - - printLlmRow('', 'llm-basic', llmBasicResult, basicRt, t1b - t0b, cols); - - // LLM escalating summarizer - const t0e = performance.now(); - const llmEscResult = await compress(scenario.messages, { - recencyWindow: 0, - summarizer: escalatingSummarizer, - }); - const t1e = performance.now(); - const escRt = roundTrip(scenario.messages, llmEscResult); - if (escRt === 'FAIL') llmFails++; - - printLlmRow('', 'llm-escalate', llmEscResult, escRt, t1e - t0e, cols); - console.log(sep); + try { + const scenarioResult: Record = {}; + + // Deterministic baseline + const t0d = performance.now(); + const detResult = compress(scenario.messages, { recencyWindow: 0 }); + const t1d = performance.now(); + const detRt = roundTrip(scenario.messages, detResult); + + printLlmRow(scenario.name, 'deterministic', detResult, detRt, t1d - t0d, cols); + scenarioResult['deterministic'] = { + ratio: detResult.compression.ratio, + tokenRatio: detResult.compression.token_ratio, + compressed: detResult.compression.messages_compressed, + preserved: detResult.compression.messages_preserved, + roundTrip: detRt, + timeMs: t1d - t0d, + }; + + // LLM basic summarizer + const t0b = performance.now(); + const llmBasicResult = await compress(scenario.messages, { + recencyWindow: 0, + summarizer: basicSummarizer, + }); + const t1b = performance.now(); + const basicRt = roundTrip(scenario.messages, llmBasicResult); + if (basicRt === 'FAIL') llmFails++; + + printLlmRow('', 'llm-basic', llmBasicResult, basicRt, t1b - t0b, cols); + scenarioResult['llm-basic'] = { + ratio: llmBasicResult.compression.ratio, + tokenRatio: llmBasicResult.compression.token_ratio, + compressed: llmBasicResult.compression.messages_compressed, + preserved: llmBasicResult.compression.messages_preserved, + roundTrip: basicRt, + timeMs: t1b - t0b, + }; + + // LLM escalating summarizer + const t0e = performance.now(); + const llmEscResult = await compress(scenario.messages, { + recencyWindow: 0, + summarizer: escalatingSummarizer, + }); + const t1e = performance.now(); + const escRt = roundTrip(scenario.messages, llmEscResult); + if (escRt === 'FAIL') llmFails++; + + printLlmRow('', 'llm-escalate', llmEscResult, escRt, t1e - t0e, cols); + scenarioResult['llm-escalate'] = { + ratio: llmEscResult.compression.ratio, + tokenRatio: llmEscResult.compression.token_ratio, + compressed: llmEscResult.compression.messages_compressed, + preserved: llmEscResult.compression.messages_preserved, + roundTrip: escRt, + timeMs: t1e - t0e, + }; + + console.log(sep); + llmResult.scenarios[scenario.name] = { methods: scenarioResult }; + } catch (err) { + console.error(` ${scenario.name}: ERROR — ${(err as Error).message}`); + console.log(sep); + } } + // Always save LLM results (informational, not gated behind --save) + saveLlmResult(baselinesDir, llmResult); + console.log(` Results saved to bench/baselines/llm/`); + if (llmFails > 0) { console.error(` WARNING: ${llmFails} LLM scenario(s) failed round-trip`); } From 8958dc5f715c15cc7c7171879b454dbe7298bcc0 Mon Sep 17 00:00:00 2001 From: Lisa Date: Wed, 25 Feb 2026 14:09:56 +0100 Subject: [PATCH 02/13] feat(bench): gate LLM benchmarks behind --llm flag, add scenarios and metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LLM benchmarks previously ran automatically when API keys were detected, silently burning money on every `npm run bench`. Now requires explicit `--llm` flag (`npm run bench:llm`). Additions: - Technical explanation scenario (pure prose, no code fences) - vsDet expansion metric (LLM ratio / deterministic ratio) - Token budget + LLM section (deterministic vs llm-escalate) - bench:llm npm script Fixes: - .env parser: strip quotes, handle `export` prefix - loadAllLlmResults: try/catch per file for malformed JSON - Ollama: verify model availability via /api/tags response - Anthropic: guard against empty content array - LLM benchmark loop: per-scenario try/catch - Doc generation: scenario count 7→8, add Technical explanation --- .env.example | 2 +- bench/baseline.ts | 54 +++- bench/baselines/llm/ollama-llama3.2.json | 174 ++++++++--- bench/baselines/llm/openai-gpt-4.1-mini.json | 178 ++++++++--- bench/run.ts | 303 +++++++++++++++++- docs/benchmarks.md | 304 +++++++++++-------- package.json | 1 + 7 files changed, 789 insertions(+), 227 deletions(-) diff --git a/.env.example b/.env.example index d695863..77c0edf 100644 --- a/.env.example +++ b/.env.example @@ -1,4 +1,4 @@ -# LLM provider API keys for benchmark comparisons +# LLM provider API keys for benchmark comparisons (npm run bench:llm) # Copy to .env and uncomment the providers you want to test. # OpenAI (default model: gpt-4.1-mini) diff --git a/bench/baseline.ts b/bench/baseline.ts index 25d7006..ea0f2a3 100644 --- a/bench/baseline.ts +++ b/bench/baseline.ts @@ -59,17 +59,31 @@ export interface LlmMethodResult { preserved: number; roundTrip: 'PASS' | 'FAIL'; timeMs: number; + /** ratio / deterministic ratio — values < 1.0 mean LLM expanded instead of compressing */ + vsDet?: number; } export interface LlmScenarioResult { methods: Record; } +export interface LlmTokenBudgetResult { + budget: number; + method: string; + tokenCount: number; + fits: boolean; + ratio: number; + recencyWindow: number | undefined; + roundTrip: 'PASS' | 'FAIL'; + timeMs: number; +} + export interface LlmBenchmarkResult { provider: string; model: string; generated: string; scenarios: Record; + tokenBudget?: Record; } // --------------------------------------------------------------------------- @@ -421,10 +435,10 @@ export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): lines.push('npm run bench:save # Run, save new baseline, regenerate this doc'); lines.push('```'); lines.push(''); - lines.push('### LLM benchmarks (optional)'); + lines.push('### LLM benchmarks (opt-in)'); lines.push(''); lines.push( - 'Set environment variables (or add a `.env` file) to enable LLM-powered summarization comparison. Ollama is auto-detected when running locally.', + 'LLM benchmarks require the `--llm` flag (`npm run bench:llm`). Set API keys in a `.env` file or export them. Ollama is auto-detected when running locally.', ); lines.push(''); lines.push('| Variable | Provider | Default Model | Notes |'); @@ -477,7 +491,7 @@ export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): // --- Scenarios --- lines.push('## Scenarios'); lines.push(''); - lines.push('The benchmark covers 7 conversation types:'); + lines.push('The benchmark covers 8 conversation types:'); lines.push(''); lines.push('| Scenario | Description |'); lines.push('| --- | --- |'); @@ -486,6 +500,7 @@ export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): lines.push('| Tool-heavy | Messages with `tool_calls` arrays (preserved by default) |'); lines.push('| Short conversation | Brief exchanges, mostly under 120 chars |'); lines.push('| Deep conversation | 25 turns of multi-paragraph prose |'); + lines.push('| Technical explanation | Pure prose Q&A about event-driven architecture |'); lines.push('| Structured content | JSON, YAML, SQL, API keys, test output |'); lines.push( '| Agentic coding session | Repeated file reads, grep results, near-duplicate edits |', @@ -550,9 +565,9 @@ export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): lines.push(`*Generated: ${llm.generated.split('T')[0]}*`); lines.push(''); lines.push( - '| Scenario | Method | Char Ratio | Token Ratio | Compressed | Preserved | Round-trip | Time |', + '| Scenario | Method | Char Ratio | Token Ratio | vs Det | Compressed | Preserved | Round-trip | Time |', ); - lines.push('| --- | --- | ---: | ---: | ---: | ---: | --- | ---: |'); + lines.push('| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: |'); for (const [scenario, sr] of Object.entries(llm.scenarios)) { let first = true; @@ -560,13 +575,40 @@ export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): const label = first ? scenario : ''; const time = mr.timeMs < 1000 ? `${Math.round(mr.timeMs)}ms` : `${(mr.timeMs / 1000).toFixed(1)}s`; + const vsDet = mr.vsDet != null ? fix(mr.vsDet) : '-'; lines.push( - `| ${label} | ${method} | ${fix(mr.ratio)} | ${fix(mr.tokenRatio)} | ${mr.compressed} | ${mr.preserved} | ${mr.roundTrip} | ${time} |`, + `| ${label} | ${method} | ${fix(mr.ratio)} | ${fix(mr.tokenRatio)} | ${vsDet} | ${mr.compressed} | ${mr.preserved} | ${mr.roundTrip} | ${time} |`, ); first = false; } } + // Token budget table (if present) + if (llm.tokenBudget && Object.keys(llm.tokenBudget).length > 0) { + lines.push(''); + lines.push('#### Token Budget (target: 2000 tokens)'); + lines.push(''); + lines.push( + '| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time |', + ); + lines.push('| --- | --- | ---: | --- | ---: | ---: | --- | ---: |'); + + for (const [scenario, entries] of Object.entries(llm.tokenBudget)) { + let first = true; + for (const entry of entries) { + const label = first ? scenario : ''; + const time = + entry.timeMs < 1000 + ? `${Math.round(entry.timeMs)}ms` + : `${(entry.timeMs / 1000).toFixed(1)}s`; + lines.push( + `| ${label} | ${entry.method} | ${entry.tokenCount} | ${entry.fits} | ${entry.recencyWindow ?? '-'} | ${fix(entry.ratio)} | ${entry.roundTrip} | ${time} |`, + ); + first = false; + } + } + } + lines.push(''); } } diff --git a/bench/baselines/llm/ollama-llama3.2.json b/bench/baselines/llm/ollama-llama3.2.json index df2daf8..a0f393b 100644 --- a/bench/baselines/llm/ollama-llama3.2.json +++ b/bench/baselines/llm/ollama-llama3.2.json @@ -1,7 +1,7 @@ { "provider": "ollama", "model": "llama3.2", - "generated": "2026-02-25T09:04:08.797Z", + "generated": "2026-02-25T12:21:05.747Z", "scenarios": { "Coding assistant": { "methods": { @@ -11,23 +11,25 @@ "compressed": 5, "preserved": 8, "roundTrip": "PASS", - "timeMs": 0.25633400000515394 + "timeMs": 0.25966599996900186 }, "llm-basic": { - "ratio": 1.542581503659348, - "tokenRatio": 1.5375722543352601, + "ratio": 1.4847902657700929, + "tokenRatio": 1.4810690423162582, "compressed": 5, "preserved": 8, "roundTrip": "PASS", - "timeMs": 4017.3837080000085 + "timeMs": 5869.715916000016, + "vsDet": 0.883125200128082 }, "llm-escalate": { - "ratio": 1.4490625, - "tokenRatio": 1.4472252448313385, + "ratio": 1.5518741633199464, + "tokenRatio": 1.5501165501165501, "compressed": 5, "preserved": 8, "roundTrip": "PASS", - "timeMs": 4231.839750000014 + "timeMs": 3001.2509999999893, + "vsDet": 0.9230254350736279 } } }, @@ -39,23 +41,25 @@ "compressed": 4, "preserved": 6, "roundTrip": "PASS", - "timeMs": 0.9851250000065193 + "timeMs": 0.73641700000735 }, "llm-basic": { - "ratio": 4.482911673324456, - "tokenRatio": 4.449230769230769, + "ratio": 4.308873720136519, + "tokenRatio": 4.2844444444444445, "compressed": 4, "preserved": 6, "roundTrip": "PASS", - "timeMs": 4514.40400000001 + "timeMs": 4080.273957999947, + "vsDet": 0.6996587030716723 }, "llm-escalate": { - "ratio": 4.46113074204947, - "tokenRatio": 4.435582822085889, + "ratio": 4.486894713460684, + "tokenRatio": 4.456086286594761, "compressed": 4, "preserved": 6, "roundTrip": "PASS", - "timeMs": 4495.341832999984 + "timeMs": 3666.4759170000325, + "vsDet": 0.7285650821856953 } } }, @@ -67,23 +71,25 @@ "compressed": 2, "preserved": 16, "roundTrip": "PASS", - "timeMs": 0.43775000001187436 + "timeMs": 1.655417000001762 }, "llm-basic": { - "ratio": 1.1762632197414806, - "tokenRatio": 1.174089068825911, + "ratio": 1.1153203342618385, + "tokenRatio": 1.1132437619961613, "compressed": 2, "preserved": 16, "roundTrip": "PASS", - "timeMs": 2554.344207999995 + "timeMs": 2252.8222499999683, + "vsDet": 0.8584958217270195 }, "llm-escalate": { - "ratio": 1.0945872061235649, - "tokenRatio": 1.0922787193973635, + "ratio": 1.2816901408450705, + "tokenRatio": 1.277533039647577, "compressed": 2, "preserved": 16, "roundTrip": "PASS", - "timeMs": 3072.135583000025 + "timeMs": 2796.051916999975, + "vsDet": 0.9865556978233034 } } }, @@ -95,23 +101,55 @@ "compressed": 50, "preserved": 1, "roundTrip": "PASS", - "timeMs": 2.759959000017261 + "timeMs": 2.8401660000090487 }, "llm-basic": { - "ratio": 3.1500511538024325, - "tokenRatio": 3.133385951065509, + "ratio": 3.123774095366926, + "tokenRatio": 3.1088488645262333, "compressed": 50, "preserved": 1, "roundTrip": "PASS", - "timeMs": 24927.85712499998 + "timeMs": 22697.48683300003, + "vsDet": 1.470071017923571 }, "llm-escalate": { - "ratio": 3.3075913105753165, - "tokenRatio": 3.2891466445733224, + "ratio": 3.2790202342918, + "tokenRatio": 3.255432554325543, "compressed": 50, "preserved": 1, "roundTrip": "PASS", - "timeMs": 25502.586582999997 + "timeMs": 23293.247875, + "vsDet": 1.5431309904153354 + } + } + }, + "Technical explanation": { + "methods": { + "deterministic": { + "ratio": 1, + "tokenRatio": 1, + "compressed": 0, + "preserved": 11, + "roundTrip": "PASS", + "timeMs": 0.6284590000286698 + }, + "llm-basic": { + "ratio": 1, + "tokenRatio": 1, + "compressed": 0, + "preserved": 11, + "roundTrip": "PASS", + "timeMs": 3207.201915999991, + "vsDet": 1 + }, + "llm-escalate": { + "ratio": 1.0009776232891592, + "tokenRatio": 1.0007587253414265, + "compressed": 2, + "preserved": 9, + "roundTrip": "PASS", + "timeMs": 784.6597920000786, + "vsDet": 1.0009776232891592 } } }, @@ -123,23 +161,25 @@ "compressed": 2, "preserved": 10, "roundTrip": "PASS", - "timeMs": 0.7937499999825377 + "timeMs": 0.48375000001396984 }, "llm-basic": { - "ratio": 1.338071693448702, - "tokenRatio": 1.3365591397849463, + "ratio": 1.4554621848739495, + "tokenRatio": 1.4521028037383177, "compressed": 2, "preserved": 10, "roundTrip": "PASS", - "timeMs": 4280.396999999997 + "timeMs": 3480.8887089999625, + "vsDet": 0.7526050420168067 }, "llm-escalate": { - "ratio": 1.338899196042053, - "tokenRatio": 1.3365591397849463, + "ratio": 1.3816209317166561, + "tokenRatio": 1.3795782463928967, "compressed": 2, "preserved": 10, "roundTrip": "PASS", - "timeMs": 4270.229416999995 + "timeMs": 3686.4468750000233, + "vsDet": 0.7144224633056797 } } }, @@ -151,25 +191,73 @@ "compressed": 2, "preserved": 31, "roundTrip": "PASS", - "timeMs": 6.805083999992348 + "timeMs": 0.749125000089407 }, "llm-basic": { - "ratio": 1.3898050974512743, - "tokenRatio": 1.3880208333333333, + "ratio": 1.3462097008422886, + "tokenRatio": 1.34460141271443, "compressed": 2, "preserved": 31, "roundTrip": "PASS", - "timeMs": 4489.758750000008 + "timeMs": 3328.690416999976, + "vsDet": 0.9424920127795526 }, "llm-escalate": { - "ratio": 1.367660076718796, - "tokenRatio": 1.3659661711942594, + "ratio": 1.3975576662143827, + "tokenRatio": 1.3952879581151831, "compressed": 2, "preserved": 31, "roundTrip": "PASS", - "timeMs": 2974.9285839999793 + "timeMs": 5422.445708999992, + "vsDet": 0.978441127694859 } } } + }, + "tokenBudget": { + "Deep conversation": [ + { + "budget": 2000, + "method": "deterministic", + "tokenCount": 3738, + "fits": false, + "ratio": 2.124913733609386, + "recencyWindow": 0, + "roundTrip": "PASS", + "timeMs": 12.129625000059605 + }, + { + "budget": 2000, + "method": "llm-escalate", + "tokenCount": 2593, + "fits": false, + "ratio": 3.0834538778235228, + "recencyWindow": 0, + "roundTrip": "PASS", + "timeMs": 131976.87870800006 + } + ], + "Agentic coding session": [ + { + "budget": 2000, + "method": "deterministic", + "tokenCount": 1957, + "fits": true, + "ratio": 1.3638369869059879, + "recencyWindow": 9, + "roundTrip": "PASS", + "timeMs": 1.8957079999381676 + }, + { + "budget": 2000, + "method": "llm-escalate", + "tokenCount": 2003, + "fits": false, + "ratio": 1.331896551724138, + "recencyWindow": 9, + "roundTrip": "PASS", + "timeMs": 4096.28350000002 + } + ] } } diff --git a/bench/baselines/llm/openai-gpt-4.1-mini.json b/bench/baselines/llm/openai-gpt-4.1-mini.json index 986174d..27b75c4 100644 --- a/bench/baselines/llm/openai-gpt-4.1-mini.json +++ b/bench/baselines/llm/openai-gpt-4.1-mini.json @@ -1,7 +1,7 @@ { "provider": "openai", "model": "gpt-4.1-mini", - "generated": "2026-02-25T09:01:27.143Z", + "generated": "2026-02-25T12:28:55.113Z", "scenarios": { "Coding assistant": { "methods": { @@ -11,23 +11,25 @@ "compressed": 5, "preserved": 8, "roundTrip": "PASS", - "timeMs": 0.24499999999989086 + "timeMs": 0.25587500000006 }, "llm-basic": { - "ratio": 1.6385159010600707, - "tokenRatio": 1.6319018404907975, + "ratio": 1.6414159292035397, + "tokenRatio": 1.633906633906634, "compressed": 5, "preserved": 8, "roundTrip": "PASS", - "timeMs": 5283.475208 + "timeMs": 5578.285459, + "vsDet": 0.976283185840708 }, "llm-escalate": { - "ratio": 1.634473034896017, - "tokenRatio": 1.627906976744186, + "ratio": 1.631597466572836, + "tokenRatio": 1.625916870415648, "compressed": 5, "preserved": 8, "roundTrip": "PASS", - "timeMs": 5459.967416999999 + "timeMs": 6046.540790999999, + "vsDet": 0.9704433497536946 } } }, @@ -39,23 +41,25 @@ "compressed": 4, "preserved": 6, "roundTrip": "PASS", - "timeMs": 0.747084000000541 + "timeMs": 0.9947919999995065 }, "llm-basic": { - "ratio": 5.34957627118644, - "tokenRatio": 5.3161764705882355, + "ratio": 5.372340425531915, + "tokenRatio": 5.3259668508287294, "compressed": 4, "preserved": 6, "roundTrip": "PASS", - "timeMs": 4945.921291999999 + "timeMs": 5892.603500000001, + "vsDet": 0.8723404255319149 }, "llm-escalate": { - "ratio": 5.324196099103848, - "tokenRatio": 5.287020109689214, + "ratio": 5.346744309158285, + "tokenRatio": 5.3064220183486235, "compressed": 4, "preserved": 6, "roundTrip": "PASS", - "timeMs": 6080.180457999999 + "timeMs": 6988.136834000001, + "vsDet": 0.868184224457385 } } }, @@ -67,23 +71,25 @@ "compressed": 2, "preserved": 16, "roundTrip": "PASS", - "timeMs": 0.2543749999967986 + "timeMs": 0.2992500000000291 }, "llm-basic": { - "ratio": 1.1159420289855073, - "tokenRatio": 1.1153846153846154, + "ratio": 1.105466593042518, + "tokenRatio": 1.1047619047619048, "compressed": 2, "preserved": 16, "roundTrip": "PASS", - "timeMs": 3140.399959000002 + "timeMs": 3497.0059580000016, + "vsDet": 0.8509110988404197 }, "llm-escalate": { - "ratio": 1.114079020589872, - "tokenRatio": 1.1132437619961613, + "ratio": 1.1159420289855073, + "tokenRatio": 1.1153846153846154, "compressed": 2, "preserved": 16, "roundTrip": "PASS", - "timeMs": 9754.72075 + "timeMs": 5327.759166, + "vsDet": 0.858974358974359 } } }, @@ -95,23 +101,55 @@ "compressed": 50, "preserved": 1, "roundTrip": "PASS", - "timeMs": 2.977333999995608 + "timeMs": 2.7148750000051223 }, "llm-basic": { - "ratio": 2.3589852728356178, - "tokenRatio": 2.350503256364713, + "ratio": 2.3424344885883346, + "tokenRatio": 2.3346074683916496, "compressed": 50, "preserved": 1, "roundTrip": "PASS", - "timeMs": 51435.262625 + "timeMs": 50365.301625, + "vsDet": 1.1023668639053252 }, "llm-escalate": { - "ratio": 2.3507804546996947, - "tokenRatio": 2.344949793266391, + "ratio": 2.3674498077744555, + "tokenRatio": 2.359583952451709, "compressed": 50, "preserved": 1, "roundTrip": "PASS", - "timeMs": 50272.36633300001 + "timeMs": 50784.971292, + "vsDet": 1.114139256727894 + } + } + }, + "Technical explanation": { + "methods": { + "deterministic": { + "ratio": 1, + "tokenRatio": 1, + "compressed": 0, + "preserved": 11, + "roundTrip": "PASS", + "timeMs": 0.6729170000180602 + }, + "llm-basic": { + "ratio": 1.0014127363616605, + "tokenRatio": 1.0015186028853456, + "compressed": 1, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 2551.7554579999996, + "vsDet": 1.0014127363616605 + }, + "llm-escalate": { + "ratio": 1.0014127363616605, + "tokenRatio": 1.0015186028853456, + "compressed": 1, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 3298.924624999985, + "vsDet": 1.0014127363616605 } } }, @@ -123,23 +161,25 @@ "compressed": 2, "preserved": 10, "roundTrip": "PASS", - "timeMs": 0.5064999999885913 + "timeMs": 0.3844159999862313 }, "llm-basic": { - "ratio": 1.2727807172251617, - "tokenRatio": 1.270961145194274, + "ratio": 1.2315130830489192, + "tokenRatio": 1.2294757665677547, "compressed": 2, "preserved": 10, "roundTrip": "PASS", - "timeMs": 4792.548290999985 + "timeMs": 10207.897041999997, + "vsDet": 0.6368031854379976 }, "llm-escalate": { - "ratio": 1.3229453101130462, - "tokenRatio": 1.3209351753453773, + "ratio": 1.2886904761904763, + "tokenRatio": 1.2867494824016563, "compressed": 2, "preserved": 10, "roundTrip": "PASS", - "timeMs": 4862.093292000005 + "timeMs": 4813.861583999998, + "vsDet": 0.6663690476190476 } } }, @@ -151,25 +191,73 @@ "compressed": 2, "preserved": 31, "roundTrip": "PASS", - "timeMs": 0.7409169999882579 + "timeMs": 0.6770829999877606 }, "llm-basic": { - "ratio": 1.3914740318222756, - "tokenRatio": 1.389468196037539, + "ratio": 1.428351309707242, + "tokenRatio": 1.4258962011771001, "compressed": 2, "preserved": 31, "roundTrip": "PASS", - "timeMs": 4425.688000000024 + "timeMs": 5799.787291999994, + "vsDet": 1 }, "llm-escalate": { - "ratio": 1.348559790514984, - "tokenRatio": 1.3473205257836198, - "compressed": 2, - "preserved": 31, + "ratio": 1.3244749249892842, + "tokenRatio": 1.3232373386295928, + "compressed": 1, + "preserved": 32, "roundTrip": "PASS", - "timeMs": 11189.863042000012 + "timeMs": 9487.380791999982, + "vsDet": 0.9272753250464352 } } } + }, + "tokenBudget": { + "Deep conversation": [ + { + "budget": 2000, + "method": "deterministic", + "tokenCount": 3738, + "fits": false, + "ratio": 2.124913733609386, + "recencyWindow": 0, + "roundTrip": "PASS", + "timeMs": 10.060708000004524 + }, + { + "budget": 2000, + "method": "llm-escalate", + "tokenCount": 3391, + "fits": false, + "ratio": 2.3493853327681222, + "recencyWindow": 0, + "roundTrip": "PASS", + "timeMs": 280464.86720800004 + } + ], + "Agentic coding session": [ + { + "budget": 2000, + "method": "deterministic", + "tokenCount": 1957, + "fits": true, + "ratio": 1.3638369869059879, + "recencyWindow": 9, + "roundTrip": "PASS", + "timeMs": 1.9349999999976717 + }, + { + "budget": 2000, + "method": "llm-escalate", + "tokenCount": 1915, + "fits": true, + "ratio": 1.3935658448586892, + "recencyWindow": 3, + "roundTrip": "PASS", + "timeMs": 28052.867749999976 + } + ] } } diff --git a/bench/run.ts b/bench/run.ts index e887fed..ee51a8f 100644 --- a/bench/run.ts +++ b/bench/run.ts @@ -6,7 +6,7 @@ import { readFileSync, readdirSync, statSync, existsSync } from 'node:fs'; import { join, resolve } from 'node:path'; import { homedir } from 'node:os'; import { detectProviders } from './llm.js'; -import type { LlmBenchmarkResult, LlmMethodResult } from './baseline.js'; +import type { LlmBenchmarkResult, LlmMethodResult, LlmTokenBudgetResult } from './baseline.js'; import { saveLlmResult } from './baseline.js'; // --------------------------------------------------------------------------- @@ -56,6 +56,7 @@ function buildScenarios(): Scenario[] { toolHeavy(), shortConversation(), deepConversation(), + technicalExplanation(), structuredContent(), agenticCodingSession(), ]; @@ -279,6 +280,160 @@ function deepConversation(): Scenario { return { name: 'Deep conversation', messages }; } +function technicalExplanation(): Scenario { + // Prose-heavy architectural discussion — no code fences, no structured data. + // Tests whether LLM summarization outperforms deterministic on pure technical prose. + return { + name: 'Technical explanation', + messages: [ + msg('system', 'You are a principal engineer explaining system design to a new team lead.'), + msg( + 'user', + 'Can you walk me through how our event-driven architecture works and why we chose it over a traditional request-response model?', + ), + msg( + 'assistant', + 'The event-driven architecture we adopted grew out of a series of scaling problems we ' + + 'encountered roughly two years ago when the monolith started buckling under peak traffic. ' + + 'The core idea is that services communicate through immutable events published to a central ' + + 'log rather than making synchronous calls to each other. When the order service completes a ' + + 'checkout, it publishes an OrderPlaced event. The inventory service, the notification ' + + 'service, and the analytics pipeline each consume that event independently and at their own ' + + 'pace. None of them need to know about the others, and none of them can block the order ' + + 'service from finishing its work.\n\n' + + 'The biggest advantage is temporal decoupling. Under the old model, a single slow downstream ' + + 'dependency could cascade latency across the entire checkout flow. A payment verification ' + + 'that normally took fifty milliseconds would occasionally spike to several seconds, and ' + + 'because every call was synchronous, the user sat waiting while six services completed in ' + + 'sequence. With events, the order service does its own work, publishes, and returns. ' + + 'Downstream processing happens asynchronously, so the user experience stays fast even when ' + + 'one consumer is struggling.\n\n' + + 'The trade-off is eventual consistency. In a synchronous model, you get immediate confirmation ' + + 'that every step completed. In an event-driven model, you have to design for the possibility ' + + 'that the inventory adjustment or the email notification has not happened yet when the user ' + + 'sees their confirmation page. We handle this through careful UX design that sets appropriate ' + + 'expectations and through idempotent consumers that can safely replay events if something ' + + 'goes wrong during processing.', + ), + msg( + 'user', + 'That makes sense. What about the event schema evolution problem? How do we handle ' + + 'changes to event formats without breaking all the consumers?', + ), + msg( + 'assistant', + 'Schema evolution is one of the hardest problems in event-driven systems and we spent ' + + 'considerable time getting our approach right. We settled on a strategy that combines ' + + 'explicit versioning with backward-compatible evolution rules. Every event type carries a ' + + 'schema version number in its header. Consumers declare which versions they understand, and ' + + 'the platform routes accordingly.\n\n' + + 'For non-breaking changes like adding a new optional field, we increment the minor version. ' + + 'Consumers that do not understand the new field simply ignore it. This follows the robustness ' + + 'principle of being conservative in what you send and liberal in what you accept. The vast ' + + 'majority of our schema changes fall into this category, and they require zero coordination ' + + 'between teams.\n\n' + + 'For breaking changes like removing a field or changing a field type, we increment the major ' + + 'version and run both the old and new versions in parallel during a migration window. The ' + + 'producing service publishes to both versions, and consuming teams migrate on their own ' + + 'schedule within the agreed window. Once all consumers have migrated, we sunset the old ' + + 'version. This approach adds operational complexity but gives teams the autonomy to migrate ' + + 'at their own pace without coordinated deployment ceremonies.\n\n' + + 'We also maintain a central schema registry that validates events at publish time. If a ' + + 'service tries to publish an event that does not conform to its declared schema, the publish ' + + 'fails fast with a clear error. This catches drift between the code and the schema before ' + + 'it reaches consumers and prevents subtle data corruption that would be much harder to ' + + 'diagnose after the fact.', + ), + msg( + 'user', + 'How do we handle ordering guarantees? I noticed some of our consumers seem to ' + + 'process events out of order.', + ), + msg( + 'assistant', + 'Ordering is subtle in distributed event systems and the right answer depends on what ' + + 'level of ordering your consumer actually needs. Our event platform guarantees ordering ' + + 'within a partition, but not across partitions. Events for the same aggregate, like all ' + + 'events for a specific order, go to the same partition based on the aggregate identifier, ' + + 'so they arrive in the order they were published. Events for different aggregates may arrive ' + + 'in any order because they live on different partitions processed by different consumer ' + + 'instances.\n\n' + + 'Most consumers only care about ordering within an aggregate and they get it for free from ' + + 'the partitioning scheme. The inventory service needs to see OrderPlaced before OrderCancelled ' + + 'for the same order, and it does because both events share the same order identifier as their ' + + 'partition key. But it does not care whether order A events arrive before or after order B ' + + 'events because those are independent aggregates.\n\n' + + 'For the rare cases where a consumer needs cross-aggregate ordering, we use a sequence ' + + 'number stamped by the producer at publish time. The consumer tracks the last processed ' + + 'sequence number and detects gaps. When it encounters a gap, it buffers the event and waits ' + + 'briefly for the missing one to arrive. If the missing event does not appear within the ' + + 'timeout window, the consumer raises an alert and proceeds with compensating logic. This ' + + 'approach adds latency in the gap case but preserves correctness.\n\n' + + 'The consumers you noticed processing out of order are likely reading from a topic that ' + + 'mixes events from multiple aggregates. Check whether the partition key is set correctly ' + + 'on the producer side. A common mistake is using a random or missing partition key, which ' + + 'causes round-robin assignment and destroys any ordering guarantee.', + ), + msg( + 'user', + 'What about failure handling? If a consumer crashes mid-processing, do we lose events?', + ), + msg( + 'assistant', + 'No, the platform guarantees at-least-once delivery. When a consumer reads an event, it ' + + 'does not acknowledge the read until processing completes successfully. If the consumer ' + + 'crashes before acknowledging, the event remains available for redelivery. The next time ' + + 'the consumer starts up, or when another instance in the consumer group takes over the ' + + 'partition, it picks up from the last acknowledged position.\n\n' + + 'The flip side of at-least-once delivery is that consumers must be idempotent. They will ' + + 'occasionally see the same event twice, particularly after a crash or during a rebalance. ' + + 'We enforce idempotency through a combination of techniques depending on the consumer. ' + + 'Some use a deduplication table that records processed event identifiers and skips any event ' + + 'it has seen before. Others design their state mutations to be naturally idempotent, for ' + + 'example using upserts instead of inserts or applying absolute values instead of deltas.\n\n' + + 'Dead letter queues catch events that fail processing repeatedly. After a configurable number ' + + 'of retries with exponential backoff, the platform moves the event to a dead letter topic ' + + 'and advances the consumer offset so it does not block subsequent events. The operations team ' + + 'monitors the dead letter queue and investigates failures. Once the root cause is fixed, they ' + + 'can replay events from the dead letter queue back into the main topic.\n\n' + + 'We also run a reconciliation process that periodically compares the state derived from ' + + 'events against the source of truth in the originating service. This catches any silent data ' + + 'loss or corruption that might slip through the cracks. The reconciler runs daily during off-' + + 'peak hours and flags discrepancies for manual review.', + ), + msg( + 'user', + 'One more thing — how do we test all of this? Integration testing with events seems ' + + 'really hard.', + ), + msg( + 'assistant', + 'Testing event-driven systems requires a layered approach because no single testing ' + + 'strategy covers all the failure modes. At the unit level, each consumer is tested with ' + + 'synthetic events that exercise the happy path, edge cases, and error paths. These tests ' + + 'run in milliseconds and give fast feedback on business logic correctness.\n\n' + + 'At the integration level, we run each service against a local instance of the event ' + + 'platform. The test publishes events, waits for the consumer to process them, and verifies ' + + 'the resulting state. These tests are slower but catch serialization issues, schema ' + + 'mismatches, and configuration problems that unit tests miss. We keep the integration test ' + + 'suite focused on the boundaries: publishing, consuming, and acknowledging. Internal ' + + 'business logic is covered at the unit level.\n\n' + + 'At the system level, we maintain a staging environment that mirrors production topology. ' + + 'Every deployment goes through staging first, where we run end-to-end scenarios that ' + + 'exercise the full event flow from producer through all consumers. These tests use realistic ' + + 'data volumes and introduce controlled failures like consumer crashes and network partitions ' + + 'to verify that the retry and dead-letter mechanisms work correctly.\n\n' + + 'Contract testing bridges the gap between producers and consumers without requiring a ' + + 'shared integration environment. Each consumer publishes a contract describing the events ' + + 'it expects, and the producer runs those contracts as part of its build. If a producer ' + + 'change would break a consumer contract, the build fails before the change reaches any ' + + 'shared environment. This is particularly valuable in our setup where different teams own ' + + "different services and may not be aware of each other's dependencies.", + ), + ], + }; +} + function structuredContent(): Scenario { // Pure prose about auth (~1500 chars): no code, URLs, SQL, API keys, JSON, paths, etc. const authProse = @@ -930,14 +1085,17 @@ async function run(): Promise { runRealSessions(); - await runLlmBenchmark(); + // LLM benchmarks require explicit --llm flag (they cost money and take minutes) + if (process.argv.includes('--llm')) { + await runLlmBenchmark(); + } console.log(); console.log('All benchmarks passed.'); } // --------------------------------------------------------------------------- -// LLM summarization benchmark (opt-in via env vars) +// LLM summarization benchmark (requires --llm flag) // --------------------------------------------------------------------------- function roundTrip(messages: Message[], cr: CompressResult): 'PASS' | 'FAIL' { @@ -953,7 +1111,7 @@ async function runLlmBenchmark(): Promise { if (providers.length === 0) { console.log(); console.log( - 'LLM Summarization Benchmark — skipped (no providers detected: set OPENAI_API_KEY or ANTHROPIC_API_KEY, or start Ollama)', + 'LLM Summarization Benchmark — no providers detected (set OPENAI_API_KEY or ANTHROPIC_API_KEY in .env, or start Ollama)', ); return; } @@ -973,6 +1131,7 @@ async function runLlmBenchmark(): Promise { method: 14, chr: 6, tkr: 6, + vsDet: 6, comp: 5, pres: 5, rt: 5, @@ -984,6 +1143,7 @@ async function runLlmBenchmark(): Promise { 'Method'.padStart(cols.method), 'ChR'.padStart(cols.chr), 'TkR'.padStart(cols.tkr), + 'vsDet'.padStart(cols.vsDet), 'Comp'.padStart(cols.comp), 'Pres'.padStart(cols.pres), 'R/T'.padStart(cols.rt), @@ -1012,10 +1172,11 @@ async function runLlmBenchmark(): Promise { const detResult = compress(scenario.messages, { recencyWindow: 0 }); const t1d = performance.now(); const detRt = roundTrip(scenario.messages, detResult); + const detRatio = detResult.compression.ratio; - printLlmRow(scenario.name, 'deterministic', detResult, detRt, t1d - t0d, cols); + printLlmRow(scenario.name, 'deterministic', detResult, detRt, t1d - t0d, undefined, cols); scenarioResult['deterministic'] = { - ratio: detResult.compression.ratio, + ratio: detRatio, tokenRatio: detResult.compression.token_ratio, compressed: detResult.compression.messages_compressed, preserved: detResult.compression.messages_preserved, @@ -1032,8 +1193,9 @@ async function runLlmBenchmark(): Promise { const t1b = performance.now(); const basicRt = roundTrip(scenario.messages, llmBasicResult); if (basicRt === 'FAIL') llmFails++; + const basicVsDet = llmBasicResult.compression.ratio / detRatio; - printLlmRow('', 'llm-basic', llmBasicResult, basicRt, t1b - t0b, cols); + printLlmRow('', 'llm-basic', llmBasicResult, basicRt, t1b - t0b, basicVsDet, cols); scenarioResult['llm-basic'] = { ratio: llmBasicResult.compression.ratio, tokenRatio: llmBasicResult.compression.token_ratio, @@ -1041,6 +1203,7 @@ async function runLlmBenchmark(): Promise { preserved: llmBasicResult.compression.messages_preserved, roundTrip: basicRt, timeMs: t1b - t0b, + vsDet: basicVsDet, }; // LLM escalating summarizer @@ -1052,8 +1215,9 @@ async function runLlmBenchmark(): Promise { const t1e = performance.now(); const escRt = roundTrip(scenario.messages, llmEscResult); if (escRt === 'FAIL') llmFails++; + const escVsDet = llmEscResult.compression.ratio / detRatio; - printLlmRow('', 'llm-escalate', llmEscResult, escRt, t1e - t0e, cols); + printLlmRow('', 'llm-escalate', llmEscResult, escRt, t1e - t0e, escVsDet, cols); scenarioResult['llm-escalate'] = { ratio: llmEscResult.compression.ratio, tokenRatio: llmEscResult.compression.token_ratio, @@ -1061,6 +1225,7 @@ async function runLlmBenchmark(): Promise { preserved: llmEscResult.compression.messages_preserved, roundTrip: escRt, timeMs: t1e - t0e, + vsDet: escVsDet, }; console.log(sep); @@ -1071,6 +1236,94 @@ async function runLlmBenchmark(): Promise { } } + // --- Token budget + LLM --- + const tokenBudget = 2000; + const budgetScenarios: Scenario[] = scenarios.filter( + (s) => s.name === 'Deep conversation' || s.name === 'Agentic coding session', + ); + + if (budgetScenarios.length > 0) { + console.log(); + console.log( + `LLM Token Budget — ${provider.name} (${provider.model}) — target: ${tokenBudget} tokens`, + ); + + const tbCols = { name: 24, method: 14, tokens: 7, fits: 5, rw: 4, chr: 6, rt: 5, time: 10 }; + const tbHeader = [ + 'Scenario'.padEnd(tbCols.name), + 'Method'.padStart(tbCols.method), + 'Tokens'.padStart(tbCols.tokens), + 'Fits'.padStart(tbCols.fits), + 'Rw'.padStart(tbCols.rw), + 'ChR'.padStart(tbCols.chr), + 'R/T'.padStart(tbCols.rt), + 'Time'.padStart(tbCols.time), + ].join(' '); + const tbSep = '-'.repeat(tbHeader.length); + + console.log(tbSep); + console.log(tbHeader); + console.log(tbSep); + + llmResult.tokenBudget = {}; + + for (const scenario of budgetScenarios) { + const entries: LlmTokenBudgetResult[] = []; + + try { + // Deterministic with token budget + const t0d = performance.now(); + const detCr = compress(scenario.messages, { tokenBudget }); + const t1d = performance.now(); + const detRt = roundTrip(scenario.messages, detCr); + + const detEntry: LlmTokenBudgetResult = { + budget: tokenBudget, + method: 'deterministic', + tokenCount: detCr.tokenCount ?? 0, + fits: detCr.fits ?? false, + ratio: detCr.compression.ratio, + recencyWindow: detCr.recencyWindow, + roundTrip: detRt, + timeMs: t1d - t0d, + }; + entries.push(detEntry); + printBudgetRow(scenario.name, detEntry, tbCols); + + // LLM escalating with token budget + const t0e = performance.now(); + const llmCr = await compress(scenario.messages, { + tokenBudget, + summarizer: escalatingSummarizer, + }); + const t1e = performance.now(); + const llmRt = roundTrip(scenario.messages, llmCr); + + const llmEntry: LlmTokenBudgetResult = { + budget: tokenBudget, + method: 'llm-escalate', + tokenCount: llmCr.tokenCount ?? 0, + fits: llmCr.fits ?? false, + ratio: llmCr.compression.ratio, + recencyWindow: llmCr.recencyWindow, + roundTrip: llmRt, + timeMs: t1e - t0e, + }; + entries.push(llmEntry); + printBudgetRow('', llmEntry, tbCols); + + console.log(tbSep); + } catch (err) { + console.error(` ${scenario.name}: ERROR — ${(err as Error).message}`); + console.log(tbSep); + } + + if (entries.length > 0) { + llmResult.tokenBudget[scenario.name] = entries; + } + } + } + // Always save LLM results (informational, not gated behind --save) saveLlmResult(baselinesDir, llmResult); console.log(` Results saved to bench/baselines/llm/`); @@ -1087,11 +1340,13 @@ function printLlmRow( cr: CompressResult, rt: string, timeMs: number, + vsDet: number | undefined, cols: { name: number; method: number; chr: number; tkr: number; + vsDet: number; comp: number; pres: number; rt: number; @@ -1104,6 +1359,7 @@ function printLlmRow( method.padStart(cols.method), cr.compression.ratio.toFixed(2).padStart(cols.chr), cr.compression.token_ratio.toFixed(2).padStart(cols.tkr), + (vsDet != null ? vsDet.toFixed(2) : '-').padStart(cols.vsDet), String(cr.compression.messages_compressed).padStart(cols.comp), String(cr.compression.messages_preserved).padStart(cols.pres), rt.padStart(cols.rt), @@ -1114,6 +1370,37 @@ function printLlmRow( ); } +function printBudgetRow( + name: string, + entry: LlmTokenBudgetResult, + cols: { + name: number; + method: number; + tokens: number; + fits: number; + rw: number; + chr: number; + rt: number; + time: number; + }, +): void { + console.log( + [ + name.padEnd(cols.name), + entry.method.padStart(cols.method), + String(entry.tokenCount).padStart(cols.tokens), + String(entry.fits).padStart(cols.fits), + String(entry.recencyWindow ?? '-').padStart(cols.rw), + entry.ratio.toFixed(2).padStart(cols.chr), + entry.roundTrip.padStart(cols.rt), + (entry.timeMs < 1000 + ? entry.timeMs.toFixed(0) + 'ms' + : (entry.timeMs / 1000).toFixed(1) + 's' + ).padStart(cols.time), + ].join(' '), + ); +} + // --------------------------------------------------------------------------- // Real session support — convert Claude Code JSONL transcripts to Message[] // --------------------------------------------------------------------------- diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 4111308..4a4346f 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -1,151 +1,207 @@ -# Benchmarks +# Benchmark Results [Back to README](../README.md) | [All docs](README.md) -Running benchmarks, interpreting results, and comparing compression methods. + + -## Running tests +## Running Benchmarks ```bash -# Run the test suite (333 tests) -npm test - -# Type check -npx tsc --noEmit -``` - -## Deterministic benchmarks - -No API keys needed. Runs entirely locally: - -```bash -npm run bench +npm run bench # Run benchmarks (no baseline check) +npm run bench:check # Run and compare against baseline +npm run bench:save # Run, save new baseline, regenerate this doc ``` -### Scenarios +### LLM benchmarks (opt-in) -The benchmark covers 7 conversation types: +LLM benchmarks require the `--llm` flag (`npm run bench:llm`). Set API keys in a `.env` file or export them. Ollama is auto-detected when running locally. -| Scenario | Description | -| ---------------------- | -------------------------------------------------------- | -| Coding assistant | Mixed code fences and prose discussion | -| Long Q&A | Extended question-and-answer with detailed explanations | -| Tool-heavy | Messages with `tool_calls` arrays (preserved by default) | -| Short conversation | Brief exchanges, mostly under 120 chars | -| Deep conversation | Long, multi-paragraph prose exchanges | -| Structured content | JSON, YAML, SQL, test output | -| Agentic coding session | Repeated file reads, grep results, test runs | +| Variable | Provider | Default Model | Notes | +| ------------------- | --------- | --------------------------- | -------------------------------- | +| `OPENAI_API_KEY` | OpenAI | `gpt-4.1-mini` | | +| `ANTHROPIC_API_KEY` | Anthropic | `claude-haiku-4-5-20251001` | | +| _(none required)_ | Ollama | `llama3.2` | Auto-detected on localhost:11434 | -### What gets measured +## Current Results (v1.0.0) -For each scenario: +### Basic Compression -- **Characters**: original vs. compressed character counts -- **Compression ratio**: `original_chars / compressed_chars` (>1 = savings) -- **Token ratio**: `original_tokens / compressed_tokens` -- **Messages compressed**: how many messages were summarized -- **Messages preserved**: how many were kept as-is -- **Messages deduped**: exact duplicates replaced (agentic scenario) -- **Timing**: milliseconds per compression +**Range:** 1.00x – 6.16x · **Average:** 2.08x · **Round-trip:** all PASS -Additional benchmark sections: +| Scenario | Char Ratio | Token Ratio | Compressed | Preserved | +| ---------------------- | ---------: | ----------: | ---------: | --------: | +| Coding assistant | 1.68 | 1.67 | 5 | 8 | +| Long Q&A | 6.16 | 6.11 | 4 | 6 | +| Tool-heavy | 1.30 | 1.29 | 2 | 16 | +| Short conversation | 1.00 | 1.00 | 0 | 7 | +| Deep conversation | 2.12 | 2.12 | 50 | 1 | +| Technical explanation | 1.00 | 1.00 | 0 | 11 | +| Structured content | 1.93 | 1.92 | 2 | 10 | +| Agentic coding session | 1.43 | 1.43 | 2 | 31 | -- **Token budget optimization** with and without dedup -- **Fuzzy dedup accuracy** across thresholds -- **Real-session compression** on actual Claude Code transcripts (if `~/.claude/projects/` exists) +### Token Budget (target: 2000 tokens) -### Real-session benchmarks - -The benchmark automatically scans for real Claude Code conversation files in `~/.claude/projects/`. It parses JSONL conversation files, extracts message arrays, and runs compression on actual production data. - -This provides the most realistic performance numbers since synthetic scenarios can't capture the full diversity of real conversations. - -## LLM benchmarks - -Compare deterministic compression against real LLM-powered summarization. Set one or more environment variables to enable: - -| Variable | Provider | Default model | -| ------------------- | --------- | --------------------------------------------------------- | -| `OPENAI_API_KEY` | OpenAI | `gpt-4.1-mini` (override: `OPENAI_MODEL`) | -| `ANTHROPIC_API_KEY` | Anthropic | `claude-haiku-4-5-20251001` (override: `ANTHROPIC_MODEL`) | -| `OLLAMA_MODEL` | Ollama | `llama3.2` (host override: `OLLAMA_HOST`) | - -```bash -# Run with OpenAI -OPENAI_API_KEY=sk-... npm run bench - -# Run with Ollama (local) -OLLAMA_MODEL=llama3.2 npm run bench - -# Run with multiple providers -OPENAI_API_KEY=sk-... ANTHROPIC_API_KEY=sk-ant-... npm run bench -``` +| Scenario | Dedup | Tokens | Fits | recencyWindow | Compressed | Preserved | Deduped | +| ---------------------- | ----- | -----: | ----- | ------------: | ---------: | --------: | ------: | +| Deep conversation | no | 3738 | false | 0 | 50 | 1 | 0 | +| Deep conversation | yes | 3738 | false | 0 | 50 | 1 | 0 | +| Agentic coding session | no | 2345 | false | 0 | 4 | 33 | 0 | +| Agentic coding session | yes | 1957 | true | 9 | 1 | 32 | 4 | -### Three methods compared +### Dedup Effectiveness -Each scenario runs three methods side-by-side: +| Scenario | No Dedup (rw=0) | Dedup (rw=0) | No Dedup (rw=4) | Dedup (rw=4) | Deduped | +| ---------------------- | --------------: | -----------: | --------------: | -----------: | ------: | +| Coding assistant | 1.68 | 1.68 | 1.51 | 1.51 | 0 | +| Long Q&A | 5.14 | 6.16 | 1.90 | 2.03 | 1 | +| Tool-heavy | 1.30 | 1.30 | 1.30 | 1.30 | 0 | +| Short conversation | 1.00 | 1.00 | 1.00 | 1.00 | 0 | +| Deep conversation | 2.12 | 2.12 | 1.95 | 1.95 | 0 | +| Technical explanation | 1.00 | 1.00 | 1.00 | 1.00 | 0 | +| Structured content | 1.93 | 1.93 | 1.37 | 1.37 | 0 | +| Agentic coding session | 1.14 | 1.43 | 1.14 | 1.43 | 4 | -| Method | Description | -| --------------- | -------------------------------------------------------------------- | -| `deterministic` | No LLM, pure sentence scoring + entity extraction | -| `llm-basic` | `createSummarizer` with the detected provider | -| `llm-escalate` | `createEscalatingSummarizer` (normal -> aggressive -> deterministic) | +### Fuzzy Dedup -All methods verify round-trip integrity — `uncompress()` is called to confirm originals are restored. +| Scenario | Exact Deduped | Fuzzy Deduped | Ratio | +| ---------------------- | ------------: | ------------: | ----: | +| Coding assistant | 0 | 0 | 1.68 | +| Long Q&A | 1 | 0 | 6.16 | +| Tool-heavy | 0 | 0 | 1.30 | +| Short conversation | 0 | 0 | 1.00 | +| Deep conversation | 0 | 0 | 2.12 | +| Technical explanation | 0 | 0 | 1.00 | +| Structured content | 0 | 0 | 1.93 | +| Agentic coding session | 4 | 2 | 2.23 | -### What to look for +## Scenarios -- **Ratio comparison** — deterministic often beats LLM on compression ratio because LLMs write fuller, more helpful summaries -- **Latency** — deterministic is < 2ms; LLM adds network round-trip time per message -- **Fallback rate** — how often the engine rejects LLM output and falls back to deterministic -- **Round-trip integrity** — all methods must pass (no data loss) +The benchmark covers 8 conversation types: -### SDK requirements - -LLM providers require their SDKs: - -- OpenAI: `openai` package -- Anthropic: `@anthropic-ai/sdk` package -- Ollama: `openai` package (uses OpenAI-compatible API) - -Missing SDKs are detected at runtime and print a skip message — no crash, no hard dependency. +| Scenario | Description | +| ---------------------- | -------------------------------------------------------- | +| Coding assistant | Mixed code fences and prose discussion | +| Long Q&A | Extended question-and-answer with repeated paragraphs | +| Tool-heavy | Messages with `tool_calls` arrays (preserved by default) | +| Short conversation | Brief exchanges, mostly under 120 chars | +| Deep conversation | 25 turns of multi-paragraph prose | +| Technical explanation | Pure prose Q&A about event-driven architecture | +| Structured content | JSON, YAML, SQL, API keys, test output | +| Agentic coding session | Repeated file reads, grep results, near-duplicate edits | -## Interpreting results +## Interpreting Results ### Compression ratio -- `1.0` = no compression (all messages preserved) -- `1.5` = 33% reduction -- `2.0` = 50% reduction -- `3.0` = 67% reduction -- `6.0` = 83% reduction - -Higher is better. The deterministic engine typically achieves 1.3-6.1x on synthetic scenarios. - -### Token ratio vs. character ratio - -Token ratio is more meaningful for LLM context budgeting since tokens are what models count. Character ratio is useful for storage optimization. - -### When LLM wins - -LLM summarization can outperform deterministic in: - -- Very long prose-heavy conversations where paraphrasing and concept merging genuinely helps -- Domain-specific content where the LLM understands what's important - -### When deterministic wins - -Deterministic typically wins when: - -- Messages contain mixed code and prose (code-aware splitting is already optimal) -- Messages are structured (test output, grep results) -- The LLM writes helpful but verbose summaries - ---- - -## See also - -- [Compression pipeline](compression-pipeline.md) - the deterministic algorithm -- [LLM integration](llm-integration.md) - setting up providers for benchmarks -- [Token budget](token-budget.md) - budget optimization -- [Deduplication](deduplication.md) - dedup in benchmarks +| Ratio | Reduction | +| ----: | --------------------------------------- | +| 1.0x | no compression (all messages preserved) | +| 1.5x | 33% reduction | +| 2.0x | 50% reduction | +| 3.0x | 67% reduction | +| 6.0x | 83% reduction | + +Higher is better. Token ratio is more meaningful for LLM context budgeting; character ratio is useful for storage. + +## Regression Testing + +Baselines are stored in [`bench/baselines/`](../bench/baselines/) as JSON. CI runs `npm run bench:check` on every push and PR to catch regressions. + +- **Tolerance:** 0% by default (all metrics are deterministic) +- **On regression:** CI fails with a diff showing which metrics changed +- **After intentional changes:** run `npm run bench:save` to update the baseline and regenerate this doc +- **Custom tolerance:** `npx tsx bench/run.ts --check --tolerance 5` allows 5% deviation + +### Baseline files + +| File | Purpose | +| ------------------------------ | ------------------------------------ | +| `bench/baselines/current.json` | Active baseline compared in CI | +| `bench/baselines/v*.json` | Versioned snapshots, one per release | + +## LLM Summarization Comparison + +> Results are **non-deterministic** — LLM outputs vary between runs. These are saved as reference data, not used for regression testing. + +### ollama (llama3.2) + +_Generated: 2026-02-25_ + +| Scenario | Method | Char Ratio | Token Ratio | vs Det | Compressed | Preserved | Round-trip | Time | +| ---------------------- | ------------- | ---------: | ----------: | -----: | ---------: | --------: | ---------- | ----: | +| Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms | +| | llm-basic | 1.48 | 1.48 | 0.88 | 5 | 8 | PASS | 5.9s | +| | llm-escalate | 1.55 | 1.55 | 0.92 | 5 | 8 | PASS | 3.0s | +| Long Q&A | deterministic | 6.16 | 6.11 | - | 4 | 6 | PASS | 1ms | +| | llm-basic | 4.31 | 4.28 | 0.70 | 4 | 6 | PASS | 4.1s | +| | llm-escalate | 4.49 | 4.46 | 0.73 | 4 | 6 | PASS | 3.7s | +| Tool-heavy | deterministic | 1.30 | 1.29 | - | 2 | 16 | PASS | 2ms | +| | llm-basic | 1.12 | 1.11 | 0.86 | 2 | 16 | PASS | 2.3s | +| | llm-escalate | 1.28 | 1.28 | 0.99 | 2 | 16 | PASS | 2.8s | +| Deep conversation | deterministic | 2.12 | 2.12 | - | 50 | 1 | PASS | 3ms | +| | llm-basic | 3.12 | 3.11 | 1.47 | 50 | 1 | PASS | 22.7s | +| | llm-escalate | 3.28 | 3.26 | 1.54 | 50 | 1 | PASS | 23.3s | +| Technical explanation | deterministic | 1.00 | 1.00 | - | 0 | 11 | PASS | 1ms | +| | llm-basic | 1.00 | 1.00 | 1.00 | 0 | 11 | PASS | 3.2s | +| | llm-escalate | 1.00 | 1.00 | 1.00 | 2 | 9 | PASS | 785ms | +| Structured content | deterministic | 1.93 | 1.92 | - | 2 | 10 | PASS | 0ms | +| | llm-basic | 1.46 | 1.45 | 0.75 | 2 | 10 | PASS | 3.5s | +| | llm-escalate | 1.38 | 1.38 | 0.71 | 2 | 10 | PASS | 3.7s | +| Agentic coding session | deterministic | 1.43 | 1.43 | - | 2 | 31 | PASS | 1ms | +| | llm-basic | 1.35 | 1.34 | 0.94 | 2 | 31 | PASS | 3.3s | +| | llm-escalate | 1.40 | 1.40 | 0.98 | 2 | 31 | PASS | 5.4s | + +#### Token Budget (target: 2000 tokens) + +| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time | +| ---------------------- | ------------- | -----: | ----- | ------------: | ----: | ---------- | -----: | +| Deep conversation | deterministic | 3738 | false | 0 | 2.12 | PASS | 12ms | +| | llm-escalate | 2593 | false | 0 | 3.08 | PASS | 132.0s | +| Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms | +| | llm-escalate | 2003 | false | 9 | 1.33 | PASS | 4.1s | + +### openai (gpt-4.1-mini) + +_Generated: 2026-02-25_ + +| Scenario | Method | Char Ratio | Token Ratio | vs Det | Compressed | Preserved | Round-trip | Time | +| ---------------------- | ------------- | ---------: | ----------: | -----: | ---------: | --------: | ---------- | ----: | +| Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms | +| | llm-basic | 1.64 | 1.63 | 0.98 | 5 | 8 | PASS | 5.6s | +| | llm-escalate | 1.63 | 1.63 | 0.97 | 5 | 8 | PASS | 6.0s | +| Long Q&A | deterministic | 6.16 | 6.11 | - | 4 | 6 | PASS | 1ms | +| | llm-basic | 5.37 | 5.33 | 0.87 | 4 | 6 | PASS | 5.9s | +| | llm-escalate | 5.35 | 5.31 | 0.87 | 4 | 6 | PASS | 7.0s | +| Tool-heavy | deterministic | 1.30 | 1.29 | - | 2 | 16 | PASS | 0ms | +| | llm-basic | 1.11 | 1.10 | 0.85 | 2 | 16 | PASS | 3.5s | +| | llm-escalate | 1.12 | 1.12 | 0.86 | 2 | 16 | PASS | 5.3s | +| Deep conversation | deterministic | 2.12 | 2.12 | - | 50 | 1 | PASS | 3ms | +| | llm-basic | 2.34 | 2.33 | 1.10 | 50 | 1 | PASS | 50.4s | +| | llm-escalate | 2.37 | 2.36 | 1.11 | 50 | 1 | PASS | 50.8s | +| Technical explanation | deterministic | 1.00 | 1.00 | - | 0 | 11 | PASS | 1ms | +| | llm-basic | 1.00 | 1.00 | 1.00 | 1 | 10 | PASS | 2.6s | +| | llm-escalate | 1.00 | 1.00 | 1.00 | 1 | 10 | PASS | 3.3s | +| Structured content | deterministic | 1.93 | 1.92 | - | 2 | 10 | PASS | 0ms | +| | llm-basic | 1.23 | 1.23 | 0.64 | 2 | 10 | PASS | 10.2s | +| | llm-escalate | 1.29 | 1.29 | 0.67 | 2 | 10 | PASS | 4.8s | +| Agentic coding session | deterministic | 1.43 | 1.43 | - | 2 | 31 | PASS | 1ms | +| | llm-basic | 1.43 | 1.43 | 1.00 | 2 | 31 | PASS | 5.8s | +| | llm-escalate | 1.32 | 1.32 | 0.93 | 1 | 32 | PASS | 9.5s | + +#### Token Budget (target: 2000 tokens) + +| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time | +| ---------------------- | ------------- | -----: | ----- | ------------: | ----: | ---------- | -----: | +| Deep conversation | deterministic | 3738 | false | 0 | 2.12 | PASS | 10ms | +| | llm-escalate | 3391 | false | 0 | 2.35 | PASS | 280.5s | +| Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms | +| | llm-escalate | 1915 | true | 3 | 1.39 | PASS | 28.1s | + +## Methodology + +- All results are **deterministic** — same input always produces the same output +- Metrics tracked: compression ratio, token ratio, message counts, dedup counts +- Timing is excluded from baselines (hardware-dependent) +- Real-session and LLM benchmarks are excluded from baselines (environment-dependent) +- Round-trip integrity is verified for every scenario (compress then uncompress) diff --git a/package.json b/package.json index 9b33f2f..f581ee3 100644 --- a/package.json +++ b/package.json @@ -14,6 +14,7 @@ "format": "prettier --write .", "format:check": "prettier --check .", "bench": "npx tsx bench/run.ts", + "bench:llm": "npx tsx bench/run.ts --llm", "bench:save": "npx tsx bench/run.ts --save", "bench:check": "npx tsx bench/run.ts --check", "test:e2e": "npm run build && npm pack && npm run test:e2e:lint && npm run test:e2e:smoke; EXIT=$?; npm run test:e2e:cleanup; exit $EXIT", From 184a451fc858af50bac83b00cbe19a9e7a618fa8 Mon Sep 17 00:00:00 2001 From: Lisa Date: Wed, 25 Feb 2026 14:41:25 +0100 Subject: [PATCH 03/13] feat(bench): wire --save/--check/--tolerance, organize baseline structure - --save: writes current.json + history/v{version}.json, regenerates docs - --check: compares against current.json, exits non-zero on regression - --tolerance N: allows N% deviation (0% default, deterministic) - Baselines reorganized: current.json at root, history/ for versioned snapshots, llm/ for non-deterministic reference data - bench:llm added to package.json for explicit LLM benchmark runs - Doc generation references correct baseline paths --- bench/baseline.ts | 67 +++++--- bench/baselines/current.json | 190 +++++++++++++++++++++ bench/baselines/history/v1.0.0.json | 190 +++++++++++++++++++++ bench/run.ts | 101 ++++++++++- docs/benchmarks.md | 249 ++++++++++++++-------------- 5 files changed, 645 insertions(+), 152 deletions(-) create mode 100644 bench/baselines/current.json create mode 100644 bench/baselines/history/v1.0.0.json diff --git a/bench/baseline.ts b/bench/baseline.ts index ea0f2a3..b44aef6 100644 --- a/bench/baseline.ts +++ b/bench/baseline.ts @@ -1,5 +1,5 @@ import { readFileSync, writeFileSync, mkdirSync, readdirSync, existsSync } from 'node:fs'; -import { dirname, join } from 'node:path'; +import { join } from 'node:path'; // --------------------------------------------------------------------------- // Types @@ -90,24 +90,36 @@ export interface LlmBenchmarkResult { // Save / Load // --------------------------------------------------------------------------- -export function saveBaseline(path: string, version: string, results: BenchmarkResults): void { +export function saveBaseline( + baselinesDir: string, + version: string, + results: BenchmarkResults, +): void { const baseline: Baseline = { version, generated: new Date().toISOString(), results, }; - const dir = dirname(path); - mkdirSync(dir, { recursive: true }); + mkdirSync(baselinesDir, { recursive: true }); const json = JSON.stringify(baseline, null, 2) + '\n'; - writeFileSync(path, json); - // Also save a versioned copy for history - writeFileSync(join(dir, `v${version}.json`), json); + // Active baseline at root + writeFileSync(join(baselinesDir, 'current.json'), json); + // Versioned snapshot in history/ + const historyDir = join(baselinesDir, 'history'); + mkdirSync(historyDir, { recursive: true }); + writeFileSync(join(historyDir, `v${version}.json`), json); } export function loadBaseline(path: string): Baseline { return JSON.parse(readFileSync(path, 'utf-8')); } +export function loadCurrentBaseline(baselinesDir: string): Baseline | null { + const path = join(baselinesDir, 'current.json'); + if (!existsSync(path)) return null; + return loadBaseline(path); +} + // --------------------------------------------------------------------------- // LLM result persistence // --------------------------------------------------------------------------- @@ -317,26 +329,30 @@ export function formatRegressions(regressions: Regression[]): string { // Doc generation // --------------------------------------------------------------------------- +function semverSort(a: string, b: string): number { + const pa = a + .replace(/^v|\.json$/g, '') + .split('.') + .map(Number); + const pb = b + .replace(/^v|\.json$/g, '') + .split('.') + .map(Number); + for (let i = 0; i < 3; i++) { + if ((pa[i] ?? 0) !== (pb[i] ?? 0)) return (pa[i] ?? 0) - (pb[i] ?? 0); + } + return 0; +} + function loadAllBaselines(baselinesDir: string): Baseline[] { - const files = readdirSync(baselinesDir) + const historyDir = join(baselinesDir, 'history'); + if (!existsSync(historyDir)) return []; + + const files = readdirSync(historyDir) .filter((f) => f.startsWith('v') && f.endsWith('.json')) - .sort((a, b) => { - // Sort by semver: v1.0.0.json < v1.1.0.json < v2.0.0.json - const pa = a - .replace(/^v|\.json$/g, '') - .split('.') - .map(Number); - const pb = b - .replace(/^v|\.json$/g, '') - .split('.') - .map(Number); - for (let i = 0; i < 3; i++) { - if ((pa[i] ?? 0) !== (pb[i] ?? 0)) return (pa[i] ?? 0) - (pb[i] ?? 0); - } - return 0; - }); + .sort(semverSort); - return files.map((f) => loadBaseline(join(baselinesDir, f))); + return files.map((f) => loadBaseline(join(historyDir, f))); } function fix(n: number, d: number = 2): string { @@ -546,7 +562,8 @@ export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): lines.push('| File | Purpose |'); lines.push('| --- | --- |'); lines.push('| `bench/baselines/current.json` | Active baseline compared in CI |'); - lines.push('| `bench/baselines/v*.json` | Versioned snapshots, one per release |'); + lines.push('| `bench/baselines/history/v*.json` | Versioned snapshots, one per release |'); + lines.push('| `bench/baselines/llm/*.json` | LLM benchmark reference data (non-deterministic) |'); lines.push(''); // --- LLM comparison (if result files exist) --- diff --git a/bench/baselines/current.json b/bench/baselines/current.json new file mode 100644 index 0000000..d127500 --- /dev/null +++ b/bench/baselines/current.json @@ -0,0 +1,190 @@ +{ + "version": "1.0.0", + "generated": "2026-02-25T13:40:26.671Z", + "results": { + "basic": { + "Coding assistant": { + "ratio": 1.6812907904278462, + "tokenRatio": 1.6729559748427674, + "compressed": 5, + "preserved": 8 + }, + "Long Q&A": { + "ratio": 6.158536585365853, + "tokenRatio": 6.114164904862579, + "compressed": 4, + "preserved": 6 + }, + "Tool-heavy": { + "ratio": 1.2991563919532771, + "tokenRatio": 1.2946428571428572, + "compressed": 2, + "preserved": 16 + }, + "Short conversation": { + "ratio": 1, + "tokenRatio": 1, + "compressed": 0, + "preserved": 7 + }, + "Deep conversation": { + "ratio": 2.124913733609386, + "tokenRatio": 2.1241305510968433, + "compressed": 50, + "preserved": 1 + }, + "Technical explanation": { + "ratio": 1, + "tokenRatio": 1, + "compressed": 0, + "preserved": 11 + }, + "Structured content": { + "ratio": 1.9338990620812864, + "tokenRatio": 1.9241486068111455, + "compressed": 2, + "preserved": 10 + }, + "Agentic coding session": { + "ratio": 1.428351309707242, + "tokenRatio": 1.4258962011771001, + "compressed": 2, + "preserved": 31 + } + }, + "tokenBudget": { + "Deep conversation|dedup=false": { + "tokenCount": 3738, + "fits": false, + "recencyWindow": 0, + "compressed": 50, + "preserved": 1, + "deduped": 0 + }, + "Deep conversation|dedup=true": { + "tokenCount": 3738, + "fits": false, + "recencyWindow": 0, + "compressed": 50, + "preserved": 1, + "deduped": 0 + }, + "Agentic coding session|dedup=false": { + "tokenCount": 2345, + "fits": false, + "recencyWindow": 0, + "compressed": 4, + "preserved": 33, + "deduped": 0 + }, + "Agentic coding session|dedup=true": { + "tokenCount": 1957, + "fits": true, + "recencyWindow": 9, + "compressed": 1, + "preserved": 32, + "deduped": 4 + } + }, + "dedup": { + "Coding assistant": { + "rw0Base": 1.6812907904278462, + "rw0Dup": 1.6812907904278462, + "rw4Base": 1.5104234527687297, + "rw4Dup": 1.5104234527687297, + "deduped": 0 + }, + "Long Q&A": { + "rw0Base": 5.139949109414759, + "rw0Dup": 6.158536585365853, + "rw4Base": 1.9024298361273309, + "rw4Dup": 2.0264847512038524, + "deduped": 1 + }, + "Tool-heavy": { + "rw0Base": 1.2991563919532771, + "rw0Dup": 1.2991563919532771, + "rw4Base": 1.2991563919532771, + "rw4Dup": 1.2991563919532771, + "deduped": 0 + }, + "Short conversation": { + "rw0Base": 1, + "rw0Dup": 1, + "rw4Base": 1, + "rw4Dup": 1, + "deduped": 0 + }, + "Deep conversation": { + "rw0Base": 2.124913733609386, + "rw0Dup": 2.124913733609386, + "rw4Base": 1.9527165104643789, + "rw4Dup": 1.9527165104643789, + "deduped": 0 + }, + "Technical explanation": { + "rw0Base": 1, + "rw0Dup": 1, + "rw4Base": 1, + "rw4Dup": 1, + "deduped": 0 + }, + "Structured content": { + "rw0Base": 1.9338990620812864, + "rw0Dup": 1.9338990620812864, + "rw4Base": 1.373730964467005, + "rw4Dup": 1.373730964467005, + "deduped": 0 + }, + "Agentic coding session": { + "rw0Base": 1.1374233128834357, + "rw0Dup": 1.428351309707242, + "rw4Base": 1.1374233128834357, + "rw4Dup": 1.428351309707242, + "deduped": 4 + } + }, + "fuzzyDedup": { + "Coding assistant": { + "exact": 0, + "fuzzy": 0, + "ratio": 1.6812907904278462 + }, + "Long Q&A": { + "exact": 1, + "fuzzy": 0, + "ratio": 6.158536585365853 + }, + "Tool-heavy": { + "exact": 0, + "fuzzy": 0, + "ratio": 1.2991563919532771 + }, + "Short conversation": { + "exact": 0, + "fuzzy": 0, + "ratio": 1 + }, + "Deep conversation": { + "exact": 0, + "fuzzy": 0, + "ratio": 2.124913733609386 + }, + "Technical explanation": { + "exact": 0, + "fuzzy": 0, + "ratio": 1 + }, + "Structured content": { + "exact": 0, + "fuzzy": 0, + "ratio": 1.9338990620812864 + }, + "Agentic coding session": { + "exact": 4, + "fuzzy": 2, + "ratio": 2.229973538609574 + } + } + } +} diff --git a/bench/baselines/history/v1.0.0.json b/bench/baselines/history/v1.0.0.json new file mode 100644 index 0000000..d127500 --- /dev/null +++ b/bench/baselines/history/v1.0.0.json @@ -0,0 +1,190 @@ +{ + "version": "1.0.0", + "generated": "2026-02-25T13:40:26.671Z", + "results": { + "basic": { + "Coding assistant": { + "ratio": 1.6812907904278462, + "tokenRatio": 1.6729559748427674, + "compressed": 5, + "preserved": 8 + }, + "Long Q&A": { + "ratio": 6.158536585365853, + "tokenRatio": 6.114164904862579, + "compressed": 4, + "preserved": 6 + }, + "Tool-heavy": { + "ratio": 1.2991563919532771, + "tokenRatio": 1.2946428571428572, + "compressed": 2, + "preserved": 16 + }, + "Short conversation": { + "ratio": 1, + "tokenRatio": 1, + "compressed": 0, + "preserved": 7 + }, + "Deep conversation": { + "ratio": 2.124913733609386, + "tokenRatio": 2.1241305510968433, + "compressed": 50, + "preserved": 1 + }, + "Technical explanation": { + "ratio": 1, + "tokenRatio": 1, + "compressed": 0, + "preserved": 11 + }, + "Structured content": { + "ratio": 1.9338990620812864, + "tokenRatio": 1.9241486068111455, + "compressed": 2, + "preserved": 10 + }, + "Agentic coding session": { + "ratio": 1.428351309707242, + "tokenRatio": 1.4258962011771001, + "compressed": 2, + "preserved": 31 + } + }, + "tokenBudget": { + "Deep conversation|dedup=false": { + "tokenCount": 3738, + "fits": false, + "recencyWindow": 0, + "compressed": 50, + "preserved": 1, + "deduped": 0 + }, + "Deep conversation|dedup=true": { + "tokenCount": 3738, + "fits": false, + "recencyWindow": 0, + "compressed": 50, + "preserved": 1, + "deduped": 0 + }, + "Agentic coding session|dedup=false": { + "tokenCount": 2345, + "fits": false, + "recencyWindow": 0, + "compressed": 4, + "preserved": 33, + "deduped": 0 + }, + "Agentic coding session|dedup=true": { + "tokenCount": 1957, + "fits": true, + "recencyWindow": 9, + "compressed": 1, + "preserved": 32, + "deduped": 4 + } + }, + "dedup": { + "Coding assistant": { + "rw0Base": 1.6812907904278462, + "rw0Dup": 1.6812907904278462, + "rw4Base": 1.5104234527687297, + "rw4Dup": 1.5104234527687297, + "deduped": 0 + }, + "Long Q&A": { + "rw0Base": 5.139949109414759, + "rw0Dup": 6.158536585365853, + "rw4Base": 1.9024298361273309, + "rw4Dup": 2.0264847512038524, + "deduped": 1 + }, + "Tool-heavy": { + "rw0Base": 1.2991563919532771, + "rw0Dup": 1.2991563919532771, + "rw4Base": 1.2991563919532771, + "rw4Dup": 1.2991563919532771, + "deduped": 0 + }, + "Short conversation": { + "rw0Base": 1, + "rw0Dup": 1, + "rw4Base": 1, + "rw4Dup": 1, + "deduped": 0 + }, + "Deep conversation": { + "rw0Base": 2.124913733609386, + "rw0Dup": 2.124913733609386, + "rw4Base": 1.9527165104643789, + "rw4Dup": 1.9527165104643789, + "deduped": 0 + }, + "Technical explanation": { + "rw0Base": 1, + "rw0Dup": 1, + "rw4Base": 1, + "rw4Dup": 1, + "deduped": 0 + }, + "Structured content": { + "rw0Base": 1.9338990620812864, + "rw0Dup": 1.9338990620812864, + "rw4Base": 1.373730964467005, + "rw4Dup": 1.373730964467005, + "deduped": 0 + }, + "Agentic coding session": { + "rw0Base": 1.1374233128834357, + "rw0Dup": 1.428351309707242, + "rw4Base": 1.1374233128834357, + "rw4Dup": 1.428351309707242, + "deduped": 4 + } + }, + "fuzzyDedup": { + "Coding assistant": { + "exact": 0, + "fuzzy": 0, + "ratio": 1.6812907904278462 + }, + "Long Q&A": { + "exact": 1, + "fuzzy": 0, + "ratio": 6.158536585365853 + }, + "Tool-heavy": { + "exact": 0, + "fuzzy": 0, + "ratio": 1.2991563919532771 + }, + "Short conversation": { + "exact": 0, + "fuzzy": 0, + "ratio": 1 + }, + "Deep conversation": { + "exact": 0, + "fuzzy": 0, + "ratio": 2.124913733609386 + }, + "Technical explanation": { + "exact": 0, + "fuzzy": 0, + "ratio": 1 + }, + "Structured content": { + "exact": 0, + "fuzzy": 0, + "ratio": 1.9338990620812864 + }, + "Agentic coding session": { + "exact": 4, + "fuzzy": 2, + "ratio": 2.229973538609574 + } + } + } +} diff --git a/bench/run.ts b/bench/run.ts index ee51a8f..191c1d8 100644 --- a/bench/run.ts +++ b/bench/run.ts @@ -6,8 +6,20 @@ import { readFileSync, readdirSync, statSync, existsSync } from 'node:fs'; import { join, resolve } from 'node:path'; import { homedir } from 'node:os'; import { detectProviders } from './llm.js'; -import type { LlmBenchmarkResult, LlmMethodResult, LlmTokenBudgetResult } from './baseline.js'; -import { saveLlmResult } from './baseline.js'; +import type { + LlmBenchmarkResult, + LlmMethodResult, + LlmTokenBudgetResult, + BenchmarkResults, +} from './baseline.js'; +import { + saveLlmResult, + saveBaseline, + loadCurrentBaseline, + compareResults, + formatRegressions, + generateBenchmarkDocs, +} from './baseline.js'; // --------------------------------------------------------------------------- // Auto-load .env (no dependency, won't override existing vars) @@ -790,9 +802,24 @@ interface Result { } async function run(): Promise { + const args = process.argv.slice(2); + const flagSave = args.includes('--save'); + const flagCheck = args.includes('--check'); + const flagLlm = args.includes('--llm'); + const toleranceIdx = args.indexOf('--tolerance'); + const tolerance = toleranceIdx >= 0 ? Number(args[toleranceIdx + 1]) / 100 : 0; + const scenarios = buildScenarios(); const results: Result[] = []; + // Structured results for baseline save/check + const benchResults: BenchmarkResults = { + basic: {}, + tokenBudget: {}, + dedup: {}, + fuzzyDedup: {}, + }; + for (const scenario of scenarios) { const t0 = performance.now(); @@ -820,6 +847,13 @@ async function run(): Promise { roundTrip, timeMs: (t1 - t0).toFixed(2), }); + + benchResults.basic[scenario.name] = { + ratio: cr.compression.ratio, + tokenRatio: cr.compression.token_ratio, + compressed: cr.compression.messages_compressed, + preserved: cr.compression.messages_preserved, + }; } // Print table @@ -949,6 +983,16 @@ async function run(): Promise { ((t1 - t0).toFixed(2) + 'ms').padStart(cols.time), ].join(' '), ); + + const tbKey = `${scenario.name}|dedup=${dedup}`; + benchResults.tokenBudget[tbKey] = { + tokenCount: cr.tokenCount ?? 0, + fits: cr.fits ?? false, + recencyWindow: cr.recencyWindow, + compressed: cr.compression.messages_compressed, + preserved: cr.compression.messages_preserved, + deduped: cr.compression.messages_deduped ?? 0, + }; } } @@ -1012,6 +1056,14 @@ async function run(): Promise { rt2.padStart(cols.rt), ].join(' '), ); + + benchResults.dedup[scenario.name] = { + rw0Base: baseRw0.compression.ratio, + rw0Dup: dedupRw0.compression.ratio, + rw4Base: baseRw4.compression.ratio, + rw4Dup: dedupRw4.compression.ratio, + deduped, + }; } console.log(dedupSep); @@ -1070,6 +1122,12 @@ async function run(): Promise { ((t1 - t0).toFixed(2) + 'ms').padStart(cols.time), ].join(' '), ); + + benchResults.fuzzyDedup[scenario.name] = { + exact: cr.compression.messages_deduped ?? 0, + fuzzy: cr.compression.messages_fuzzy_deduped ?? 0, + ratio: cr.compression.ratio, + }; } console.log(fuzzySep); @@ -1079,6 +1137,43 @@ async function run(): Promise { process.exit(1); } + // --------------------------------------------------------------------------- + // --save / --check + // --------------------------------------------------------------------------- + + const baselinesDir = resolve(import.meta.dirname, 'baselines'); + const version = JSON.parse( + readFileSync(resolve(import.meta.dirname, '..', 'package.json'), 'utf-8'), + ).version; + + if (flagSave) { + saveBaseline(baselinesDir, version, benchResults); + generateBenchmarkDocs( + baselinesDir, + resolve(import.meta.dirname, '..', 'docs', 'benchmarks.md'), + ); + console.log(); + console.log(`Baseline saved (v${version}) and docs/benchmarks.md regenerated.`); + } + + if (flagCheck) { + const current = loadCurrentBaseline(baselinesDir); + if (!current) { + console.error( + 'No baseline found at bench/baselines/current.json — run `npm run bench:save` first.', + ); + process.exit(1); + } + const regressions = compareResults(current.results, benchResults, tolerance); + if (regressions.length > 0) { + console.error(); + console.error(formatRegressions(regressions)); + process.exit(1); + } + console.log(); + console.log(`Baseline check passed (v${current.version}, tolerance ${tolerance * 100}%).`); + } + // --------------------------------------------------------------------------- // Real Claude Code sessions (if available locally) // --------------------------------------------------------------------------- @@ -1086,7 +1181,7 @@ async function run(): Promise { runRealSessions(); // LLM benchmarks require explicit --llm flag (they cost money and take minutes) - if (process.argv.includes('--llm')) { + if (flagLlm) { await runLlmBenchmark(); } diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 4a4346f..9888243 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -17,11 +17,11 @@ npm run bench:save # Run, save new baseline, regenerate this doc LLM benchmarks require the `--llm` flag (`npm run bench:llm`). Set API keys in a `.env` file or export them. Ollama is auto-detected when running locally. -| Variable | Provider | Default Model | Notes | -| ------------------- | --------- | --------------------------- | -------------------------------- | -| `OPENAI_API_KEY` | OpenAI | `gpt-4.1-mini` | | -| `ANTHROPIC_API_KEY` | Anthropic | `claude-haiku-4-5-20251001` | | -| _(none required)_ | Ollama | `llama3.2` | Auto-detected on localhost:11434 | +| Variable | Provider | Default Model | Notes | +| --- | --- | --- | --- | +| `OPENAI_API_KEY` | OpenAI | `gpt-4.1-mini` | | +| `ANTHROPIC_API_KEY` | Anthropic | `claude-haiku-4-5-20251001` | | +| *(none required)* | Ollama | `llama3.2` | Auto-detected on localhost:11434 | ## Current Results (v1.0.0) @@ -29,78 +29,78 @@ LLM benchmarks require the `--llm` flag (`npm run bench:llm`). Set API keys in a **Range:** 1.00x – 6.16x · **Average:** 2.08x · **Round-trip:** all PASS -| Scenario | Char Ratio | Token Ratio | Compressed | Preserved | -| ---------------------- | ---------: | ----------: | ---------: | --------: | -| Coding assistant | 1.68 | 1.67 | 5 | 8 | -| Long Q&A | 6.16 | 6.11 | 4 | 6 | -| Tool-heavy | 1.30 | 1.29 | 2 | 16 | -| Short conversation | 1.00 | 1.00 | 0 | 7 | -| Deep conversation | 2.12 | 2.12 | 50 | 1 | -| Technical explanation | 1.00 | 1.00 | 0 | 11 | -| Structured content | 1.93 | 1.92 | 2 | 10 | -| Agentic coding session | 1.43 | 1.43 | 2 | 31 | +| Scenario | Char Ratio | Token Ratio | Compressed | Preserved | +| --- | ---: | ---: | ---: | ---: | +| Coding assistant | 1.68 | 1.67 | 5 | 8 | +| Long Q&A | 6.16 | 6.11 | 4 | 6 | +| Tool-heavy | 1.30 | 1.29 | 2 | 16 | +| Short conversation | 1.00 | 1.00 | 0 | 7 | +| Deep conversation | 2.12 | 2.12 | 50 | 1 | +| Technical explanation | 1.00 | 1.00 | 0 | 11 | +| Structured content | 1.93 | 1.92 | 2 | 10 | +| Agentic coding session | 1.43 | 1.43 | 2 | 31 | ### Token Budget (target: 2000 tokens) -| Scenario | Dedup | Tokens | Fits | recencyWindow | Compressed | Preserved | Deduped | -| ---------------------- | ----- | -----: | ----- | ------------: | ---------: | --------: | ------: | -| Deep conversation | no | 3738 | false | 0 | 50 | 1 | 0 | -| Deep conversation | yes | 3738 | false | 0 | 50 | 1 | 0 | -| Agentic coding session | no | 2345 | false | 0 | 4 | 33 | 0 | -| Agentic coding session | yes | 1957 | true | 9 | 1 | 32 | 4 | +| Scenario | Dedup | Tokens | Fits | recencyWindow | Compressed | Preserved | Deduped | +| --- | --- | ---: | --- | ---: | ---: | ---: | ---: | +| Deep conversation | no | 3738 | false | 0 | 50 | 1 | 0 | +| Deep conversation | yes | 3738 | false | 0 | 50 | 1 | 0 | +| Agentic coding session | no | 2345 | false | 0 | 4 | 33 | 0 | +| Agentic coding session | yes | 1957 | true | 9 | 1 | 32 | 4 | ### Dedup Effectiveness -| Scenario | No Dedup (rw=0) | Dedup (rw=0) | No Dedup (rw=4) | Dedup (rw=4) | Deduped | -| ---------------------- | --------------: | -----------: | --------------: | -----------: | ------: | -| Coding assistant | 1.68 | 1.68 | 1.51 | 1.51 | 0 | -| Long Q&A | 5.14 | 6.16 | 1.90 | 2.03 | 1 | -| Tool-heavy | 1.30 | 1.30 | 1.30 | 1.30 | 0 | -| Short conversation | 1.00 | 1.00 | 1.00 | 1.00 | 0 | -| Deep conversation | 2.12 | 2.12 | 1.95 | 1.95 | 0 | -| Technical explanation | 1.00 | 1.00 | 1.00 | 1.00 | 0 | -| Structured content | 1.93 | 1.93 | 1.37 | 1.37 | 0 | -| Agentic coding session | 1.14 | 1.43 | 1.14 | 1.43 | 4 | +| Scenario | No Dedup (rw=0) | Dedup (rw=0) | No Dedup (rw=4) | Dedup (rw=4) | Deduped | +| --- | ---: | ---: | ---: | ---: | ---: | +| Coding assistant | 1.68 | 1.68 | 1.51 | 1.51 | 0 | +| Long Q&A | 5.14 | 6.16 | 1.90 | 2.03 | 1 | +| Tool-heavy | 1.30 | 1.30 | 1.30 | 1.30 | 0 | +| Short conversation | 1.00 | 1.00 | 1.00 | 1.00 | 0 | +| Deep conversation | 2.12 | 2.12 | 1.95 | 1.95 | 0 | +| Technical explanation | 1.00 | 1.00 | 1.00 | 1.00 | 0 | +| Structured content | 1.93 | 1.93 | 1.37 | 1.37 | 0 | +| Agentic coding session | 1.14 | 1.43 | 1.14 | 1.43 | 4 | ### Fuzzy Dedup -| Scenario | Exact Deduped | Fuzzy Deduped | Ratio | -| ---------------------- | ------------: | ------------: | ----: | -| Coding assistant | 0 | 0 | 1.68 | -| Long Q&A | 1 | 0 | 6.16 | -| Tool-heavy | 0 | 0 | 1.30 | -| Short conversation | 0 | 0 | 1.00 | -| Deep conversation | 0 | 0 | 2.12 | -| Technical explanation | 0 | 0 | 1.00 | -| Structured content | 0 | 0 | 1.93 | -| Agentic coding session | 4 | 2 | 2.23 | +| Scenario | Exact Deduped | Fuzzy Deduped | Ratio | +| --- | ---: | ---: | ---: | +| Coding assistant | 0 | 0 | 1.68 | +| Long Q&A | 1 | 0 | 6.16 | +| Tool-heavy | 0 | 0 | 1.30 | +| Short conversation | 0 | 0 | 1.00 | +| Deep conversation | 0 | 0 | 2.12 | +| Technical explanation | 0 | 0 | 1.00 | +| Structured content | 0 | 0 | 1.93 | +| Agentic coding session | 4 | 2 | 2.23 | ## Scenarios The benchmark covers 8 conversation types: -| Scenario | Description | -| ---------------------- | -------------------------------------------------------- | -| Coding assistant | Mixed code fences and prose discussion | -| Long Q&A | Extended question-and-answer with repeated paragraphs | -| Tool-heavy | Messages with `tool_calls` arrays (preserved by default) | -| Short conversation | Brief exchanges, mostly under 120 chars | -| Deep conversation | 25 turns of multi-paragraph prose | -| Technical explanation | Pure prose Q&A about event-driven architecture | -| Structured content | JSON, YAML, SQL, API keys, test output | -| Agentic coding session | Repeated file reads, grep results, near-duplicate edits | +| Scenario | Description | +| --- | --- | +| Coding assistant | Mixed code fences and prose discussion | +| Long Q&A | Extended question-and-answer with repeated paragraphs | +| Tool-heavy | Messages with `tool_calls` arrays (preserved by default) | +| Short conversation | Brief exchanges, mostly under 120 chars | +| Deep conversation | 25 turns of multi-paragraph prose | +| Technical explanation | Pure prose Q&A about event-driven architecture | +| Structured content | JSON, YAML, SQL, API keys, test output | +| Agentic coding session | Repeated file reads, grep results, near-duplicate edits | ## Interpreting Results ### Compression ratio -| Ratio | Reduction | -| ----: | --------------------------------------- | -| 1.0x | no compression (all messages preserved) | -| 1.5x | 33% reduction | -| 2.0x | 50% reduction | -| 3.0x | 67% reduction | -| 6.0x | 83% reduction | +| Ratio | Reduction | +| ---: | --- | +| 1.0x | no compression (all messages preserved) | +| 1.5x | 33% reduction | +| 2.0x | 50% reduction | +| 3.0x | 67% reduction | +| 6.0x | 83% reduction | Higher is better. Token ratio is more meaningful for LLM context budgeting; character ratio is useful for storage. @@ -115,10 +115,11 @@ Baselines are stored in [`bench/baselines/`](../bench/baselines/) as JSON. CI ru ### Baseline files -| File | Purpose | -| ------------------------------ | ------------------------------------ | -| `bench/baselines/current.json` | Active baseline compared in CI | -| `bench/baselines/v*.json` | Versioned snapshots, one per release | +| File | Purpose | +| --- | --- | +| `bench/baselines/current.json` | Active baseline compared in CI | +| `bench/baselines/history/v*.json` | Versioned snapshots, one per release | +| `bench/baselines/llm/*.json` | LLM benchmark reference data (non-deterministic) | ## LLM Summarization Comparison @@ -126,77 +127,77 @@ Baselines are stored in [`bench/baselines/`](../bench/baselines/) as JSON. CI ru ### ollama (llama3.2) -_Generated: 2026-02-25_ - -| Scenario | Method | Char Ratio | Token Ratio | vs Det | Compressed | Preserved | Round-trip | Time | -| ---------------------- | ------------- | ---------: | ----------: | -----: | ---------: | --------: | ---------- | ----: | -| Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms | -| | llm-basic | 1.48 | 1.48 | 0.88 | 5 | 8 | PASS | 5.9s | -| | llm-escalate | 1.55 | 1.55 | 0.92 | 5 | 8 | PASS | 3.0s | -| Long Q&A | deterministic | 6.16 | 6.11 | - | 4 | 6 | PASS | 1ms | -| | llm-basic | 4.31 | 4.28 | 0.70 | 4 | 6 | PASS | 4.1s | -| | llm-escalate | 4.49 | 4.46 | 0.73 | 4 | 6 | PASS | 3.7s | -| Tool-heavy | deterministic | 1.30 | 1.29 | - | 2 | 16 | PASS | 2ms | -| | llm-basic | 1.12 | 1.11 | 0.86 | 2 | 16 | PASS | 2.3s | -| | llm-escalate | 1.28 | 1.28 | 0.99 | 2 | 16 | PASS | 2.8s | -| Deep conversation | deterministic | 2.12 | 2.12 | - | 50 | 1 | PASS | 3ms | -| | llm-basic | 3.12 | 3.11 | 1.47 | 50 | 1 | PASS | 22.7s | -| | llm-escalate | 3.28 | 3.26 | 1.54 | 50 | 1 | PASS | 23.3s | -| Technical explanation | deterministic | 1.00 | 1.00 | - | 0 | 11 | PASS | 1ms | -| | llm-basic | 1.00 | 1.00 | 1.00 | 0 | 11 | PASS | 3.2s | -| | llm-escalate | 1.00 | 1.00 | 1.00 | 2 | 9 | PASS | 785ms | -| Structured content | deterministic | 1.93 | 1.92 | - | 2 | 10 | PASS | 0ms | -| | llm-basic | 1.46 | 1.45 | 0.75 | 2 | 10 | PASS | 3.5s | -| | llm-escalate | 1.38 | 1.38 | 0.71 | 2 | 10 | PASS | 3.7s | -| Agentic coding session | deterministic | 1.43 | 1.43 | - | 2 | 31 | PASS | 1ms | -| | llm-basic | 1.35 | 1.34 | 0.94 | 2 | 31 | PASS | 3.3s | -| | llm-escalate | 1.40 | 1.40 | 0.98 | 2 | 31 | PASS | 5.4s | +*Generated: 2026-02-25* + +| Scenario | Method | Char Ratio | Token Ratio | vs Det | Compressed | Preserved | Round-trip | Time | +| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: | +| Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms | +| | llm-basic | 1.48 | 1.48 | 0.88 | 5 | 8 | PASS | 5.9s | +| | llm-escalate | 1.55 | 1.55 | 0.92 | 5 | 8 | PASS | 3.0s | +| Long Q&A | deterministic | 6.16 | 6.11 | - | 4 | 6 | PASS | 1ms | +| | llm-basic | 4.31 | 4.28 | 0.70 | 4 | 6 | PASS | 4.1s | +| | llm-escalate | 4.49 | 4.46 | 0.73 | 4 | 6 | PASS | 3.7s | +| Tool-heavy | deterministic | 1.30 | 1.29 | - | 2 | 16 | PASS | 2ms | +| | llm-basic | 1.12 | 1.11 | 0.86 | 2 | 16 | PASS | 2.3s | +| | llm-escalate | 1.28 | 1.28 | 0.99 | 2 | 16 | PASS | 2.8s | +| Deep conversation | deterministic | 2.12 | 2.12 | - | 50 | 1 | PASS | 3ms | +| | llm-basic | 3.12 | 3.11 | 1.47 | 50 | 1 | PASS | 22.7s | +| | llm-escalate | 3.28 | 3.26 | 1.54 | 50 | 1 | PASS | 23.3s | +| Technical explanation | deterministic | 1.00 | 1.00 | - | 0 | 11 | PASS | 1ms | +| | llm-basic | 1.00 | 1.00 | 1.00 | 0 | 11 | PASS | 3.2s | +| | llm-escalate | 1.00 | 1.00 | 1.00 | 2 | 9 | PASS | 785ms | +| Structured content | deterministic | 1.93 | 1.92 | - | 2 | 10 | PASS | 0ms | +| | llm-basic | 1.46 | 1.45 | 0.75 | 2 | 10 | PASS | 3.5s | +| | llm-escalate | 1.38 | 1.38 | 0.71 | 2 | 10 | PASS | 3.7s | +| Agentic coding session | deterministic | 1.43 | 1.43 | - | 2 | 31 | PASS | 1ms | +| | llm-basic | 1.35 | 1.34 | 0.94 | 2 | 31 | PASS | 3.3s | +| | llm-escalate | 1.40 | 1.40 | 0.98 | 2 | 31 | PASS | 5.4s | #### Token Budget (target: 2000 tokens) -| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time | -| ---------------------- | ------------- | -----: | ----- | ------------: | ----: | ---------- | -----: | -| Deep conversation | deterministic | 3738 | false | 0 | 2.12 | PASS | 12ms | -| | llm-escalate | 2593 | false | 0 | 3.08 | PASS | 132.0s | -| Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms | -| | llm-escalate | 2003 | false | 9 | 1.33 | PASS | 4.1s | +| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time | +| --- | --- | ---: | --- | ---: | ---: | --- | ---: | +| Deep conversation | deterministic | 3738 | false | 0 | 2.12 | PASS | 12ms | +| | llm-escalate | 2593 | false | 0 | 3.08 | PASS | 132.0s | +| Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms | +| | llm-escalate | 2003 | false | 9 | 1.33 | PASS | 4.1s | ### openai (gpt-4.1-mini) -_Generated: 2026-02-25_ - -| Scenario | Method | Char Ratio | Token Ratio | vs Det | Compressed | Preserved | Round-trip | Time | -| ---------------------- | ------------- | ---------: | ----------: | -----: | ---------: | --------: | ---------- | ----: | -| Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms | -| | llm-basic | 1.64 | 1.63 | 0.98 | 5 | 8 | PASS | 5.6s | -| | llm-escalate | 1.63 | 1.63 | 0.97 | 5 | 8 | PASS | 6.0s | -| Long Q&A | deterministic | 6.16 | 6.11 | - | 4 | 6 | PASS | 1ms | -| | llm-basic | 5.37 | 5.33 | 0.87 | 4 | 6 | PASS | 5.9s | -| | llm-escalate | 5.35 | 5.31 | 0.87 | 4 | 6 | PASS | 7.0s | -| Tool-heavy | deterministic | 1.30 | 1.29 | - | 2 | 16 | PASS | 0ms | -| | llm-basic | 1.11 | 1.10 | 0.85 | 2 | 16 | PASS | 3.5s | -| | llm-escalate | 1.12 | 1.12 | 0.86 | 2 | 16 | PASS | 5.3s | -| Deep conversation | deterministic | 2.12 | 2.12 | - | 50 | 1 | PASS | 3ms | -| | llm-basic | 2.34 | 2.33 | 1.10 | 50 | 1 | PASS | 50.4s | -| | llm-escalate | 2.37 | 2.36 | 1.11 | 50 | 1 | PASS | 50.8s | -| Technical explanation | deterministic | 1.00 | 1.00 | - | 0 | 11 | PASS | 1ms | -| | llm-basic | 1.00 | 1.00 | 1.00 | 1 | 10 | PASS | 2.6s | -| | llm-escalate | 1.00 | 1.00 | 1.00 | 1 | 10 | PASS | 3.3s | -| Structured content | deterministic | 1.93 | 1.92 | - | 2 | 10 | PASS | 0ms | -| | llm-basic | 1.23 | 1.23 | 0.64 | 2 | 10 | PASS | 10.2s | -| | llm-escalate | 1.29 | 1.29 | 0.67 | 2 | 10 | PASS | 4.8s | -| Agentic coding session | deterministic | 1.43 | 1.43 | - | 2 | 31 | PASS | 1ms | -| | llm-basic | 1.43 | 1.43 | 1.00 | 2 | 31 | PASS | 5.8s | -| | llm-escalate | 1.32 | 1.32 | 0.93 | 1 | 32 | PASS | 9.5s | +*Generated: 2026-02-25* + +| Scenario | Method | Char Ratio | Token Ratio | vs Det | Compressed | Preserved | Round-trip | Time | +| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: | +| Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms | +| | llm-basic | 1.64 | 1.63 | 0.98 | 5 | 8 | PASS | 5.6s | +| | llm-escalate | 1.63 | 1.63 | 0.97 | 5 | 8 | PASS | 6.0s | +| Long Q&A | deterministic | 6.16 | 6.11 | - | 4 | 6 | PASS | 1ms | +| | llm-basic | 5.37 | 5.33 | 0.87 | 4 | 6 | PASS | 5.9s | +| | llm-escalate | 5.35 | 5.31 | 0.87 | 4 | 6 | PASS | 7.0s | +| Tool-heavy | deterministic | 1.30 | 1.29 | - | 2 | 16 | PASS | 0ms | +| | llm-basic | 1.11 | 1.10 | 0.85 | 2 | 16 | PASS | 3.5s | +| | llm-escalate | 1.12 | 1.12 | 0.86 | 2 | 16 | PASS | 5.3s | +| Deep conversation | deterministic | 2.12 | 2.12 | - | 50 | 1 | PASS | 3ms | +| | llm-basic | 2.34 | 2.33 | 1.10 | 50 | 1 | PASS | 50.4s | +| | llm-escalate | 2.37 | 2.36 | 1.11 | 50 | 1 | PASS | 50.8s | +| Technical explanation | deterministic | 1.00 | 1.00 | - | 0 | 11 | PASS | 1ms | +| | llm-basic | 1.00 | 1.00 | 1.00 | 1 | 10 | PASS | 2.6s | +| | llm-escalate | 1.00 | 1.00 | 1.00 | 1 | 10 | PASS | 3.3s | +| Structured content | deterministic | 1.93 | 1.92 | - | 2 | 10 | PASS | 0ms | +| | llm-basic | 1.23 | 1.23 | 0.64 | 2 | 10 | PASS | 10.2s | +| | llm-escalate | 1.29 | 1.29 | 0.67 | 2 | 10 | PASS | 4.8s | +| Agentic coding session | deterministic | 1.43 | 1.43 | - | 2 | 31 | PASS | 1ms | +| | llm-basic | 1.43 | 1.43 | 1.00 | 2 | 31 | PASS | 5.8s | +| | llm-escalate | 1.32 | 1.32 | 0.93 | 1 | 32 | PASS | 9.5s | #### Token Budget (target: 2000 tokens) -| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time | -| ---------------------- | ------------- | -----: | ----- | ------------: | ----: | ---------- | -----: | -| Deep conversation | deterministic | 3738 | false | 0 | 2.12 | PASS | 10ms | -| | llm-escalate | 3391 | false | 0 | 2.35 | PASS | 280.5s | -| Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms | -| | llm-escalate | 1915 | true | 3 | 1.39 | PASS | 28.1s | +| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time | +| --- | --- | ---: | --- | ---: | ---: | --- | ---: | +| Deep conversation | deterministic | 3738 | false | 0 | 2.12 | PASS | 10ms | +| | llm-escalate | 3391 | false | 0 | 2.35 | PASS | 280.5s | +| Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms | +| | llm-escalate | 1915 | true | 3 | 1.39 | PASS | 28.1s | ## Methodology From 90f3c04e3608a19193de02dde54371ff1c3539e6 Mon Sep 17 00:00:00 2001 From: Lisa Date: Wed, 25 Feb 2026 15:49:25 +0100 Subject: [PATCH 04/13] docs(bench): split benchmark docs into handbook + auto-generated results Split docs/benchmarks.md into two files: - docs/benchmarks.md: hand-written handbook (how to run, scenarios, interpreting results, regression testing) - docs/benchmark-results.md: auto-generated by bench:save with Mermaid xychart-beta charts, summary table, and polished data presentation Rewrite generateBenchmarkDocs() with compression ratio chart, dedup impact chart, LLM comparison chart, key findings callout, and conditional sections for LLM data and version history. --- CLAUDE.md | 1 + README.md | 2 +- bench/baseline.ts | 509 +++++++++++++++++----------- bench/baselines/current.json | 2 +- bench/baselines/history/v1.0.0.json | 2 +- bench/run.ts | 4 +- docs/README.md | 1 + docs/benchmark-results.md | 188 ++++++++++ docs/benchmarks.md | 167 ++------- 9 files changed, 537 insertions(+), 339 deletions(-) create mode 100644 docs/benchmark-results.md diff --git a/CLAUDE.md b/CLAUDE.md index 1c2a457..1131aa9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -13,6 +13,7 @@ npm run lint # ESLint check npm run format # Prettier write npm run format:check # Prettier check npm run bench # Run benchmark suite +npm run bench:save # Run, save baseline, regenerate docs/benchmark-results.md ``` Run a single test file: diff --git a/README.md b/README.md index 11a8981..9e00710 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ const { messages: originals } = uncompress(compressed, verbatim); No API keys. No network calls. Runs synchronously by default. Under 2ms for typical conversations. -The classifier is content-aware, not domain-specific. It preserves structured data (code, JSON, SQL, tables, citations, formulas) and compresses surrounding prose — making it useful anywhere dense reference material is mixed with natural language: LLM conversations, legal briefs, medical records, technical documentation, support logs. +The classifier is content-aware, not domain-specific. It preserves structured data (code, JSON, SQL, tables, citations, formulas) and compresses surrounding prose — optimized for LLM conversations and technical documentation. ## Key findings diff --git a/bench/baseline.ts b/bench/baseline.ts index b44aef6..7f14994 100644 --- a/bench/baseline.ts +++ b/bench/baseline.ts @@ -359,23 +359,145 @@ function fix(n: number, d: number = 2): string { return n.toFixed(d); } -function generateSection(b: Baseline): string { +/** Shorten scenario names for chart x-axis labels. */ +const SHORT_NAMES: Record = { + 'Coding assistant': 'Coding', + 'Long Q&A': 'Long Q&A', + 'Tool-heavy': 'Tool-heavy', + 'Short conversation': 'Short', + 'Deep conversation': 'Deep', + 'Technical explanation': 'Technical', + 'Structured content': 'Structured', + 'Agentic coding session': 'Agentic', +}; + +function shortName(name: string): string { + return SHORT_NAMES[name] ?? name; +} + +function formatTime(ms: number): string { + return ms < 1000 ? `${Math.round(ms)}ms` : `${(ms / 1000).toFixed(1)}s`; +} + +// --------------------------------------------------------------------------- +// Mermaid chart helpers +// --------------------------------------------------------------------------- + +function compressionChart(basic: Record): string[] { + const entries = Object.entries(basic); + const labels = entries.map(([n]) => `"${shortName(n)}"`).join(', '); + const values = entries.map(([, v]) => fix(v.ratio)).join(', '); + + return [ + '```mermaid', + 'xychart-beta', + ' title "Compression Ratio by Scenario"', + ` x-axis [${labels}]`, + ' y-axis "Char Ratio"', + ` bar [${values}]`, + '```', + ]; +} + +function dedupChart(dedup: Record): string[] { + // Only include scenarios where dedup actually changes the ratio + const entries = Object.entries(dedup).filter(([, v]) => v.rw0Base !== v.rw0Dup || v.deduped > 0); + if (entries.length === 0) return []; + + const labels = entries.map(([n]) => `"${shortName(n)}"`).join(', '); + const base = entries.map(([, v]) => fix(v.rw0Base)).join(', '); + const exact = entries.map(([, v]) => fix(v.rw0Dup)).join(', '); + + return [ + '```mermaid', + 'xychart-beta', + ' title "Deduplication Impact (recencyWindow=0)"', + ` x-axis [${labels}]`, + ' y-axis "Char Ratio"', + ` bar [${base}]`, + ` bar [${exact}]`, + '```', + '', + '*First bar: no dedup · Second bar: with dedup*', + ]; +} + +function llmComparisonChart( + basic: Record, + llmResults: LlmBenchmarkResult[], +): string[] { + // Use the best LLM result (highest average vsDet) for the chart + let bestLlm: LlmBenchmarkResult | undefined; + let bestAvg = -Infinity; + for (const llm of llmResults) { + const vsDetValues: number[] = []; + for (const sr of Object.values(llm.scenarios)) { + for (const mr of Object.values(sr.methods)) { + if (mr.vsDet != null && mr.vsDet > 0) vsDetValues.push(mr.vsDet); + } + } + const avg = vsDetValues.length > 0 ? vsDetValues.reduce((a, b) => a + b, 0) / vsDetValues.length : 0; + if (avg > bestAvg) { + bestAvg = avg; + bestLlm = llm; + } + } + if (!bestLlm) return []; + + // Match scenarios that exist in both basic and LLM results + const sharedScenarios = Object.keys(basic).filter((s) => s in bestLlm!.scenarios); + if (sharedScenarios.length === 0) return []; + + const labels = sharedScenarios.map((n) => `"${shortName(n)}"`).join(', '); + const detValues = sharedScenarios.map((s) => fix(basic[s].ratio)).join(', '); + + // Pick the best LLM method per scenario (highest ratio) + const llmValues = sharedScenarios + .map((s) => { + const methods = Object.values(bestLlm!.scenarios[s].methods).filter( + (m) => m.vsDet != null, + ); + if (methods.length === 0) return fix(basic[s].ratio); + return fix(Math.max(...methods.map((m) => m.ratio))); + }) + .join(', '); + + return [ + '```mermaid', + 'xychart-beta', + ` title "Deterministic vs LLM (${bestLlm.provider}/${bestLlm.model})"`, + ` x-axis [${labels}]`, + ' y-axis "Char Ratio"', + ` bar [${detValues}]`, + ` bar [${llmValues}]`, + '```', + '', + '*First bar: deterministic · Second bar: best LLM method*', + ]; +} + +// --------------------------------------------------------------------------- +// Section generators +// --------------------------------------------------------------------------- + +function generateCompressionSection(b: Baseline): string[] { const lines: string[] = []; const r = b.results; - - // Basic compression table const basicEntries = Object.entries(r.basic); const ratios = basicEntries.map(([, v]) => v.ratio); const minR = Math.min(...ratios); const maxR = Math.max(...ratios); const avgR = ratios.reduce((a, b) => a + b, 0) / ratios.length; - lines.push(`### Basic Compression`); + lines.push('## Compression by Scenario'); lines.push(''); lines.push( - `**Range:** ${fix(minR)}x \u2013 ${fix(maxR)}x \u00b7 **Average:** ${fix(avgR)}x \u00b7 **Round-trip:** all PASS`, + `> **${basicEntries.length} scenarios** · **${fix(avgR)}x** avg ratio · ` + + `**${fix(minR)}x** – **${fix(maxR)}x** range · all round-trips PASS`, ); lines.push(''); + lines.push(...compressionChart(r.basic)); + lines.push(''); lines.push('| Scenario | Char Ratio | Token Ratio | Compressed | Preserved |'); lines.push('| --- | ---: | ---: | ---: | ---: |'); for (const [name, v] of basicEntries) { @@ -383,27 +505,20 @@ function generateSection(b: Baseline): string { `| ${name} | ${fix(v.ratio)} | ${fix(v.tokenRatio)} | ${v.compressed} | ${v.preserved} |`, ); } + return lines; +} - // Token budget table - lines.push(''); - lines.push('### Token Budget (target: 2000 tokens)'); +function generateDedupSection(r: BenchmarkResults): string[] { + const lines: string[] = []; + lines.push('## Deduplication Impact'); lines.push(''); - lines.push( - '| Scenario | Dedup | Tokens | Fits | recencyWindow | Compressed | Preserved | Deduped |', - ); - lines.push('| --- | --- | ---: | --- | ---: | ---: | ---: | ---: |'); - for (const [key, v] of Object.entries(r.tokenBudget)) { - const [name, dedupStr] = key.split('|'); - const dedup = dedupStr === 'dedup=true' ? 'yes' : 'no'; - lines.push( - `| ${name} | ${dedup} | ${v.tokenCount} | ${v.fits} | ${v.recencyWindow ?? '-'} | ${v.compressed} | ${v.preserved} | ${v.deduped} |`, - ); + + const chart = dedupChart(r.dedup); + if (chart.length > 0) { + lines.push(...chart); + lines.push(''); } - // Dedup comparison table - lines.push(''); - lines.push('### Dedup Effectiveness'); - lines.push(''); lines.push( '| Scenario | No Dedup (rw=0) | Dedup (rw=0) | No Dedup (rw=4) | Dedup (rw=4) | Deduped |', ); @@ -413,219 +528,237 @@ function generateSection(b: Baseline): string { `| ${name} | ${fix(v.rw0Base)} | ${fix(v.rw0Dup)} | ${fix(v.rw4Base)} | ${fix(v.rw4Dup)} | ${v.deduped} |`, ); } - - // Fuzzy dedup table - lines.push(''); - lines.push('### Fuzzy Dedup'); lines.push(''); + + // Fuzzy dedup detail + const hasFuzzy = Object.values(r.fuzzyDedup).some((v) => v.fuzzy > 0); + if (hasFuzzy) { + lines.push('### Fuzzy Dedup'); + lines.push(''); + } lines.push('| Scenario | Exact Deduped | Fuzzy Deduped | Ratio |'); lines.push('| --- | ---: | ---: | ---: |'); for (const [name, v] of Object.entries(r.fuzzyDedup)) { lines.push(`| ${name} | ${v.exact} | ${v.fuzzy} | ${fix(v.ratio)} |`); } - - return lines.join('\n'); + return lines; } -export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): void { - const baselines = loadAllBaselines(baselinesDir); - if (baselines.length === 0) return; - - const latest = baselines[baselines.length - 1]; +function generateTokenBudgetSection(r: BenchmarkResults): string[] { const lines: string[] = []; + const entries = Object.entries(r.tokenBudget); + const allFit = entries.every(([, v]) => v.fits); + const fitCount = entries.filter(([, v]) => v.fits).length; - lines.push('# Benchmark Results'); - lines.push(''); - lines.push('[Back to README](../README.md) | [All docs](README.md)'); - lines.push(''); - lines.push(''); - lines.push(''); - lines.push(''); - - // --- How to run section --- - lines.push('## Running Benchmarks'); + lines.push('## Token Budget'); lines.push(''); - lines.push('```bash'); - lines.push('npm run bench # Run benchmarks (no baseline check)'); - lines.push('npm run bench:check # Run and compare against baseline'); - lines.push('npm run bench:save # Run, save new baseline, regenerate this doc'); - lines.push('```'); - lines.push(''); - lines.push('### LLM benchmarks (opt-in)'); + lines.push(`Target: **2000 tokens** · ${allFit ? 'all fit' : `${fitCount}/${entries.length} fit`}`); lines.push(''); lines.push( - 'LLM benchmarks require the `--llm` flag (`npm run bench:llm`). Set API keys in a `.env` file or export them. Ollama is auto-detected when running locally.', + '| Scenario | Dedup | Tokens | Fits | recencyWindow | Compressed | Preserved | Deduped |', ); - lines.push(''); - lines.push('| Variable | Provider | Default Model | Notes |'); - lines.push('| --- | --- | --- | --- |'); - lines.push('| `OPENAI_API_KEY` | OpenAI | `gpt-4.1-mini` | |'); - lines.push('| `ANTHROPIC_API_KEY` | Anthropic | `claude-haiku-4-5-20251001` | |'); - lines.push('| *(none required)* | Ollama | `llama3.2` | Auto-detected on localhost:11434 |'); - lines.push(''); + lines.push('| --- | --- | ---: | --- | ---: | ---: | ---: | ---: |'); + for (const [key, v] of entries) { + const [name, dedupStr] = key.split('|'); + const dedup = dedupStr === 'dedup=true' ? 'yes' : 'no'; + const fitIcon = v.fits ? 'yes' : 'no'; + lines.push( + `| ${name} | ${dedup} | ${v.tokenCount} | ${fitIcon} | ${v.recencyWindow ?? '-'} | ${v.compressed} | ${v.preserved} | ${v.deduped} |`, + ); + } + return lines; +} + +function generateLlmSection( + baselinesDir: string, + basic: Record, +): string[] { + const llmResults = loadAllLlmResults(baselinesDir); + if (llmResults.length === 0) return []; - // --- Latest version results --- - lines.push(`## Current Results (v${latest.version})`); + const lines: string[] = []; + lines.push('## LLM vs Deterministic'); lines.push(''); - lines.push(generateSection(latest)); + lines.push( + '> Results are **non-deterministic** — LLM outputs vary between runs. ' + + 'Saved as reference data, not used for regression testing.', + ); lines.push(''); - // --- Version history --- - if (baselines.length > 1) { - lines.push('## Version History'); + // Summary chart + const chart = llmComparisonChart(basic, llmResults); + if (chart.length > 0) { + lines.push(...chart); lines.push(''); - lines.push('| Version | Date | Avg Char Ratio | Avg Token Ratio | Scenarios |'); - lines.push('| --- | --- | ---: | ---: | ---: |'); - for (const b of [...baselines].reverse()) { - const basicEntries = Object.values(b.results.basic); - const avgChr = basicEntries.reduce((s, v) => s + v.ratio, 0) / basicEntries.length; - const avgTkr = basicEntries.reduce((s, v) => s + v.tokenRatio, 0) / basicEntries.length; - const date = b.generated.split('T')[0]; + } + + // Key finding callout + const wins: string[] = []; + const losses: string[] = []; + for (const llm of llmResults) { + for (const [scenario, sr] of Object.entries(llm.scenarios)) { + for (const mr of Object.values(sr.methods)) { + if (mr.vsDet != null && mr.vsDet > 1.0) wins.push(scenario); + if (mr.vsDet != null && mr.vsDet < 0.9) losses.push(scenario); + } + } + } + const uniqueWins = [...new Set(wins)]; + const uniqueLosses = [...new Set(losses)]; + if (uniqueWins.length > 0 || uniqueLosses.length > 0) { + lines.push('> **Key findings:**'); + if (uniqueWins.length > 0) { + lines.push(`> LLM wins on prose-heavy scenarios: ${uniqueWins.join(', ')}`); + } + if (uniqueLosses.length > 0) { lines.push( - `| ${b.version} | ${date} | ${fix(avgChr)} | ${fix(avgTkr)} | ${basicEntries.length} |`, + `> Deterministic wins on structured/technical content: ${uniqueLosses.join(', ')}`, ); } lines.push(''); } - // --- Per-version detail (older versions) --- - const olderVersions = baselines.slice(0, -1).reverse(); - if (olderVersions.length > 0) { - lines.push('## Previous Versions'); + // Per-provider detail tables + for (const llm of llmResults) { + lines.push(`### ${llm.provider} (${llm.model})`); lines.push(''); - for (const b of olderVersions) { - lines.push(`
`); - lines.push(`v${b.version} (${b.generated.split('T')[0]})`); - lines.push(''); - lines.push(generateSection(b)); + lines.push(`*Generated: ${llm.generated.split('T')[0]}*`); + lines.push(''); + lines.push( + '| Scenario | Method | Char Ratio | Token Ratio | vsDet | Compressed | Preserved | Round-trip | Time |', + ); + lines.push('| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: |'); + + for (const [scenario, sr] of Object.entries(llm.scenarios)) { + let first = true; + for (const [method, mr] of Object.entries(sr.methods)) { + const label = first ? scenario : ''; + const vsDet = mr.vsDet != null ? fix(mr.vsDet) : '-'; + lines.push( + `| ${label} | ${method} | ${fix(mr.ratio)} | ${fix(mr.tokenRatio)} | ${vsDet} | ${mr.compressed} | ${mr.preserved} | ${mr.roundTrip} | ${formatTime(mr.timeMs)} |`, + ); + first = false; + } + } + + // Token budget table (if present) + if (llm.tokenBudget && Object.keys(llm.tokenBudget).length > 0) { lines.push(''); - lines.push('
'); + lines.push('#### Token Budget (target: 2000 tokens)'); lines.push(''); + lines.push( + '| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time |', + ); + lines.push('| --- | --- | ---: | --- | ---: | ---: | --- | ---: |'); + + for (const [scenario, entries] of Object.entries(llm.tokenBudget)) { + let first = true; + for (const entry of entries) { + const label = first ? scenario : ''; + lines.push( + `| ${label} | ${entry.method} | ${entry.tokenCount} | ${entry.fits} | ${entry.recencyWindow ?? '-'} | ${fix(entry.ratio)} | ${entry.roundTrip} | ${formatTime(entry.timeMs)} |`, + ); + first = false; + } + } } + + lines.push(''); } - // --- Scenarios --- - lines.push('## Scenarios'); - lines.push(''); - lines.push('The benchmark covers 8 conversation types:'); - lines.push(''); - lines.push('| Scenario | Description |'); - lines.push('| --- | --- |'); - lines.push('| Coding assistant | Mixed code fences and prose discussion |'); - lines.push('| Long Q&A | Extended question-and-answer with repeated paragraphs |'); - lines.push('| Tool-heavy | Messages with `tool_calls` arrays (preserved by default) |'); - lines.push('| Short conversation | Brief exchanges, mostly under 120 chars |'); - lines.push('| Deep conversation | 25 turns of multi-paragraph prose |'); - lines.push('| Technical explanation | Pure prose Q&A about event-driven architecture |'); - lines.push('| Structured content | JSON, YAML, SQL, API keys, test output |'); - lines.push( - '| Agentic coding session | Repeated file reads, grep results, near-duplicate edits |', - ); - lines.push(''); + return lines; +} + +// --------------------------------------------------------------------------- +// Main doc generator +// --------------------------------------------------------------------------- - // --- Interpreting results --- - lines.push('## Interpreting Results'); +export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): void { + const baselines = loadAllBaselines(baselinesDir); + if (baselines.length === 0) return; + + const latest = baselines[baselines.length - 1]; + const lines: string[] = []; + + // --- Header --- + lines.push('# Benchmark Results'); lines.push(''); - lines.push('### Compression ratio'); + lines.push('[Back to README](../README.md) | [All docs](README.md) | [Handbook](benchmarks.md)'); lines.push(''); - lines.push('| Ratio | Reduction |'); - lines.push('| ---: | --- |'); - lines.push('| 1.0x | no compression (all messages preserved) |'); - lines.push('| 1.5x | 33% reduction |'); - lines.push('| 2.0x | 50% reduction |'); - lines.push('| 3.0x | 67% reduction |'); - lines.push('| 6.0x | 83% reduction |'); + lines.push('*Auto-generated by `npm run bench:save`. Do not edit manually.*'); lines.push(''); - lines.push( - 'Higher is better. Token ratio is more meaningful for LLM context budgeting; character ratio is useful for storage.', - ); + lines.push(`**v${latest.version}** · Generated: ${latest.generated.split('T')[0]}`); lines.push(''); - // --- Regression testing --- - lines.push('## Regression Testing'); + // --- Summary --- + const basicEntries = Object.entries(latest.results.basic); + const ratios = basicEntries.map(([, v]) => v.ratio); + const avgR = ratios.reduce((a, b) => a + b, 0) / ratios.length; + lines.push('## Summary'); lines.push(''); - lines.push( - 'Baselines are stored in [`bench/baselines/`](../bench/baselines/) as JSON. CI runs `npm run bench:check` on every push and PR to catch regressions.', - ); + lines.push(`| Metric | Value |`); + lines.push(`| --- | --- |`); + lines.push(`| Scenarios | ${basicEntries.length} |`); + lines.push(`| Average compression | ${fix(avgR)}x |`); + lines.push(`| Best compression | ${fix(Math.max(...ratios))}x |`); + lines.push(`| Round-trip integrity | all PASS |`); lines.push(''); - lines.push('- **Tolerance:** 0% by default (all metrics are deterministic)'); - lines.push('- **On regression:** CI fails with a diff showing which metrics changed'); - lines.push( - '- **After intentional changes:** run `npm run bench:save` to update the baseline and regenerate this doc', - ); - lines.push( - '- **Custom tolerance:** `npx tsx bench/run.ts --check --tolerance 5` allows 5% deviation', - ); + + // --- Compression --- + lines.push(...generateCompressionSection(latest)); lines.push(''); - lines.push('### Baseline files'); + + // --- Dedup --- + lines.push(...generateDedupSection(latest.results)); lines.push(''); - lines.push('| File | Purpose |'); - lines.push('| --- | --- |'); - lines.push('| `bench/baselines/current.json` | Active baseline compared in CI |'); - lines.push('| `bench/baselines/history/v*.json` | Versioned snapshots, one per release |'); - lines.push('| `bench/baselines/llm/*.json` | LLM benchmark reference data (non-deterministic) |'); + + // --- Token budget --- + lines.push(...generateTokenBudgetSection(latest.results)); lines.push(''); - // --- LLM comparison (if result files exist) --- - const llmResults = loadAllLlmResults(baselinesDir); - if (llmResults.length > 0) { - lines.push('## LLM Summarization Comparison'); - lines.push(''); - lines.push( - '> Results are **non-deterministic** — LLM outputs vary between runs. These are saved as reference data, not used for regression testing.', - ); - lines.push(''); + // --- LLM (conditional) --- + const llmSection = generateLlmSection(baselinesDir, latest.results.basic); + if (llmSection.length > 0) { + lines.push(...llmSection); + } - for (const llm of llmResults) { - lines.push(`### ${llm.provider} (${llm.model})`); - lines.push(''); - lines.push(`*Generated: ${llm.generated.split('T')[0]}*`); - lines.push(''); + // --- Version history (conditional) --- + if (baselines.length > 1) { + lines.push('## Version History'); + lines.push(''); + lines.push('| Version | Date | Avg Char Ratio | Avg Token Ratio | Scenarios |'); + lines.push('| --- | --- | ---: | ---: | ---: |'); + for (const b of [...baselines].reverse()) { + const entries = Object.values(b.results.basic); + const avgChr = entries.reduce((s, v) => s + v.ratio, 0) / entries.length; + const avgTkr = entries.reduce((s, v) => s + v.tokenRatio, 0) / entries.length; + const date = b.generated.split('T')[0]; lines.push( - '| Scenario | Method | Char Ratio | Token Ratio | vs Det | Compressed | Preserved | Round-trip | Time |', + `| ${b.version} | ${date} | ${fix(avgChr)} | ${fix(avgTkr)} | ${entries.length} |`, ); - lines.push('| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: |'); + } + lines.push(''); - for (const [scenario, sr] of Object.entries(llm.scenarios)) { - let first = true; - for (const [method, mr] of Object.entries(sr.methods)) { - const label = first ? scenario : ''; - const time = - mr.timeMs < 1000 ? `${Math.round(mr.timeMs)}ms` : `${(mr.timeMs / 1000).toFixed(1)}s`; - const vsDet = mr.vsDet != null ? fix(mr.vsDet) : '-'; - lines.push( - `| ${label} | ${method} | ${fix(mr.ratio)} | ${fix(mr.tokenRatio)} | ${vsDet} | ${mr.compressed} | ${mr.preserved} | ${mr.roundTrip} | ${time} |`, - ); - first = false; - } - } + // Per-version detail (older versions) + const olderVersions = baselines.slice(0, -1).reverse(); + for (const b of olderVersions) { + const r = b.results; + const oldEntries = Object.entries(r.basic); + const oldRatios = oldEntries.map(([, v]) => v.ratio); + const oldAvg = oldRatios.reduce((a, b) => a + b, 0) / oldRatios.length; - // Token budget table (if present) - if (llm.tokenBudget && Object.keys(llm.tokenBudget).length > 0) { - lines.push(''); - lines.push('#### Token Budget (target: 2000 tokens)'); - lines.push(''); + lines.push(`
`); + lines.push(`v${b.version} (${b.generated.split('T')[0]}) — ${fix(oldAvg)}x avg`); + lines.push(''); + lines.push('| Scenario | Char Ratio | Token Ratio | Compressed | Preserved |'); + lines.push('| --- | ---: | ---: | ---: | ---: |'); + for (const [name, v] of oldEntries) { lines.push( - '| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time |', + `| ${name} | ${fix(v.ratio)} | ${fix(v.tokenRatio)} | ${v.compressed} | ${v.preserved} |`, ); - lines.push('| --- | --- | ---: | --- | ---: | ---: | --- | ---: |'); - - for (const [scenario, entries] of Object.entries(llm.tokenBudget)) { - let first = true; - for (const entry of entries) { - const label = first ? scenario : ''; - const time = - entry.timeMs < 1000 - ? `${Math.round(entry.timeMs)}ms` - : `${(entry.timeMs / 1000).toFixed(1)}s`; - lines.push( - `| ${label} | ${entry.method} | ${entry.tokenCount} | ${entry.fits} | ${entry.recencyWindow ?? '-'} | ${fix(entry.ratio)} | ${entry.roundTrip} | ${time} |`, - ); - first = false; - } - } } - + lines.push(''); + lines.push('
'); lines.push(''); } } @@ -633,12 +766,10 @@ export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): // --- Methodology --- lines.push('## Methodology'); lines.push(''); - lines.push('- All results are **deterministic** — same input always produces the same output'); - lines.push('- Metrics tracked: compression ratio, token ratio, message counts, dedup counts'); + lines.push('- All deterministic results use the same input → same output guarantee'); + lines.push('- Metrics: compression ratio, token ratio, message counts, dedup counts'); lines.push('- Timing is excluded from baselines (hardware-dependent)'); - lines.push( - '- Real-session and LLM benchmarks are excluded from baselines (environment-dependent)', - ); + lines.push('- LLM benchmarks are saved as reference data, not used for regression testing'); lines.push('- Round-trip integrity is verified for every scenario (compress then uncompress)'); lines.push(''); diff --git a/bench/baselines/current.json b/bench/baselines/current.json index d127500..731eaca 100644 --- a/bench/baselines/current.json +++ b/bench/baselines/current.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T13:40:26.671Z", + "generated": "2026-02-25T14:48:02.426Z", "results": { "basic": { "Coding assistant": { diff --git a/bench/baselines/history/v1.0.0.json b/bench/baselines/history/v1.0.0.json index d127500..731eaca 100644 --- a/bench/baselines/history/v1.0.0.json +++ b/bench/baselines/history/v1.0.0.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T13:40:26.671Z", + "generated": "2026-02-25T14:48:02.426Z", "results": { "basic": { "Coding assistant": { diff --git a/bench/run.ts b/bench/run.ts index 191c1d8..2fb5460 100644 --- a/bench/run.ts +++ b/bench/run.ts @@ -1150,10 +1150,10 @@ async function run(): Promise { saveBaseline(baselinesDir, version, benchResults); generateBenchmarkDocs( baselinesDir, - resolve(import.meta.dirname, '..', 'docs', 'benchmarks.md'), + resolve(import.meta.dirname, '..', 'docs', 'benchmark-results.md'), ); console.log(); - console.log(`Baseline saved (v${version}) and docs/benchmarks.md regenerated.`); + console.log(`Baseline saved (v${version}) and docs/benchmark-results.md regenerated.`); } if (flagCheck) { diff --git a/docs/README.md b/docs/README.md index 658c442..e5f246d 100644 --- a/docs/README.md +++ b/docs/README.md @@ -13,3 +13,4 @@ | [Provenance](provenance.md) | `_cce_original` metadata, summary_id, parent_ids | | [Preservation Rules](preservation-rules.md) | What gets preserved, classification tiers, code-aware splitting | | [Benchmarks](benchmarks.md) | Running benchmarks, LLM comparison, interpreting results | +| [Benchmark Results](benchmark-results.md) | Auto-generated results with charts (regenerated by bench:save) | diff --git a/docs/benchmark-results.md b/docs/benchmark-results.md new file mode 100644 index 0000000..ecdca00 --- /dev/null +++ b/docs/benchmark-results.md @@ -0,0 +1,188 @@ +# Benchmark Results + +[Back to README](../README.md) | [All docs](README.md) | [Handbook](benchmarks.md) + +*Auto-generated by `npm run bench:save`. Do not edit manually.* + +**v1.0.0** · Generated: 2026-02-25 + +## Summary + +| Metric | Value | +| --- | --- | +| Scenarios | 8 | +| Average compression | 2.08x | +| Best compression | 6.16x | +| Round-trip integrity | all PASS | + +## Compression by Scenario + +> **8 scenarios** · **2.08x** avg ratio · **1.00x** – **6.16x** range · all round-trips PASS + +```mermaid +xychart-beta + title "Compression Ratio by Scenario" + x-axis ["Coding", "Long Q&A", "Tool-heavy", "Short", "Deep", "Technical", "Structured", "Agentic"] + y-axis "Char Ratio" + bar [1.68, 6.16, 1.30, 1.00, 2.12, 1.00, 1.93, 1.43] +``` + +| Scenario | Char Ratio | Token Ratio | Compressed | Preserved | +| --- | ---: | ---: | ---: | ---: | +| Coding assistant | 1.68 | 1.67 | 5 | 8 | +| Long Q&A | 6.16 | 6.11 | 4 | 6 | +| Tool-heavy | 1.30 | 1.29 | 2 | 16 | +| Short conversation | 1.00 | 1.00 | 0 | 7 | +| Deep conversation | 2.12 | 2.12 | 50 | 1 | +| Technical explanation | 1.00 | 1.00 | 0 | 11 | +| Structured content | 1.93 | 1.92 | 2 | 10 | +| Agentic coding session | 1.43 | 1.43 | 2 | 31 | + +## Deduplication Impact + +```mermaid +xychart-beta + title "Deduplication Impact (recencyWindow=0)" + x-axis ["Long Q&A", "Agentic"] + y-axis "Char Ratio" + bar [5.14, 1.14] + bar [6.16, 1.43] +``` + +*First bar: no dedup · Second bar: with dedup* + +| Scenario | No Dedup (rw=0) | Dedup (rw=0) | No Dedup (rw=4) | Dedup (rw=4) | Deduped | +| --- | ---: | ---: | ---: | ---: | ---: | +| Coding assistant | 1.68 | 1.68 | 1.51 | 1.51 | 0 | +| Long Q&A | 5.14 | 6.16 | 1.90 | 2.03 | 1 | +| Tool-heavy | 1.30 | 1.30 | 1.30 | 1.30 | 0 | +| Short conversation | 1.00 | 1.00 | 1.00 | 1.00 | 0 | +| Deep conversation | 2.12 | 2.12 | 1.95 | 1.95 | 0 | +| Technical explanation | 1.00 | 1.00 | 1.00 | 1.00 | 0 | +| Structured content | 1.93 | 1.93 | 1.37 | 1.37 | 0 | +| Agentic coding session | 1.14 | 1.43 | 1.14 | 1.43 | 4 | + +### Fuzzy Dedup + +| Scenario | Exact Deduped | Fuzzy Deduped | Ratio | +| --- | ---: | ---: | ---: | +| Coding assistant | 0 | 0 | 1.68 | +| Long Q&A | 1 | 0 | 6.16 | +| Tool-heavy | 0 | 0 | 1.30 | +| Short conversation | 0 | 0 | 1.00 | +| Deep conversation | 0 | 0 | 2.12 | +| Technical explanation | 0 | 0 | 1.00 | +| Structured content | 0 | 0 | 1.93 | +| Agentic coding session | 4 | 2 | 2.23 | + +## Token Budget + +Target: **2000 tokens** · 1/4 fit + +| Scenario | Dedup | Tokens | Fits | recencyWindow | Compressed | Preserved | Deduped | +| --- | --- | ---: | --- | ---: | ---: | ---: | ---: | +| Deep conversation | no | 3738 | no | 0 | 50 | 1 | 0 | +| Deep conversation | yes | 3738 | no | 0 | 50 | 1 | 0 | +| Agentic coding session | no | 2345 | no | 0 | 4 | 33 | 0 | +| Agentic coding session | yes | 1957 | yes | 9 | 1 | 32 | 4 | + +## LLM vs Deterministic + +> Results are **non-deterministic** — LLM outputs vary between runs. Saved as reference data, not used for regression testing. + +```mermaid +xychart-beta + title "Deterministic vs LLM (ollama/llama3.2)" + x-axis ["Coding", "Long Q&A", "Tool-heavy", "Deep", "Technical", "Structured", "Agentic"] + y-axis "Char Ratio" + bar [1.68, 6.16, 1.30, 2.12, 1.00, 1.93, 1.43] + bar [1.55, 4.49, 1.28, 3.28, 1.00, 1.46, 1.40] +``` + +*First bar: deterministic · Second bar: best LLM method* + +> **Key findings:** +> LLM wins on prose-heavy scenarios: Deep conversation, Technical explanation +> Deterministic wins on structured/technical content: Coding assistant, Long Q&A, Tool-heavy, Structured content + +### ollama (llama3.2) + +*Generated: 2026-02-25* + +| Scenario | Method | Char Ratio | Token Ratio | vsDet | Compressed | Preserved | Round-trip | Time | +| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: | +| Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms | +| | llm-basic | 1.48 | 1.48 | 0.88 | 5 | 8 | PASS | 5.9s | +| | llm-escalate | 1.55 | 1.55 | 0.92 | 5 | 8 | PASS | 3.0s | +| Long Q&A | deterministic | 6.16 | 6.11 | - | 4 | 6 | PASS | 1ms | +| | llm-basic | 4.31 | 4.28 | 0.70 | 4 | 6 | PASS | 4.1s | +| | llm-escalate | 4.49 | 4.46 | 0.73 | 4 | 6 | PASS | 3.7s | +| Tool-heavy | deterministic | 1.30 | 1.29 | - | 2 | 16 | PASS | 2ms | +| | llm-basic | 1.12 | 1.11 | 0.86 | 2 | 16 | PASS | 2.3s | +| | llm-escalate | 1.28 | 1.28 | 0.99 | 2 | 16 | PASS | 2.8s | +| Deep conversation | deterministic | 2.12 | 2.12 | - | 50 | 1 | PASS | 3ms | +| | llm-basic | 3.12 | 3.11 | 1.47 | 50 | 1 | PASS | 22.7s | +| | llm-escalate | 3.28 | 3.26 | 1.54 | 50 | 1 | PASS | 23.3s | +| Technical explanation | deterministic | 1.00 | 1.00 | - | 0 | 11 | PASS | 1ms | +| | llm-basic | 1.00 | 1.00 | 1.00 | 0 | 11 | PASS | 3.2s | +| | llm-escalate | 1.00 | 1.00 | 1.00 | 2 | 9 | PASS | 785ms | +| Structured content | deterministic | 1.93 | 1.92 | - | 2 | 10 | PASS | 0ms | +| | llm-basic | 1.46 | 1.45 | 0.75 | 2 | 10 | PASS | 3.5s | +| | llm-escalate | 1.38 | 1.38 | 0.71 | 2 | 10 | PASS | 3.7s | +| Agentic coding session | deterministic | 1.43 | 1.43 | - | 2 | 31 | PASS | 1ms | +| | llm-basic | 1.35 | 1.34 | 0.94 | 2 | 31 | PASS | 3.3s | +| | llm-escalate | 1.40 | 1.40 | 0.98 | 2 | 31 | PASS | 5.4s | + +#### Token Budget (target: 2000 tokens) + +| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time | +| --- | --- | ---: | --- | ---: | ---: | --- | ---: | +| Deep conversation | deterministic | 3738 | false | 0 | 2.12 | PASS | 12ms | +| | llm-escalate | 2593 | false | 0 | 3.08 | PASS | 132.0s | +| Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms | +| | llm-escalate | 2003 | false | 9 | 1.33 | PASS | 4.1s | + +### openai (gpt-4.1-mini) + +*Generated: 2026-02-25* + +| Scenario | Method | Char Ratio | Token Ratio | vsDet | Compressed | Preserved | Round-trip | Time | +| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: | +| Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms | +| | llm-basic | 1.64 | 1.63 | 0.98 | 5 | 8 | PASS | 5.6s | +| | llm-escalate | 1.63 | 1.63 | 0.97 | 5 | 8 | PASS | 6.0s | +| Long Q&A | deterministic | 6.16 | 6.11 | - | 4 | 6 | PASS | 1ms | +| | llm-basic | 5.37 | 5.33 | 0.87 | 4 | 6 | PASS | 5.9s | +| | llm-escalate | 5.35 | 5.31 | 0.87 | 4 | 6 | PASS | 7.0s | +| Tool-heavy | deterministic | 1.30 | 1.29 | - | 2 | 16 | PASS | 0ms | +| | llm-basic | 1.11 | 1.10 | 0.85 | 2 | 16 | PASS | 3.5s | +| | llm-escalate | 1.12 | 1.12 | 0.86 | 2 | 16 | PASS | 5.3s | +| Deep conversation | deterministic | 2.12 | 2.12 | - | 50 | 1 | PASS | 3ms | +| | llm-basic | 2.34 | 2.33 | 1.10 | 50 | 1 | PASS | 50.4s | +| | llm-escalate | 2.37 | 2.36 | 1.11 | 50 | 1 | PASS | 50.8s | +| Technical explanation | deterministic | 1.00 | 1.00 | - | 0 | 11 | PASS | 1ms | +| | llm-basic | 1.00 | 1.00 | 1.00 | 1 | 10 | PASS | 2.6s | +| | llm-escalate | 1.00 | 1.00 | 1.00 | 1 | 10 | PASS | 3.3s | +| Structured content | deterministic | 1.93 | 1.92 | - | 2 | 10 | PASS | 0ms | +| | llm-basic | 1.23 | 1.23 | 0.64 | 2 | 10 | PASS | 10.2s | +| | llm-escalate | 1.29 | 1.29 | 0.67 | 2 | 10 | PASS | 4.8s | +| Agentic coding session | deterministic | 1.43 | 1.43 | - | 2 | 31 | PASS | 1ms | +| | llm-basic | 1.43 | 1.43 | 1.00 | 2 | 31 | PASS | 5.8s | +| | llm-escalate | 1.32 | 1.32 | 0.93 | 1 | 32 | PASS | 9.5s | + +#### Token Budget (target: 2000 tokens) + +| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time | +| --- | --- | ---: | --- | ---: | ---: | --- | ---: | +| Deep conversation | deterministic | 3738 | false | 0 | 2.12 | PASS | 10ms | +| | llm-escalate | 3391 | false | 0 | 2.35 | PASS | 280.5s | +| Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms | +| | llm-escalate | 1915 | true | 3 | 1.39 | PASS | 28.1s | + +## Methodology + +- All deterministic results use the same input → same output guarantee +- Metrics: compression ratio, token ratio, message counts, dedup counts +- Timing is excluded from baselines (hardware-dependent) +- LLM benchmarks are saved as reference data, not used for regression testing +- Round-trip integrity is verified for every scenario (compress then uncompress) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 9888243..eca3acb 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -1,16 +1,14 @@ -# Benchmark Results +# Benchmarks -[Back to README](../README.md) | [All docs](README.md) - - - +[Back to README](../README.md) | [All docs](README.md) | [Latest Results](benchmark-results.md) ## Running Benchmarks ```bash npm run bench # Run benchmarks (no baseline check) npm run bench:check # Run and compare against baseline -npm run bench:save # Run, save new baseline, regenerate this doc +npm run bench:save # Run, save new baseline, regenerate results doc +npm run bench:llm # Run with LLM summarization benchmarks ``` ### LLM benchmarks (opt-in) @@ -23,58 +21,6 @@ LLM benchmarks require the `--llm` flag (`npm run bench:llm`). Set API keys in a | `ANTHROPIC_API_KEY` | Anthropic | `claude-haiku-4-5-20251001` | | | *(none required)* | Ollama | `llama3.2` | Auto-detected on localhost:11434 | -## Current Results (v1.0.0) - -### Basic Compression - -**Range:** 1.00x – 6.16x · **Average:** 2.08x · **Round-trip:** all PASS - -| Scenario | Char Ratio | Token Ratio | Compressed | Preserved | -| --- | ---: | ---: | ---: | ---: | -| Coding assistant | 1.68 | 1.67 | 5 | 8 | -| Long Q&A | 6.16 | 6.11 | 4 | 6 | -| Tool-heavy | 1.30 | 1.29 | 2 | 16 | -| Short conversation | 1.00 | 1.00 | 0 | 7 | -| Deep conversation | 2.12 | 2.12 | 50 | 1 | -| Technical explanation | 1.00 | 1.00 | 0 | 11 | -| Structured content | 1.93 | 1.92 | 2 | 10 | -| Agentic coding session | 1.43 | 1.43 | 2 | 31 | - -### Token Budget (target: 2000 tokens) - -| Scenario | Dedup | Tokens | Fits | recencyWindow | Compressed | Preserved | Deduped | -| --- | --- | ---: | --- | ---: | ---: | ---: | ---: | -| Deep conversation | no | 3738 | false | 0 | 50 | 1 | 0 | -| Deep conversation | yes | 3738 | false | 0 | 50 | 1 | 0 | -| Agentic coding session | no | 2345 | false | 0 | 4 | 33 | 0 | -| Agentic coding session | yes | 1957 | true | 9 | 1 | 32 | 4 | - -### Dedup Effectiveness - -| Scenario | No Dedup (rw=0) | Dedup (rw=0) | No Dedup (rw=4) | Dedup (rw=4) | Deduped | -| --- | ---: | ---: | ---: | ---: | ---: | -| Coding assistant | 1.68 | 1.68 | 1.51 | 1.51 | 0 | -| Long Q&A | 5.14 | 6.16 | 1.90 | 2.03 | 1 | -| Tool-heavy | 1.30 | 1.30 | 1.30 | 1.30 | 0 | -| Short conversation | 1.00 | 1.00 | 1.00 | 1.00 | 0 | -| Deep conversation | 2.12 | 2.12 | 1.95 | 1.95 | 0 | -| Technical explanation | 1.00 | 1.00 | 1.00 | 1.00 | 0 | -| Structured content | 1.93 | 1.93 | 1.37 | 1.37 | 0 | -| Agentic coding session | 1.14 | 1.43 | 1.14 | 1.43 | 4 | - -### Fuzzy Dedup - -| Scenario | Exact Deduped | Fuzzy Deduped | Ratio | -| --- | ---: | ---: | ---: | -| Coding assistant | 0 | 0 | 1.68 | -| Long Q&A | 1 | 0 | 6.16 | -| Tool-heavy | 0 | 0 | 1.30 | -| Short conversation | 0 | 0 | 1.00 | -| Deep conversation | 0 | 0 | 2.12 | -| Technical explanation | 0 | 0 | 1.00 | -| Structured content | 0 | 0 | 1.93 | -| Agentic coding session | 4 | 2 | 2.23 | - ## Scenarios The benchmark covers 8 conversation types: @@ -104,13 +50,30 @@ The benchmark covers 8 conversation types: Higher is better. Token ratio is more meaningful for LLM context budgeting; character ratio is useful for storage. +### Deduplication + +Dedup effectiveness is measured across two axes: + +- **recencyWindow=0** vs **recencyWindow=4** — how much compression improves when recent messages are protected +- **With dedup** vs **without** — the marginal gain from exact + fuzzy duplicate detection + +Scenarios with repeated content (Long Q&A, Agentic coding session) show the largest dedup gains. Scenarios with unique messages show no difference. + +### LLM vs deterministic + +The `vsDet` column shows LLM compression relative to deterministic: + +- **vsDet > 1.0** — LLM achieves better compression (common for long prose) +- **vsDet < 1.0** — deterministic wins (common for structured/technical content) +- **vsDet = 1.0** — no difference (content is already optimal or fully preserved) + ## Regression Testing Baselines are stored in [`bench/baselines/`](../bench/baselines/) as JSON. CI runs `npm run bench:check` on every push and PR to catch regressions. - **Tolerance:** 0% by default (all metrics are deterministic) - **On regression:** CI fails with a diff showing which metrics changed -- **After intentional changes:** run `npm run bench:save` to update the baseline and regenerate this doc +- **After intentional changes:** run `npm run bench:save` to update the baseline and regenerate the results doc - **Custom tolerance:** `npx tsx bench/run.ts --check --tolerance 5` allows 5% deviation ### Baseline files @@ -120,89 +83,3 @@ Baselines are stored in [`bench/baselines/`](../bench/baselines/) as JSON. CI ru | `bench/baselines/current.json` | Active baseline compared in CI | | `bench/baselines/history/v*.json` | Versioned snapshots, one per release | | `bench/baselines/llm/*.json` | LLM benchmark reference data (non-deterministic) | - -## LLM Summarization Comparison - -> Results are **non-deterministic** — LLM outputs vary between runs. These are saved as reference data, not used for regression testing. - -### ollama (llama3.2) - -*Generated: 2026-02-25* - -| Scenario | Method | Char Ratio | Token Ratio | vs Det | Compressed | Preserved | Round-trip | Time | -| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: | -| Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms | -| | llm-basic | 1.48 | 1.48 | 0.88 | 5 | 8 | PASS | 5.9s | -| | llm-escalate | 1.55 | 1.55 | 0.92 | 5 | 8 | PASS | 3.0s | -| Long Q&A | deterministic | 6.16 | 6.11 | - | 4 | 6 | PASS | 1ms | -| | llm-basic | 4.31 | 4.28 | 0.70 | 4 | 6 | PASS | 4.1s | -| | llm-escalate | 4.49 | 4.46 | 0.73 | 4 | 6 | PASS | 3.7s | -| Tool-heavy | deterministic | 1.30 | 1.29 | - | 2 | 16 | PASS | 2ms | -| | llm-basic | 1.12 | 1.11 | 0.86 | 2 | 16 | PASS | 2.3s | -| | llm-escalate | 1.28 | 1.28 | 0.99 | 2 | 16 | PASS | 2.8s | -| Deep conversation | deterministic | 2.12 | 2.12 | - | 50 | 1 | PASS | 3ms | -| | llm-basic | 3.12 | 3.11 | 1.47 | 50 | 1 | PASS | 22.7s | -| | llm-escalate | 3.28 | 3.26 | 1.54 | 50 | 1 | PASS | 23.3s | -| Technical explanation | deterministic | 1.00 | 1.00 | - | 0 | 11 | PASS | 1ms | -| | llm-basic | 1.00 | 1.00 | 1.00 | 0 | 11 | PASS | 3.2s | -| | llm-escalate | 1.00 | 1.00 | 1.00 | 2 | 9 | PASS | 785ms | -| Structured content | deterministic | 1.93 | 1.92 | - | 2 | 10 | PASS | 0ms | -| | llm-basic | 1.46 | 1.45 | 0.75 | 2 | 10 | PASS | 3.5s | -| | llm-escalate | 1.38 | 1.38 | 0.71 | 2 | 10 | PASS | 3.7s | -| Agentic coding session | deterministic | 1.43 | 1.43 | - | 2 | 31 | PASS | 1ms | -| | llm-basic | 1.35 | 1.34 | 0.94 | 2 | 31 | PASS | 3.3s | -| | llm-escalate | 1.40 | 1.40 | 0.98 | 2 | 31 | PASS | 5.4s | - -#### Token Budget (target: 2000 tokens) - -| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time | -| --- | --- | ---: | --- | ---: | ---: | --- | ---: | -| Deep conversation | deterministic | 3738 | false | 0 | 2.12 | PASS | 12ms | -| | llm-escalate | 2593 | false | 0 | 3.08 | PASS | 132.0s | -| Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms | -| | llm-escalate | 2003 | false | 9 | 1.33 | PASS | 4.1s | - -### openai (gpt-4.1-mini) - -*Generated: 2026-02-25* - -| Scenario | Method | Char Ratio | Token Ratio | vs Det | Compressed | Preserved | Round-trip | Time | -| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: | -| Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms | -| | llm-basic | 1.64 | 1.63 | 0.98 | 5 | 8 | PASS | 5.6s | -| | llm-escalate | 1.63 | 1.63 | 0.97 | 5 | 8 | PASS | 6.0s | -| Long Q&A | deterministic | 6.16 | 6.11 | - | 4 | 6 | PASS | 1ms | -| | llm-basic | 5.37 | 5.33 | 0.87 | 4 | 6 | PASS | 5.9s | -| | llm-escalate | 5.35 | 5.31 | 0.87 | 4 | 6 | PASS | 7.0s | -| Tool-heavy | deterministic | 1.30 | 1.29 | - | 2 | 16 | PASS | 0ms | -| | llm-basic | 1.11 | 1.10 | 0.85 | 2 | 16 | PASS | 3.5s | -| | llm-escalate | 1.12 | 1.12 | 0.86 | 2 | 16 | PASS | 5.3s | -| Deep conversation | deterministic | 2.12 | 2.12 | - | 50 | 1 | PASS | 3ms | -| | llm-basic | 2.34 | 2.33 | 1.10 | 50 | 1 | PASS | 50.4s | -| | llm-escalate | 2.37 | 2.36 | 1.11 | 50 | 1 | PASS | 50.8s | -| Technical explanation | deterministic | 1.00 | 1.00 | - | 0 | 11 | PASS | 1ms | -| | llm-basic | 1.00 | 1.00 | 1.00 | 1 | 10 | PASS | 2.6s | -| | llm-escalate | 1.00 | 1.00 | 1.00 | 1 | 10 | PASS | 3.3s | -| Structured content | deterministic | 1.93 | 1.92 | - | 2 | 10 | PASS | 0ms | -| | llm-basic | 1.23 | 1.23 | 0.64 | 2 | 10 | PASS | 10.2s | -| | llm-escalate | 1.29 | 1.29 | 0.67 | 2 | 10 | PASS | 4.8s | -| Agentic coding session | deterministic | 1.43 | 1.43 | - | 2 | 31 | PASS | 1ms | -| | llm-basic | 1.43 | 1.43 | 1.00 | 2 | 31 | PASS | 5.8s | -| | llm-escalate | 1.32 | 1.32 | 0.93 | 1 | 32 | PASS | 9.5s | - -#### Token Budget (target: 2000 tokens) - -| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time | -| --- | --- | ---: | --- | ---: | ---: | --- | ---: | -| Deep conversation | deterministic | 3738 | false | 0 | 2.12 | PASS | 10ms | -| | llm-escalate | 3391 | false | 0 | 2.35 | PASS | 280.5s | -| Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms | -| | llm-escalate | 1915 | true | 3 | 1.39 | PASS | 28.1s | - -## Methodology - -- All results are **deterministic** — same input always produces the same output -- Metrics tracked: compression ratio, token ratio, message counts, dedup counts -- Timing is excluded from baselines (hardware-dependent) -- Real-session and LLM benchmarks are excluded from baselines (environment-dependent) -- Round-trip integrity is verified for every scenario (compress then uncompress) From 6b81f158f9f513d0a4538b8a470f277fd8e6ec4e Mon Sep 17 00:00:00 2001 From: Lisa Date: Wed, 25 Feb 2026 16:02:31 +0100 Subject: [PATCH 05/13] feat(bench): enrich benchmark report with badges, progress bars, and pie chart Add shields.io badges, unicode progress bars, reduction % and message count columns to the compression table, a Mermaid pie chart for message outcomes, and collapsible details sections for LLM provider tables. --- bench/baseline.ts | 59 +++++++++++++++++++++++++++-- bench/baselines/current.json | 2 +- bench/baselines/history/v1.0.0.json | 2 +- docs/benchmark-results.md | 38 ++++++++++++++----- 4 files changed, 85 insertions(+), 16 deletions(-) diff --git a/bench/baseline.ts b/bench/baseline.ts index 7f14994..4f6feb9 100644 --- a/bench/baseline.ts +++ b/bench/baseline.ts @@ -379,6 +379,35 @@ function formatTime(ms: number): string { return ms < 1000 ? `${Math.round(ms)}ms` : `${(ms / 1000).toFixed(1)}s`; } +// --------------------------------------------------------------------------- +// Visual helpers +// --------------------------------------------------------------------------- + +function badges(basic: Record): string[] { + const entries = Object.values(basic); + const ratios = entries.map((v) => v.ratio); + const avgR = (ratios.reduce((a, b) => a + b, 0) / ratios.length).toFixed(2); + const bestR = Math.max(...ratios).toFixed(2); + const allPass = 'all_PASS'; + + const badge = (label: string, value: string, color: string) => + `![${label}](https://img.shields.io/badge/${encodeURIComponent(label).replace(/-/g, '--')}-${encodeURIComponent(value).replace(/-/g, '--')}-${color})`; + + return [ + [ + badge('avg ratio', `${avgR}x`, 'blue'), + badge('best', `${bestR}x`, 'blue'), + badge('scenarios', `${entries.length}`, 'blue'), + badge('round-trip', allPass, 'brightgreen'), + ].join(' '), + ]; +} + +function progressBar(value: number, max: number, width: number = 10): string { + const filled = Math.round((value / max) * width); + return '█'.repeat(filled) + '░'.repeat(width - filled); +} + // --------------------------------------------------------------------------- // Mermaid chart helpers // --------------------------------------------------------------------------- @@ -498,11 +527,16 @@ function generateCompressionSection(b: Baseline): string[] { lines.push(''); lines.push(...compressionChart(r.basic)); lines.push(''); - lines.push('| Scenario | Char Ratio | Token Ratio | Compressed | Preserved |'); - lines.push('| --- | ---: | ---: | ---: | ---: |'); + lines.push( + '| Scenario | | Ratio | Reduction | Token Ratio | Messages | Compressed | Preserved |', + ); + lines.push('| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |'); for (const [name, v] of basicEntries) { + const bar = progressBar(v.ratio, maxR); + const reduction = Math.round((1 - 1 / v.ratio) * 100); + const messages = v.compressed + v.preserved; lines.push( - `| ${name} | ${fix(v.ratio)} | ${fix(v.tokenRatio)} | ${v.compressed} | ${v.preserved} |`, + `| ${name} | ${bar} | ${fix(v.ratio)} | ${reduction}% | ${fix(v.tokenRatio)} | ${messages} | ${v.compressed} | ${v.preserved} |`, ); } return lines; @@ -618,12 +652,15 @@ function generateLlmSection( lines.push(''); } - // Per-provider detail tables + // Per-provider detail tables (collapsible) for (const llm of llmResults) { lines.push(`### ${llm.provider} (${llm.model})`); lines.push(''); lines.push(`*Generated: ${llm.generated.split('T')[0]}*`); lines.push(''); + lines.push('
'); + lines.push(`Scenario details`); + lines.push(''); lines.push( '| Scenario | Method | Char Ratio | Token Ratio | vsDet | Compressed | Preserved | Round-trip | Time |', ); @@ -664,6 +701,8 @@ function generateLlmSection( } lines.push(''); + lines.push('
'); + lines.push(''); } return lines; @@ -689,6 +728,8 @@ export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): lines.push(''); lines.push(`**v${latest.version}** · Generated: ${latest.generated.split('T')[0]}`); lines.push(''); + lines.push(...badges(latest.results.basic)); + lines.push(''); // --- Summary --- const basicEntries = Object.entries(latest.results.basic); @@ -704,6 +745,16 @@ export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): lines.push(`| Round-trip integrity | all PASS |`); lines.push(''); + // --- Pie chart: message outcome distribution --- + const totalPreserved = basicEntries.reduce((s, [, v]) => s + v.preserved, 0); + const totalCompressed = basicEntries.reduce((s, [, v]) => s + v.compressed, 0); + lines.push('```mermaid'); + lines.push('pie title "Message Outcomes"'); + lines.push(` "Preserved" : ${totalPreserved}`); + lines.push(` "Compressed" : ${totalCompressed}`); + lines.push('```'); + lines.push(''); + // --- Compression --- lines.push(...generateCompressionSection(latest)); lines.push(''); diff --git a/bench/baselines/current.json b/bench/baselines/current.json index 731eaca..f3992d7 100644 --- a/bench/baselines/current.json +++ b/bench/baselines/current.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T14:48:02.426Z", + "generated": "2026-02-25T15:01:16.400Z", "results": { "basic": { "Coding assistant": { diff --git a/bench/baselines/history/v1.0.0.json b/bench/baselines/history/v1.0.0.json index 731eaca..f3992d7 100644 --- a/bench/baselines/history/v1.0.0.json +++ b/bench/baselines/history/v1.0.0.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T14:48:02.426Z", + "generated": "2026-02-25T15:01:16.400Z", "results": { "basic": { "Coding assistant": { diff --git a/docs/benchmark-results.md b/docs/benchmark-results.md index ecdca00..db6ef69 100644 --- a/docs/benchmark-results.md +++ b/docs/benchmark-results.md @@ -6,6 +6,8 @@ **v1.0.0** · Generated: 2026-02-25 +![avg ratio](https://img.shields.io/badge/avg%20ratio-2.08x-blue) ![best](https://img.shields.io/badge/best-6.16x-blue) ![scenarios](https://img.shields.io/badge/scenarios-8-blue) ![round-trip](https://img.shields.io/badge/round--trip-all_PASS-brightgreen) + ## Summary | Metric | Value | @@ -15,6 +17,12 @@ | Best compression | 6.16x | | Round-trip integrity | all PASS | +```mermaid +pie title "Message Outcomes" + "Preserved" : 90 + "Compressed" : 65 +``` + ## Compression by Scenario > **8 scenarios** · **2.08x** avg ratio · **1.00x** – **6.16x** range · all round-trips PASS @@ -27,16 +35,16 @@ xychart-beta bar [1.68, 6.16, 1.30, 1.00, 2.12, 1.00, 1.93, 1.43] ``` -| Scenario | Char Ratio | Token Ratio | Compressed | Preserved | -| --- | ---: | ---: | ---: | ---: | -| Coding assistant | 1.68 | 1.67 | 5 | 8 | -| Long Q&A | 6.16 | 6.11 | 4 | 6 | -| Tool-heavy | 1.30 | 1.29 | 2 | 16 | -| Short conversation | 1.00 | 1.00 | 0 | 7 | -| Deep conversation | 2.12 | 2.12 | 50 | 1 | -| Technical explanation | 1.00 | 1.00 | 0 | 11 | -| Structured content | 1.93 | 1.92 | 2 | 10 | -| Agentic coding session | 1.43 | 1.43 | 2 | 31 | +| Scenario | | Ratio | Reduction | Token Ratio | Messages | Compressed | Preserved | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | +| Coding assistant | ███░░░░░░░ | 1.68 | 41% | 1.67 | 13 | 5 | 8 | +| Long Q&A | ██████████ | 6.16 | 84% | 6.11 | 10 | 4 | 6 | +| Tool-heavy | ██░░░░░░░░ | 1.30 | 23% | 1.29 | 18 | 2 | 16 | +| Short conversation | ██░░░░░░░░ | 1.00 | 0% | 1.00 | 7 | 0 | 7 | +| Deep conversation | ███░░░░░░░ | 2.12 | 53% | 2.12 | 51 | 50 | 1 | +| Technical explanation | ██░░░░░░░░ | 1.00 | 0% | 1.00 | 11 | 0 | 11 | +| Structured content | ███░░░░░░░ | 1.93 | 48% | 1.92 | 12 | 2 | 10 | +| Agentic coding session | ██░░░░░░░░ | 1.43 | 30% | 1.43 | 33 | 2 | 31 | ## Deduplication Impact @@ -109,6 +117,9 @@ xychart-beta *Generated: 2026-02-25* +
+Scenario details + | Scenario | Method | Char Ratio | Token Ratio | vsDet | Compressed | Preserved | Round-trip | Time | | --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: | | Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms | @@ -142,10 +153,15 @@ xychart-beta | Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms | | | llm-escalate | 2003 | false | 9 | 1.33 | PASS | 4.1s | +
+ ### openai (gpt-4.1-mini) *Generated: 2026-02-25* +
+Scenario details + | Scenario | Method | Char Ratio | Token Ratio | vsDet | Compressed | Preserved | Round-trip | Time | | --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: | | Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms | @@ -179,6 +195,8 @@ xychart-beta | Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms | | | llm-escalate | 1915 | true | 3 | 1.39 | PASS | 28.1s | +
+ ## Methodology - All deterministic results use the same input → same output guarantee From 9759e1b0f74e2d14558c891472c7e2c540cd9b0a Mon Sep 17 00:00:00 2001 From: Lisa Date: Wed, 25 Feb 2026 16:56:51 +0100 Subject: [PATCH 06/13] fix(bench): remove ugly unicode bars, fix stacked LLM comparison chart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop progress bar column from compression table — unicode blocks render with variable width in GitHub's proportional-font tables. Switch LLM comparison chart from double bar (stacked) to bar+line so both series are visible side by side. --- bench/baseline.ts | 18 ++++++------------ bench/baselines/current.json | 2 +- bench/baselines/history/v1.0.0.json | 2 +- docs/benchmark-results.md | 26 +++++++++++++------------- 4 files changed, 21 insertions(+), 27 deletions(-) diff --git a/bench/baseline.ts b/bench/baseline.ts index 4f6feb9..8f192ec 100644 --- a/bench/baseline.ts +++ b/bench/baseline.ts @@ -403,11 +403,6 @@ function badges(basic: Record): string[] { ]; } -function progressBar(value: number, max: number, width: number = 10): string { - const filled = Math.round((value / max) * width); - return '█'.repeat(filled) + '░'.repeat(width - filled); -} - // --------------------------------------------------------------------------- // Mermaid chart helpers // --------------------------------------------------------------------------- @@ -497,11 +492,11 @@ function llmComparisonChart( ` title "Deterministic vs LLM (${bestLlm.provider}/${bestLlm.model})"`, ` x-axis [${labels}]`, ' y-axis "Char Ratio"', - ` bar [${detValues}]`, - ` bar [${llmValues}]`, + ` bar "Deterministic" [${detValues}]`, + ` line "Best LLM" [${llmValues}]`, '```', '', - '*First bar: deterministic · Second bar: best LLM method*', + '*Bars: deterministic · Line: best LLM method*', ]; } @@ -528,15 +523,14 @@ function generateCompressionSection(b: Baseline): string[] { lines.push(...compressionChart(r.basic)); lines.push(''); lines.push( - '| Scenario | | Ratio | Reduction | Token Ratio | Messages | Compressed | Preserved |', + '| Scenario | Ratio | Reduction | Token Ratio | Messages | Compressed | Preserved |', ); - lines.push('| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |'); + lines.push('| --- | ---: | ---: | ---: | ---: | ---: | ---: |'); for (const [name, v] of basicEntries) { - const bar = progressBar(v.ratio, maxR); const reduction = Math.round((1 - 1 / v.ratio) * 100); const messages = v.compressed + v.preserved; lines.push( - `| ${name} | ${bar} | ${fix(v.ratio)} | ${reduction}% | ${fix(v.tokenRatio)} | ${messages} | ${v.compressed} | ${v.preserved} |`, + `| ${name} | ${fix(v.ratio)} | ${reduction}% | ${fix(v.tokenRatio)} | ${messages} | ${v.compressed} | ${v.preserved} |`, ); } return lines; diff --git a/bench/baselines/current.json b/bench/baselines/current.json index f3992d7..2bffe69 100644 --- a/bench/baselines/current.json +++ b/bench/baselines/current.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T15:01:16.400Z", + "generated": "2026-02-25T15:56:33.089Z", "results": { "basic": { "Coding assistant": { diff --git a/bench/baselines/history/v1.0.0.json b/bench/baselines/history/v1.0.0.json index f3992d7..2bffe69 100644 --- a/bench/baselines/history/v1.0.0.json +++ b/bench/baselines/history/v1.0.0.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T15:01:16.400Z", + "generated": "2026-02-25T15:56:33.089Z", "results": { "basic": { "Coding assistant": { diff --git a/docs/benchmark-results.md b/docs/benchmark-results.md index db6ef69..8266e69 100644 --- a/docs/benchmark-results.md +++ b/docs/benchmark-results.md @@ -35,16 +35,16 @@ xychart-beta bar [1.68, 6.16, 1.30, 1.00, 2.12, 1.00, 1.93, 1.43] ``` -| Scenario | | Ratio | Reduction | Token Ratio | Messages | Compressed | Preserved | -| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | -| Coding assistant | ███░░░░░░░ | 1.68 | 41% | 1.67 | 13 | 5 | 8 | -| Long Q&A | ██████████ | 6.16 | 84% | 6.11 | 10 | 4 | 6 | -| Tool-heavy | ██░░░░░░░░ | 1.30 | 23% | 1.29 | 18 | 2 | 16 | -| Short conversation | ██░░░░░░░░ | 1.00 | 0% | 1.00 | 7 | 0 | 7 | -| Deep conversation | ███░░░░░░░ | 2.12 | 53% | 2.12 | 51 | 50 | 1 | -| Technical explanation | ██░░░░░░░░ | 1.00 | 0% | 1.00 | 11 | 0 | 11 | -| Structured content | ███░░░░░░░ | 1.93 | 48% | 1.92 | 12 | 2 | 10 | -| Agentic coding session | ██░░░░░░░░ | 1.43 | 30% | 1.43 | 33 | 2 | 31 | +| Scenario | Ratio | Reduction | Token Ratio | Messages | Compressed | Preserved | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| Coding assistant | 1.68 | 41% | 1.67 | 13 | 5 | 8 | +| Long Q&A | 6.16 | 84% | 6.11 | 10 | 4 | 6 | +| Tool-heavy | 1.30 | 23% | 1.29 | 18 | 2 | 16 | +| Short conversation | 1.00 | 0% | 1.00 | 7 | 0 | 7 | +| Deep conversation | 2.12 | 53% | 2.12 | 51 | 50 | 1 | +| Technical explanation | 1.00 | 0% | 1.00 | 11 | 0 | 11 | +| Structured content | 1.93 | 48% | 1.92 | 12 | 2 | 10 | +| Agentic coding session | 1.43 | 30% | 1.43 | 33 | 2 | 31 | ## Deduplication Impact @@ -103,11 +103,11 @@ xychart-beta title "Deterministic vs LLM (ollama/llama3.2)" x-axis ["Coding", "Long Q&A", "Tool-heavy", "Deep", "Technical", "Structured", "Agentic"] y-axis "Char Ratio" - bar [1.68, 6.16, 1.30, 2.12, 1.00, 1.93, 1.43] - bar [1.55, 4.49, 1.28, 3.28, 1.00, 1.46, 1.40] + bar "Deterministic" [1.68, 6.16, 1.30, 2.12, 1.00, 1.93, 1.43] + line "Best LLM" [1.55, 4.49, 1.28, 3.28, 1.00, 1.46, 1.40] ``` -*First bar: deterministic · Second bar: best LLM method* +*Bars: deterministic · Line: best LLM method* > **Key findings:** > LLM wins on prose-heavy scenarios: Deep conversation, Technical explanation From 4b92c4198f95a9d5aef7d270bff2d44936be8d12 Mon Sep 17 00:00:00 2001 From: Lisa Date: Wed, 25 Feb 2026 17:10:23 +0100 Subject: [PATCH 07/13] fix(bench): use paired bars for LLM comparison chart Interleave "Scenario (Det)" and "Scenario (LLM)" labels on the x-axis so each scenario gets two side-by-side bars in a single series, avoiding Mermaid's stacked-bar behavior. --- bench/baseline.ts | 33 +++++++++++++---------------- bench/baselines/current.json | 2 +- bench/baselines/history/v1.0.0.json | 2 +- docs/benchmark-results.md | 7 ++---- 4 files changed, 19 insertions(+), 25 deletions(-) diff --git a/bench/baseline.ts b/bench/baseline.ts index 8f192ec..7082469 100644 --- a/bench/baseline.ts +++ b/bench/baseline.ts @@ -472,31 +472,28 @@ function llmComparisonChart( const sharedScenarios = Object.keys(basic).filter((s) => s in bestLlm!.scenarios); if (sharedScenarios.length === 0) return []; - const labels = sharedScenarios.map((n) => `"${shortName(n)}"`).join(', '); - const detValues = sharedScenarios.map((s) => fix(basic[s].ratio)).join(', '); - - // Pick the best LLM method per scenario (highest ratio) - const llmValues = sharedScenarios - .map((s) => { - const methods = Object.values(bestLlm!.scenarios[s].methods).filter( - (m) => m.vsDet != null, - ); - if (methods.length === 0) return fix(basic[s].ratio); - return fix(Math.max(...methods.map((m) => m.ratio))); - }) - .join(', '); + // Interleave labels and values: "Coding (Det)", "Coding (LLM)", ... + const labels: string[] = []; + const values: number[] = []; + for (const s of sharedScenarios) { + const sn = shortName(s); + labels.push(`"${sn} (Det)"`, `"${sn} (LLM)"`); + const detR = basic[s].ratio; + const methods = Object.values(bestLlm!.scenarios[s].methods).filter( + (m) => m.vsDet != null, + ); + const llmR = methods.length > 0 ? Math.max(...methods.map((m) => m.ratio)) : detR; + values.push(detR, llmR); + } return [ '```mermaid', 'xychart-beta', ` title "Deterministic vs LLM (${bestLlm.provider}/${bestLlm.model})"`, - ` x-axis [${labels}]`, + ` x-axis [${labels.join(', ')}]`, ' y-axis "Char Ratio"', - ` bar "Deterministic" [${detValues}]`, - ` line "Best LLM" [${llmValues}]`, + ` bar [${values.map((v) => fix(v)).join(', ')}]`, '```', - '', - '*Bars: deterministic · Line: best LLM method*', ]; } diff --git a/bench/baselines/current.json b/bench/baselines/current.json index 2bffe69..fc5fdac 100644 --- a/bench/baselines/current.json +++ b/bench/baselines/current.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T15:56:33.089Z", + "generated": "2026-02-25T16:09:56.537Z", "results": { "basic": { "Coding assistant": { diff --git a/bench/baselines/history/v1.0.0.json b/bench/baselines/history/v1.0.0.json index 2bffe69..fc5fdac 100644 --- a/bench/baselines/history/v1.0.0.json +++ b/bench/baselines/history/v1.0.0.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T15:56:33.089Z", + "generated": "2026-02-25T16:09:56.537Z", "results": { "basic": { "Coding assistant": { diff --git a/docs/benchmark-results.md b/docs/benchmark-results.md index 8266e69..04d8ac2 100644 --- a/docs/benchmark-results.md +++ b/docs/benchmark-results.md @@ -101,14 +101,11 @@ Target: **2000 tokens** · 1/4 fit ```mermaid xychart-beta title "Deterministic vs LLM (ollama/llama3.2)" - x-axis ["Coding", "Long Q&A", "Tool-heavy", "Deep", "Technical", "Structured", "Agentic"] + x-axis ["Coding (Det)", "Coding (LLM)", "Long Q&A (Det)", "Long Q&A (LLM)", "Tool-heavy (Det)", "Tool-heavy (LLM)", "Deep (Det)", "Deep (LLM)", "Technical (Det)", "Technical (LLM)", "Structured (Det)", "Structured (LLM)", "Agentic (Det)", "Agentic (LLM)"] y-axis "Char Ratio" - bar "Deterministic" [1.68, 6.16, 1.30, 2.12, 1.00, 1.93, 1.43] - line "Best LLM" [1.55, 4.49, 1.28, 3.28, 1.00, 1.46, 1.40] + bar [1.68, 1.55, 6.16, 4.49, 1.30, 1.28, 2.12, 3.28, 1.00, 1.00, 1.93, 1.46, 1.43, 1.40] ``` -*Bars: deterministic · Line: best LLM method* - > **Key findings:** > LLM wins on prose-heavy scenarios: Deep conversation, Technical explanation > Deterministic wins on structured/technical content: Coding assistant, Long Q&A, Tool-heavy, Structured content From 67b6ef8aff9f8f50d4077916dc66956a5db99b39 Mon Sep 17 00:00:00 2001 From: Lisa Date: Wed, 25 Feb 2026 17:50:05 +0100 Subject: [PATCH 08/13] fix(bench): replace broken LLM comparison chart with summary table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mermaid xychart can't do grouped bars — stacks or overlaps labels. Replace with a clean comparison table showing Det vs Best LLM ratio, delta percentage, and winner per scenario. --- bench/baseline.ts | 44 ++++++++++++++--------------- bench/baselines/current.json | 2 +- bench/baselines/history/v1.0.0.json | 2 +- docs/benchmark-results.md | 18 +++++++----- 4 files changed, 34 insertions(+), 32 deletions(-) diff --git a/bench/baseline.ts b/bench/baseline.ts index 7082469..d96371e 100644 --- a/bench/baseline.ts +++ b/bench/baseline.ts @@ -446,11 +446,11 @@ function dedupChart(dedup: Record): string[] { ]; } -function llmComparisonChart( +function llmComparisonTable( basic: Record, llmResults: LlmBenchmarkResult[], ): string[] { - // Use the best LLM result (highest average vsDet) for the chart + // Use the best LLM result (highest average vsDet) for the summary table let bestLlm: LlmBenchmarkResult | undefined; let bestAvg = -Infinity; for (const llm of llmResults) { @@ -460,7 +460,8 @@ function llmComparisonChart( if (mr.vsDet != null && mr.vsDet > 0) vsDetValues.push(mr.vsDet); } } - const avg = vsDetValues.length > 0 ? vsDetValues.reduce((a, b) => a + b, 0) / vsDetValues.length : 0; + const avg = + vsDetValues.length > 0 ? vsDetValues.reduce((a, b) => a + b, 0) / vsDetValues.length : 0; if (avg > bestAvg) { bestAvg = avg; bestLlm = llm; @@ -468,33 +469,30 @@ function llmComparisonChart( } if (!bestLlm) return []; - // Match scenarios that exist in both basic and LLM results const sharedScenarios = Object.keys(basic).filter((s) => s in bestLlm!.scenarios); if (sharedScenarios.length === 0) return []; - // Interleave labels and values: "Coding (Det)", "Coding (LLM)", ... - const labels: string[] = []; - const values: number[] = []; + const lines: string[] = []; + lines.push(`*Best provider: ${bestLlm.provider}/${bestLlm.model}*`); + lines.push(''); + lines.push('| Scenario | Det | Best LLM | Delta | Winner |'); + lines.push('| --- | ---: | ---: | ---: | --- |'); + for (const s of sharedScenarios) { - const sn = shortName(s); - labels.push(`"${sn} (Det)"`, `"${sn} (LLM)"`); const detR = basic[s].ratio; const methods = Object.values(bestLlm!.scenarios[s].methods).filter( (m) => m.vsDet != null, ); const llmR = methods.length > 0 ? Math.max(...methods.map((m) => m.ratio)) : detR; - values.push(detR, llmR); + const delta = Math.round(((llmR - detR) / detR) * 100); + const sign = delta >= 0 ? '+' : ''; + const winner = llmR > detR + 0.01 ? 'LLM' : detR > llmR + 0.01 ? 'Det' : 'Tie'; + lines.push( + `| ${s} | ${fix(detR)}x | ${fix(llmR)}x | ${sign}${delta}% | ${winner} |`, + ); } - return [ - '```mermaid', - 'xychart-beta', - ` title "Deterministic vs LLM (${bestLlm.provider}/${bestLlm.model})"`, - ` x-axis [${labels.join(', ')}]`, - ' y-axis "Char Ratio"', - ` bar [${values.map((v) => fix(v)).join(', ')}]`, - '```', - ]; + return lines; } // --------------------------------------------------------------------------- @@ -610,10 +608,10 @@ function generateLlmSection( ); lines.push(''); - // Summary chart - const chart = llmComparisonChart(basic, llmResults); - if (chart.length > 0) { - lines.push(...chart); + // Summary comparison table + const table = llmComparisonTable(basic, llmResults); + if (table.length > 0) { + lines.push(...table); lines.push(''); } diff --git a/bench/baselines/current.json b/bench/baselines/current.json index fc5fdac..371e056 100644 --- a/bench/baselines/current.json +++ b/bench/baselines/current.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T16:09:56.537Z", + "generated": "2026-02-25T16:49:46.729Z", "results": { "basic": { "Coding assistant": { diff --git a/bench/baselines/history/v1.0.0.json b/bench/baselines/history/v1.0.0.json index fc5fdac..371e056 100644 --- a/bench/baselines/history/v1.0.0.json +++ b/bench/baselines/history/v1.0.0.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T16:09:56.537Z", + "generated": "2026-02-25T16:49:46.729Z", "results": { "basic": { "Coding assistant": { diff --git a/docs/benchmark-results.md b/docs/benchmark-results.md index 04d8ac2..4eac953 100644 --- a/docs/benchmark-results.md +++ b/docs/benchmark-results.md @@ -98,13 +98,17 @@ Target: **2000 tokens** · 1/4 fit > Results are **non-deterministic** — LLM outputs vary between runs. Saved as reference data, not used for regression testing. -```mermaid -xychart-beta - title "Deterministic vs LLM (ollama/llama3.2)" - x-axis ["Coding (Det)", "Coding (LLM)", "Long Q&A (Det)", "Long Q&A (LLM)", "Tool-heavy (Det)", "Tool-heavy (LLM)", "Deep (Det)", "Deep (LLM)", "Technical (Det)", "Technical (LLM)", "Structured (Det)", "Structured (LLM)", "Agentic (Det)", "Agentic (LLM)"] - y-axis "Char Ratio" - bar [1.68, 1.55, 6.16, 4.49, 1.30, 1.28, 2.12, 3.28, 1.00, 1.00, 1.93, 1.46, 1.43, 1.40] -``` +*Best provider: ollama/llama3.2* + +| Scenario | Det | Best LLM | Delta | Winner | +| --- | ---: | ---: | ---: | --- | +| Coding assistant | 1.68x | 1.55x | -8% | Det | +| Long Q&A | 6.16x | 4.49x | -27% | Det | +| Tool-heavy | 1.30x | 1.28x | -1% | Det | +| Deep conversation | 2.12x | 3.28x | +54% | LLM | +| Technical explanation | 1.00x | 1.00x | +0% | Tie | +| Structured content | 1.93x | 1.46x | -25% | Det | +| Agentic coding session | 1.43x | 1.40x | -2% | Det | > **Key findings:** > LLM wins on prose-heavy scenarios: Deep conversation, Technical explanation From 86526640f0016a041d6c7a4da69812e9bc6c181f Mon Sep 17 00:00:00 2001 From: Lisa Date: Wed, 25 Feb 2026 19:36:54 +0100 Subject: [PATCH 09/13] feat(bench): ASCII horizontal bar chart for LLM vs deterministic comparison Render comparison as paired horizontal bars inside a fenced code block (monospace), replacing the broken Mermaid chart. Each scenario shows Det and LLM bars side by side with ratios and a star for LLM wins. --- bench/baseline.ts | 51 +++++++++++++++++++---------- bench/baselines/current.json | 2 +- bench/baselines/history/v1.0.0.json | 2 +- docs/benchmark-results.md | 37 ++++++++++++++------- 4 files changed, 61 insertions(+), 31 deletions(-) diff --git a/bench/baseline.ts b/bench/baseline.ts index d96371e..f21b6c7 100644 --- a/bench/baseline.ts +++ b/bench/baseline.ts @@ -446,11 +446,16 @@ function dedupChart(dedup: Record): string[] { ]; } -function llmComparisonTable( +function asciiBar(value: number, max: number, width: number): string { + const filled = Math.round((value / max) * width); + return '\u2588'.repeat(filled) + '\u2591'.repeat(width - filled); +} + +function llmComparisonChart( basic: Record, llmResults: LlmBenchmarkResult[], ): string[] { - // Use the best LLM result (highest average vsDet) for the summary table + // Use the best LLM result (highest average vsDet) for the summary let bestLlm: LlmBenchmarkResult | undefined; let bestAvg = -Infinity; for (const llm of llmResults) { @@ -472,25 +477,35 @@ function llmComparisonTable( const sharedScenarios = Object.keys(basic).filter((s) => s in bestLlm!.scenarios); if (sharedScenarios.length === 0) return []; - const lines: string[] = []; - lines.push(`*Best provider: ${bestLlm.provider}/${bestLlm.model}*`); - lines.push(''); - lines.push('| Scenario | Det | Best LLM | Delta | Winner |'); - lines.push('| --- | ---: | ---: | ---: | --- |'); - + // Collect data and find max for scaling + const rows: { name: string; detR: number; llmR: number }[] = []; for (const s of sharedScenarios) { const detR = basic[s].ratio; const methods = Object.values(bestLlm!.scenarios[s].methods).filter( (m) => m.vsDet != null, ); const llmR = methods.length > 0 ? Math.max(...methods.map((m) => m.ratio)) : detR; - const delta = Math.round(((llmR - detR) / detR) * 100); - const sign = delta >= 0 ? '+' : ''; - const winner = llmR > detR + 0.01 ? 'LLM' : detR > llmR + 0.01 ? 'Det' : 'Tie'; - lines.push( - `| ${s} | ${fix(detR)}x | ${fix(llmR)}x | ${sign}${delta}% | ${winner} |`, - ); + rows.push({ name: s, detR, llmR }); + } + const maxR = Math.max(...rows.flatMap((r) => [r.detR, r.llmR])); + const barWidth = 30; + const nameWidth = Math.max(...rows.map((r) => r.name.length)); + + const lines: string[] = []; + lines.push('```'); + lines.push(`Deterministic vs LLM (${bestLlm.provider}/${bestLlm.model})`); + lines.push(''); + for (const r of rows) { + const label = r.name.padEnd(nameWidth); + const detBar = asciiBar(r.detR, maxR, barWidth); + const llmBar = asciiBar(r.llmR, maxR, barWidth); + const winner = r.llmR > r.detR + 0.01 ? ' \u2605' : ''; + lines.push(`${label} Det ${detBar} ${fix(r.detR)}x`); + lines.push(`${' '.repeat(nameWidth)} LLM ${llmBar} ${fix(r.llmR)}x${winner}`); + lines.push(''); } + lines.push('\u2605 = LLM wins'); + lines.push('```'); return lines; } @@ -608,10 +623,10 @@ function generateLlmSection( ); lines.push(''); - // Summary comparison table - const table = llmComparisonTable(basic, llmResults); - if (table.length > 0) { - lines.push(...table); + // Summary comparison chart (ASCII horizontal bars in code block) + const chart = llmComparisonChart(basic, llmResults); + if (chart.length > 0) { + lines.push(...chart); lines.push(''); } diff --git a/bench/baselines/current.json b/bench/baselines/current.json index 371e056..b6a5a33 100644 --- a/bench/baselines/current.json +++ b/bench/baselines/current.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T16:49:46.729Z", + "generated": "2026-02-25T18:36:31.625Z", "results": { "basic": { "Coding assistant": { diff --git a/bench/baselines/history/v1.0.0.json b/bench/baselines/history/v1.0.0.json index 371e056..b6a5a33 100644 --- a/bench/baselines/history/v1.0.0.json +++ b/bench/baselines/history/v1.0.0.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T16:49:46.729Z", + "generated": "2026-02-25T18:36:31.625Z", "results": { "basic": { "Coding assistant": { diff --git a/docs/benchmark-results.md b/docs/benchmark-results.md index 4eac953..5017cc7 100644 --- a/docs/benchmark-results.md +++ b/docs/benchmark-results.md @@ -98,17 +98,32 @@ Target: **2000 tokens** · 1/4 fit > Results are **non-deterministic** — LLM outputs vary between runs. Saved as reference data, not used for regression testing. -*Best provider: ollama/llama3.2* - -| Scenario | Det | Best LLM | Delta | Winner | -| --- | ---: | ---: | ---: | --- | -| Coding assistant | 1.68x | 1.55x | -8% | Det | -| Long Q&A | 6.16x | 4.49x | -27% | Det | -| Tool-heavy | 1.30x | 1.28x | -1% | Det | -| Deep conversation | 2.12x | 3.28x | +54% | LLM | -| Technical explanation | 1.00x | 1.00x | +0% | Tie | -| Structured content | 1.93x | 1.46x | -25% | Det | -| Agentic coding session | 1.43x | 1.40x | -2% | Det | +``` +Deterministic vs LLM (ollama/llama3.2) + +Coding assistant Det ████████░░░░░░░░░░░░░░░░░░░░░░ 1.68x + LLM ████████░░░░░░░░░░░░░░░░░░░░░░ 1.55x + +Long Q&A Det ██████████████████████████████ 6.16x + LLM ██████████████████████░░░░░░░░ 4.49x + +Tool-heavy Det ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.30x + LLM ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.28x + +Deep conversation Det ██████████░░░░░░░░░░░░░░░░░░░░ 2.12x + LLM ████████████████░░░░░░░░░░░░░░ 3.28x ★ + +Technical explanation Det █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x + LLM █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x + +Structured content Det █████████░░░░░░░░░░░░░░░░░░░░░ 1.93x + LLM ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.46x + +Agentic coding session Det ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.43x + LLM ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.40x + +★ = LLM wins +``` > **Key findings:** > LLM wins on prose-heavy scenarios: Deep conversation, Technical explanation From c9271ff71646daf1aa8e57ef70b096cfa05257e0 Mon Sep 17 00:00:00 2001 From: Lisa Date: Wed, 25 Feb 2026 19:59:29 +0100 Subject: [PATCH 10/13] refactor(compress): unify sync/async paths via generator (#5) compressSync and compressAsync were identical (~180 lines each) except for 2 summarize call sites. Replace both with a single compressGen generator that yields summarize requests, driven by thin sync/async runners. Removes 149 lines of duplication, no public API changes. --- src/compress.ts | 225 ++++++++---------------------------------------- 1 file changed, 38 insertions(+), 187 deletions(-) diff --git a/src/compress.ts b/src/compress.ts index 68e2641..1b3068a 100644 --- a/src/compress.ts +++ b/src/compress.ts @@ -614,190 +614,10 @@ function computeStats( } // --------------------------------------------------------------------------- -// Sync compression (internal) +// Unified compression core (generator + sync/async runners) // --------------------------------------------------------------------------- -function compressSync(messages: Message[], options: CompressOptions = {}): CompressResult { - const sourceVersion = options.sourceVersion ?? 0; - const counter = options.tokenCounter ?? defaultTokenCounter; - - if (messages.length === 0) { - return { - messages: [], - compression: { - original_version: sourceVersion, - ratio: 1, - token_ratio: 1, - messages_compressed: 0, - messages_preserved: 0, - }, - verbatim: {}, - }; - } - - const preserveRoles = new Set(options.preserve ?? ['system']); - const recencyWindow = options.recencyWindow ?? 4; - const recencyStart = Math.max(0, messages.length - (recencyWindow > 0 ? recencyWindow : 0)); - let dedupAnnotations = - (options.dedup ?? true) ? analyzeDuplicates(messages, recencyStart, preserveRoles) : undefined; - - if (options.fuzzyDedup) { - const fuzzyAnnotations = analyzeFuzzyDuplicates( - messages, - recencyStart, - preserveRoles, - dedupAnnotations ?? new Map(), - options.fuzzyThreshold ?? 0.85, - ); - if (fuzzyAnnotations.size > 0) { - if (!dedupAnnotations) dedupAnnotations = new Map(); - for (const [idx, ann] of fuzzyAnnotations) { - dedupAnnotations.set(idx, ann); - } - } - } - - const classified = classifyAll(messages, preserveRoles, recencyWindow, dedupAnnotations); - - const result: Message[] = []; - const verbatim: Record = {}; - let messagesCompressed = 0; - let messagesPreserved = 0; - let messagesDeduped = 0; - let messagesFuzzyDeduped = 0; - let i = 0; - - while (i < classified.length) { - const { msg, preserved } = classified[i]; - - if (preserved) { - result.push(msg); - messagesPreserved++; - i++; - continue; - } - - // Dedup: replace earlier duplicate/near-duplicate with compact reference - if (classified[i].dedup) { - const annotation = classified[i].dedup!; - const keepTargetId = messages[annotation.duplicateOfIndex].id; - const tag = - annotation.similarity != null - ? `[cce:near-dup of ${keepTargetId} — ${annotation.contentLength} chars, ~${Math.round(annotation.similarity * 100)}% match]` - : `[cce:dup of ${keepTargetId} — ${annotation.contentLength} chars]`; - result.push(buildCompressedMessage(msg, [msg.id], tag, sourceVersion, verbatim, [msg])); - if (annotation.similarity != null) { - messagesFuzzyDeduped++; - } else { - messagesDeduped++; - } - i++; - continue; - } - - // Code-split: extract fences verbatim, summarize surrounding prose - if (classified[i].codeSplit) { - const content = typeof msg.content === 'string' ? msg.content : ''; - const segments = splitCodeAndProse(content); - const proseText = segments - .filter((s) => s.type === 'prose') - .map((s) => s.content) - .join(' '); - const codeFences = segments.filter((s) => s.type === 'code').map((s) => s.content); - const proseBudget = proseText.length < 600 ? 200 : 400; - const summaryText = summarize(proseText, proseBudget); - const embeddedId = options.embedSummaryId ? makeSummaryId([msg.id]) : undefined; - const compressed = `${formatSummary(summaryText, proseText, undefined, true, embeddedId)}\n\n${codeFences.join('\n\n')}`; - - if (compressed.length >= content.length) { - result.push(msg); - messagesPreserved++; - i++; - continue; - } - - result.push( - buildCompressedMessage(msg, [msg.id], compressed, sourceVersion, verbatim, [msg]), - ); - messagesCompressed++; - i++; - continue; - } - - // Collect consecutive non-preserved messages with the SAME role - const { group, nextIdx } = collectGroup(classified, i); - i = nextIdx; - - const allContent = group - .map((g) => (typeof g.msg.content === 'string' ? g.msg.content : '')) - .join(' '); - const contentBudget = allContent.length < 600 ? 200 : 400; - const summaryText = isStructuredOutput(allContent) - ? summarizeStructured(allContent, contentBudget) - : summarize(allContent, contentBudget); - - if (group.length > 1) { - const mergeIds = group.map((g) => g.msg.id); - const embeddedId = options.embedSummaryId ? makeSummaryId(mergeIds) : undefined; - let summary = formatSummary(summaryText, allContent, group.length, undefined, embeddedId); - const combinedLength = group.reduce((sum, g) => sum + contentLength(g.msg), 0); - if (summary.length >= combinedLength) { - summary = formatSummary(summaryText, allContent, group.length, true, embeddedId); - } - - if (summary.length >= combinedLength) { - for (const g of group) { - result.push(g.msg); - messagesPreserved++; - } - } else { - const sourceMsgs = group.map((g) => g.msg); - const base: Message = { ...sourceMsgs[0] }; - result.push( - buildCompressedMessage(base, mergeIds, summary, sourceVersion, verbatim, sourceMsgs), - ); - messagesCompressed += group.length; - } - } else { - const single = group[0].msg; - const content = typeof single.content === 'string' ? single.content : ''; - const embeddedId = options.embedSummaryId ? makeSummaryId([single.id]) : undefined; - let summary = formatSummary(summaryText, allContent, undefined, undefined, embeddedId); - if (summary.length >= content.length) { - summary = formatSummary(summaryText, allContent, undefined, true, embeddedId); - } - - if (summary.length >= content.length) { - result.push(single); - messagesPreserved++; - } else { - result.push( - buildCompressedMessage(single, [single.id], summary, sourceVersion, verbatim, [single]), - ); - messagesCompressed++; - } - } - } - - return { - messages: result, - compression: computeStats( - messages, - result, - messagesCompressed, - messagesPreserved, - sourceVersion, - counter, - messagesDeduped, - messagesFuzzyDeduped, - ), - verbatim, - }; -} - -// --------------------------------------------------------------------------- -// Async compression (internal, LLM summarizer support) -// --------------------------------------------------------------------------- +type SummarizeRequest = { text: string; budget: number }; async function withFallback( text: string, @@ -816,13 +636,12 @@ async function withFallback( return summarize(text, maxBudget); } -async function compressAsync( +function* compressGen( messages: Message[], options: CompressOptions = {}, -): Promise { +): Generator { const sourceVersion = options.sourceVersion ?? 0; const counter = options.tokenCounter ?? defaultTokenCounter; - const userSummarizer = options.summarizer; if (messages.length === 0) { return { @@ -908,7 +727,7 @@ async function compressAsync( .join(' '); const codeFences = segments.filter((s) => s.type === 'code').map((s) => s.content); const proseBudget = proseText.length < 600 ? 200 : 400; - const summaryText = await withFallback(proseText, userSummarizer, proseBudget); + const summaryText: string = yield { text: proseText, budget: proseBudget }; const embeddedId = options.embedSummaryId ? makeSummaryId([msg.id]) : undefined; const compressed = `${formatSummary(summaryText, proseText, undefined, true, embeddedId)}\n\n${codeFences.join('\n\n')}`; @@ -937,7 +756,7 @@ async function compressAsync( const contentBudget = allContent.length < 600 ? 200 : 400; const summaryText = isStructuredOutput(allContent) ? summarizeStructured(allContent, contentBudget) - : await withFallback(allContent, userSummarizer, contentBudget); + : yield { text: allContent, budget: contentBudget }; if (group.length > 1) { const mergeIds = group.map((g) => g.msg.id); @@ -998,6 +817,38 @@ async function compressAsync( }; } +function runCompressSync(gen: Generator): CompressResult { + let next = gen.next(); + while (!next.done) { + const { text, budget } = next.value; + next = gen.next(summarize(text, budget)); + } + return next.value; +} + +async function runCompressAsync( + gen: Generator, + userSummarizer?: Summarizer, +): Promise { + let next = gen.next(); + while (!next.done) { + const { text, budget } = next.value; + next = gen.next(await withFallback(text, userSummarizer, budget)); + } + return next.value; +} + +function compressSync(messages: Message[], options: CompressOptions = {}): CompressResult { + return runCompressSync(compressGen(messages, options)); +} + +async function compressAsync( + messages: Message[], + options: CompressOptions = {}, +): Promise { + return runCompressAsync(compressGen(messages, options), options.summarizer); +} + // --------------------------------------------------------------------------- // Token budget helpers (absorbed from compressToFit) // --------------------------------------------------------------------------- From 1670993843c9488e2563c210110e2992ddf14c6b Mon Sep 17 00:00:00 2001 From: Lisa Date: Wed, 25 Feb 2026 20:01:42 +0100 Subject: [PATCH 11/13] docs: clarify defaultTokenCounter rationale across docs and source --- docs/api-reference.md | 2 +- docs/token-budget.md | 2 +- src/compress.ts | 11 ++++++++++- src/types.ts | 2 +- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/docs/api-reference.md b/docs/api-reference.md index 7fd7843..9f5973b 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -174,7 +174,7 @@ function defaultTokenCounter(msg: Message): number; Math.ceil(msg.content.length / 3.5); ``` -Approximates ~3.5 characters per token. Suitable for rough estimates. For accurate budgeting, replace with a real tokenizer. See [Token budget](token-budget.md). +The 3.5 chars/token ratio is the empirical average for GPT-family BPE tokenizers (cl100k_base, o200k_base) on mixed English text. The lower end of the range (~3.2–4.5) is chosen intentionally so budget estimates stay conservative — over-counting tokens is safer than under-counting. For accurate budgeting, replace with a real tokenizer. See [Token budget](token-budget.md). --- diff --git a/docs/token-budget.md b/docs/token-budget.md index cb1a9f4..c1fabe2 100644 --- a/docs/token-budget.md +++ b/docs/token-budget.md @@ -49,7 +49,7 @@ function defaultTokenCounter(msg: Message): number { } ``` -~3.5 characters per token is a rough heuristic. It's fast and works for ballpark estimates, but real tokenizers vary: +~3.5 characters per token is derived from empirical measurements of GPT-family BPE tokenizers (cl100k_base, o200k_base) on mixed English text. We pick the lower end of the observed range so estimates are conservative — slightly over-counting tokens is safer than under-counting and blowing the budget. It's fast and works for ballpark estimates, but real tokenizers vary: | Tokenizer | Typical chars/token | | --------- | ------------------- | diff --git a/src/compress.ts b/src/compress.ts index 1b3068a..b77b72c 100644 --- a/src/compress.ts +++ b/src/compress.ts @@ -418,7 +418,16 @@ function contentLength(msg: Message): number { return typeof msg.content === 'string' ? msg.content.length : 0; } -/** Default token counter: ~3.5 chars/token heuristic. */ +/** + * Default token counter: ~3.5 chars/token heuristic. + * + * The 3.5 ratio is the empirical average for GPT-family BPE tokenizers + * (cl100k_base, o200k_base) on mixed English text. Real-world values range + * from ~3.2 (code-heavy) to ~4.5 (plain prose). We intentionally pick the + * lower end so budget estimates stay conservative (slightly over-counting + * tokens is safer than under-counting). Users who need exact counts can + * supply a real tokenizer via the `tokenCounter` option. + */ export function defaultTokenCounter(msg: Message): number { return Math.ceil(contentLength(msg) / 3.5); } diff --git a/src/types.ts b/src/types.ts index d885de3..16e4fd3 100644 --- a/src/types.ts +++ b/src/types.ts @@ -32,7 +32,7 @@ export type CompressOptions = { embedSummaryId?: boolean; /** Hard-truncate non-recency messages when binary search bottoms out and budget still exceeded. Default: false. */ forceConverge?: boolean; - /** Custom token counter per message. Default: ceil(content.length / 3.5). */ + /** Custom token counter per message. Default: ceil(content.length / 3.5) — see defaultTokenCounter for rationale. */ tokenCounter?: (msg: Message) => number; }; From 04ee50dbf2a1e5ad15337107930cb6473fd6b058 Mon Sep 17 00:00:00 2001 From: Lisa Date: Wed, 25 Feb 2026 20:13:53 +0100 Subject: [PATCH 12/13] feat(bench): add provider summary, fuzzy dedup delta, per-provider ASCII charts - Cross-provider summary table with avg ratio, vsDet, budget fits, time - Fuzzy dedup table gains "vs Base" column highlighting improvements - ASCII comparison charts now render for all providers, not just best --- bench/baseline.ts | 144 +++++++++++++++++----------- bench/baselines/current.json | 2 +- bench/baselines/history/v1.0.0.json | 2 +- docs/benchmark-results.md | 56 ++++++++--- 4 files changed, 137 insertions(+), 67 deletions(-) diff --git a/bench/baseline.ts b/bench/baseline.ts index f21b6c7..df7df75 100644 --- a/bench/baseline.ts +++ b/bench/baseline.ts @@ -451,61 +451,44 @@ function asciiBar(value: number, max: number, width: number): string { return '\u2588'.repeat(filled) + '\u2591'.repeat(width - filled); } -function llmComparisonChart( +function llmComparisonCharts( basic: Record, llmResults: LlmBenchmarkResult[], ): string[] { - // Use the best LLM result (highest average vsDet) for the summary - let bestLlm: LlmBenchmarkResult | undefined; - let bestAvg = -Infinity; + const lines: string[] = []; + const barWidth = 30; + for (const llm of llmResults) { - const vsDetValues: number[] = []; - for (const sr of Object.values(llm.scenarios)) { - for (const mr of Object.values(sr.methods)) { - if (mr.vsDet != null && mr.vsDet > 0) vsDetValues.push(mr.vsDet); - } + const sharedScenarios = Object.keys(basic).filter((s) => s in llm.scenarios); + if (sharedScenarios.length === 0) continue; + + // Collect data and find max for scaling + const rows: { name: string; detR: number; llmR: number }[] = []; + for (const s of sharedScenarios) { + const detR = basic[s].ratio; + const methods = Object.values(llm.scenarios[s].methods).filter((m) => m.vsDet != null); + const llmR = methods.length > 0 ? Math.max(...methods.map((m) => m.ratio)) : detR; + rows.push({ name: s, detR, llmR }); } - const avg = - vsDetValues.length > 0 ? vsDetValues.reduce((a, b) => a + b, 0) / vsDetValues.length : 0; - if (avg > bestAvg) { - bestAvg = avg; - bestLlm = llm; - } - } - if (!bestLlm) return []; + const maxR = Math.max(...rows.flatMap((r) => [r.detR, r.llmR])); + const nameWidth = Math.max(...rows.map((r) => r.name.length)); - const sharedScenarios = Object.keys(basic).filter((s) => s in bestLlm!.scenarios); - if (sharedScenarios.length === 0) return []; - - // Collect data and find max for scaling - const rows: { name: string; detR: number; llmR: number }[] = []; - for (const s of sharedScenarios) { - const detR = basic[s].ratio; - const methods = Object.values(bestLlm!.scenarios[s].methods).filter( - (m) => m.vsDet != null, - ); - const llmR = methods.length > 0 ? Math.max(...methods.map((m) => m.ratio)) : detR; - rows.push({ name: s, detR, llmR }); - } - const maxR = Math.max(...rows.flatMap((r) => [r.detR, r.llmR])); - const barWidth = 30; - const nameWidth = Math.max(...rows.map((r) => r.name.length)); - - const lines: string[] = []; - lines.push('```'); - lines.push(`Deterministic vs LLM (${bestLlm.provider}/${bestLlm.model})`); - lines.push(''); - for (const r of rows) { - const label = r.name.padEnd(nameWidth); - const detBar = asciiBar(r.detR, maxR, barWidth); - const llmBar = asciiBar(r.llmR, maxR, barWidth); - const winner = r.llmR > r.detR + 0.01 ? ' \u2605' : ''; - lines.push(`${label} Det ${detBar} ${fix(r.detR)}x`); - lines.push(`${' '.repeat(nameWidth)} LLM ${llmBar} ${fix(r.llmR)}x${winner}`); + lines.push('```'); + lines.push(`Deterministic vs ${llm.provider}/${llm.model}`); + lines.push(''); + for (const r of rows) { + const label = r.name.padEnd(nameWidth); + const detBar = asciiBar(r.detR, maxR, barWidth); + const llmBar = asciiBar(r.llmR, maxR, barWidth); + const winner = r.llmR > r.detR + 0.01 ? ' \u2605' : ''; + lines.push(`${label} Det ${detBar} ${fix(r.detR)}x`); + lines.push(`${' '.repeat(nameWidth)} LLM ${llmBar} ${fix(r.llmR)}x${winner}`); + lines.push(''); + } + lines.push('\u2605 = LLM wins'); + lines.push('```'); lines.push(''); } - lines.push('\u2605 = LLM wins'); - lines.push('```'); return lines; } @@ -574,10 +557,13 @@ function generateDedupSection(r: BenchmarkResults): string[] { lines.push('### Fuzzy Dedup'); lines.push(''); } - lines.push('| Scenario | Exact Deduped | Fuzzy Deduped | Ratio |'); - lines.push('| --- | ---: | ---: | ---: |'); + lines.push('| Scenario | Exact Deduped | Fuzzy Deduped | Ratio | vs Base |'); + lines.push('| --- | ---: | ---: | ---: | ---: |'); for (const [name, v] of Object.entries(r.fuzzyDedup)) { - lines.push(`| ${name} | ${v.exact} | ${v.fuzzy} | ${fix(v.ratio)} |`); + const baseRatio = r.basic[name]?.ratio ?? v.ratio; + const improvement = + v.ratio > baseRatio + 0.01 ? `+${Math.round(((v.ratio - baseRatio) / baseRatio) * 100)}%` : '-'; + lines.push(`| ${name} | ${v.exact} | ${v.fuzzy} | ${fix(v.ratio)} | ${improvement} |`); } return lines; } @@ -623,10 +609,60 @@ function generateLlmSection( ); lines.push(''); - // Summary comparison chart (ASCII horizontal bars in code block) - const chart = llmComparisonChart(basic, llmResults); - if (chart.length > 0) { - lines.push(...chart); + // Per-provider comparison charts (ASCII horizontal bars in code blocks) + const charts = llmComparisonCharts(basic, llmResults); + if (charts.length > 0) { + lines.push(...charts); + } + + // Cross-provider summary table + if (llmResults.length > 0) { + lines.push('### Provider Summary'); + lines.push(''); + lines.push( + '| Provider | Model | Avg Ratio | Avg vsDet | Round-trip | Budget Fits | Avg Time |', + ); + lines.push('| --- | --- | ---: | ---: | --- | --- | ---: |'); + for (const llm of llmResults) { + const ratioValues: number[] = []; + const vsDetValues: number[] = []; + const timeValues: number[] = []; + let passCount = 0; + let totalCount = 0; + for (const sr of Object.values(llm.scenarios)) { + for (const mr of Object.values(sr.methods)) { + ratioValues.push(mr.ratio); + if (mr.vsDet != null) vsDetValues.push(mr.vsDet); + timeValues.push(mr.timeMs); + totalCount++; + if (mr.roundTrip === 'PASS') passCount++; + } + } + const avgRatio = ratioValues.length > 0 + ? ratioValues.reduce((a, b) => a + b, 0) / ratioValues.length + : 0; + const avgVsDet = vsDetValues.length > 0 + ? vsDetValues.reduce((a, b) => a + b, 0) / vsDetValues.length + : 0; + const avgTime = timeValues.length > 0 + ? timeValues.reduce((a, b) => a + b, 0) / timeValues.length + : 0; + const rt = passCount === totalCount ? 'all PASS' : `${passCount}/${totalCount}`; + + // Token budget summary + let budgetFits = '-'; + if (llm.tokenBudget) { + const allEntries = Object.values(llm.tokenBudget).flat(); + if (allEntries.length > 0) { + const fitCount = allEntries.filter((e) => e.fits).length; + budgetFits = `${fitCount}/${allEntries.length}`; + } + } + + lines.push( + `| ${llm.provider} | ${llm.model} | ${fix(avgRatio)}x | ${fix(avgVsDet)} | ${rt} | ${budgetFits} | ${formatTime(avgTime)} |`, + ); + } lines.push(''); } diff --git a/bench/baselines/current.json b/bench/baselines/current.json index b6a5a33..b2c8976 100644 --- a/bench/baselines/current.json +++ b/bench/baselines/current.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T18:36:31.625Z", + "generated": "2026-02-25T19:10:23.701Z", "results": { "basic": { "Coding assistant": { diff --git a/bench/baselines/history/v1.0.0.json b/bench/baselines/history/v1.0.0.json index b6a5a33..b2c8976 100644 --- a/bench/baselines/history/v1.0.0.json +++ b/bench/baselines/history/v1.0.0.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T18:36:31.625Z", + "generated": "2026-02-25T19:10:23.701Z", "results": { "basic": { "Coding assistant": { diff --git a/docs/benchmark-results.md b/docs/benchmark-results.md index 5017cc7..9a1ac7d 100644 --- a/docs/benchmark-results.md +++ b/docs/benchmark-results.md @@ -72,16 +72,16 @@ xychart-beta ### Fuzzy Dedup -| Scenario | Exact Deduped | Fuzzy Deduped | Ratio | -| --- | ---: | ---: | ---: | -| Coding assistant | 0 | 0 | 1.68 | -| Long Q&A | 1 | 0 | 6.16 | -| Tool-heavy | 0 | 0 | 1.30 | -| Short conversation | 0 | 0 | 1.00 | -| Deep conversation | 0 | 0 | 2.12 | -| Technical explanation | 0 | 0 | 1.00 | -| Structured content | 0 | 0 | 1.93 | -| Agentic coding session | 4 | 2 | 2.23 | +| Scenario | Exact Deduped | Fuzzy Deduped | Ratio | vs Base | +| --- | ---: | ---: | ---: | ---: | +| Coding assistant | 0 | 0 | 1.68 | - | +| Long Q&A | 1 | 0 | 6.16 | - | +| Tool-heavy | 0 | 0 | 1.30 | - | +| Short conversation | 0 | 0 | 1.00 | - | +| Deep conversation | 0 | 0 | 2.12 | - | +| Technical explanation | 0 | 0 | 1.00 | - | +| Structured content | 0 | 0 | 1.93 | - | +| Agentic coding session | 4 | 2 | 2.23 | +56% | ## Token Budget @@ -99,7 +99,7 @@ Target: **2000 tokens** · 1/4 fit > Results are **non-deterministic** — LLM outputs vary between runs. Saved as reference data, not used for regression testing. ``` -Deterministic vs LLM (ollama/llama3.2) +Deterministic vs ollama/llama3.2 Coding assistant Det ████████░░░░░░░░░░░░░░░░░░░░░░ 1.68x LLM ████████░░░░░░░░░░░░░░░░░░░░░░ 1.55x @@ -125,6 +125,40 @@ Agentic coding session Det ███████░░░░░░░░░░ ★ = LLM wins ``` +``` +Deterministic vs openai/gpt-4.1-mini + +Coding assistant Det ████████░░░░░░░░░░░░░░░░░░░░░░ 1.68x + LLM ████████░░░░░░░░░░░░░░░░░░░░░░ 1.64x + +Long Q&A Det ██████████████████████████████ 6.16x + LLM ██████████████████████████░░░░ 5.37x + +Tool-heavy Det ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.30x + LLM █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.12x + +Deep conversation Det ██████████░░░░░░░░░░░░░░░░░░░░ 2.12x + LLM ████████████░░░░░░░░░░░░░░░░░░ 2.37x ★ + +Technical explanation Det █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x + LLM █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x + +Structured content Det █████████░░░░░░░░░░░░░░░░░░░░░ 1.93x + LLM ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.29x + +Agentic coding session Det ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.43x + LLM ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.43x + +★ = LLM wins +``` + +### Provider Summary + +| Provider | Model | Avg Ratio | Avg vsDet | Round-trip | Budget Fits | Avg Time | +| --- | --- | ---: | ---: | --- | --- | ---: | +| ollama | llama3.2 | 2.09x | 0.96 | all PASS | 1/4 | 4.2s | +| openai | gpt-4.1-mini | 2.09x | 0.92 | all PASS | 2/4 | 8.1s | + > **Key findings:** > LLM wins on prose-heavy scenarios: Deep conversation, Technical explanation > Deterministic wins on structured/technical content: Coding assistant, Long Q&A, Tool-heavy, Structured content From 66955974b0fa1d6e02f57ebe8cd0417de30b8571 Mon Sep 17 00:00:00 2001 From: Lisa Date: Thu, 26 Feb 2026 06:33:06 +0100 Subject: [PATCH 13/13] feat(bench): track bundle size per-file with gzip in benchmark suite Measure each dist/*.js file and total after tsc build. Adds BundleSizeResult type, comparison loop for --check regression detection, doc section with table, and gzip badge. --- bench/baseline.ts | 73 +++++++++++++++++++++++++---- bench/baselines/current.json | 36 +++++++++++++- bench/baselines/history/v1.0.0.json | 36 +++++++++++++- bench/run.ts | 59 +++++++++++++++++++++++ docs/benchmark-results.md | 19 +++++++- 5 files changed, 210 insertions(+), 13 deletions(-) diff --git a/bench/baseline.ts b/bench/baseline.ts index df7df75..f59b29c 100644 --- a/bench/baseline.ts +++ b/bench/baseline.ts @@ -35,11 +35,17 @@ export interface FuzzyDedupResult { ratio: number; } +export interface BundleSizeResult { + bytes: number; + gzipBytes: number; +} + export interface BenchmarkResults { basic: Record; tokenBudget: Record; dedup: Record; fuzzyDedup: Record; + bundleSize: Record; } export interface Baseline { @@ -303,6 +309,17 @@ export function compareResults( checkNum(regressions, 'fuzzyDedup', name, 'ratio', exp.ratio, act.ratio, tolerance); } + // Bundle size + for (const [name, exp] of Object.entries(baseline.bundleSize ?? {})) { + const act = current.bundleSize?.[name]; + if (!act) { + missing(regressions, 'bundleSize', name); + continue; + } + checkNum(regressions, 'bundleSize', name, 'bytes', exp.bytes, act.bytes, tolerance); + checkNum(regressions, 'bundleSize', name, 'gzipBytes', exp.gzipBytes, act.gzipBytes, tolerance); + } + return regressions; } @@ -383,7 +400,15 @@ function formatTime(ms: number): string { // Visual helpers // --------------------------------------------------------------------------- -function badges(basic: Record): string[] { +function formatBytes(bytes: number): string { + if (bytes < 1024) return `${bytes} B`; + return `${(bytes / 1024).toFixed(1)} KB`; +} + +function badges( + basic: Record, + bundleSize?: Record, +): string[] { const entries = Object.values(basic); const ratios = entries.map((v) => v.ratio); const avgR = (ratios.reduce((a, b) => a + b, 0) / ratios.length).toFixed(2); @@ -393,14 +418,19 @@ function badges(basic: Record): string[] { const badge = (label: string, value: string, color: string) => `![${label}](https://img.shields.io/badge/${encodeURIComponent(label).replace(/-/g, '--')}-${encodeURIComponent(value).replace(/-/g, '--')}-${color})`; - return [ - [ - badge('avg ratio', `${avgR}x`, 'blue'), - badge('best', `${bestR}x`, 'blue'), - badge('scenarios', `${entries.length}`, 'blue'), - badge('round-trip', allPass, 'brightgreen'), - ].join(' '), + const badgeList = [ + badge('avg ratio', `${avgR}x`, 'blue'), + badge('best', `${bestR}x`, 'blue'), + badge('scenarios', `${entries.length}`, 'blue'), + badge('round-trip', allPass, 'brightgreen'), ]; + + const totalGzip = bundleSize?.total?.gzipBytes; + if (totalGzip != null) { + badgeList.push(badge('gzip', formatBytes(totalGzip), 'blue')); + } + + return [badgeList.join(' ')]; } // --------------------------------------------------------------------------- @@ -593,6 +623,24 @@ function generateTokenBudgetSection(r: BenchmarkResults): string[] { return lines; } +function generateBundleSizeSection(bundleSize: Record): string[] { + const entries = Object.entries(bundleSize); + if (entries.length === 0) return []; + + const lines: string[] = []; + lines.push('## Bundle Size'); + lines.push(''); + lines.push('> Zero-dependency ESM library — tracked per-file to catch regressions.'); + lines.push(''); + lines.push('| File | Size | Gzip |'); + lines.push('| --- | ---: | ---: |'); + for (const [name, v] of entries) { + const label = name === 'total' ? '**total**' : name; + lines.push(`| ${label} | ${formatBytes(v.bytes)} | ${formatBytes(v.gzipBytes)} |`); + } + return lines; +} + function generateLlmSection( baselinesDir: string, basic: Record, @@ -768,7 +816,7 @@ export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): lines.push(''); lines.push(`**v${latest.version}** · Generated: ${latest.generated.split('T')[0]}`); lines.push(''); - lines.push(...badges(latest.results.basic)); + lines.push(...badges(latest.results.basic, latest.results.bundleSize)); lines.push(''); // --- Summary --- @@ -807,6 +855,13 @@ export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): lines.push(...generateTokenBudgetSection(latest.results)); lines.push(''); + // --- Bundle size --- + const bundleSizeSection = generateBundleSizeSection(latest.results.bundleSize ?? {}); + if (bundleSizeSection.length > 0) { + lines.push(...bundleSizeSection); + lines.push(''); + } + // --- LLM (conditional) --- const llmSection = generateLlmSection(baselinesDir, latest.results.basic); if (llmSection.length > 0) { diff --git a/bench/baselines/current.json b/bench/baselines/current.json index b2c8976..77bfa0d 100644 --- a/bench/baselines/current.json +++ b/bench/baselines/current.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T19:10:23.701Z", + "generated": "2026-02-26T05:31:42.406Z", "results": { "basic": { "Coding assistant": { @@ -185,6 +185,40 @@ "fuzzy": 2, "ratio": 2.229973538609574 } + }, + "bundleSize": { + "classify.js": { + "bytes": 7724, + "gzipBytes": 3250 + }, + "compress.js": { + "bytes": 33941, + "gzipBytes": 8721 + }, + "dedup.js": { + "bytes": 10260, + "gzipBytes": 2864 + }, + "expand.js": { + "bytes": 2795, + "gzipBytes": 934 + }, + "index.js": { + "bytes": 225, + "gzipBytes": 159 + }, + "summarizer.js": { + "bytes": 2542, + "gzipBytes": 993 + }, + "types.js": { + "bytes": 11, + "gzipBytes": 31 + }, + "total": { + "bytes": 57498, + "gzipBytes": 16952 + } } } } diff --git a/bench/baselines/history/v1.0.0.json b/bench/baselines/history/v1.0.0.json index b2c8976..77bfa0d 100644 --- a/bench/baselines/history/v1.0.0.json +++ b/bench/baselines/history/v1.0.0.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "generated": "2026-02-25T19:10:23.701Z", + "generated": "2026-02-26T05:31:42.406Z", "results": { "basic": { "Coding assistant": { @@ -185,6 +185,40 @@ "fuzzy": 2, "ratio": 2.229973538609574 } + }, + "bundleSize": { + "classify.js": { + "bytes": 7724, + "gzipBytes": 3250 + }, + "compress.js": { + "bytes": 33941, + "gzipBytes": 8721 + }, + "dedup.js": { + "bytes": 10260, + "gzipBytes": 2864 + }, + "expand.js": { + "bytes": 2795, + "gzipBytes": 934 + }, + "index.js": { + "bytes": 225, + "gzipBytes": 159 + }, + "summarizer.js": { + "bytes": 2542, + "gzipBytes": 993 + }, + "types.js": { + "bytes": 11, + "gzipBytes": 31 + }, + "total": { + "bytes": 57498, + "gzipBytes": 16952 + } } } } diff --git a/bench/run.ts b/bench/run.ts index 2fb5460..f275d74 100644 --- a/bench/run.ts +++ b/bench/run.ts @@ -5,6 +5,8 @@ import type { CompressResult, Message } from '../src/types.js'; import { readFileSync, readdirSync, statSync, existsSync } from 'node:fs'; import { join, resolve } from 'node:path'; import { homedir } from 'node:os'; +import { execSync } from 'node:child_process'; +import { gzipSync } from 'node:zlib'; import { detectProviders } from './llm.js'; import type { LlmBenchmarkResult, @@ -818,6 +820,7 @@ async function run(): Promise { tokenBudget: {}, dedup: {}, fuzzyDedup: {}, + bundleSize: {}, }; for (const scenario of scenarios) { @@ -1137,6 +1140,62 @@ async function run(): Promise { process.exit(1); } + // --------------------------------------------------------------------------- + // Bundle size + // --------------------------------------------------------------------------- + + console.log(); + console.log('Bundle Size'); + + execSync('npm run build', { stdio: 'pipe', cwd: resolve(import.meta.dirname, '..') }); + + const distDir = resolve(import.meta.dirname, '..', 'dist'); + const distFiles = readdirSync(distDir, { recursive: true }) + .map(String) + .filter((f) => f.endsWith('.js')) + .sort(); + + let totalBytes = 0; + let totalGzip = 0; + + const bsHeader = [ + 'File'.padEnd(30), + 'Size'.padStart(10), + 'Gzip'.padStart(10), + ].join(' '); + const bsSep = '-'.repeat(bsHeader.length); + + console.log(bsSep); + console.log(bsHeader); + console.log(bsSep); + + for (const file of distFiles) { + const fullPath = join(distDir, file); + const bytes = statSync(fullPath).size; + const gzipBytes = gzipSync(readFileSync(fullPath)).length; + totalBytes += bytes; + totalGzip += gzipBytes; + + benchResults.bundleSize[file] = { bytes, gzipBytes }; + + const fmtBytes = bytes < 1024 ? `${bytes} B` : `${(bytes / 1024).toFixed(1)} KB`; + const fmtGzip = gzipBytes < 1024 ? `${gzipBytes} B` : `${(gzipBytes / 1024).toFixed(1)} KB`; + console.log( + [file.padEnd(30), fmtBytes.padStart(10), fmtGzip.padStart(10)].join(' '), + ); + } + + benchResults.bundleSize['total'] = { bytes: totalBytes, gzipBytes: totalGzip }; + + const fmtTotal = totalBytes < 1024 ? `${totalBytes} B` : `${(totalBytes / 1024).toFixed(1)} KB`; + const fmtTotalGz = + totalGzip < 1024 ? `${totalGzip} B` : `${(totalGzip / 1024).toFixed(1)} KB`; + console.log(bsSep); + console.log( + ['total'.padEnd(30), fmtTotal.padStart(10), fmtTotalGz.padStart(10)].join(' '), + ); + console.log(bsSep); + // --------------------------------------------------------------------------- // --save / --check // --------------------------------------------------------------------------- diff --git a/docs/benchmark-results.md b/docs/benchmark-results.md index 9a1ac7d..8e54c61 100644 --- a/docs/benchmark-results.md +++ b/docs/benchmark-results.md @@ -4,9 +4,9 @@ *Auto-generated by `npm run bench:save`. Do not edit manually.* -**v1.0.0** · Generated: 2026-02-25 +**v1.0.0** · Generated: 2026-02-26 -![avg ratio](https://img.shields.io/badge/avg%20ratio-2.08x-blue) ![best](https://img.shields.io/badge/best-6.16x-blue) ![scenarios](https://img.shields.io/badge/scenarios-8-blue) ![round-trip](https://img.shields.io/badge/round--trip-all_PASS-brightgreen) +![avg ratio](https://img.shields.io/badge/avg%20ratio-2.08x-blue) ![best](https://img.shields.io/badge/best-6.16x-blue) ![scenarios](https://img.shields.io/badge/scenarios-8-blue) ![round-trip](https://img.shields.io/badge/round--trip-all_PASS-brightgreen) ![gzip](https://img.shields.io/badge/gzip-16.6%20KB-blue) ## Summary @@ -94,6 +94,21 @@ Target: **2000 tokens** · 1/4 fit | Agentic coding session | no | 2345 | no | 0 | 4 | 33 | 0 | | Agentic coding session | yes | 1957 | yes | 9 | 1 | 32 | 4 | +## Bundle Size + +> Zero-dependency ESM library — tracked per-file to catch regressions. + +| File | Size | Gzip | +| --- | ---: | ---: | +| classify.js | 7.5 KB | 3.2 KB | +| compress.js | 33.1 KB | 8.5 KB | +| dedup.js | 10.0 KB | 2.8 KB | +| expand.js | 2.7 KB | 934 B | +| index.js | 225 B | 159 B | +| summarizer.js | 2.5 KB | 993 B | +| types.js | 11 B | 31 B | +| **total** | 56.2 KB | 16.6 KB | + ## LLM vs Deterministic > Results are **non-deterministic** — LLM outputs vary between runs. Saved as reference data, not used for regression testing.