diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..77c0edf --- /dev/null +++ b/.env.example @@ -0,0 +1,14 @@ +# LLM provider API keys for benchmark comparisons (npm run bench:llm) +# Copy to .env and uncomment the providers you want to test. + +# OpenAI (default model: gpt-4.1-mini) +# OPENAI_API_KEY=sk-... +# OPENAI_MODEL=gpt-4.1-mini + +# Anthropic (default model: claude-haiku-4-5-20251001) +# ANTHROPIC_API_KEY=sk-ant-... +# ANTHROPIC_MODEL=claude-haiku-4-5-20251001 + +# Ollama (auto-detected when running locally — no env vars required) +# OLLAMA_HOST=http://localhost:11434 +# OLLAMA_MODEL=llama3.2 diff --git a/CLAUDE.md b/CLAUDE.md index 1c2a457..1131aa9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -13,6 +13,7 @@ npm run lint # ESLint check npm run format # Prettier write npm run format:check # Prettier check npm run bench # Run benchmark suite +npm run bench:save # Run, save baseline, regenerate docs/benchmark-results.md ``` Run a single test file: diff --git a/README.md b/README.md index 11a8981..9e00710 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ const { messages: originals } = uncompress(compressed, verbatim); No API keys. No network calls. Runs synchronously by default. Under 2ms for typical conversations. -The classifier is content-aware, not domain-specific. It preserves structured data (code, JSON, SQL, tables, citations, formulas) and compresses surrounding prose — making it useful anywhere dense reference material is mixed with natural language: LLM conversations, legal briefs, medical records, technical documentation, support logs. +The classifier is content-aware, not domain-specific. It preserves structured data (code, JSON, SQL, tables, citations, formulas) and compresses surrounding prose — optimized for LLM conversations and technical documentation. ## Key findings diff --git a/bench/baseline.ts b/bench/baseline.ts new file mode 100644 index 0000000..f59b29c --- /dev/null +++ b/bench/baseline.ts @@ -0,0 +1,923 @@ +import { readFileSync, writeFileSync, mkdirSync, readdirSync, existsSync } from 'node:fs'; +import { join } from 'node:path'; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface BasicResult { + ratio: number; + tokenRatio: number; + compressed: number; + preserved: number; +} + +export interface TokenBudgetResult { + tokenCount: number; + fits: boolean; + recencyWindow: number | undefined; + compressed: number; + preserved: number; + deduped: number; +} + +export interface DedupResult { + rw0Base: number; + rw0Dup: number; + rw4Base: number; + rw4Dup: number; + deduped: number; +} + +export interface FuzzyDedupResult { + exact: number; + fuzzy: number; + ratio: number; +} + +export interface BundleSizeResult { + bytes: number; + gzipBytes: number; +} + +export interface BenchmarkResults { + basic: Record; + tokenBudget: Record; + dedup: Record; + fuzzyDedup: Record; + bundleSize: Record; +} + +export interface Baseline { + version: string; + generated: string; + results: BenchmarkResults; +} + +// --------------------------------------------------------------------------- +// LLM benchmark types +// --------------------------------------------------------------------------- + +export interface LlmMethodResult { + ratio: number; + tokenRatio: number; + compressed: number; + preserved: number; + roundTrip: 'PASS' | 'FAIL'; + timeMs: number; + /** ratio / deterministic ratio — values < 1.0 mean LLM expanded instead of compressing */ + vsDet?: number; +} + +export interface LlmScenarioResult { + methods: Record; +} + +export interface LlmTokenBudgetResult { + budget: number; + method: string; + tokenCount: number; + fits: boolean; + ratio: number; + recencyWindow: number | undefined; + roundTrip: 'PASS' | 'FAIL'; + timeMs: number; +} + +export interface LlmBenchmarkResult { + provider: string; + model: string; + generated: string; + scenarios: Record; + tokenBudget?: Record; +} + +// --------------------------------------------------------------------------- +// Save / Load +// --------------------------------------------------------------------------- + +export function saveBaseline( + baselinesDir: string, + version: string, + results: BenchmarkResults, +): void { + const baseline: Baseline = { + version, + generated: new Date().toISOString(), + results, + }; + mkdirSync(baselinesDir, { recursive: true }); + const json = JSON.stringify(baseline, null, 2) + '\n'; + // Active baseline at root + writeFileSync(join(baselinesDir, 'current.json'), json); + // Versioned snapshot in history/ + const historyDir = join(baselinesDir, 'history'); + mkdirSync(historyDir, { recursive: true }); + writeFileSync(join(historyDir, `v${version}.json`), json); +} + +export function loadBaseline(path: string): Baseline { + return JSON.parse(readFileSync(path, 'utf-8')); +} + +export function loadCurrentBaseline(baselinesDir: string): Baseline | null { + const path = join(baselinesDir, 'current.json'); + if (!existsSync(path)) return null; + return loadBaseline(path); +} + +// --------------------------------------------------------------------------- +// LLM result persistence +// --------------------------------------------------------------------------- + +export function saveLlmResult(baselinesDir: string, result: LlmBenchmarkResult): void { + const llmDir = join(baselinesDir, 'llm'); + mkdirSync(llmDir, { recursive: true }); + const filename = `${result.provider}-${result.model.replace(/[/:]/g, '-')}.json`; + writeFileSync(join(llmDir, filename), JSON.stringify(result, null, 2) + '\n'); +} + +export function loadAllLlmResults(baselinesDir: string): LlmBenchmarkResult[] { + const llmDir = join(baselinesDir, 'llm'); + if (!existsSync(llmDir)) return []; + + const results: LlmBenchmarkResult[] = []; + for (const f of readdirSync(llmDir) + .filter((f) => f.endsWith('.json')) + .sort()) { + try { + results.push(JSON.parse(readFileSync(join(llmDir, f), 'utf-8'))); + } catch { + console.warn(` Warning: skipping malformed LLM result file: ${f}`); + } + } + return results; +} + +// --------------------------------------------------------------------------- +// Compare +// --------------------------------------------------------------------------- + +export interface Regression { + benchmark: string; + scenario: string; + metric: string; + expected: number | boolean; + actual: number | boolean; + delta?: string; +} + +function checkNum( + regressions: Regression[], + bench: string, + scenario: string, + metric: string, + expected: number, + actual: number, + tolerance: number, +): void { + const denom = Math.max(Math.abs(expected), 1); + const pctDiff = Math.abs(actual - expected) / denom; + if (pctDiff > tolerance) { + const sign = actual > expected ? '+' : ''; + regressions.push({ + benchmark: bench, + scenario, + metric, + expected, + actual, + delta: `${sign}${(((actual - expected) / denom) * 100).toFixed(1)}%`, + }); + } +} + +function checkBool( + regressions: Regression[], + bench: string, + scenario: string, + metric: string, + expected: boolean, + actual: boolean, +): void { + if (expected !== actual) { + regressions.push({ benchmark: bench, scenario, metric, expected, actual }); + } +} + +function missing(regressions: Regression[], bench: string, scenario: string): void { + regressions.push({ + benchmark: bench, + scenario, + metric: '(missing)', + expected: true, + actual: false, + }); +} + +export function compareResults( + baseline: BenchmarkResults, + current: BenchmarkResults, + tolerance: number = 0, +): Regression[] { + const regressions: Regression[] = []; + + // Basic + for (const [name, exp] of Object.entries(baseline.basic)) { + const act = current.basic[name]; + if (!act) { + missing(regressions, 'basic', name); + continue; + } + checkNum(regressions, 'basic', name, 'ratio', exp.ratio, act.ratio, tolerance); + checkNum(regressions, 'basic', name, 'tokenRatio', exp.tokenRatio, act.tokenRatio, tolerance); + checkNum(regressions, 'basic', name, 'compressed', exp.compressed, act.compressed, tolerance); + checkNum(regressions, 'basic', name, 'preserved', exp.preserved, act.preserved, tolerance); + } + + // Token budget + for (const [name, exp] of Object.entries(baseline.tokenBudget)) { + const act = current.tokenBudget[name]; + if (!act) { + missing(regressions, 'tokenBudget', name); + continue; + } + checkNum( + regressions, + 'tokenBudget', + name, + 'tokenCount', + exp.tokenCount, + act.tokenCount, + tolerance, + ); + checkBool(regressions, 'tokenBudget', name, 'fits', exp.fits, act.fits); + if (exp.recencyWindow != null && act.recencyWindow != null) { + checkNum( + regressions, + 'tokenBudget', + name, + 'recencyWindow', + exp.recencyWindow, + act.recencyWindow, + tolerance, + ); + } + checkNum( + regressions, + 'tokenBudget', + name, + 'compressed', + exp.compressed, + act.compressed, + tolerance, + ); + checkNum( + regressions, + 'tokenBudget', + name, + 'preserved', + exp.preserved, + act.preserved, + tolerance, + ); + checkNum(regressions, 'tokenBudget', name, 'deduped', exp.deduped, act.deduped, tolerance); + } + + // Dedup + for (const [name, exp] of Object.entries(baseline.dedup)) { + const act = current.dedup[name]; + if (!act) { + missing(regressions, 'dedup', name); + continue; + } + checkNum(regressions, 'dedup', name, 'rw0Base', exp.rw0Base, act.rw0Base, tolerance); + checkNum(regressions, 'dedup', name, 'rw0Dup', exp.rw0Dup, act.rw0Dup, tolerance); + checkNum(regressions, 'dedup', name, 'rw4Base', exp.rw4Base, act.rw4Base, tolerance); + checkNum(regressions, 'dedup', name, 'rw4Dup', exp.rw4Dup, act.rw4Dup, tolerance); + checkNum(regressions, 'dedup', name, 'deduped', exp.deduped, act.deduped, tolerance); + } + + // Fuzzy dedup + for (const [name, exp] of Object.entries(baseline.fuzzyDedup)) { + const act = current.fuzzyDedup[name]; + if (!act) { + missing(regressions, 'fuzzyDedup', name); + continue; + } + checkNum(regressions, 'fuzzyDedup', name, 'exact', exp.exact, act.exact, tolerance); + checkNum(regressions, 'fuzzyDedup', name, 'fuzzy', exp.fuzzy, act.fuzzy, tolerance); + checkNum(regressions, 'fuzzyDedup', name, 'ratio', exp.ratio, act.ratio, tolerance); + } + + // Bundle size + for (const [name, exp] of Object.entries(baseline.bundleSize ?? {})) { + const act = current.bundleSize?.[name]; + if (!act) { + missing(regressions, 'bundleSize', name); + continue; + } + checkNum(regressions, 'bundleSize', name, 'bytes', exp.bytes, act.bytes, tolerance); + checkNum(regressions, 'bundleSize', name, 'gzipBytes', exp.gzipBytes, act.gzipBytes, tolerance); + } + + return regressions; +} + +// --------------------------------------------------------------------------- +// Report +// --------------------------------------------------------------------------- + +export function formatRegressions(regressions: Regression[]): string { + if (regressions.length === 0) return 'No regressions detected.'; + + const lines: string[] = [`${regressions.length} regression(s) detected:`, '']; + + for (const r of regressions) { + const delta = r.delta ? ` (${r.delta})` : ''; + lines.push( + ` [${r.benchmark}] ${r.scenario} → ${r.metric}: expected ${r.expected}, got ${r.actual}${delta}`, + ); + } + + return lines.join('\n'); +} + +// --------------------------------------------------------------------------- +// Doc generation +// --------------------------------------------------------------------------- + +function semverSort(a: string, b: string): number { + const pa = a + .replace(/^v|\.json$/g, '') + .split('.') + .map(Number); + const pb = b + .replace(/^v|\.json$/g, '') + .split('.') + .map(Number); + for (let i = 0; i < 3; i++) { + if ((pa[i] ?? 0) !== (pb[i] ?? 0)) return (pa[i] ?? 0) - (pb[i] ?? 0); + } + return 0; +} + +function loadAllBaselines(baselinesDir: string): Baseline[] { + const historyDir = join(baselinesDir, 'history'); + if (!existsSync(historyDir)) return []; + + const files = readdirSync(historyDir) + .filter((f) => f.startsWith('v') && f.endsWith('.json')) + .sort(semverSort); + + return files.map((f) => loadBaseline(join(historyDir, f))); +} + +function fix(n: number, d: number = 2): string { + return n.toFixed(d); +} + +/** Shorten scenario names for chart x-axis labels. */ +const SHORT_NAMES: Record = { + 'Coding assistant': 'Coding', + 'Long Q&A': 'Long Q&A', + 'Tool-heavy': 'Tool-heavy', + 'Short conversation': 'Short', + 'Deep conversation': 'Deep', + 'Technical explanation': 'Technical', + 'Structured content': 'Structured', + 'Agentic coding session': 'Agentic', +}; + +function shortName(name: string): string { + return SHORT_NAMES[name] ?? name; +} + +function formatTime(ms: number): string { + return ms < 1000 ? `${Math.round(ms)}ms` : `${(ms / 1000).toFixed(1)}s`; +} + +// --------------------------------------------------------------------------- +// Visual helpers +// --------------------------------------------------------------------------- + +function formatBytes(bytes: number): string { + if (bytes < 1024) return `${bytes} B`; + return `${(bytes / 1024).toFixed(1)} KB`; +} + +function badges( + basic: Record, + bundleSize?: Record, +): string[] { + const entries = Object.values(basic); + const ratios = entries.map((v) => v.ratio); + const avgR = (ratios.reduce((a, b) => a + b, 0) / ratios.length).toFixed(2); + const bestR = Math.max(...ratios).toFixed(2); + const allPass = 'all_PASS'; + + const badge = (label: string, value: string, color: string) => + `![${label}](https://img.shields.io/badge/${encodeURIComponent(label).replace(/-/g, '--')}-${encodeURIComponent(value).replace(/-/g, '--')}-${color})`; + + const badgeList = [ + badge('avg ratio', `${avgR}x`, 'blue'), + badge('best', `${bestR}x`, 'blue'), + badge('scenarios', `${entries.length}`, 'blue'), + badge('round-trip', allPass, 'brightgreen'), + ]; + + const totalGzip = bundleSize?.total?.gzipBytes; + if (totalGzip != null) { + badgeList.push(badge('gzip', formatBytes(totalGzip), 'blue')); + } + + return [badgeList.join(' ')]; +} + +// --------------------------------------------------------------------------- +// Mermaid chart helpers +// --------------------------------------------------------------------------- + +function compressionChart(basic: Record): string[] { + const entries = Object.entries(basic); + const labels = entries.map(([n]) => `"${shortName(n)}"`).join(', '); + const values = entries.map(([, v]) => fix(v.ratio)).join(', '); + + return [ + '```mermaid', + 'xychart-beta', + ' title "Compression Ratio by Scenario"', + ` x-axis [${labels}]`, + ' y-axis "Char Ratio"', + ` bar [${values}]`, + '```', + ]; +} + +function dedupChart(dedup: Record): string[] { + // Only include scenarios where dedup actually changes the ratio + const entries = Object.entries(dedup).filter(([, v]) => v.rw0Base !== v.rw0Dup || v.deduped > 0); + if (entries.length === 0) return []; + + const labels = entries.map(([n]) => `"${shortName(n)}"`).join(', '); + const base = entries.map(([, v]) => fix(v.rw0Base)).join(', '); + const exact = entries.map(([, v]) => fix(v.rw0Dup)).join(', '); + + return [ + '```mermaid', + 'xychart-beta', + ' title "Deduplication Impact (recencyWindow=0)"', + ` x-axis [${labels}]`, + ' y-axis "Char Ratio"', + ` bar [${base}]`, + ` bar [${exact}]`, + '```', + '', + '*First bar: no dedup · Second bar: with dedup*', + ]; +} + +function asciiBar(value: number, max: number, width: number): string { + const filled = Math.round((value / max) * width); + return '\u2588'.repeat(filled) + '\u2591'.repeat(width - filled); +} + +function llmComparisonCharts( + basic: Record, + llmResults: LlmBenchmarkResult[], +): string[] { + const lines: string[] = []; + const barWidth = 30; + + for (const llm of llmResults) { + const sharedScenarios = Object.keys(basic).filter((s) => s in llm.scenarios); + if (sharedScenarios.length === 0) continue; + + // Collect data and find max for scaling + const rows: { name: string; detR: number; llmR: number }[] = []; + for (const s of sharedScenarios) { + const detR = basic[s].ratio; + const methods = Object.values(llm.scenarios[s].methods).filter((m) => m.vsDet != null); + const llmR = methods.length > 0 ? Math.max(...methods.map((m) => m.ratio)) : detR; + rows.push({ name: s, detR, llmR }); + } + const maxR = Math.max(...rows.flatMap((r) => [r.detR, r.llmR])); + const nameWidth = Math.max(...rows.map((r) => r.name.length)); + + lines.push('```'); + lines.push(`Deterministic vs ${llm.provider}/${llm.model}`); + lines.push(''); + for (const r of rows) { + const label = r.name.padEnd(nameWidth); + const detBar = asciiBar(r.detR, maxR, barWidth); + const llmBar = asciiBar(r.llmR, maxR, barWidth); + const winner = r.llmR > r.detR + 0.01 ? ' \u2605' : ''; + lines.push(`${label} Det ${detBar} ${fix(r.detR)}x`); + lines.push(`${' '.repeat(nameWidth)} LLM ${llmBar} ${fix(r.llmR)}x${winner}`); + lines.push(''); + } + lines.push('\u2605 = LLM wins'); + lines.push('```'); + lines.push(''); + } + + return lines; +} + +// --------------------------------------------------------------------------- +// Section generators +// --------------------------------------------------------------------------- + +function generateCompressionSection(b: Baseline): string[] { + const lines: string[] = []; + const r = b.results; + const basicEntries = Object.entries(r.basic); + const ratios = basicEntries.map(([, v]) => v.ratio); + const minR = Math.min(...ratios); + const maxR = Math.max(...ratios); + const avgR = ratios.reduce((a, b) => a + b, 0) / ratios.length; + + lines.push('## Compression by Scenario'); + lines.push(''); + lines.push( + `> **${basicEntries.length} scenarios** · **${fix(avgR)}x** avg ratio · ` + + `**${fix(minR)}x** – **${fix(maxR)}x** range · all round-trips PASS`, + ); + lines.push(''); + lines.push(...compressionChart(r.basic)); + lines.push(''); + lines.push( + '| Scenario | Ratio | Reduction | Token Ratio | Messages | Compressed | Preserved |', + ); + lines.push('| --- | ---: | ---: | ---: | ---: | ---: | ---: |'); + for (const [name, v] of basicEntries) { + const reduction = Math.round((1 - 1 / v.ratio) * 100); + const messages = v.compressed + v.preserved; + lines.push( + `| ${name} | ${fix(v.ratio)} | ${reduction}% | ${fix(v.tokenRatio)} | ${messages} | ${v.compressed} | ${v.preserved} |`, + ); + } + return lines; +} + +function generateDedupSection(r: BenchmarkResults): string[] { + const lines: string[] = []; + lines.push('## Deduplication Impact'); + lines.push(''); + + const chart = dedupChart(r.dedup); + if (chart.length > 0) { + lines.push(...chart); + lines.push(''); + } + + lines.push( + '| Scenario | No Dedup (rw=0) | Dedup (rw=0) | No Dedup (rw=4) | Dedup (rw=4) | Deduped |', + ); + lines.push('| --- | ---: | ---: | ---: | ---: | ---: |'); + for (const [name, v] of Object.entries(r.dedup)) { + lines.push( + `| ${name} | ${fix(v.rw0Base)} | ${fix(v.rw0Dup)} | ${fix(v.rw4Base)} | ${fix(v.rw4Dup)} | ${v.deduped} |`, + ); + } + lines.push(''); + + // Fuzzy dedup detail + const hasFuzzy = Object.values(r.fuzzyDedup).some((v) => v.fuzzy > 0); + if (hasFuzzy) { + lines.push('### Fuzzy Dedup'); + lines.push(''); + } + lines.push('| Scenario | Exact Deduped | Fuzzy Deduped | Ratio | vs Base |'); + lines.push('| --- | ---: | ---: | ---: | ---: |'); + for (const [name, v] of Object.entries(r.fuzzyDedup)) { + const baseRatio = r.basic[name]?.ratio ?? v.ratio; + const improvement = + v.ratio > baseRatio + 0.01 ? `+${Math.round(((v.ratio - baseRatio) / baseRatio) * 100)}%` : '-'; + lines.push(`| ${name} | ${v.exact} | ${v.fuzzy} | ${fix(v.ratio)} | ${improvement} |`); + } + return lines; +} + +function generateTokenBudgetSection(r: BenchmarkResults): string[] { + const lines: string[] = []; + const entries = Object.entries(r.tokenBudget); + const allFit = entries.every(([, v]) => v.fits); + const fitCount = entries.filter(([, v]) => v.fits).length; + + lines.push('## Token Budget'); + lines.push(''); + lines.push(`Target: **2000 tokens** · ${allFit ? 'all fit' : `${fitCount}/${entries.length} fit`}`); + lines.push(''); + lines.push( + '| Scenario | Dedup | Tokens | Fits | recencyWindow | Compressed | Preserved | Deduped |', + ); + lines.push('| --- | --- | ---: | --- | ---: | ---: | ---: | ---: |'); + for (const [key, v] of entries) { + const [name, dedupStr] = key.split('|'); + const dedup = dedupStr === 'dedup=true' ? 'yes' : 'no'; + const fitIcon = v.fits ? 'yes' : 'no'; + lines.push( + `| ${name} | ${dedup} | ${v.tokenCount} | ${fitIcon} | ${v.recencyWindow ?? '-'} | ${v.compressed} | ${v.preserved} | ${v.deduped} |`, + ); + } + return lines; +} + +function generateBundleSizeSection(bundleSize: Record): string[] { + const entries = Object.entries(bundleSize); + if (entries.length === 0) return []; + + const lines: string[] = []; + lines.push('## Bundle Size'); + lines.push(''); + lines.push('> Zero-dependency ESM library — tracked per-file to catch regressions.'); + lines.push(''); + lines.push('| File | Size | Gzip |'); + lines.push('| --- | ---: | ---: |'); + for (const [name, v] of entries) { + const label = name === 'total' ? '**total**' : name; + lines.push(`| ${label} | ${formatBytes(v.bytes)} | ${formatBytes(v.gzipBytes)} |`); + } + return lines; +} + +function generateLlmSection( + baselinesDir: string, + basic: Record, +): string[] { + const llmResults = loadAllLlmResults(baselinesDir); + if (llmResults.length === 0) return []; + + const lines: string[] = []; + lines.push('## LLM vs Deterministic'); + lines.push(''); + lines.push( + '> Results are **non-deterministic** — LLM outputs vary between runs. ' + + 'Saved as reference data, not used for regression testing.', + ); + lines.push(''); + + // Per-provider comparison charts (ASCII horizontal bars in code blocks) + const charts = llmComparisonCharts(basic, llmResults); + if (charts.length > 0) { + lines.push(...charts); + } + + // Cross-provider summary table + if (llmResults.length > 0) { + lines.push('### Provider Summary'); + lines.push(''); + lines.push( + '| Provider | Model | Avg Ratio | Avg vsDet | Round-trip | Budget Fits | Avg Time |', + ); + lines.push('| --- | --- | ---: | ---: | --- | --- | ---: |'); + for (const llm of llmResults) { + const ratioValues: number[] = []; + const vsDetValues: number[] = []; + const timeValues: number[] = []; + let passCount = 0; + let totalCount = 0; + for (const sr of Object.values(llm.scenarios)) { + for (const mr of Object.values(sr.methods)) { + ratioValues.push(mr.ratio); + if (mr.vsDet != null) vsDetValues.push(mr.vsDet); + timeValues.push(mr.timeMs); + totalCount++; + if (mr.roundTrip === 'PASS') passCount++; + } + } + const avgRatio = ratioValues.length > 0 + ? ratioValues.reduce((a, b) => a + b, 0) / ratioValues.length + : 0; + const avgVsDet = vsDetValues.length > 0 + ? vsDetValues.reduce((a, b) => a + b, 0) / vsDetValues.length + : 0; + const avgTime = timeValues.length > 0 + ? timeValues.reduce((a, b) => a + b, 0) / timeValues.length + : 0; + const rt = passCount === totalCount ? 'all PASS' : `${passCount}/${totalCount}`; + + // Token budget summary + let budgetFits = '-'; + if (llm.tokenBudget) { + const allEntries = Object.values(llm.tokenBudget).flat(); + if (allEntries.length > 0) { + const fitCount = allEntries.filter((e) => e.fits).length; + budgetFits = `${fitCount}/${allEntries.length}`; + } + } + + lines.push( + `| ${llm.provider} | ${llm.model} | ${fix(avgRatio)}x | ${fix(avgVsDet)} | ${rt} | ${budgetFits} | ${formatTime(avgTime)} |`, + ); + } + lines.push(''); + } + + // Key finding callout + const wins: string[] = []; + const losses: string[] = []; + for (const llm of llmResults) { + for (const [scenario, sr] of Object.entries(llm.scenarios)) { + for (const mr of Object.values(sr.methods)) { + if (mr.vsDet != null && mr.vsDet > 1.0) wins.push(scenario); + if (mr.vsDet != null && mr.vsDet < 0.9) losses.push(scenario); + } + } + } + const uniqueWins = [...new Set(wins)]; + const uniqueLosses = [...new Set(losses)]; + if (uniqueWins.length > 0 || uniqueLosses.length > 0) { + lines.push('> **Key findings:**'); + if (uniqueWins.length > 0) { + lines.push(`> LLM wins on prose-heavy scenarios: ${uniqueWins.join(', ')}`); + } + if (uniqueLosses.length > 0) { + lines.push( + `> Deterministic wins on structured/technical content: ${uniqueLosses.join(', ')}`, + ); + } + lines.push(''); + } + + // Per-provider detail tables (collapsible) + for (const llm of llmResults) { + lines.push(`### ${llm.provider} (${llm.model})`); + lines.push(''); + lines.push(`*Generated: ${llm.generated.split('T')[0]}*`); + lines.push(''); + lines.push('
'); + lines.push(`Scenario details`); + lines.push(''); + lines.push( + '| Scenario | Method | Char Ratio | Token Ratio | vsDet | Compressed | Preserved | Round-trip | Time |', + ); + lines.push('| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: |'); + + for (const [scenario, sr] of Object.entries(llm.scenarios)) { + let first = true; + for (const [method, mr] of Object.entries(sr.methods)) { + const label = first ? scenario : ''; + const vsDet = mr.vsDet != null ? fix(mr.vsDet) : '-'; + lines.push( + `| ${label} | ${method} | ${fix(mr.ratio)} | ${fix(mr.tokenRatio)} | ${vsDet} | ${mr.compressed} | ${mr.preserved} | ${mr.roundTrip} | ${formatTime(mr.timeMs)} |`, + ); + first = false; + } + } + + // Token budget table (if present) + if (llm.tokenBudget && Object.keys(llm.tokenBudget).length > 0) { + lines.push(''); + lines.push('#### Token Budget (target: 2000 tokens)'); + lines.push(''); + lines.push( + '| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time |', + ); + lines.push('| --- | --- | ---: | --- | ---: | ---: | --- | ---: |'); + + for (const [scenario, entries] of Object.entries(llm.tokenBudget)) { + let first = true; + for (const entry of entries) { + const label = first ? scenario : ''; + lines.push( + `| ${label} | ${entry.method} | ${entry.tokenCount} | ${entry.fits} | ${entry.recencyWindow ?? '-'} | ${fix(entry.ratio)} | ${entry.roundTrip} | ${formatTime(entry.timeMs)} |`, + ); + first = false; + } + } + } + + lines.push(''); + lines.push('
'); + lines.push(''); + } + + return lines; +} + +// --------------------------------------------------------------------------- +// Main doc generator +// --------------------------------------------------------------------------- + +export function generateBenchmarkDocs(baselinesDir: string, outputPath: string): void { + const baselines = loadAllBaselines(baselinesDir); + if (baselines.length === 0) return; + + const latest = baselines[baselines.length - 1]; + const lines: string[] = []; + + // --- Header --- + lines.push('# Benchmark Results'); + lines.push(''); + lines.push('[Back to README](../README.md) | [All docs](README.md) | [Handbook](benchmarks.md)'); + lines.push(''); + lines.push('*Auto-generated by `npm run bench:save`. Do not edit manually.*'); + lines.push(''); + lines.push(`**v${latest.version}** · Generated: ${latest.generated.split('T')[0]}`); + lines.push(''); + lines.push(...badges(latest.results.basic, latest.results.bundleSize)); + lines.push(''); + + // --- Summary --- + const basicEntries = Object.entries(latest.results.basic); + const ratios = basicEntries.map(([, v]) => v.ratio); + const avgR = ratios.reduce((a, b) => a + b, 0) / ratios.length; + lines.push('## Summary'); + lines.push(''); + lines.push(`| Metric | Value |`); + lines.push(`| --- | --- |`); + lines.push(`| Scenarios | ${basicEntries.length} |`); + lines.push(`| Average compression | ${fix(avgR)}x |`); + lines.push(`| Best compression | ${fix(Math.max(...ratios))}x |`); + lines.push(`| Round-trip integrity | all PASS |`); + lines.push(''); + + // --- Pie chart: message outcome distribution --- + const totalPreserved = basicEntries.reduce((s, [, v]) => s + v.preserved, 0); + const totalCompressed = basicEntries.reduce((s, [, v]) => s + v.compressed, 0); + lines.push('```mermaid'); + lines.push('pie title "Message Outcomes"'); + lines.push(` "Preserved" : ${totalPreserved}`); + lines.push(` "Compressed" : ${totalCompressed}`); + lines.push('```'); + lines.push(''); + + // --- Compression --- + lines.push(...generateCompressionSection(latest)); + lines.push(''); + + // --- Dedup --- + lines.push(...generateDedupSection(latest.results)); + lines.push(''); + + // --- Token budget --- + lines.push(...generateTokenBudgetSection(latest.results)); + lines.push(''); + + // --- Bundle size --- + const bundleSizeSection = generateBundleSizeSection(latest.results.bundleSize ?? {}); + if (bundleSizeSection.length > 0) { + lines.push(...bundleSizeSection); + lines.push(''); + } + + // --- LLM (conditional) --- + const llmSection = generateLlmSection(baselinesDir, latest.results.basic); + if (llmSection.length > 0) { + lines.push(...llmSection); + } + + // --- Version history (conditional) --- + if (baselines.length > 1) { + lines.push('## Version History'); + lines.push(''); + lines.push('| Version | Date | Avg Char Ratio | Avg Token Ratio | Scenarios |'); + lines.push('| --- | --- | ---: | ---: | ---: |'); + for (const b of [...baselines].reverse()) { + const entries = Object.values(b.results.basic); + const avgChr = entries.reduce((s, v) => s + v.ratio, 0) / entries.length; + const avgTkr = entries.reduce((s, v) => s + v.tokenRatio, 0) / entries.length; + const date = b.generated.split('T')[0]; + lines.push( + `| ${b.version} | ${date} | ${fix(avgChr)} | ${fix(avgTkr)} | ${entries.length} |`, + ); + } + lines.push(''); + + // Per-version detail (older versions) + const olderVersions = baselines.slice(0, -1).reverse(); + for (const b of olderVersions) { + const r = b.results; + const oldEntries = Object.entries(r.basic); + const oldRatios = oldEntries.map(([, v]) => v.ratio); + const oldAvg = oldRatios.reduce((a, b) => a + b, 0) / oldRatios.length; + + lines.push(`
`); + lines.push(`v${b.version} (${b.generated.split('T')[0]}) — ${fix(oldAvg)}x avg`); + lines.push(''); + lines.push('| Scenario | Char Ratio | Token Ratio | Compressed | Preserved |'); + lines.push('| --- | ---: | ---: | ---: | ---: |'); + for (const [name, v] of oldEntries) { + lines.push( + `| ${name} | ${fix(v.ratio)} | ${fix(v.tokenRatio)} | ${v.compressed} | ${v.preserved} |`, + ); + } + lines.push(''); + lines.push('
'); + lines.push(''); + } + } + + // --- Methodology --- + lines.push('## Methodology'); + lines.push(''); + lines.push('- All deterministic results use the same input → same output guarantee'); + lines.push('- Metrics: compression ratio, token ratio, message counts, dedup counts'); + lines.push('- Timing is excluded from baselines (hardware-dependent)'); + lines.push('- LLM benchmarks are saved as reference data, not used for regression testing'); + lines.push('- Round-trip integrity is verified for every scenario (compress then uncompress)'); + lines.push(''); + + writeFileSync(outputPath, lines.join('\n')); +} diff --git a/bench/baselines/current.json b/bench/baselines/current.json new file mode 100644 index 0000000..77bfa0d --- /dev/null +++ b/bench/baselines/current.json @@ -0,0 +1,224 @@ +{ + "version": "1.0.0", + "generated": "2026-02-26T05:31:42.406Z", + "results": { + "basic": { + "Coding assistant": { + "ratio": 1.6812907904278462, + "tokenRatio": 1.6729559748427674, + "compressed": 5, + "preserved": 8 + }, + "Long Q&A": { + "ratio": 6.158536585365853, + "tokenRatio": 6.114164904862579, + "compressed": 4, + "preserved": 6 + }, + "Tool-heavy": { + "ratio": 1.2991563919532771, + "tokenRatio": 1.2946428571428572, + "compressed": 2, + "preserved": 16 + }, + "Short conversation": { + "ratio": 1, + "tokenRatio": 1, + "compressed": 0, + "preserved": 7 + }, + "Deep conversation": { + "ratio": 2.124913733609386, + "tokenRatio": 2.1241305510968433, + "compressed": 50, + "preserved": 1 + }, + "Technical explanation": { + "ratio": 1, + "tokenRatio": 1, + "compressed": 0, + "preserved": 11 + }, + "Structured content": { + "ratio": 1.9338990620812864, + "tokenRatio": 1.9241486068111455, + "compressed": 2, + "preserved": 10 + }, + "Agentic coding session": { + "ratio": 1.428351309707242, + "tokenRatio": 1.4258962011771001, + "compressed": 2, + "preserved": 31 + } + }, + "tokenBudget": { + "Deep conversation|dedup=false": { + "tokenCount": 3738, + "fits": false, + "recencyWindow": 0, + "compressed": 50, + "preserved": 1, + "deduped": 0 + }, + "Deep conversation|dedup=true": { + "tokenCount": 3738, + "fits": false, + "recencyWindow": 0, + "compressed": 50, + "preserved": 1, + "deduped": 0 + }, + "Agentic coding session|dedup=false": { + "tokenCount": 2345, + "fits": false, + "recencyWindow": 0, + "compressed": 4, + "preserved": 33, + "deduped": 0 + }, + "Agentic coding session|dedup=true": { + "tokenCount": 1957, + "fits": true, + "recencyWindow": 9, + "compressed": 1, + "preserved": 32, + "deduped": 4 + } + }, + "dedup": { + "Coding assistant": { + "rw0Base": 1.6812907904278462, + "rw0Dup": 1.6812907904278462, + "rw4Base": 1.5104234527687297, + "rw4Dup": 1.5104234527687297, + "deduped": 0 + }, + "Long Q&A": { + "rw0Base": 5.139949109414759, + "rw0Dup": 6.158536585365853, + "rw4Base": 1.9024298361273309, + "rw4Dup": 2.0264847512038524, + "deduped": 1 + }, + "Tool-heavy": { + "rw0Base": 1.2991563919532771, + "rw0Dup": 1.2991563919532771, + "rw4Base": 1.2991563919532771, + "rw4Dup": 1.2991563919532771, + "deduped": 0 + }, + "Short conversation": { + "rw0Base": 1, + "rw0Dup": 1, + "rw4Base": 1, + "rw4Dup": 1, + "deduped": 0 + }, + "Deep conversation": { + "rw0Base": 2.124913733609386, + "rw0Dup": 2.124913733609386, + "rw4Base": 1.9527165104643789, + "rw4Dup": 1.9527165104643789, + "deduped": 0 + }, + "Technical explanation": { + "rw0Base": 1, + "rw0Dup": 1, + "rw4Base": 1, + "rw4Dup": 1, + "deduped": 0 + }, + "Structured content": { + "rw0Base": 1.9338990620812864, + "rw0Dup": 1.9338990620812864, + "rw4Base": 1.373730964467005, + "rw4Dup": 1.373730964467005, + "deduped": 0 + }, + "Agentic coding session": { + "rw0Base": 1.1374233128834357, + "rw0Dup": 1.428351309707242, + "rw4Base": 1.1374233128834357, + "rw4Dup": 1.428351309707242, + "deduped": 4 + } + }, + "fuzzyDedup": { + "Coding assistant": { + "exact": 0, + "fuzzy": 0, + "ratio": 1.6812907904278462 + }, + "Long Q&A": { + "exact": 1, + "fuzzy": 0, + "ratio": 6.158536585365853 + }, + "Tool-heavy": { + "exact": 0, + "fuzzy": 0, + "ratio": 1.2991563919532771 + }, + "Short conversation": { + "exact": 0, + "fuzzy": 0, + "ratio": 1 + }, + "Deep conversation": { + "exact": 0, + "fuzzy": 0, + "ratio": 2.124913733609386 + }, + "Technical explanation": { + "exact": 0, + "fuzzy": 0, + "ratio": 1 + }, + "Structured content": { + "exact": 0, + "fuzzy": 0, + "ratio": 1.9338990620812864 + }, + "Agentic coding session": { + "exact": 4, + "fuzzy": 2, + "ratio": 2.229973538609574 + } + }, + "bundleSize": { + "classify.js": { + "bytes": 7724, + "gzipBytes": 3250 + }, + "compress.js": { + "bytes": 33941, + "gzipBytes": 8721 + }, + "dedup.js": { + "bytes": 10260, + "gzipBytes": 2864 + }, + "expand.js": { + "bytes": 2795, + "gzipBytes": 934 + }, + "index.js": { + "bytes": 225, + "gzipBytes": 159 + }, + "summarizer.js": { + "bytes": 2542, + "gzipBytes": 993 + }, + "types.js": { + "bytes": 11, + "gzipBytes": 31 + }, + "total": { + "bytes": 57498, + "gzipBytes": 16952 + } + } + } +} diff --git a/bench/baselines/history/v1.0.0.json b/bench/baselines/history/v1.0.0.json new file mode 100644 index 0000000..77bfa0d --- /dev/null +++ b/bench/baselines/history/v1.0.0.json @@ -0,0 +1,224 @@ +{ + "version": "1.0.0", + "generated": "2026-02-26T05:31:42.406Z", + "results": { + "basic": { + "Coding assistant": { + "ratio": 1.6812907904278462, + "tokenRatio": 1.6729559748427674, + "compressed": 5, + "preserved": 8 + }, + "Long Q&A": { + "ratio": 6.158536585365853, + "tokenRatio": 6.114164904862579, + "compressed": 4, + "preserved": 6 + }, + "Tool-heavy": { + "ratio": 1.2991563919532771, + "tokenRatio": 1.2946428571428572, + "compressed": 2, + "preserved": 16 + }, + "Short conversation": { + "ratio": 1, + "tokenRatio": 1, + "compressed": 0, + "preserved": 7 + }, + "Deep conversation": { + "ratio": 2.124913733609386, + "tokenRatio": 2.1241305510968433, + "compressed": 50, + "preserved": 1 + }, + "Technical explanation": { + "ratio": 1, + "tokenRatio": 1, + "compressed": 0, + "preserved": 11 + }, + "Structured content": { + "ratio": 1.9338990620812864, + "tokenRatio": 1.9241486068111455, + "compressed": 2, + "preserved": 10 + }, + "Agentic coding session": { + "ratio": 1.428351309707242, + "tokenRatio": 1.4258962011771001, + "compressed": 2, + "preserved": 31 + } + }, + "tokenBudget": { + "Deep conversation|dedup=false": { + "tokenCount": 3738, + "fits": false, + "recencyWindow": 0, + "compressed": 50, + "preserved": 1, + "deduped": 0 + }, + "Deep conversation|dedup=true": { + "tokenCount": 3738, + "fits": false, + "recencyWindow": 0, + "compressed": 50, + "preserved": 1, + "deduped": 0 + }, + "Agentic coding session|dedup=false": { + "tokenCount": 2345, + "fits": false, + "recencyWindow": 0, + "compressed": 4, + "preserved": 33, + "deduped": 0 + }, + "Agentic coding session|dedup=true": { + "tokenCount": 1957, + "fits": true, + "recencyWindow": 9, + "compressed": 1, + "preserved": 32, + "deduped": 4 + } + }, + "dedup": { + "Coding assistant": { + "rw0Base": 1.6812907904278462, + "rw0Dup": 1.6812907904278462, + "rw4Base": 1.5104234527687297, + "rw4Dup": 1.5104234527687297, + "deduped": 0 + }, + "Long Q&A": { + "rw0Base": 5.139949109414759, + "rw0Dup": 6.158536585365853, + "rw4Base": 1.9024298361273309, + "rw4Dup": 2.0264847512038524, + "deduped": 1 + }, + "Tool-heavy": { + "rw0Base": 1.2991563919532771, + "rw0Dup": 1.2991563919532771, + "rw4Base": 1.2991563919532771, + "rw4Dup": 1.2991563919532771, + "deduped": 0 + }, + "Short conversation": { + "rw0Base": 1, + "rw0Dup": 1, + "rw4Base": 1, + "rw4Dup": 1, + "deduped": 0 + }, + "Deep conversation": { + "rw0Base": 2.124913733609386, + "rw0Dup": 2.124913733609386, + "rw4Base": 1.9527165104643789, + "rw4Dup": 1.9527165104643789, + "deduped": 0 + }, + "Technical explanation": { + "rw0Base": 1, + "rw0Dup": 1, + "rw4Base": 1, + "rw4Dup": 1, + "deduped": 0 + }, + "Structured content": { + "rw0Base": 1.9338990620812864, + "rw0Dup": 1.9338990620812864, + "rw4Base": 1.373730964467005, + "rw4Dup": 1.373730964467005, + "deduped": 0 + }, + "Agentic coding session": { + "rw0Base": 1.1374233128834357, + "rw0Dup": 1.428351309707242, + "rw4Base": 1.1374233128834357, + "rw4Dup": 1.428351309707242, + "deduped": 4 + } + }, + "fuzzyDedup": { + "Coding assistant": { + "exact": 0, + "fuzzy": 0, + "ratio": 1.6812907904278462 + }, + "Long Q&A": { + "exact": 1, + "fuzzy": 0, + "ratio": 6.158536585365853 + }, + "Tool-heavy": { + "exact": 0, + "fuzzy": 0, + "ratio": 1.2991563919532771 + }, + "Short conversation": { + "exact": 0, + "fuzzy": 0, + "ratio": 1 + }, + "Deep conversation": { + "exact": 0, + "fuzzy": 0, + "ratio": 2.124913733609386 + }, + "Technical explanation": { + "exact": 0, + "fuzzy": 0, + "ratio": 1 + }, + "Structured content": { + "exact": 0, + "fuzzy": 0, + "ratio": 1.9338990620812864 + }, + "Agentic coding session": { + "exact": 4, + "fuzzy": 2, + "ratio": 2.229973538609574 + } + }, + "bundleSize": { + "classify.js": { + "bytes": 7724, + "gzipBytes": 3250 + }, + "compress.js": { + "bytes": 33941, + "gzipBytes": 8721 + }, + "dedup.js": { + "bytes": 10260, + "gzipBytes": 2864 + }, + "expand.js": { + "bytes": 2795, + "gzipBytes": 934 + }, + "index.js": { + "bytes": 225, + "gzipBytes": 159 + }, + "summarizer.js": { + "bytes": 2542, + "gzipBytes": 993 + }, + "types.js": { + "bytes": 11, + "gzipBytes": 31 + }, + "total": { + "bytes": 57498, + "gzipBytes": 16952 + } + } + } +} diff --git a/bench/baselines/llm/ollama-llama3.2.json b/bench/baselines/llm/ollama-llama3.2.json new file mode 100644 index 0000000..a0f393b --- /dev/null +++ b/bench/baselines/llm/ollama-llama3.2.json @@ -0,0 +1,263 @@ +{ + "provider": "ollama", + "model": "llama3.2", + "generated": "2026-02-25T12:21:05.747Z", + "scenarios": { + "Coding assistant": { + "methods": { + "deterministic": { + "ratio": 1.6812907904278462, + "tokenRatio": 1.6729559748427674, + "compressed": 5, + "preserved": 8, + "roundTrip": "PASS", + "timeMs": 0.25966599996900186 + }, + "llm-basic": { + "ratio": 1.4847902657700929, + "tokenRatio": 1.4810690423162582, + "compressed": 5, + "preserved": 8, + "roundTrip": "PASS", + "timeMs": 5869.715916000016, + "vsDet": 0.883125200128082 + }, + "llm-escalate": { + "ratio": 1.5518741633199464, + "tokenRatio": 1.5501165501165501, + "compressed": 5, + "preserved": 8, + "roundTrip": "PASS", + "timeMs": 3001.2509999999893, + "vsDet": 0.9230254350736279 + } + } + }, + "Long Q&A": { + "methods": { + "deterministic": { + "ratio": 6.158536585365853, + "tokenRatio": 6.114164904862579, + "compressed": 4, + "preserved": 6, + "roundTrip": "PASS", + "timeMs": 0.73641700000735 + }, + "llm-basic": { + "ratio": 4.308873720136519, + "tokenRatio": 4.2844444444444445, + "compressed": 4, + "preserved": 6, + "roundTrip": "PASS", + "timeMs": 4080.273957999947, + "vsDet": 0.6996587030716723 + }, + "llm-escalate": { + "ratio": 4.486894713460684, + "tokenRatio": 4.456086286594761, + "compressed": 4, + "preserved": 6, + "roundTrip": "PASS", + "timeMs": 3666.4759170000325, + "vsDet": 0.7285650821856953 + } + } + }, + "Tool-heavy": { + "methods": { + "deterministic": { + "ratio": 1.2991563919532771, + "tokenRatio": 1.2946428571428572, + "compressed": 2, + "preserved": 16, + "roundTrip": "PASS", + "timeMs": 1.655417000001762 + }, + "llm-basic": { + "ratio": 1.1153203342618385, + "tokenRatio": 1.1132437619961613, + "compressed": 2, + "preserved": 16, + "roundTrip": "PASS", + "timeMs": 2252.8222499999683, + "vsDet": 0.8584958217270195 + }, + "llm-escalate": { + "ratio": 1.2816901408450705, + "tokenRatio": 1.277533039647577, + "compressed": 2, + "preserved": 16, + "roundTrip": "PASS", + "timeMs": 2796.051916999975, + "vsDet": 0.9865556978233034 + } + } + }, + "Deep conversation": { + "methods": { + "deterministic": { + "ratio": 2.124913733609386, + "tokenRatio": 2.1241305510968433, + "compressed": 50, + "preserved": 1, + "roundTrip": "PASS", + "timeMs": 2.8401660000090487 + }, + "llm-basic": { + "ratio": 3.123774095366926, + "tokenRatio": 3.1088488645262333, + "compressed": 50, + "preserved": 1, + "roundTrip": "PASS", + "timeMs": 22697.48683300003, + "vsDet": 1.470071017923571 + }, + "llm-escalate": { + "ratio": 3.2790202342918, + "tokenRatio": 3.255432554325543, + "compressed": 50, + "preserved": 1, + "roundTrip": "PASS", + "timeMs": 23293.247875, + "vsDet": 1.5431309904153354 + } + } + }, + "Technical explanation": { + "methods": { + "deterministic": { + "ratio": 1, + "tokenRatio": 1, + "compressed": 0, + "preserved": 11, + "roundTrip": "PASS", + "timeMs": 0.6284590000286698 + }, + "llm-basic": { + "ratio": 1, + "tokenRatio": 1, + "compressed": 0, + "preserved": 11, + "roundTrip": "PASS", + "timeMs": 3207.201915999991, + "vsDet": 1 + }, + "llm-escalate": { + "ratio": 1.0009776232891592, + "tokenRatio": 1.0007587253414265, + "compressed": 2, + "preserved": 9, + "roundTrip": "PASS", + "timeMs": 784.6597920000786, + "vsDet": 1.0009776232891592 + } + } + }, + "Structured content": { + "methods": { + "deterministic": { + "ratio": 1.9338990620812864, + "tokenRatio": 1.9241486068111455, + "compressed": 2, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 0.48375000001396984 + }, + "llm-basic": { + "ratio": 1.4554621848739495, + "tokenRatio": 1.4521028037383177, + "compressed": 2, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 3480.8887089999625, + "vsDet": 0.7526050420168067 + }, + "llm-escalate": { + "ratio": 1.3816209317166561, + "tokenRatio": 1.3795782463928967, + "compressed": 2, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 3686.4468750000233, + "vsDet": 0.7144224633056797 + } + } + }, + "Agentic coding session": { + "methods": { + "deterministic": { + "ratio": 1.428351309707242, + "tokenRatio": 1.4258962011771001, + "compressed": 2, + "preserved": 31, + "roundTrip": "PASS", + "timeMs": 0.749125000089407 + }, + "llm-basic": { + "ratio": 1.3462097008422886, + "tokenRatio": 1.34460141271443, + "compressed": 2, + "preserved": 31, + "roundTrip": "PASS", + "timeMs": 3328.690416999976, + "vsDet": 0.9424920127795526 + }, + "llm-escalate": { + "ratio": 1.3975576662143827, + "tokenRatio": 1.3952879581151831, + "compressed": 2, + "preserved": 31, + "roundTrip": "PASS", + "timeMs": 5422.445708999992, + "vsDet": 0.978441127694859 + } + } + } + }, + "tokenBudget": { + "Deep conversation": [ + { + "budget": 2000, + "method": "deterministic", + "tokenCount": 3738, + "fits": false, + "ratio": 2.124913733609386, + "recencyWindow": 0, + "roundTrip": "PASS", + "timeMs": 12.129625000059605 + }, + { + "budget": 2000, + "method": "llm-escalate", + "tokenCount": 2593, + "fits": false, + "ratio": 3.0834538778235228, + "recencyWindow": 0, + "roundTrip": "PASS", + "timeMs": 131976.87870800006 + } + ], + "Agentic coding session": [ + { + "budget": 2000, + "method": "deterministic", + "tokenCount": 1957, + "fits": true, + "ratio": 1.3638369869059879, + "recencyWindow": 9, + "roundTrip": "PASS", + "timeMs": 1.8957079999381676 + }, + { + "budget": 2000, + "method": "llm-escalate", + "tokenCount": 2003, + "fits": false, + "ratio": 1.331896551724138, + "recencyWindow": 9, + "roundTrip": "PASS", + "timeMs": 4096.28350000002 + } + ] + } +} diff --git a/bench/baselines/llm/openai-gpt-4.1-mini.json b/bench/baselines/llm/openai-gpt-4.1-mini.json new file mode 100644 index 0000000..27b75c4 --- /dev/null +++ b/bench/baselines/llm/openai-gpt-4.1-mini.json @@ -0,0 +1,263 @@ +{ + "provider": "openai", + "model": "gpt-4.1-mini", + "generated": "2026-02-25T12:28:55.113Z", + "scenarios": { + "Coding assistant": { + "methods": { + "deterministic": { + "ratio": 1.6812907904278462, + "tokenRatio": 1.6729559748427674, + "compressed": 5, + "preserved": 8, + "roundTrip": "PASS", + "timeMs": 0.25587500000006 + }, + "llm-basic": { + "ratio": 1.6414159292035397, + "tokenRatio": 1.633906633906634, + "compressed": 5, + "preserved": 8, + "roundTrip": "PASS", + "timeMs": 5578.285459, + "vsDet": 0.976283185840708 + }, + "llm-escalate": { + "ratio": 1.631597466572836, + "tokenRatio": 1.625916870415648, + "compressed": 5, + "preserved": 8, + "roundTrip": "PASS", + "timeMs": 6046.540790999999, + "vsDet": 0.9704433497536946 + } + } + }, + "Long Q&A": { + "methods": { + "deterministic": { + "ratio": 6.158536585365853, + "tokenRatio": 6.114164904862579, + "compressed": 4, + "preserved": 6, + "roundTrip": "PASS", + "timeMs": 0.9947919999995065 + }, + "llm-basic": { + "ratio": 5.372340425531915, + "tokenRatio": 5.3259668508287294, + "compressed": 4, + "preserved": 6, + "roundTrip": "PASS", + "timeMs": 5892.603500000001, + "vsDet": 0.8723404255319149 + }, + "llm-escalate": { + "ratio": 5.346744309158285, + "tokenRatio": 5.3064220183486235, + "compressed": 4, + "preserved": 6, + "roundTrip": "PASS", + "timeMs": 6988.136834000001, + "vsDet": 0.868184224457385 + } + } + }, + "Tool-heavy": { + "methods": { + "deterministic": { + "ratio": 1.2991563919532771, + "tokenRatio": 1.2946428571428572, + "compressed": 2, + "preserved": 16, + "roundTrip": "PASS", + "timeMs": 0.2992500000000291 + }, + "llm-basic": { + "ratio": 1.105466593042518, + "tokenRatio": 1.1047619047619048, + "compressed": 2, + "preserved": 16, + "roundTrip": "PASS", + "timeMs": 3497.0059580000016, + "vsDet": 0.8509110988404197 + }, + "llm-escalate": { + "ratio": 1.1159420289855073, + "tokenRatio": 1.1153846153846154, + "compressed": 2, + "preserved": 16, + "roundTrip": "PASS", + "timeMs": 5327.759166, + "vsDet": 0.858974358974359 + } + } + }, + "Deep conversation": { + "methods": { + "deterministic": { + "ratio": 2.124913733609386, + "tokenRatio": 2.1241305510968433, + "compressed": 50, + "preserved": 1, + "roundTrip": "PASS", + "timeMs": 2.7148750000051223 + }, + "llm-basic": { + "ratio": 2.3424344885883346, + "tokenRatio": 2.3346074683916496, + "compressed": 50, + "preserved": 1, + "roundTrip": "PASS", + "timeMs": 50365.301625, + "vsDet": 1.1023668639053252 + }, + "llm-escalate": { + "ratio": 2.3674498077744555, + "tokenRatio": 2.359583952451709, + "compressed": 50, + "preserved": 1, + "roundTrip": "PASS", + "timeMs": 50784.971292, + "vsDet": 1.114139256727894 + } + } + }, + "Technical explanation": { + "methods": { + "deterministic": { + "ratio": 1, + "tokenRatio": 1, + "compressed": 0, + "preserved": 11, + "roundTrip": "PASS", + "timeMs": 0.6729170000180602 + }, + "llm-basic": { + "ratio": 1.0014127363616605, + "tokenRatio": 1.0015186028853456, + "compressed": 1, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 2551.7554579999996, + "vsDet": 1.0014127363616605 + }, + "llm-escalate": { + "ratio": 1.0014127363616605, + "tokenRatio": 1.0015186028853456, + "compressed": 1, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 3298.924624999985, + "vsDet": 1.0014127363616605 + } + } + }, + "Structured content": { + "methods": { + "deterministic": { + "ratio": 1.9338990620812864, + "tokenRatio": 1.9241486068111455, + "compressed": 2, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 0.3844159999862313 + }, + "llm-basic": { + "ratio": 1.2315130830489192, + "tokenRatio": 1.2294757665677547, + "compressed": 2, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 10207.897041999997, + "vsDet": 0.6368031854379976 + }, + "llm-escalate": { + "ratio": 1.2886904761904763, + "tokenRatio": 1.2867494824016563, + "compressed": 2, + "preserved": 10, + "roundTrip": "PASS", + "timeMs": 4813.861583999998, + "vsDet": 0.6663690476190476 + } + } + }, + "Agentic coding session": { + "methods": { + "deterministic": { + "ratio": 1.428351309707242, + "tokenRatio": 1.4258962011771001, + "compressed": 2, + "preserved": 31, + "roundTrip": "PASS", + "timeMs": 0.6770829999877606 + }, + "llm-basic": { + "ratio": 1.428351309707242, + "tokenRatio": 1.4258962011771001, + "compressed": 2, + "preserved": 31, + "roundTrip": "PASS", + "timeMs": 5799.787291999994, + "vsDet": 1 + }, + "llm-escalate": { + "ratio": 1.3244749249892842, + "tokenRatio": 1.3232373386295928, + "compressed": 1, + "preserved": 32, + "roundTrip": "PASS", + "timeMs": 9487.380791999982, + "vsDet": 0.9272753250464352 + } + } + } + }, + "tokenBudget": { + "Deep conversation": [ + { + "budget": 2000, + "method": "deterministic", + "tokenCount": 3738, + "fits": false, + "ratio": 2.124913733609386, + "recencyWindow": 0, + "roundTrip": "PASS", + "timeMs": 10.060708000004524 + }, + { + "budget": 2000, + "method": "llm-escalate", + "tokenCount": 3391, + "fits": false, + "ratio": 2.3493853327681222, + "recencyWindow": 0, + "roundTrip": "PASS", + "timeMs": 280464.86720800004 + } + ], + "Agentic coding session": [ + { + "budget": 2000, + "method": "deterministic", + "tokenCount": 1957, + "fits": true, + "ratio": 1.3638369869059879, + "recencyWindow": 9, + "roundTrip": "PASS", + "timeMs": 1.9349999999976717 + }, + { + "budget": 2000, + "method": "llm-escalate", + "tokenCount": 1915, + "fits": true, + "ratio": 1.3935658448586892, + "recencyWindow": 3, + "roundTrip": "PASS", + "timeMs": 28052.867749999976 + } + ] + } +} diff --git a/bench/llm.ts b/bench/llm.ts index 68c7197..e4615ef 100644 --- a/bench/llm.ts +++ b/bench/llm.ts @@ -6,7 +6,7 @@ * * Supported providers: * - OpenAI: OPENAI_API_KEY (model override: OPENAI_MODEL, default gpt-4.1-mini) - * - Ollama: OLLAMA_MODEL or OLLAMA_HOST (default host http://localhost:11434, model llama3.2) + * - Ollama: Auto-detected on localhost:11434, or OLLAMA_MODEL/OLLAMA_HOST (model default llama3.2) * - Anthropic: ANTHROPIC_API_KEY (model override: ANTHROPIC_MODEL, default claude-haiku-4-5-20251001) * * SDKs are dynamically imported — missing packages print a skip message @@ -47,31 +47,59 @@ export async function detectProviders(): Promise { } } - // --- Ollama (OpenAI-compatible API) --- - if (process.env.OLLAMA_MODEL || process.env.OLLAMA_HOST) { - try { - const { default: OpenAI } = await import('openai'); - const host = process.env.OLLAMA_HOST ?? 'http://localhost:11434'; - const model = process.env.OLLAMA_MODEL ?? 'llama3.2'; - const client = new OpenAI({ baseURL: `${host}/v1`, apiKey: 'ollama' }); + // --- Ollama (auto-detected or via env vars) --- + { + const host = process.env.OLLAMA_HOST ?? 'http://localhost:11434'; + const model = process.env.OLLAMA_MODEL ?? 'llama3.2'; + const hasEnv = !!(process.env.OLLAMA_MODEL || process.env.OLLAMA_HOST); - providers.push({ - name: 'ollama', - model, - callLlm: async (prompt: string): Promise => { - const r = await client.chat.completions.create({ - model, - messages: [{ role: 'user', content: prompt }], - max_tokens: 400, - temperature: 0.3, - }); - return r.choices[0]?.message?.content ?? ''; - }, - }); - } catch (err) { - console.log( - ` OpenAI SDK not installed (needed for Ollama), skipping (${(err as Error).message})`, - ); + // Auto-detect: probe the Ollama API with a short timeout + let ollamaAvailable = hasEnv; + if (!hasEnv) { + try { + const res = await fetch(`${host}/api/tags`, { + signal: AbortSignal.timeout(2000), + }); + if (res.ok) { + const data = (await res.json()) as { models?: { name: string }[] }; + const models = data.models ?? []; + const hasModel = models.some((m) => m.name === model || m.name === `${model}:latest`); + if (hasModel) { + ollamaAvailable = true; + } else if (models.length > 0) { + console.log( + ` Ollama running but model "${model}" not found (available: ${models.map((m) => m.name).join(', ')})`, + ); + } + } + } catch { + // Not running — skip silently + } + } + + if (ollamaAvailable) { + try { + const { default: OpenAI } = await import('openai'); + const client = new OpenAI({ baseURL: `${host}/v1`, apiKey: 'ollama' }); + + providers.push({ + name: 'ollama', + model, + callLlm: async (prompt: string): Promise => { + const r = await client.chat.completions.create({ + model, + messages: [{ role: 'user', content: prompt }], + max_tokens: 400, + temperature: 0.3, + }); + return r.choices[0]?.message?.content ?? ''; + }, + }); + } catch (err) { + console.log( + ` Ollama detected but openai SDK not installed — run \`npm install openai\` (${(err as Error).message})`, + ); + } } } @@ -92,7 +120,7 @@ export async function detectProviders(): Promise { messages: [{ role: 'user', content: prompt }], }); const block = msg.content[0]; - return block.type === 'text' ? block.text : ''; + return block?.type === 'text' ? block.text : ''; }, }); } catch (err) { diff --git a/bench/run.ts b/bench/run.ts index 3b69ff7..f275d74 100644 --- a/bench/run.ts +++ b/bench/run.ts @@ -3,9 +3,40 @@ import { uncompress } from '../src/expand.js'; import { createSummarizer, createEscalatingSummarizer } from '../src/summarizer.js'; import type { CompressResult, Message } from '../src/types.js'; import { readFileSync, readdirSync, statSync, existsSync } from 'node:fs'; -import { join } from 'node:path'; +import { join, resolve } from 'node:path'; import { homedir } from 'node:os'; +import { execSync } from 'node:child_process'; +import { gzipSync } from 'node:zlib'; import { detectProviders } from './llm.js'; +import type { + LlmBenchmarkResult, + LlmMethodResult, + LlmTokenBudgetResult, + BenchmarkResults, +} from './baseline.js'; +import { + saveLlmResult, + saveBaseline, + loadCurrentBaseline, + compareResults, + formatRegressions, + generateBenchmarkDocs, +} from './baseline.js'; + +// --------------------------------------------------------------------------- +// Auto-load .env (no dependency, won't override existing vars) +// --------------------------------------------------------------------------- + +const envPath = resolve(import.meta.dirname, '..', '.env'); +if (existsSync(envPath)) { + for (const line of readFileSync(envPath, 'utf-8').split('\n')) { + const match = line.match(/^\s*(?:export\s+)?([^#=]+?)\s*=\s*(.*?)\s*$/); + if (!match || process.env[match[1]]) continue; + // Strip wrapping quotes (single or double) + const val = match[2].replace(/^(['"])(.*)\1$/, '$2'); + process.env[match[1]] = val; + } +} // --------------------------------------------------------------------------- // Helpers @@ -39,6 +70,7 @@ function buildScenarios(): Scenario[] { toolHeavy(), shortConversation(), deepConversation(), + technicalExplanation(), structuredContent(), agenticCodingSession(), ]; @@ -262,6 +294,160 @@ function deepConversation(): Scenario { return { name: 'Deep conversation', messages }; } +function technicalExplanation(): Scenario { + // Prose-heavy architectural discussion — no code fences, no structured data. + // Tests whether LLM summarization outperforms deterministic on pure technical prose. + return { + name: 'Technical explanation', + messages: [ + msg('system', 'You are a principal engineer explaining system design to a new team lead.'), + msg( + 'user', + 'Can you walk me through how our event-driven architecture works and why we chose it over a traditional request-response model?', + ), + msg( + 'assistant', + 'The event-driven architecture we adopted grew out of a series of scaling problems we ' + + 'encountered roughly two years ago when the monolith started buckling under peak traffic. ' + + 'The core idea is that services communicate through immutable events published to a central ' + + 'log rather than making synchronous calls to each other. When the order service completes a ' + + 'checkout, it publishes an OrderPlaced event. The inventory service, the notification ' + + 'service, and the analytics pipeline each consume that event independently and at their own ' + + 'pace. None of them need to know about the others, and none of them can block the order ' + + 'service from finishing its work.\n\n' + + 'The biggest advantage is temporal decoupling. Under the old model, a single slow downstream ' + + 'dependency could cascade latency across the entire checkout flow. A payment verification ' + + 'that normally took fifty milliseconds would occasionally spike to several seconds, and ' + + 'because every call was synchronous, the user sat waiting while six services completed in ' + + 'sequence. With events, the order service does its own work, publishes, and returns. ' + + 'Downstream processing happens asynchronously, so the user experience stays fast even when ' + + 'one consumer is struggling.\n\n' + + 'The trade-off is eventual consistency. In a synchronous model, you get immediate confirmation ' + + 'that every step completed. In an event-driven model, you have to design for the possibility ' + + 'that the inventory adjustment or the email notification has not happened yet when the user ' + + 'sees their confirmation page. We handle this through careful UX design that sets appropriate ' + + 'expectations and through idempotent consumers that can safely replay events if something ' + + 'goes wrong during processing.', + ), + msg( + 'user', + 'That makes sense. What about the event schema evolution problem? How do we handle ' + + 'changes to event formats without breaking all the consumers?', + ), + msg( + 'assistant', + 'Schema evolution is one of the hardest problems in event-driven systems and we spent ' + + 'considerable time getting our approach right. We settled on a strategy that combines ' + + 'explicit versioning with backward-compatible evolution rules. Every event type carries a ' + + 'schema version number in its header. Consumers declare which versions they understand, and ' + + 'the platform routes accordingly.\n\n' + + 'For non-breaking changes like adding a new optional field, we increment the minor version. ' + + 'Consumers that do not understand the new field simply ignore it. This follows the robustness ' + + 'principle of being conservative in what you send and liberal in what you accept. The vast ' + + 'majority of our schema changes fall into this category, and they require zero coordination ' + + 'between teams.\n\n' + + 'For breaking changes like removing a field or changing a field type, we increment the major ' + + 'version and run both the old and new versions in parallel during a migration window. The ' + + 'producing service publishes to both versions, and consuming teams migrate on their own ' + + 'schedule within the agreed window. Once all consumers have migrated, we sunset the old ' + + 'version. This approach adds operational complexity but gives teams the autonomy to migrate ' + + 'at their own pace without coordinated deployment ceremonies.\n\n' + + 'We also maintain a central schema registry that validates events at publish time. If a ' + + 'service tries to publish an event that does not conform to its declared schema, the publish ' + + 'fails fast with a clear error. This catches drift between the code and the schema before ' + + 'it reaches consumers and prevents subtle data corruption that would be much harder to ' + + 'diagnose after the fact.', + ), + msg( + 'user', + 'How do we handle ordering guarantees? I noticed some of our consumers seem to ' + + 'process events out of order.', + ), + msg( + 'assistant', + 'Ordering is subtle in distributed event systems and the right answer depends on what ' + + 'level of ordering your consumer actually needs. Our event platform guarantees ordering ' + + 'within a partition, but not across partitions. Events for the same aggregate, like all ' + + 'events for a specific order, go to the same partition based on the aggregate identifier, ' + + 'so they arrive in the order they were published. Events for different aggregates may arrive ' + + 'in any order because they live on different partitions processed by different consumer ' + + 'instances.\n\n' + + 'Most consumers only care about ordering within an aggregate and they get it for free from ' + + 'the partitioning scheme. The inventory service needs to see OrderPlaced before OrderCancelled ' + + 'for the same order, and it does because both events share the same order identifier as their ' + + 'partition key. But it does not care whether order A events arrive before or after order B ' + + 'events because those are independent aggregates.\n\n' + + 'For the rare cases where a consumer needs cross-aggregate ordering, we use a sequence ' + + 'number stamped by the producer at publish time. The consumer tracks the last processed ' + + 'sequence number and detects gaps. When it encounters a gap, it buffers the event and waits ' + + 'briefly for the missing one to arrive. If the missing event does not appear within the ' + + 'timeout window, the consumer raises an alert and proceeds with compensating logic. This ' + + 'approach adds latency in the gap case but preserves correctness.\n\n' + + 'The consumers you noticed processing out of order are likely reading from a topic that ' + + 'mixes events from multiple aggregates. Check whether the partition key is set correctly ' + + 'on the producer side. A common mistake is using a random or missing partition key, which ' + + 'causes round-robin assignment and destroys any ordering guarantee.', + ), + msg( + 'user', + 'What about failure handling? If a consumer crashes mid-processing, do we lose events?', + ), + msg( + 'assistant', + 'No, the platform guarantees at-least-once delivery. When a consumer reads an event, it ' + + 'does not acknowledge the read until processing completes successfully. If the consumer ' + + 'crashes before acknowledging, the event remains available for redelivery. The next time ' + + 'the consumer starts up, or when another instance in the consumer group takes over the ' + + 'partition, it picks up from the last acknowledged position.\n\n' + + 'The flip side of at-least-once delivery is that consumers must be idempotent. They will ' + + 'occasionally see the same event twice, particularly after a crash or during a rebalance. ' + + 'We enforce idempotency through a combination of techniques depending on the consumer. ' + + 'Some use a deduplication table that records processed event identifiers and skips any event ' + + 'it has seen before. Others design their state mutations to be naturally idempotent, for ' + + 'example using upserts instead of inserts or applying absolute values instead of deltas.\n\n' + + 'Dead letter queues catch events that fail processing repeatedly. After a configurable number ' + + 'of retries with exponential backoff, the platform moves the event to a dead letter topic ' + + 'and advances the consumer offset so it does not block subsequent events. The operations team ' + + 'monitors the dead letter queue and investigates failures. Once the root cause is fixed, they ' + + 'can replay events from the dead letter queue back into the main topic.\n\n' + + 'We also run a reconciliation process that periodically compares the state derived from ' + + 'events against the source of truth in the originating service. This catches any silent data ' + + 'loss or corruption that might slip through the cracks. The reconciler runs daily during off-' + + 'peak hours and flags discrepancies for manual review.', + ), + msg( + 'user', + 'One more thing — how do we test all of this? Integration testing with events seems ' + + 'really hard.', + ), + msg( + 'assistant', + 'Testing event-driven systems requires a layered approach because no single testing ' + + 'strategy covers all the failure modes. At the unit level, each consumer is tested with ' + + 'synthetic events that exercise the happy path, edge cases, and error paths. These tests ' + + 'run in milliseconds and give fast feedback on business logic correctness.\n\n' + + 'At the integration level, we run each service against a local instance of the event ' + + 'platform. The test publishes events, waits for the consumer to process them, and verifies ' + + 'the resulting state. These tests are slower but catch serialization issues, schema ' + + 'mismatches, and configuration problems that unit tests miss. We keep the integration test ' + + 'suite focused on the boundaries: publishing, consuming, and acknowledging. Internal ' + + 'business logic is covered at the unit level.\n\n' + + 'At the system level, we maintain a staging environment that mirrors production topology. ' + + 'Every deployment goes through staging first, where we run end-to-end scenarios that ' + + 'exercise the full event flow from producer through all consumers. These tests use realistic ' + + 'data volumes and introduce controlled failures like consumer crashes and network partitions ' + + 'to verify that the retry and dead-letter mechanisms work correctly.\n\n' + + 'Contract testing bridges the gap between producers and consumers without requiring a ' + + 'shared integration environment. Each consumer publishes a contract describing the events ' + + 'it expects, and the producer runs those contracts as part of its build. If a producer ' + + 'change would break a consumer contract, the build fails before the change reaches any ' + + 'shared environment. This is particularly valuable in our setup where different teams own ' + + "different services and may not be aware of each other's dependencies.", + ), + ], + }; +} + function structuredContent(): Scenario { // Pure prose about auth (~1500 chars): no code, URLs, SQL, API keys, JSON, paths, etc. const authProse = @@ -618,9 +804,25 @@ interface Result { } async function run(): Promise { + const args = process.argv.slice(2); + const flagSave = args.includes('--save'); + const flagCheck = args.includes('--check'); + const flagLlm = args.includes('--llm'); + const toleranceIdx = args.indexOf('--tolerance'); + const tolerance = toleranceIdx >= 0 ? Number(args[toleranceIdx + 1]) / 100 : 0; + const scenarios = buildScenarios(); const results: Result[] = []; + // Structured results for baseline save/check + const benchResults: BenchmarkResults = { + basic: {}, + tokenBudget: {}, + dedup: {}, + fuzzyDedup: {}, + bundleSize: {}, + }; + for (const scenario of scenarios) { const t0 = performance.now(); @@ -648,6 +850,13 @@ async function run(): Promise { roundTrip, timeMs: (t1 - t0).toFixed(2), }); + + benchResults.basic[scenario.name] = { + ratio: cr.compression.ratio, + tokenRatio: cr.compression.token_ratio, + compressed: cr.compression.messages_compressed, + preserved: cr.compression.messages_preserved, + }; } // Print table @@ -777,6 +986,16 @@ async function run(): Promise { ((t1 - t0).toFixed(2) + 'ms').padStart(cols.time), ].join(' '), ); + + const tbKey = `${scenario.name}|dedup=${dedup}`; + benchResults.tokenBudget[tbKey] = { + tokenCount: cr.tokenCount ?? 0, + fits: cr.fits ?? false, + recencyWindow: cr.recencyWindow, + compressed: cr.compression.messages_compressed, + preserved: cr.compression.messages_preserved, + deduped: cr.compression.messages_deduped ?? 0, + }; } } @@ -840,6 +1059,14 @@ async function run(): Promise { rt2.padStart(cols.rt), ].join(' '), ); + + benchResults.dedup[scenario.name] = { + rw0Base: baseRw0.compression.ratio, + rw0Dup: dedupRw0.compression.ratio, + rw4Base: baseRw4.compression.ratio, + rw4Dup: dedupRw4.compression.ratio, + deduped, + }; } console.log(dedupSep); @@ -898,6 +1125,12 @@ async function run(): Promise { ((t1 - t0).toFixed(2) + 'ms').padStart(cols.time), ].join(' '), ); + + benchResults.fuzzyDedup[scenario.name] = { + exact: cr.compression.messages_deduped ?? 0, + fuzzy: cr.compression.messages_fuzzy_deduped ?? 0, + ratio: cr.compression.ratio, + }; } console.log(fuzzySep); @@ -907,20 +1140,116 @@ async function run(): Promise { process.exit(1); } + // --------------------------------------------------------------------------- + // Bundle size + // --------------------------------------------------------------------------- + + console.log(); + console.log('Bundle Size'); + + execSync('npm run build', { stdio: 'pipe', cwd: resolve(import.meta.dirname, '..') }); + + const distDir = resolve(import.meta.dirname, '..', 'dist'); + const distFiles = readdirSync(distDir, { recursive: true }) + .map(String) + .filter((f) => f.endsWith('.js')) + .sort(); + + let totalBytes = 0; + let totalGzip = 0; + + const bsHeader = [ + 'File'.padEnd(30), + 'Size'.padStart(10), + 'Gzip'.padStart(10), + ].join(' '); + const bsSep = '-'.repeat(bsHeader.length); + + console.log(bsSep); + console.log(bsHeader); + console.log(bsSep); + + for (const file of distFiles) { + const fullPath = join(distDir, file); + const bytes = statSync(fullPath).size; + const gzipBytes = gzipSync(readFileSync(fullPath)).length; + totalBytes += bytes; + totalGzip += gzipBytes; + + benchResults.bundleSize[file] = { bytes, gzipBytes }; + + const fmtBytes = bytes < 1024 ? `${bytes} B` : `${(bytes / 1024).toFixed(1)} KB`; + const fmtGzip = gzipBytes < 1024 ? `${gzipBytes} B` : `${(gzipBytes / 1024).toFixed(1)} KB`; + console.log( + [file.padEnd(30), fmtBytes.padStart(10), fmtGzip.padStart(10)].join(' '), + ); + } + + benchResults.bundleSize['total'] = { bytes: totalBytes, gzipBytes: totalGzip }; + + const fmtTotal = totalBytes < 1024 ? `${totalBytes} B` : `${(totalBytes / 1024).toFixed(1)} KB`; + const fmtTotalGz = + totalGzip < 1024 ? `${totalGzip} B` : `${(totalGzip / 1024).toFixed(1)} KB`; + console.log(bsSep); + console.log( + ['total'.padEnd(30), fmtTotal.padStart(10), fmtTotalGz.padStart(10)].join(' '), + ); + console.log(bsSep); + + // --------------------------------------------------------------------------- + // --save / --check + // --------------------------------------------------------------------------- + + const baselinesDir = resolve(import.meta.dirname, 'baselines'); + const version = JSON.parse( + readFileSync(resolve(import.meta.dirname, '..', 'package.json'), 'utf-8'), + ).version; + + if (flagSave) { + saveBaseline(baselinesDir, version, benchResults); + generateBenchmarkDocs( + baselinesDir, + resolve(import.meta.dirname, '..', 'docs', 'benchmark-results.md'), + ); + console.log(); + console.log(`Baseline saved (v${version}) and docs/benchmark-results.md regenerated.`); + } + + if (flagCheck) { + const current = loadCurrentBaseline(baselinesDir); + if (!current) { + console.error( + 'No baseline found at bench/baselines/current.json — run `npm run bench:save` first.', + ); + process.exit(1); + } + const regressions = compareResults(current.results, benchResults, tolerance); + if (regressions.length > 0) { + console.error(); + console.error(formatRegressions(regressions)); + process.exit(1); + } + console.log(); + console.log(`Baseline check passed (v${current.version}, tolerance ${tolerance * 100}%).`); + } + // --------------------------------------------------------------------------- // Real Claude Code sessions (if available locally) // --------------------------------------------------------------------------- runRealSessions(); - await runLlmBenchmark(); + // LLM benchmarks require explicit --llm flag (they cost money and take minutes) + if (flagLlm) { + await runLlmBenchmark(); + } console.log(); console.log('All benchmarks passed.'); } // --------------------------------------------------------------------------- -// LLM summarization benchmark (opt-in via env vars) +// LLM summarization benchmark (requires --llm flag) // --------------------------------------------------------------------------- function roundTrip(messages: Message[], cr: CompressResult): 'PASS' | 'FAIL' { @@ -936,12 +1265,13 @@ async function runLlmBenchmark(): Promise { if (providers.length === 0) { console.log(); console.log( - 'LLM Summarization Benchmark — skipped (no OPENAI_API_KEY, OLLAMA_MODEL, or ANTHROPIC_API_KEY set)', + 'LLM Summarization Benchmark — no providers detected (set OPENAI_API_KEY or ANTHROPIC_API_KEY in .env, or start Ollama)', ); return; } const scenarios = buildScenarios().filter((s) => s.name !== 'Short conversation'); + const baselinesDir = resolve(import.meta.dirname, 'baselines'); for (const provider of providers) { console.log(); @@ -955,6 +1285,7 @@ async function runLlmBenchmark(): Promise { method: 14, chr: 6, tkr: 6, + vsDet: 6, comp: 5, pres: 5, rt: 5, @@ -966,6 +1297,7 @@ async function runLlmBenchmark(): Promise { 'Method'.padStart(cols.method), 'ChR'.padStart(cols.chr), 'TkR'.padStart(cols.tkr), + 'vsDet'.padStart(cols.vsDet), 'Comp'.padStart(cols.comp), 'Pres'.padStart(cols.pres), 'R/T'.padStart(cols.rt), @@ -978,42 +1310,178 @@ async function runLlmBenchmark(): Promise { console.log(sep); let llmFails = 0; + const llmResult: LlmBenchmarkResult = { + provider: provider.name, + model: provider.model, + generated: new Date().toISOString(), + scenarios: {}, + }; for (const scenario of scenarios) { - // Deterministic baseline - const t0d = performance.now(); - const detResult = compress(scenario.messages, { recencyWindow: 0 }); - const t1d = performance.now(); - const detRt = roundTrip(scenario.messages, detResult); - - printLlmRow(scenario.name, 'deterministic', detResult, detRt, t1d - t0d, cols); - - // LLM basic summarizer - const t0b = performance.now(); - const llmBasicResult = await compress(scenario.messages, { - recencyWindow: 0, - summarizer: basicSummarizer, - }); - const t1b = performance.now(); - const basicRt = roundTrip(scenario.messages, llmBasicResult); - if (basicRt === 'FAIL') llmFails++; - - printLlmRow('', 'llm-basic', llmBasicResult, basicRt, t1b - t0b, cols); - - // LLM escalating summarizer - const t0e = performance.now(); - const llmEscResult = await compress(scenario.messages, { - recencyWindow: 0, - summarizer: escalatingSummarizer, - }); - const t1e = performance.now(); - const escRt = roundTrip(scenario.messages, llmEscResult); - if (escRt === 'FAIL') llmFails++; - - printLlmRow('', 'llm-escalate', llmEscResult, escRt, t1e - t0e, cols); - console.log(sep); + try { + const scenarioResult: Record = {}; + + // Deterministic baseline + const t0d = performance.now(); + const detResult = compress(scenario.messages, { recencyWindow: 0 }); + const t1d = performance.now(); + const detRt = roundTrip(scenario.messages, detResult); + const detRatio = detResult.compression.ratio; + + printLlmRow(scenario.name, 'deterministic', detResult, detRt, t1d - t0d, undefined, cols); + scenarioResult['deterministic'] = { + ratio: detRatio, + tokenRatio: detResult.compression.token_ratio, + compressed: detResult.compression.messages_compressed, + preserved: detResult.compression.messages_preserved, + roundTrip: detRt, + timeMs: t1d - t0d, + }; + + // LLM basic summarizer + const t0b = performance.now(); + const llmBasicResult = await compress(scenario.messages, { + recencyWindow: 0, + summarizer: basicSummarizer, + }); + const t1b = performance.now(); + const basicRt = roundTrip(scenario.messages, llmBasicResult); + if (basicRt === 'FAIL') llmFails++; + const basicVsDet = llmBasicResult.compression.ratio / detRatio; + + printLlmRow('', 'llm-basic', llmBasicResult, basicRt, t1b - t0b, basicVsDet, cols); + scenarioResult['llm-basic'] = { + ratio: llmBasicResult.compression.ratio, + tokenRatio: llmBasicResult.compression.token_ratio, + compressed: llmBasicResult.compression.messages_compressed, + preserved: llmBasicResult.compression.messages_preserved, + roundTrip: basicRt, + timeMs: t1b - t0b, + vsDet: basicVsDet, + }; + + // LLM escalating summarizer + const t0e = performance.now(); + const llmEscResult = await compress(scenario.messages, { + recencyWindow: 0, + summarizer: escalatingSummarizer, + }); + const t1e = performance.now(); + const escRt = roundTrip(scenario.messages, llmEscResult); + if (escRt === 'FAIL') llmFails++; + const escVsDet = llmEscResult.compression.ratio / detRatio; + + printLlmRow('', 'llm-escalate', llmEscResult, escRt, t1e - t0e, escVsDet, cols); + scenarioResult['llm-escalate'] = { + ratio: llmEscResult.compression.ratio, + tokenRatio: llmEscResult.compression.token_ratio, + compressed: llmEscResult.compression.messages_compressed, + preserved: llmEscResult.compression.messages_preserved, + roundTrip: escRt, + timeMs: t1e - t0e, + vsDet: escVsDet, + }; + + console.log(sep); + llmResult.scenarios[scenario.name] = { methods: scenarioResult }; + } catch (err) { + console.error(` ${scenario.name}: ERROR — ${(err as Error).message}`); + console.log(sep); + } } + // --- Token budget + LLM --- + const tokenBudget = 2000; + const budgetScenarios: Scenario[] = scenarios.filter( + (s) => s.name === 'Deep conversation' || s.name === 'Agentic coding session', + ); + + if (budgetScenarios.length > 0) { + console.log(); + console.log( + `LLM Token Budget — ${provider.name} (${provider.model}) — target: ${tokenBudget} tokens`, + ); + + const tbCols = { name: 24, method: 14, tokens: 7, fits: 5, rw: 4, chr: 6, rt: 5, time: 10 }; + const tbHeader = [ + 'Scenario'.padEnd(tbCols.name), + 'Method'.padStart(tbCols.method), + 'Tokens'.padStart(tbCols.tokens), + 'Fits'.padStart(tbCols.fits), + 'Rw'.padStart(tbCols.rw), + 'ChR'.padStart(tbCols.chr), + 'R/T'.padStart(tbCols.rt), + 'Time'.padStart(tbCols.time), + ].join(' '); + const tbSep = '-'.repeat(tbHeader.length); + + console.log(tbSep); + console.log(tbHeader); + console.log(tbSep); + + llmResult.tokenBudget = {}; + + for (const scenario of budgetScenarios) { + const entries: LlmTokenBudgetResult[] = []; + + try { + // Deterministic with token budget + const t0d = performance.now(); + const detCr = compress(scenario.messages, { tokenBudget }); + const t1d = performance.now(); + const detRt = roundTrip(scenario.messages, detCr); + + const detEntry: LlmTokenBudgetResult = { + budget: tokenBudget, + method: 'deterministic', + tokenCount: detCr.tokenCount ?? 0, + fits: detCr.fits ?? false, + ratio: detCr.compression.ratio, + recencyWindow: detCr.recencyWindow, + roundTrip: detRt, + timeMs: t1d - t0d, + }; + entries.push(detEntry); + printBudgetRow(scenario.name, detEntry, tbCols); + + // LLM escalating with token budget + const t0e = performance.now(); + const llmCr = await compress(scenario.messages, { + tokenBudget, + summarizer: escalatingSummarizer, + }); + const t1e = performance.now(); + const llmRt = roundTrip(scenario.messages, llmCr); + + const llmEntry: LlmTokenBudgetResult = { + budget: tokenBudget, + method: 'llm-escalate', + tokenCount: llmCr.tokenCount ?? 0, + fits: llmCr.fits ?? false, + ratio: llmCr.compression.ratio, + recencyWindow: llmCr.recencyWindow, + roundTrip: llmRt, + timeMs: t1e - t0e, + }; + entries.push(llmEntry); + printBudgetRow('', llmEntry, tbCols); + + console.log(tbSep); + } catch (err) { + console.error(` ${scenario.name}: ERROR — ${(err as Error).message}`); + console.log(tbSep); + } + + if (entries.length > 0) { + llmResult.tokenBudget[scenario.name] = entries; + } + } + } + + // Always save LLM results (informational, not gated behind --save) + saveLlmResult(baselinesDir, llmResult); + console.log(` Results saved to bench/baselines/llm/`); + if (llmFails > 0) { console.error(` WARNING: ${llmFails} LLM scenario(s) failed round-trip`); } @@ -1026,11 +1494,13 @@ function printLlmRow( cr: CompressResult, rt: string, timeMs: number, + vsDet: number | undefined, cols: { name: number; method: number; chr: number; tkr: number; + vsDet: number; comp: number; pres: number; rt: number; @@ -1043,6 +1513,7 @@ function printLlmRow( method.padStart(cols.method), cr.compression.ratio.toFixed(2).padStart(cols.chr), cr.compression.token_ratio.toFixed(2).padStart(cols.tkr), + (vsDet != null ? vsDet.toFixed(2) : '-').padStart(cols.vsDet), String(cr.compression.messages_compressed).padStart(cols.comp), String(cr.compression.messages_preserved).padStart(cols.pres), rt.padStart(cols.rt), @@ -1053,6 +1524,37 @@ function printLlmRow( ); } +function printBudgetRow( + name: string, + entry: LlmTokenBudgetResult, + cols: { + name: number; + method: number; + tokens: number; + fits: number; + rw: number; + chr: number; + rt: number; + time: number; + }, +): void { + console.log( + [ + name.padEnd(cols.name), + entry.method.padStart(cols.method), + String(entry.tokenCount).padStart(cols.tokens), + String(entry.fits).padStart(cols.fits), + String(entry.recencyWindow ?? '-').padStart(cols.rw), + entry.ratio.toFixed(2).padStart(cols.chr), + entry.roundTrip.padStart(cols.rt), + (entry.timeMs < 1000 + ? entry.timeMs.toFixed(0) + 'ms' + : (entry.timeMs / 1000).toFixed(1) + 's' + ).padStart(cols.time), + ].join(' '), + ); +} + // --------------------------------------------------------------------------- // Real session support — convert Claude Code JSONL transcripts to Message[] // --------------------------------------------------------------------------- diff --git a/docs/README.md b/docs/README.md index 658c442..e5f246d 100644 --- a/docs/README.md +++ b/docs/README.md @@ -13,3 +13,4 @@ | [Provenance](provenance.md) | `_cce_original` metadata, summary_id, parent_ids | | [Preservation Rules](preservation-rules.md) | What gets preserved, classification tiers, code-aware splitting | | [Benchmarks](benchmarks.md) | Running benchmarks, LLM comparison, interpreting results | +| [Benchmark Results](benchmark-results.md) | Auto-generated results with charts (regenerated by bench:save) | diff --git a/docs/api-reference.md b/docs/api-reference.md index 7fd7843..9f5973b 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -174,7 +174,7 @@ function defaultTokenCounter(msg: Message): number; Math.ceil(msg.content.length / 3.5); ``` -Approximates ~3.5 characters per token. Suitable for rough estimates. For accurate budgeting, replace with a real tokenizer. See [Token budget](token-budget.md). +The 3.5 chars/token ratio is the empirical average for GPT-family BPE tokenizers (cl100k_base, o200k_base) on mixed English text. The lower end of the range (~3.2–4.5) is chosen intentionally so budget estimates stay conservative — over-counting tokens is safer than under-counting. For accurate budgeting, replace with a real tokenizer. See [Token budget](token-budget.md). --- diff --git a/docs/benchmark-results.md b/docs/benchmark-results.md new file mode 100644 index 0000000..8e54c61 --- /dev/null +++ b/docs/benchmark-results.md @@ -0,0 +1,271 @@ +# Benchmark Results + +[Back to README](../README.md) | [All docs](README.md) | [Handbook](benchmarks.md) + +*Auto-generated by `npm run bench:save`. Do not edit manually.* + +**v1.0.0** · Generated: 2026-02-26 + +![avg ratio](https://img.shields.io/badge/avg%20ratio-2.08x-blue) ![best](https://img.shields.io/badge/best-6.16x-blue) ![scenarios](https://img.shields.io/badge/scenarios-8-blue) ![round-trip](https://img.shields.io/badge/round--trip-all_PASS-brightgreen) ![gzip](https://img.shields.io/badge/gzip-16.6%20KB-blue) + +## Summary + +| Metric | Value | +| --- | --- | +| Scenarios | 8 | +| Average compression | 2.08x | +| Best compression | 6.16x | +| Round-trip integrity | all PASS | + +```mermaid +pie title "Message Outcomes" + "Preserved" : 90 + "Compressed" : 65 +``` + +## Compression by Scenario + +> **8 scenarios** · **2.08x** avg ratio · **1.00x** – **6.16x** range · all round-trips PASS + +```mermaid +xychart-beta + title "Compression Ratio by Scenario" + x-axis ["Coding", "Long Q&A", "Tool-heavy", "Short", "Deep", "Technical", "Structured", "Agentic"] + y-axis "Char Ratio" + bar [1.68, 6.16, 1.30, 1.00, 2.12, 1.00, 1.93, 1.43] +``` + +| Scenario | Ratio | Reduction | Token Ratio | Messages | Compressed | Preserved | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| Coding assistant | 1.68 | 41% | 1.67 | 13 | 5 | 8 | +| Long Q&A | 6.16 | 84% | 6.11 | 10 | 4 | 6 | +| Tool-heavy | 1.30 | 23% | 1.29 | 18 | 2 | 16 | +| Short conversation | 1.00 | 0% | 1.00 | 7 | 0 | 7 | +| Deep conversation | 2.12 | 53% | 2.12 | 51 | 50 | 1 | +| Technical explanation | 1.00 | 0% | 1.00 | 11 | 0 | 11 | +| Structured content | 1.93 | 48% | 1.92 | 12 | 2 | 10 | +| Agentic coding session | 1.43 | 30% | 1.43 | 33 | 2 | 31 | + +## Deduplication Impact + +```mermaid +xychart-beta + title "Deduplication Impact (recencyWindow=0)" + x-axis ["Long Q&A", "Agentic"] + y-axis "Char Ratio" + bar [5.14, 1.14] + bar [6.16, 1.43] +``` + +*First bar: no dedup · Second bar: with dedup* + +| Scenario | No Dedup (rw=0) | Dedup (rw=0) | No Dedup (rw=4) | Dedup (rw=4) | Deduped | +| --- | ---: | ---: | ---: | ---: | ---: | +| Coding assistant | 1.68 | 1.68 | 1.51 | 1.51 | 0 | +| Long Q&A | 5.14 | 6.16 | 1.90 | 2.03 | 1 | +| Tool-heavy | 1.30 | 1.30 | 1.30 | 1.30 | 0 | +| Short conversation | 1.00 | 1.00 | 1.00 | 1.00 | 0 | +| Deep conversation | 2.12 | 2.12 | 1.95 | 1.95 | 0 | +| Technical explanation | 1.00 | 1.00 | 1.00 | 1.00 | 0 | +| Structured content | 1.93 | 1.93 | 1.37 | 1.37 | 0 | +| Agentic coding session | 1.14 | 1.43 | 1.14 | 1.43 | 4 | + +### Fuzzy Dedup + +| Scenario | Exact Deduped | Fuzzy Deduped | Ratio | vs Base | +| --- | ---: | ---: | ---: | ---: | +| Coding assistant | 0 | 0 | 1.68 | - | +| Long Q&A | 1 | 0 | 6.16 | - | +| Tool-heavy | 0 | 0 | 1.30 | - | +| Short conversation | 0 | 0 | 1.00 | - | +| Deep conversation | 0 | 0 | 2.12 | - | +| Technical explanation | 0 | 0 | 1.00 | - | +| Structured content | 0 | 0 | 1.93 | - | +| Agentic coding session | 4 | 2 | 2.23 | +56% | + +## Token Budget + +Target: **2000 tokens** · 1/4 fit + +| Scenario | Dedup | Tokens | Fits | recencyWindow | Compressed | Preserved | Deduped | +| --- | --- | ---: | --- | ---: | ---: | ---: | ---: | +| Deep conversation | no | 3738 | no | 0 | 50 | 1 | 0 | +| Deep conversation | yes | 3738 | no | 0 | 50 | 1 | 0 | +| Agentic coding session | no | 2345 | no | 0 | 4 | 33 | 0 | +| Agentic coding session | yes | 1957 | yes | 9 | 1 | 32 | 4 | + +## Bundle Size + +> Zero-dependency ESM library — tracked per-file to catch regressions. + +| File | Size | Gzip | +| --- | ---: | ---: | +| classify.js | 7.5 KB | 3.2 KB | +| compress.js | 33.1 KB | 8.5 KB | +| dedup.js | 10.0 KB | 2.8 KB | +| expand.js | 2.7 KB | 934 B | +| index.js | 225 B | 159 B | +| summarizer.js | 2.5 KB | 993 B | +| types.js | 11 B | 31 B | +| **total** | 56.2 KB | 16.6 KB | + +## LLM vs Deterministic + +> Results are **non-deterministic** — LLM outputs vary between runs. Saved as reference data, not used for regression testing. + +``` +Deterministic vs ollama/llama3.2 + +Coding assistant Det ████████░░░░░░░░░░░░░░░░░░░░░░ 1.68x + LLM ████████░░░░░░░░░░░░░░░░░░░░░░ 1.55x + +Long Q&A Det ██████████████████████████████ 6.16x + LLM ██████████████████████░░░░░░░░ 4.49x + +Tool-heavy Det ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.30x + LLM ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.28x + +Deep conversation Det ██████████░░░░░░░░░░░░░░░░░░░░ 2.12x + LLM ████████████████░░░░░░░░░░░░░░ 3.28x ★ + +Technical explanation Det █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x + LLM █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x + +Structured content Det █████████░░░░░░░░░░░░░░░░░░░░░ 1.93x + LLM ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.46x + +Agentic coding session Det ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.43x + LLM ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.40x + +★ = LLM wins +``` + +``` +Deterministic vs openai/gpt-4.1-mini + +Coding assistant Det ████████░░░░░░░░░░░░░░░░░░░░░░ 1.68x + LLM ████████░░░░░░░░░░░░░░░░░░░░░░ 1.64x + +Long Q&A Det ██████████████████████████████ 6.16x + LLM ██████████████████████████░░░░ 5.37x + +Tool-heavy Det ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.30x + LLM █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.12x + +Deep conversation Det ██████████░░░░░░░░░░░░░░░░░░░░ 2.12x + LLM ████████████░░░░░░░░░░░░░░░░░░ 2.37x ★ + +Technical explanation Det █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x + LLM █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x + +Structured content Det █████████░░░░░░░░░░░░░░░░░░░░░ 1.93x + LLM ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.29x + +Agentic coding session Det ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.43x + LLM ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.43x + +★ = LLM wins +``` + +### Provider Summary + +| Provider | Model | Avg Ratio | Avg vsDet | Round-trip | Budget Fits | Avg Time | +| --- | --- | ---: | ---: | --- | --- | ---: | +| ollama | llama3.2 | 2.09x | 0.96 | all PASS | 1/4 | 4.2s | +| openai | gpt-4.1-mini | 2.09x | 0.92 | all PASS | 2/4 | 8.1s | + +> **Key findings:** +> LLM wins on prose-heavy scenarios: Deep conversation, Technical explanation +> Deterministic wins on structured/technical content: Coding assistant, Long Q&A, Tool-heavy, Structured content + +### ollama (llama3.2) + +*Generated: 2026-02-25* + +
+Scenario details + +| Scenario | Method | Char Ratio | Token Ratio | vsDet | Compressed | Preserved | Round-trip | Time | +| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: | +| Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms | +| | llm-basic | 1.48 | 1.48 | 0.88 | 5 | 8 | PASS | 5.9s | +| | llm-escalate | 1.55 | 1.55 | 0.92 | 5 | 8 | PASS | 3.0s | +| Long Q&A | deterministic | 6.16 | 6.11 | - | 4 | 6 | PASS | 1ms | +| | llm-basic | 4.31 | 4.28 | 0.70 | 4 | 6 | PASS | 4.1s | +| | llm-escalate | 4.49 | 4.46 | 0.73 | 4 | 6 | PASS | 3.7s | +| Tool-heavy | deterministic | 1.30 | 1.29 | - | 2 | 16 | PASS | 2ms | +| | llm-basic | 1.12 | 1.11 | 0.86 | 2 | 16 | PASS | 2.3s | +| | llm-escalate | 1.28 | 1.28 | 0.99 | 2 | 16 | PASS | 2.8s | +| Deep conversation | deterministic | 2.12 | 2.12 | - | 50 | 1 | PASS | 3ms | +| | llm-basic | 3.12 | 3.11 | 1.47 | 50 | 1 | PASS | 22.7s | +| | llm-escalate | 3.28 | 3.26 | 1.54 | 50 | 1 | PASS | 23.3s | +| Technical explanation | deterministic | 1.00 | 1.00 | - | 0 | 11 | PASS | 1ms | +| | llm-basic | 1.00 | 1.00 | 1.00 | 0 | 11 | PASS | 3.2s | +| | llm-escalate | 1.00 | 1.00 | 1.00 | 2 | 9 | PASS | 785ms | +| Structured content | deterministic | 1.93 | 1.92 | - | 2 | 10 | PASS | 0ms | +| | llm-basic | 1.46 | 1.45 | 0.75 | 2 | 10 | PASS | 3.5s | +| | llm-escalate | 1.38 | 1.38 | 0.71 | 2 | 10 | PASS | 3.7s | +| Agentic coding session | deterministic | 1.43 | 1.43 | - | 2 | 31 | PASS | 1ms | +| | llm-basic | 1.35 | 1.34 | 0.94 | 2 | 31 | PASS | 3.3s | +| | llm-escalate | 1.40 | 1.40 | 0.98 | 2 | 31 | PASS | 5.4s | + +#### Token Budget (target: 2000 tokens) + +| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time | +| --- | --- | ---: | --- | ---: | ---: | --- | ---: | +| Deep conversation | deterministic | 3738 | false | 0 | 2.12 | PASS | 12ms | +| | llm-escalate | 2593 | false | 0 | 3.08 | PASS | 132.0s | +| Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms | +| | llm-escalate | 2003 | false | 9 | 1.33 | PASS | 4.1s | + +
+ +### openai (gpt-4.1-mini) + +*Generated: 2026-02-25* + +
+Scenario details + +| Scenario | Method | Char Ratio | Token Ratio | vsDet | Compressed | Preserved | Round-trip | Time | +| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: | +| Coding assistant | deterministic | 1.68 | 1.67 | - | 5 | 8 | PASS | 0ms | +| | llm-basic | 1.64 | 1.63 | 0.98 | 5 | 8 | PASS | 5.6s | +| | llm-escalate | 1.63 | 1.63 | 0.97 | 5 | 8 | PASS | 6.0s | +| Long Q&A | deterministic | 6.16 | 6.11 | - | 4 | 6 | PASS | 1ms | +| | llm-basic | 5.37 | 5.33 | 0.87 | 4 | 6 | PASS | 5.9s | +| | llm-escalate | 5.35 | 5.31 | 0.87 | 4 | 6 | PASS | 7.0s | +| Tool-heavy | deterministic | 1.30 | 1.29 | - | 2 | 16 | PASS | 0ms | +| | llm-basic | 1.11 | 1.10 | 0.85 | 2 | 16 | PASS | 3.5s | +| | llm-escalate | 1.12 | 1.12 | 0.86 | 2 | 16 | PASS | 5.3s | +| Deep conversation | deterministic | 2.12 | 2.12 | - | 50 | 1 | PASS | 3ms | +| | llm-basic | 2.34 | 2.33 | 1.10 | 50 | 1 | PASS | 50.4s | +| | llm-escalate | 2.37 | 2.36 | 1.11 | 50 | 1 | PASS | 50.8s | +| Technical explanation | deterministic | 1.00 | 1.00 | - | 0 | 11 | PASS | 1ms | +| | llm-basic | 1.00 | 1.00 | 1.00 | 1 | 10 | PASS | 2.6s | +| | llm-escalate | 1.00 | 1.00 | 1.00 | 1 | 10 | PASS | 3.3s | +| Structured content | deterministic | 1.93 | 1.92 | - | 2 | 10 | PASS | 0ms | +| | llm-basic | 1.23 | 1.23 | 0.64 | 2 | 10 | PASS | 10.2s | +| | llm-escalate | 1.29 | 1.29 | 0.67 | 2 | 10 | PASS | 4.8s | +| Agentic coding session | deterministic | 1.43 | 1.43 | - | 2 | 31 | PASS | 1ms | +| | llm-basic | 1.43 | 1.43 | 1.00 | 2 | 31 | PASS | 5.8s | +| | llm-escalate | 1.32 | 1.32 | 0.93 | 1 | 32 | PASS | 9.5s | + +#### Token Budget (target: 2000 tokens) + +| Scenario | Method | Tokens | Fits | recencyWindow | Ratio | Round-trip | Time | +| --- | --- | ---: | --- | ---: | ---: | --- | ---: | +| Deep conversation | deterministic | 3738 | false | 0 | 2.12 | PASS | 10ms | +| | llm-escalate | 3391 | false | 0 | 2.35 | PASS | 280.5s | +| Agentic coding session | deterministic | 1957 | true | 9 | 1.36 | PASS | 2ms | +| | llm-escalate | 1915 | true | 3 | 1.39 | PASS | 28.1s | + +
+ +## Methodology + +- All deterministic results use the same input → same output guarantee +- Metrics: compression ratio, token ratio, message counts, dedup counts +- Timing is excluded from baselines (hardware-dependent) +- LLM benchmarks are saved as reference data, not used for regression testing +- Round-trip integrity is verified for every scenario (compress then uncompress) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 4111308..eca3acb 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -1,151 +1,85 @@ # Benchmarks -[Back to README](../README.md) | [All docs](README.md) +[Back to README](../README.md) | [All docs](README.md) | [Latest Results](benchmark-results.md) -Running benchmarks, interpreting results, and comparing compression methods. - -## Running tests - -```bash -# Run the test suite (333 tests) -npm test - -# Type check -npx tsc --noEmit -``` - -## Deterministic benchmarks - -No API keys needed. Runs entirely locally: +## Running Benchmarks ```bash -npm run bench +npm run bench # Run benchmarks (no baseline check) +npm run bench:check # Run and compare against baseline +npm run bench:save # Run, save new baseline, regenerate results doc +npm run bench:llm # Run with LLM summarization benchmarks ``` -### Scenarios - -The benchmark covers 7 conversation types: +### LLM benchmarks (opt-in) -| Scenario | Description | -| ---------------------- | -------------------------------------------------------- | -| Coding assistant | Mixed code fences and prose discussion | -| Long Q&A | Extended question-and-answer with detailed explanations | -| Tool-heavy | Messages with `tool_calls` arrays (preserved by default) | -| Short conversation | Brief exchanges, mostly under 120 chars | -| Deep conversation | Long, multi-paragraph prose exchanges | -| Structured content | JSON, YAML, SQL, test output | -| Agentic coding session | Repeated file reads, grep results, test runs | +LLM benchmarks require the `--llm` flag (`npm run bench:llm`). Set API keys in a `.env` file or export them. Ollama is auto-detected when running locally. -### What gets measured +| Variable | Provider | Default Model | Notes | +| --- | --- | --- | --- | +| `OPENAI_API_KEY` | OpenAI | `gpt-4.1-mini` | | +| `ANTHROPIC_API_KEY` | Anthropic | `claude-haiku-4-5-20251001` | | +| *(none required)* | Ollama | `llama3.2` | Auto-detected on localhost:11434 | -For each scenario: +## Scenarios -- **Characters**: original vs. compressed character counts -- **Compression ratio**: `original_chars / compressed_chars` (>1 = savings) -- **Token ratio**: `original_tokens / compressed_tokens` -- **Messages compressed**: how many messages were summarized -- **Messages preserved**: how many were kept as-is -- **Messages deduped**: exact duplicates replaced (agentic scenario) -- **Timing**: milliseconds per compression +The benchmark covers 8 conversation types: -Additional benchmark sections: +| Scenario | Description | +| --- | --- | +| Coding assistant | Mixed code fences and prose discussion | +| Long Q&A | Extended question-and-answer with repeated paragraphs | +| Tool-heavy | Messages with `tool_calls` arrays (preserved by default) | +| Short conversation | Brief exchanges, mostly under 120 chars | +| Deep conversation | 25 turns of multi-paragraph prose | +| Technical explanation | Pure prose Q&A about event-driven architecture | +| Structured content | JSON, YAML, SQL, API keys, test output | +| Agentic coding session | Repeated file reads, grep results, near-duplicate edits | -- **Token budget optimization** with and without dedup -- **Fuzzy dedup accuracy** across thresholds -- **Real-session compression** on actual Claude Code transcripts (if `~/.claude/projects/` exists) - -### Real-session benchmarks - -The benchmark automatically scans for real Claude Code conversation files in `~/.claude/projects/`. It parses JSONL conversation files, extracts message arrays, and runs compression on actual production data. - -This provides the most realistic performance numbers since synthetic scenarios can't capture the full diversity of real conversations. - -## LLM benchmarks - -Compare deterministic compression against real LLM-powered summarization. Set one or more environment variables to enable: - -| Variable | Provider | Default model | -| ------------------- | --------- | --------------------------------------------------------- | -| `OPENAI_API_KEY` | OpenAI | `gpt-4.1-mini` (override: `OPENAI_MODEL`) | -| `ANTHROPIC_API_KEY` | Anthropic | `claude-haiku-4-5-20251001` (override: `ANTHROPIC_MODEL`) | -| `OLLAMA_MODEL` | Ollama | `llama3.2` (host override: `OLLAMA_HOST`) | - -```bash -# Run with OpenAI -OPENAI_API_KEY=sk-... npm run bench - -# Run with Ollama (local) -OLLAMA_MODEL=llama3.2 npm run bench - -# Run with multiple providers -OPENAI_API_KEY=sk-... ANTHROPIC_API_KEY=sk-ant-... npm run bench -``` - -### Three methods compared - -Each scenario runs three methods side-by-side: - -| Method | Description | -| --------------- | -------------------------------------------------------------------- | -| `deterministic` | No LLM, pure sentence scoring + entity extraction | -| `llm-basic` | `createSummarizer` with the detected provider | -| `llm-escalate` | `createEscalatingSummarizer` (normal -> aggressive -> deterministic) | - -All methods verify round-trip integrity — `uncompress()` is called to confirm originals are restored. - -### What to look for - -- **Ratio comparison** — deterministic often beats LLM on compression ratio because LLMs write fuller, more helpful summaries -- **Latency** — deterministic is < 2ms; LLM adds network round-trip time per message -- **Fallback rate** — how often the engine rejects LLM output and falls back to deterministic -- **Round-trip integrity** — all methods must pass (no data loss) - -### SDK requirements - -LLM providers require their SDKs: - -- OpenAI: `openai` package -- Anthropic: `@anthropic-ai/sdk` package -- Ollama: `openai` package (uses OpenAI-compatible API) - -Missing SDKs are detected at runtime and print a skip message — no crash, no hard dependency. - -## Interpreting results +## Interpreting Results ### Compression ratio -- `1.0` = no compression (all messages preserved) -- `1.5` = 33% reduction -- `2.0` = 50% reduction -- `3.0` = 67% reduction -- `6.0` = 83% reduction +| Ratio | Reduction | +| ---: | --- | +| 1.0x | no compression (all messages preserved) | +| 1.5x | 33% reduction | +| 2.0x | 50% reduction | +| 3.0x | 67% reduction | +| 6.0x | 83% reduction | + +Higher is better. Token ratio is more meaningful for LLM context budgeting; character ratio is useful for storage. -Higher is better. The deterministic engine typically achieves 1.3-6.1x on synthetic scenarios. +### Deduplication -### Token ratio vs. character ratio +Dedup effectiveness is measured across two axes: -Token ratio is more meaningful for LLM context budgeting since tokens are what models count. Character ratio is useful for storage optimization. +- **recencyWindow=0** vs **recencyWindow=4** — how much compression improves when recent messages are protected +- **With dedup** vs **without** — the marginal gain from exact + fuzzy duplicate detection -### When LLM wins +Scenarios with repeated content (Long Q&A, Agentic coding session) show the largest dedup gains. Scenarios with unique messages show no difference. -LLM summarization can outperform deterministic in: +### LLM vs deterministic -- Very long prose-heavy conversations where paraphrasing and concept merging genuinely helps -- Domain-specific content where the LLM understands what's important +The `vsDet` column shows LLM compression relative to deterministic: -### When deterministic wins +- **vsDet > 1.0** — LLM achieves better compression (common for long prose) +- **vsDet < 1.0** — deterministic wins (common for structured/technical content) +- **vsDet = 1.0** — no difference (content is already optimal or fully preserved) -Deterministic typically wins when: +## Regression Testing -- Messages contain mixed code and prose (code-aware splitting is already optimal) -- Messages are structured (test output, grep results) -- The LLM writes helpful but verbose summaries +Baselines are stored in [`bench/baselines/`](../bench/baselines/) as JSON. CI runs `npm run bench:check` on every push and PR to catch regressions. ---- +- **Tolerance:** 0% by default (all metrics are deterministic) +- **On regression:** CI fails with a diff showing which metrics changed +- **After intentional changes:** run `npm run bench:save` to update the baseline and regenerate the results doc +- **Custom tolerance:** `npx tsx bench/run.ts --check --tolerance 5` allows 5% deviation -## See also +### Baseline files -- [Compression pipeline](compression-pipeline.md) - the deterministic algorithm -- [LLM integration](llm-integration.md) - setting up providers for benchmarks -- [Token budget](token-budget.md) - budget optimization -- [Deduplication](deduplication.md) - dedup in benchmarks +| File | Purpose | +| --- | --- | +| `bench/baselines/current.json` | Active baseline compared in CI | +| `bench/baselines/history/v*.json` | Versioned snapshots, one per release | +| `bench/baselines/llm/*.json` | LLM benchmark reference data (non-deterministic) | diff --git a/docs/token-budget.md b/docs/token-budget.md index cb1a9f4..c1fabe2 100644 --- a/docs/token-budget.md +++ b/docs/token-budget.md @@ -49,7 +49,7 @@ function defaultTokenCounter(msg: Message): number { } ``` -~3.5 characters per token is a rough heuristic. It's fast and works for ballpark estimates, but real tokenizers vary: +~3.5 characters per token is derived from empirical measurements of GPT-family BPE tokenizers (cl100k_base, o200k_base) on mixed English text. We pick the lower end of the observed range so estimates are conservative — slightly over-counting tokens is safer than under-counting and blowing the budget. It's fast and works for ballpark estimates, but real tokenizers vary: | Tokenizer | Typical chars/token | | --------- | ------------------- | diff --git a/package.json b/package.json index 9b33f2f..f581ee3 100644 --- a/package.json +++ b/package.json @@ -14,6 +14,7 @@ "format": "prettier --write .", "format:check": "prettier --check .", "bench": "npx tsx bench/run.ts", + "bench:llm": "npx tsx bench/run.ts --llm", "bench:save": "npx tsx bench/run.ts --save", "bench:check": "npx tsx bench/run.ts --check", "test:e2e": "npm run build && npm pack && npm run test:e2e:lint && npm run test:e2e:smoke; EXIT=$?; npm run test:e2e:cleanup; exit $EXIT", diff --git a/src/compress.ts b/src/compress.ts index 68e2641..b77b72c 100644 --- a/src/compress.ts +++ b/src/compress.ts @@ -418,7 +418,16 @@ function contentLength(msg: Message): number { return typeof msg.content === 'string' ? msg.content.length : 0; } -/** Default token counter: ~3.5 chars/token heuristic. */ +/** + * Default token counter: ~3.5 chars/token heuristic. + * + * The 3.5 ratio is the empirical average for GPT-family BPE tokenizers + * (cl100k_base, o200k_base) on mixed English text. Real-world values range + * from ~3.2 (code-heavy) to ~4.5 (plain prose). We intentionally pick the + * lower end so budget estimates stay conservative (slightly over-counting + * tokens is safer than under-counting). Users who need exact counts can + * supply a real tokenizer via the `tokenCounter` option. + */ export function defaultTokenCounter(msg: Message): number { return Math.ceil(contentLength(msg) / 3.5); } @@ -614,190 +623,10 @@ function computeStats( } // --------------------------------------------------------------------------- -// Sync compression (internal) +// Unified compression core (generator + sync/async runners) // --------------------------------------------------------------------------- -function compressSync(messages: Message[], options: CompressOptions = {}): CompressResult { - const sourceVersion = options.sourceVersion ?? 0; - const counter = options.tokenCounter ?? defaultTokenCounter; - - if (messages.length === 0) { - return { - messages: [], - compression: { - original_version: sourceVersion, - ratio: 1, - token_ratio: 1, - messages_compressed: 0, - messages_preserved: 0, - }, - verbatim: {}, - }; - } - - const preserveRoles = new Set(options.preserve ?? ['system']); - const recencyWindow = options.recencyWindow ?? 4; - const recencyStart = Math.max(0, messages.length - (recencyWindow > 0 ? recencyWindow : 0)); - let dedupAnnotations = - (options.dedup ?? true) ? analyzeDuplicates(messages, recencyStart, preserveRoles) : undefined; - - if (options.fuzzyDedup) { - const fuzzyAnnotations = analyzeFuzzyDuplicates( - messages, - recencyStart, - preserveRoles, - dedupAnnotations ?? new Map(), - options.fuzzyThreshold ?? 0.85, - ); - if (fuzzyAnnotations.size > 0) { - if (!dedupAnnotations) dedupAnnotations = new Map(); - for (const [idx, ann] of fuzzyAnnotations) { - dedupAnnotations.set(idx, ann); - } - } - } - - const classified = classifyAll(messages, preserveRoles, recencyWindow, dedupAnnotations); - - const result: Message[] = []; - const verbatim: Record = {}; - let messagesCompressed = 0; - let messagesPreserved = 0; - let messagesDeduped = 0; - let messagesFuzzyDeduped = 0; - let i = 0; - - while (i < classified.length) { - const { msg, preserved } = classified[i]; - - if (preserved) { - result.push(msg); - messagesPreserved++; - i++; - continue; - } - - // Dedup: replace earlier duplicate/near-duplicate with compact reference - if (classified[i].dedup) { - const annotation = classified[i].dedup!; - const keepTargetId = messages[annotation.duplicateOfIndex].id; - const tag = - annotation.similarity != null - ? `[cce:near-dup of ${keepTargetId} — ${annotation.contentLength} chars, ~${Math.round(annotation.similarity * 100)}% match]` - : `[cce:dup of ${keepTargetId} — ${annotation.contentLength} chars]`; - result.push(buildCompressedMessage(msg, [msg.id], tag, sourceVersion, verbatim, [msg])); - if (annotation.similarity != null) { - messagesFuzzyDeduped++; - } else { - messagesDeduped++; - } - i++; - continue; - } - - // Code-split: extract fences verbatim, summarize surrounding prose - if (classified[i].codeSplit) { - const content = typeof msg.content === 'string' ? msg.content : ''; - const segments = splitCodeAndProse(content); - const proseText = segments - .filter((s) => s.type === 'prose') - .map((s) => s.content) - .join(' '); - const codeFences = segments.filter((s) => s.type === 'code').map((s) => s.content); - const proseBudget = proseText.length < 600 ? 200 : 400; - const summaryText = summarize(proseText, proseBudget); - const embeddedId = options.embedSummaryId ? makeSummaryId([msg.id]) : undefined; - const compressed = `${formatSummary(summaryText, proseText, undefined, true, embeddedId)}\n\n${codeFences.join('\n\n')}`; - - if (compressed.length >= content.length) { - result.push(msg); - messagesPreserved++; - i++; - continue; - } - - result.push( - buildCompressedMessage(msg, [msg.id], compressed, sourceVersion, verbatim, [msg]), - ); - messagesCompressed++; - i++; - continue; - } - - // Collect consecutive non-preserved messages with the SAME role - const { group, nextIdx } = collectGroup(classified, i); - i = nextIdx; - - const allContent = group - .map((g) => (typeof g.msg.content === 'string' ? g.msg.content : '')) - .join(' '); - const contentBudget = allContent.length < 600 ? 200 : 400; - const summaryText = isStructuredOutput(allContent) - ? summarizeStructured(allContent, contentBudget) - : summarize(allContent, contentBudget); - - if (group.length > 1) { - const mergeIds = group.map((g) => g.msg.id); - const embeddedId = options.embedSummaryId ? makeSummaryId(mergeIds) : undefined; - let summary = formatSummary(summaryText, allContent, group.length, undefined, embeddedId); - const combinedLength = group.reduce((sum, g) => sum + contentLength(g.msg), 0); - if (summary.length >= combinedLength) { - summary = formatSummary(summaryText, allContent, group.length, true, embeddedId); - } - - if (summary.length >= combinedLength) { - for (const g of group) { - result.push(g.msg); - messagesPreserved++; - } - } else { - const sourceMsgs = group.map((g) => g.msg); - const base: Message = { ...sourceMsgs[0] }; - result.push( - buildCompressedMessage(base, mergeIds, summary, sourceVersion, verbatim, sourceMsgs), - ); - messagesCompressed += group.length; - } - } else { - const single = group[0].msg; - const content = typeof single.content === 'string' ? single.content : ''; - const embeddedId = options.embedSummaryId ? makeSummaryId([single.id]) : undefined; - let summary = formatSummary(summaryText, allContent, undefined, undefined, embeddedId); - if (summary.length >= content.length) { - summary = formatSummary(summaryText, allContent, undefined, true, embeddedId); - } - - if (summary.length >= content.length) { - result.push(single); - messagesPreserved++; - } else { - result.push( - buildCompressedMessage(single, [single.id], summary, sourceVersion, verbatim, [single]), - ); - messagesCompressed++; - } - } - } - - return { - messages: result, - compression: computeStats( - messages, - result, - messagesCompressed, - messagesPreserved, - sourceVersion, - counter, - messagesDeduped, - messagesFuzzyDeduped, - ), - verbatim, - }; -} - -// --------------------------------------------------------------------------- -// Async compression (internal, LLM summarizer support) -// --------------------------------------------------------------------------- +type SummarizeRequest = { text: string; budget: number }; async function withFallback( text: string, @@ -816,13 +645,12 @@ async function withFallback( return summarize(text, maxBudget); } -async function compressAsync( +function* compressGen( messages: Message[], options: CompressOptions = {}, -): Promise { +): Generator { const sourceVersion = options.sourceVersion ?? 0; const counter = options.tokenCounter ?? defaultTokenCounter; - const userSummarizer = options.summarizer; if (messages.length === 0) { return { @@ -908,7 +736,7 @@ async function compressAsync( .join(' '); const codeFences = segments.filter((s) => s.type === 'code').map((s) => s.content); const proseBudget = proseText.length < 600 ? 200 : 400; - const summaryText = await withFallback(proseText, userSummarizer, proseBudget); + const summaryText: string = yield { text: proseText, budget: proseBudget }; const embeddedId = options.embedSummaryId ? makeSummaryId([msg.id]) : undefined; const compressed = `${formatSummary(summaryText, proseText, undefined, true, embeddedId)}\n\n${codeFences.join('\n\n')}`; @@ -937,7 +765,7 @@ async function compressAsync( const contentBudget = allContent.length < 600 ? 200 : 400; const summaryText = isStructuredOutput(allContent) ? summarizeStructured(allContent, contentBudget) - : await withFallback(allContent, userSummarizer, contentBudget); + : yield { text: allContent, budget: contentBudget }; if (group.length > 1) { const mergeIds = group.map((g) => g.msg.id); @@ -998,6 +826,38 @@ async function compressAsync( }; } +function runCompressSync(gen: Generator): CompressResult { + let next = gen.next(); + while (!next.done) { + const { text, budget } = next.value; + next = gen.next(summarize(text, budget)); + } + return next.value; +} + +async function runCompressAsync( + gen: Generator, + userSummarizer?: Summarizer, +): Promise { + let next = gen.next(); + while (!next.done) { + const { text, budget } = next.value; + next = gen.next(await withFallback(text, userSummarizer, budget)); + } + return next.value; +} + +function compressSync(messages: Message[], options: CompressOptions = {}): CompressResult { + return runCompressSync(compressGen(messages, options)); +} + +async function compressAsync( + messages: Message[], + options: CompressOptions = {}, +): Promise { + return runCompressAsync(compressGen(messages, options), options.summarizer); +} + // --------------------------------------------------------------------------- // Token budget helpers (absorbed from compressToFit) // --------------------------------------------------------------------------- diff --git a/src/types.ts b/src/types.ts index d885de3..16e4fd3 100644 --- a/src/types.ts +++ b/src/types.ts @@ -32,7 +32,7 @@ export type CompressOptions = { embedSummaryId?: boolean; /** Hard-truncate non-recency messages when binary search bottoms out and budget still exceeded. Default: false. */ forceConverge?: boolean; - /** Custom token counter per message. Default: ceil(content.length / 3.5). */ + /** Custom token counter per message. Default: ceil(content.length / 3.5) — see defaultTokenCounter for rationale. */ tokenCounter?: (msg: Message) => number; };