From ebf021508403bc03ca8840f118f325520e583e0a Mon Sep 17 00:00:00 2001
From: Eamon <eamon112009@gmail.com>
Date: Sat, 16 May 2026 18:17:15 +0530
Subject: [PATCH] feat(benchmarks): add script to compare C++ and Python
 backend results (#40)

Introduces a CLI tool to load, index, and align benchmark JSON results from both backends. It displays a side-by-side comparison table showing latency (ms), throughput (tokens/s), and the percentage speedup/slowdown.
---
 benchmark/compare.py | 141 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 benchmark/compare.py

diff --git a/benchmark/compare.py b/benchmark/compare.py
new file mode 100644
index 0000000..94f0f5b
--- /dev/null
+++ b/benchmark/compare.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""Compare Quadtrix C++ and Python benchmark JSON files."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+
+DEFAULT_RESULTS = Path(__file__).resolve().parent / "results"
+
+
+def load(path: Path) -> dict[str, Any]:
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def index_rows(result: dict[str, Any]) -> dict[tuple[str, str, int, int], dict[str, Any]]:
+    indexed = {}
+    for row in result.get("results", []):
+        key = (
+            row.get("suite", ""),
+            row.get("name", ""),
+            int(row.get("batch_size") or 0),
+            int(row.get("sequence_length") or 0),
+        )
+        indexed[key] = row
+    return indexed
+
+
+def pct(new: float, old: float) -> float:
+    if old == 0:
+        return 0.0
+    return (new - old) / old * 100.0
+
+
+def compare_backends(cpp_path: Path, python_path: Path) -> int:
+    missing = [str(path) for path in (cpp_path, python_path) if not path.exists()]
+    if missing:
+        print("Missing benchmark result file(s):")
+        for path in missing:
+            print(f"  {path}")
+        print("Run benchmark/run_all.py first, or pass explicit --cpp/--python paths.")
+        return 1
+
+    cpp = load(cpp_path)
+    py = load(python_path)
+    cpp_rows = index_rows(cpp)
+    py_rows = index_rows(py)
+
+    common = sorted(set(cpp_rows) & set(py_rows))
+    if not common:
+        print("No matching benchmark rows found.")
+        return 1
+
+    print("Quadtrix C++ vs Python Benchmark Comparison")
+    print(f"C++:    {cpp_path}")
+    print(f"Python: {python_path}")
+    print()
+    print(f"{'suite':<12} {'name':<24} {'shape':<10} {'cpp ms':>10} {'py ms':>10} {'cpp tok/s':>12} {'py tok/s':>12} {'latency':>10}")
+    print("-" * 110)
+
+    for key in common:
+        suite, name, batch, seq = key
+        c = cpp_rows[key]
+        p = py_rows[key]
+        cpp_ms = float(c.get("avg_ms") or 0.0)
+        py_ms = float(p.get("avg_ms") or 0.0)
+        cpp_tps = float(c.get("tokens_per_sec") or 0.0)
+        py_tps = float(p.get("tokens_per_sec") or 0.0)
+        shape = f"{batch}x{seq}" if batch or seq else "-"
+        delta = pct(cpp_ms, py_ms)
+        print(
+            f"{suite:<12} {name:<24} {shape:<10} "
+            f"{cpp_ms:10.3f} {py_ms:10.3f} {cpp_tps:12.1f} {py_tps:12.1f} {delta:+9.1f}%"
+        )
+    return 0
+
+
+def compare_baseline(current_path: Path, baseline_path: Path, threshold_pct: float) -> int:
+    missing = [str(path) for path in (current_path, baseline_path) if not path.exists()]
+    if missing:
+        print("Missing benchmark result file(s):")
+        for path in missing:
+            print(f"  {path}")
+        return 1
+
+    current = load(current_path)
+    baseline = load(baseline_path)
+    current_rows = index_rows(current)
+    baseline_rows = index_rows(baseline)
+    common = sorted(set(current_rows) & set(baseline_rows))
+
+    print("Quadtrix Benchmark Baseline Comparison")
+    print(f"Current:  {current_path}")
+    print(f"Baseline: {baseline_path}")
+    print()
+
+    regressions = []
+    for key in common:
+        c = current_rows[key]
+        b = baseline_rows[key]
+        delta = pct(float(c.get("avg_ms") or 0.0), float(b.get("avg_ms") or 0.0))
+        if delta > threshold_pct:
+            regressions.append((key, delta, b, c))
+
+    if not regressions:
+        print(f"No latency regressions over {threshold_pct:.1f}%.")
+        return 0
+
+    print(f"Latency regressions over {threshold_pct:.1f}%:")
+    for key, delta, b, c in regressions:
+        suite, name, batch, seq = key
+        print(
+            f"  {suite}/{name} {batch}x{seq}: "
+            f"{float(b.get('avg_ms') or 0.0):.3f} ms -> {float(c.get('avg_ms') or 0.0):.3f} ms ({delta:+.1f}%)"
+        )
+    return 2
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Compare Quadtrix benchmark results.")
+    parser.add_argument("--cpp", type=Path, default=DEFAULT_RESULTS / "cpp_benchmark.json")
+    parser.add_argument("--python", type=Path, default=DEFAULT_RESULTS / "python_benchmark.json")
+    parser.add_argument("--current", type=Path, default=None)
+    parser.add_argument("--baseline", type=Path, default=None)
+    parser.add_argument("--threshold-pct", type=float, default=10.0)
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    if args.current and args.baseline:
+        return compare_baseline(args.current, args.baseline, args.threshold_pct)
+    return compare_backends(args.cpp, args.python)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())