From 979212e1b7c2bff0f860e51978408507974de671 Mon Sep 17 00:00:00 2001 From: Eamon Date: Wed, 13 May 2026 22:16:31 +0530 Subject: [PATCH 1/6] Add GitHub Packages publish workflow --- .github/workflows/github-package.yml | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 .github/workflows/github-package.yml diff --git a/.github/workflows/github-package.yml b/.github/workflows/github-package.yml new file mode 100644 index 0000000..cbcb92a --- /dev/null +++ b/.github/workflows/github-package.yml @@ -0,0 +1,44 @@ +name: Publish GitHub Package + +on: + workflow_dispatch: + push: + tags: + - "v*" + +permissions: + contents: read + packages: write + +jobs: + publish-github-package: + name: Publish to GitHub Packages + runs-on: ubuntu-latest + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Node.js for GitHub Packages + uses: actions/setup-node@v4 + with: + node-version: "20" + registry-url: "https://npm.pkg.github.com" + scope: "@eamon2009" + cache: "npm" + cache-dependency-path: frontend/package-lock.json + + - name: Build frontend assets + run: | + npm --prefix frontend ci + npm --prefix frontend run build + + - name: Prepare GitHub Packages metadata + run: | + npm pkg set name="@eamon2009/quadtrix" + npm pkg set publishConfig.registry="https://npm.pkg.github.com" + + - name: Publish package + run: npm publish + env: + NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 124bdcf3da252433a4cd0c5494cf3b4c332e0518 Mon Sep 17 00:00:00 2001 From: Eamon Date: Sat, 16 May 2026 17:57:47 +0530 Subject: [PATCH 2/6] feat: add reference benchmark dimensions for Quadtrix Introduces configuration for real C++ and Python Quadtrix benchmark runs, including warmup, token generation, and training step dimensions. --- benchmark/benchmark_config.json | 59 +++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 benchmark/benchmark_config.json diff --git a/benchmark/benchmark_config.json b/benchmark/benchmark_config.json new file mode 100644 index 0000000..9397319 --- /dev/null +++ b/benchmark/benchmark_config.json @@ -0,0 +1,59 @@ +{ + "schema_version": 1, + "purpose": "Reference benchmark dimensions for the real C++ and Python Quadtrix benchmark suites.", + "common": { + "runs": 10, + "warmup": 3, + "quick_runs": 2, + "quick_warmup": 1, + "generate_tokens": 32, + "quick_generate_tokens": 4, + "train_steps": 5, + "quick_train_steps": 1, + "max_data_chars": 1000000, + "quick_max_data_chars": 50000 + }, + "suites": { + "data": [ + "tokenizer_or_char_encode", + "batch_sample_to_device" + ], + "primitive": [ + "matmul_3d", + "attention_scores_or_softmax3d", + "layer_norm" + ], + "forward": [ + "batch1_seq8", + "batch1_full_context", + "configured_batch_full_context" + ], + "training": [ + "adamw_step_forward_backward_update" + ], + "generation": [ + "empty_prompt", + "short_prompt", + "long_prompt" + ] + }, + "metrics": [ + "avg_ms", + "median_ms", + "min_ms", + "max_ms", + "p90_ms", + "p95_ms", + "std_ms", + "tokens_per_sec", + "loss", + "parameter_mb_fp32", + "memory_mb" + ], + "outputs": { + "cpp_json": "benchmark/results/cpp_benchmark.json", + "cpp_csv": "benchmark/results/cpp_benchmark.csv", + "python_json": "benchmark/results/python_benchmark.json", + "python_csv": "benchmark/results/python_benchmark.csv" + } +} From ebf021508403bc03ca8840f118f325520e583e0a Mon Sep 17 00:00:00 2001 From: Eamon Date: Sat, 16 May 2026 18:17:15 +0530 Subject: [PATCH 3/6] feat(benchmarks): add script to compare C++ and Python backend results (#40) Introduces a CLI tool to load, index, and align benchmark JSON results from both backends. It displays a side-by-side comparison table showing latency (ms), throughput (tokens/s), and the percentage speedup/slowdown. --- benchmark/compare.py | 141 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 benchmark/compare.py diff --git a/benchmark/compare.py b/benchmark/compare.py new file mode 100644 index 0000000..94f0f5b --- /dev/null +++ b/benchmark/compare.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +"""Compare Quadtrix C++ and Python benchmark JSON files.""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + + +DEFAULT_RESULTS = Path(__file__).resolve().parent / "results" + + +def load(path: Path) -> dict[str, Any]: + with path.open("r", encoding="utf-8") as f: + return json.load(f) + + +def index_rows(result: dict[str, Any]) -> dict[tuple[str, str, int, int], dict[str, Any]]: + indexed = {} + for row in result.get("results", []): + key = ( + row.get("suite", ""), + row.get("name", ""), + int(row.get("batch_size") or 0), + int(row.get("sequence_length") or 0), + ) + indexed[key] = row + return indexed + + +def pct(new: float, old: float) -> float: + if old == 0: + return 0.0 + return (new - old) / old * 100.0 + + +def compare_backends(cpp_path: Path, python_path: Path) -> int: + missing = [str(path) for path in (cpp_path, python_path) if not path.exists()] + if missing: + print("Missing benchmark result file(s):") + for path in missing: + print(f" {path}") + print("Run benchmark/run_all.py first, or pass explicit --cpp/--python paths.") + return 1 + + cpp = load(cpp_path) + py = load(python_path) + cpp_rows = index_rows(cpp) + py_rows = index_rows(py) + + common = sorted(set(cpp_rows) & set(py_rows)) + if not common: + print("No matching benchmark rows found.") + return 1 + + print("Quadtrix C++ vs Python Benchmark Comparison") + print(f"C++: {cpp_path}") + print(f"Python: {python_path}") + print() + print(f"{'suite':<12} {'name':<24} {'shape':<10} {'cpp ms':>10} {'py ms':>10} {'cpp tok/s':>12} {'py tok/s':>12} {'latency':>10}") + print("-" * 110) + + for key in common: + suite, name, batch, seq = key + c = cpp_rows[key] + p = py_rows[key] + cpp_ms = float(c.get("avg_ms") or 0.0) + py_ms = float(p.get("avg_ms") or 0.0) + cpp_tps = float(c.get("tokens_per_sec") or 0.0) + py_tps = float(p.get("tokens_per_sec") or 0.0) + shape = f"{batch}x{seq}" if batch or seq else "-" + delta = pct(cpp_ms, py_ms) + print( + f"{suite:<12} {name:<24} {shape:<10} " + f"{cpp_ms:10.3f} {py_ms:10.3f} {cpp_tps:12.1f} {py_tps:12.1f} {delta:+9.1f}%" + ) + return 0 + + +def compare_baseline(current_path: Path, baseline_path: Path, threshold_pct: float) -> int: + missing = [str(path) for path in (current_path, baseline_path) if not path.exists()] + if missing: + print("Missing benchmark result file(s):") + for path in missing: + print(f" {path}") + return 1 + + current = load(current_path) + baseline = load(baseline_path) + current_rows = index_rows(current) + baseline_rows = index_rows(baseline) + common = sorted(set(current_rows) & set(baseline_rows)) + + print("Quadtrix Benchmark Baseline Comparison") + print(f"Current: {current_path}") + print(f"Baseline: {baseline_path}") + print() + + regressions = [] + for key in common: + c = current_rows[key] + b = baseline_rows[key] + delta = pct(float(c.get("avg_ms") or 0.0), float(b.get("avg_ms") or 0.0)) + if delta > threshold_pct: + regressions.append((key, delta, b, c)) + + if not regressions: + print(f"No latency regressions over {threshold_pct:.1f}%.") + return 0 + + print(f"Latency regressions over {threshold_pct:.1f}%:") + for key, delta, b, c in regressions: + suite, name, batch, seq = key + print( + f" {suite}/{name} {batch}x{seq}: " + f"{float(b.get('avg_ms') or 0.0):.3f} ms -> {float(c.get('avg_ms') or 0.0):.3f} ms ({delta:+.1f}%)" + ) + return 2 + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Compare Quadtrix benchmark results.") + parser.add_argument("--cpp", type=Path, default=DEFAULT_RESULTS / "cpp_benchmark.json") + parser.add_argument("--python", type=Path, default=DEFAULT_RESULTS / "python_benchmark.json") + parser.add_argument("--current", type=Path, default=None) + parser.add_argument("--baseline", type=Path, default=None) + parser.add_argument("--threshold-pct", type=float, default=10.0) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + if args.current and args.baseline: + return compare_baseline(args.current, args.baseline, args.threshold_pct) + return compare_backends(args.cpp, args.python) + + +if __name__ == "__main__": + raise SystemExit(main()) From 2b9413042dff566b75858e7d43b02d9a9f368ada Mon Sep 17 00:00:00 2001 From: Eamon Date: Sat, 16 May 2026 18:20:44 +0530 Subject: [PATCH 4/6] feat: entry point for Python benchmark (#41) suite Introduces a standard entry point script that invokes the core python_benchmark module execution flow. --- benchmark/benchmark_training.py | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 benchmark/benchmark_training.py diff --git a/benchmark/benchmark_training.py b/benchmark/benchmark_training.py new file mode 100644 index 0000000..07ec369 --- /dev/null +++ b/benchmark/benchmark_training.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 +"""Compatibility entry point for the real Python benchmark suite.""" + +from python_benchmark import main + + +if __name__ == "__main__": + raise SystemExit(main()) From 410f1fe54e2cc027294df57b801be18b3e5e443a Mon Sep 17 00:00:00 2001 From: Eamon Date: Sat, 16 May 2026 18:35:49 +0530 Subject: [PATCH 5/6] feat(benchmarks): implement core PyTorch benchmark suite for Quadtrix Introduces the primary Python benchmark runner, measuring model metadata, data throughput, forward latency, training-step latency, and autoregressive generation. Includes utility functions for dynamic module loading, timing, and percentile calculation. --- benchmark/python_benchmark.py | 456 ++++++++++++++++++++++++++++++++++ 1 file changed, 456 insertions(+) create mode 100644 benchmark/python_benchmark.py diff --git a/benchmark/python_benchmark.py b/benchmark/python_benchmark.py new file mode 100644 index 0000000..09c9d38 --- /dev/null +++ b/benchmark/python_benchmark.py @@ -0,0 +1,456 @@ +#!/usr/bin/env python3 +"""Real PyTorch benchmark suite for Quadtrix. + +Measures the things an ML/AI engineer usually asks for: +model metadata, tokenizer/data throughput, forward latency, training-step +latency, autoregressive generation latency, memory, and JSON/CSV output. +""" + +from __future__ import annotations + +import argparse +import csv +import gc +import importlib.util +import json +import math +import platform +import statistics +import sys +import time +from dataclasses import dataclass, asdict +from pathlib import Path +from typing import Any, Callable + + +ROOT = Path(__file__).resolve().parents[1] +ENGINE_INFERENCE = ROOT / "engine" / "inference.py" +DEFAULT_DATA = ROOT / "engine" / "input.txt" +DEFAULT_OUT = ROOT / "benchmark" / "results" + + +def load_engine_module(): + spec = importlib.util.spec_from_file_location("quadtrix_engine_inference", ENGINE_INFERENCE) + if spec is None or spec.loader is None: + raise RuntimeError(f"Cannot import {ENGINE_INFERENCE}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def now_iso() -> str: + return time.strftime("%Y-%m-%dT%H:%M:%S%z") + + +def percentile(values: list[float], pct: float) -> float: + if not values: + return 0.0 + ordered = sorted(values) + pos = (len(ordered) - 1) * pct + lo = math.floor(pos) + hi = math.ceil(pos) + if lo == hi: + return ordered[lo] + return ordered[lo] + (ordered[hi] - ordered[lo]) * (pos - lo) + + +def summarize_ms(samples: list[float]) -> dict[str, float]: + mean = statistics.fmean(samples) + return { + "avg_ms": mean, + "median_ms": statistics.median(samples), + "min_ms": min(samples), + "max_ms": max(samples), + "p90_ms": percentile(samples, 0.90), + "p95_ms": percentile(samples, 0.95), + "std_ms": statistics.pstdev(samples) if len(samples) > 1 else 0.0, + } + + +def sync(torch: Any, device: Any) -> None: + if str(device).startswith("cuda"): + torch.cuda.synchronize() + + +def timed_samples( + torch: Any, + device: Any, + fn: Callable[[], Any], + runs: int, + warmup: int, +) -> tuple[list[float], Any]: + last = None + for _ in range(warmup): + last = fn() + sync(torch, device) + + samples: list[float] = [] + for _ in range(runs): + start = time.perf_counter() + last = fn() + sync(torch, device) + samples.append((time.perf_counter() - start) * 1000.0) + return samples, last + + +def cuda_memory(torch: Any, device: Any) -> dict[str, float]: + if not str(device).startswith("cuda"): + return {} + return { + "cuda_allocated_mb": torch.cuda.memory_allocated(device) / (1024**2), + "cuda_reserved_mb": torch.cuda.memory_reserved(device) / (1024**2), + "cuda_peak_allocated_mb": torch.cuda.max_memory_allocated(device) / (1024**2), + } + + +def process_rss_mb() -> float | None: + try: + import psutil + + return psutil.Process().memory_info().rss / (1024**2) + except Exception: + return None + + +@dataclass +class BenchRow: + suite: str + name: str + backend: str + batch_size: int = 0 + sequence_length: int = 0 + tokens: int = 0 + avg_ms: float = 0.0 + median_ms: float = 0.0 + min_ms: float = 0.0 + max_ms: float = 0.0 + p90_ms: float = 0.0 + p95_ms: float = 0.0 + std_ms: float = 0.0 + tokens_per_sec: float = 0.0 + samples: int = 0 + loss: float | None = None + memory_mb: float | None = None + notes: str = "" + + +class QuadtrixPythonBenchmark: + def __init__(self, args: argparse.Namespace): + self.args = args + self.engine = load_engine_module() + self.torch = __import__("torch") + self.torch.manual_seed(args.seed) + self.device = self.engine.device + self.rows: list[BenchRow] = [] + + if str(self.device).startswith("cuda"): + self.torch.cuda.reset_peak_memory_stats(self.device) + + self.model = self._make_model() + self.model.eval() + + def _make_model(self): + checkpoint = Path(self.args.checkpoint) if self.args.checkpoint else self.engine.default_checkpoint_path() + if checkpoint.exists() and not self.args.random_weights: + return self.engine.load_model(checkpoint) + + model = self.engine.GPTLanguageModel().to(self.device) + model.eval() + return model + + def _record(self, row: BenchRow) -> None: + self.rows.append(row) + print( + f"{row.suite:<14} {row.name:<24} " + f"avg={row.avg_ms:9.3f} ms " + f"p95={row.p95_ms:9.3f} ms " + f"tok/s={row.tokens_per_sec:10.1f}" + ) + + def run(self) -> dict[str, Any]: + print("Quadtrix Python Benchmark") + print(f"Device: {self.device}") + print(f"Runs: {self.args.runs}, warmup: {self.args.warmup}") + + self.bench_tokenizer_and_data() + self.bench_primitives() + self.bench_forward() + self.bench_training_step() + self.bench_generation() + + return self.save() + + def bench_tokenizer_and_data(self) -> None: + data_path = Path(self.args.data) + text = data_path.read_text(encoding="utf-8") if data_path.exists() else "Quadtrix benchmark text. " * 512 + if self.args.max_data_chars and len(text) > self.args.max_data_chars: + text = text[: self.args.max_data_chars] + tokenizer = self.engine.tokenizer + + samples, encoded = timed_samples( + self.torch, + self.device, + lambda: tokenizer.encode(text), + self.args.runs, + self.args.warmup, + ) + stats = summarize_ms(samples) + self._record( + BenchRow( + suite="data", + name="tokenizer_encode", + backend="python", + tokens=len(encoded), + tokens_per_sec=len(encoded) / (stats["avg_ms"] / 1000.0), + samples=len(samples), + memory_mb=process_rss_mb(), + **stats, + ) + ) + + tensor = self.torch.tensor(encoded, dtype=self.torch.long) + max_block = min(self.engine.block_size, max(2, len(tensor) - 2)) + batch_size = min(self.args.batch_size, max(1, len(tensor) - max_block - 1)) + + def make_batch(): + ix = self.torch.randint(len(tensor) - max_block - 1, (batch_size,)) + x = self.torch.stack([tensor[i : i + max_block] for i in ix]).to(self.device) + y = self.torch.stack([tensor[i + 1 : i + max_block + 1] for i in ix]).to(self.device) + return x, y + + samples, _ = timed_samples(self.torch, self.device, make_batch, self.args.runs, self.args.warmup) + stats = summarize_ms(samples) + self._record( + BenchRow( + suite="data", + name="batch_sample_to_device", + backend="python", + batch_size=batch_size, + sequence_length=max_block, + tokens=batch_size * max_block, + tokens_per_sec=(batch_size * max_block) / (stats["avg_ms"] / 1000.0), + samples=len(samples), + memory_mb=process_rss_mb(), + **stats, + ) + ) + + def bench_primitives(self) -> None: + torch = self.torch + cases = [ + ("matmul_3d", (1, 16, self.engine.n_embd), (self.engine.n_embd, self.engine.n_embd)), + ("matmul_3d", (self.args.batch_size, self.engine.block_size, self.engine.n_embd), (self.engine.n_embd, self.engine.n_embd)), + ("attention_scores", (self.args.batch_size, self.engine.block_size, self.engine.n_embd), None), + ] + for name, x_shape, w_shape in cases: + x = torch.randn(*x_shape, device=self.device) + if name == "attention_scores": + fn = lambda: torch.softmax((x @ x.transpose(-2, -1)) / math.sqrt(x_shape[-1]), dim=-1) + else: + w = torch.randn(*w_shape, device=self.device) + fn = lambda: x @ w + samples, _ = timed_samples(torch, self.device, fn, self.args.runs, self.args.warmup) + stats = summarize_ms(samples) + tokens = x_shape[0] * x_shape[1] + self._record( + BenchRow( + suite="primitive", + name=f"{name}_{x_shape[0]}x{x_shape[1]}", + backend="python", + batch_size=x_shape[0], + sequence_length=x_shape[1], + tokens=tokens, + tokens_per_sec=tokens / (stats["avg_ms"] / 1000.0), + samples=len(samples), + memory_mb=process_rss_mb(), + **stats, + ) + ) + + def bench_forward(self) -> None: + torch = self.torch + cases = [(1, 8), (1, self.engine.block_size), (self.args.batch_size, self.engine.block_size)] + self.model.eval() + for batch_size, seq_len in cases: + idx = torch.randint(self.engine.vocab_size, (batch_size, seq_len), device=self.device) + targets = torch.randint(self.engine.vocab_size, (batch_size, seq_len), device=self.device) + + @torch.no_grad() + def fn(): + return self.model(idx, targets) + + samples, last = timed_samples(torch, self.device, fn, self.args.runs, self.args.warmup) + stats = summarize_ms(samples) + loss = float(last[1].item()) + tokens = batch_size * seq_len + self._record( + BenchRow( + suite="forward", + name=f"batch{batch_size}_seq{seq_len}", + backend="python", + batch_size=batch_size, + sequence_length=seq_len, + tokens=tokens, + tokens_per_sec=tokens / (stats["avg_ms"] / 1000.0), + samples=len(samples), + loss=loss, + memory_mb=process_rss_mb(), + **stats, + ) + ) + + def bench_training_step(self) -> None: + torch = self.torch + model = self.engine.GPTLanguageModel().to(self.device) + model.train() + optimizer = torch.optim.AdamW(model.parameters(), lr=self.args.learning_rate) + batch_size = self.args.batch_size + seq_len = self.engine.block_size + idx = torch.randint(self.engine.vocab_size, (batch_size, seq_len), device=self.device) + targets = torch.randint(self.engine.vocab_size, (batch_size, seq_len), device=self.device) + + def fn(): + optimizer.zero_grad(set_to_none=True) + _, loss = model(idx, targets) + loss.backward() + optimizer.step() + return loss.detach() + + samples, loss = timed_samples(torch, self.device, fn, self.args.train_steps, self.args.warmup) + stats = summarize_ms(samples) + tokens = batch_size * seq_len + self._record( + BenchRow( + suite="training", + name=f"adamw_step_b{batch_size}_s{seq_len}", + backend="python", + batch_size=batch_size, + sequence_length=seq_len, + tokens=tokens, + tokens_per_sec=tokens / (stats["avg_ms"] / 1000.0), + samples=len(samples), + loss=float(loss.item()), + memory_mb=process_rss_mb(), + **stats, + ) + ) + del model + gc.collect() + + def bench_generation(self) -> None: + torch = self.torch + tokenizer = self.engine.tokenizer + prompts = [ + ("empty", ""), + ("short", "The future of local AI is"), + ("long", "Quadtrix is a compact transformer benchmark that measures " * 4), + ] + self.model.eval() + for label, prompt in prompts: + encoded = tokenizer.encode(prompt) or [0] + encoded = encoded[-self.engine.block_size :] + idx = torch.tensor([encoded], dtype=torch.long, device=self.device) + + @torch.no_grad() + def fn(): + return self.model.generate(idx, self.args.generate_tokens, temperature=1.0, top_k=self.args.top_k) + + samples, _ = timed_samples(torch, self.device, fn, self.args.runs, self.args.warmup) + stats = summarize_ms(samples) + self._record( + BenchRow( + suite="generation", + name=label, + backend="python", + batch_size=1, + sequence_length=len(encoded), + tokens=self.args.generate_tokens, + tokens_per_sec=self.args.generate_tokens / (stats["avg_ms"] / 1000.0), + samples=len(samples), + memory_mb=process_rss_mb(), + **stats, + ) + ) + + def save(self) -> dict[str, Any]: + out_dir = Path(self.args.out) + out_dir.mkdir(parents=True, exist_ok=True) + n_params = sum(p.numel() for p in self.model.parameters()) + result = { + "schema_version": 1, + "timestamp": now_iso(), + "backend": "python", + "system": { + "platform": platform.platform(), + "python": sys.version.split()[0], + "torch": self.torch.__version__, + "device": str(self.device), + "cuda": getattr(self.torch.version, "cuda", None), + "rss_mb": process_rss_mb(), + **cuda_memory(self.torch, self.device), + }, + "model": { + "vocab_size": self.engine.vocab_size, + "block_size": self.engine.block_size, + "n_embd": self.engine.n_embd, + "n_head": self.engine.n_head, + "n_layer": self.engine.n_layer, + "dropout": self.engine.dropout, + "parameters": n_params, + "parameter_mb_fp32": n_params * 4 / (1024**2), + }, + "config": {key: str(value) if isinstance(value, Path) else value for key, value in vars(self.args).items()}, + "results": [asdict(row) for row in self.rows], + } + json_path = out_dir / "python_benchmark.json" + csv_path = out_dir / "python_benchmark.csv" + json_path.write_text(json.dumps(result, indent=2), encoding="utf-8") + with csv_path.open("w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=list(asdict(self.rows[0]).keys())) + writer.writeheader() + for row in self.rows: + writer.writerow(asdict(row)) + print(f"Saved {json_path}") + print(f"Saved {csv_path}") + return result + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run real Quadtrix PyTorch benchmarks.") + parser.add_argument("--data", type=Path, default=DEFAULT_DATA) + parser.add_argument("--checkpoint", type=Path, default=None) + parser.add_argument("--out", type=Path, default=DEFAULT_OUT) + parser.add_argument("--runs", type=int, default=10) + parser.add_argument("--warmup", type=int, default=3) + parser.add_argument("--batch-size", type=int, default=4) + parser.add_argument("--train-steps", type=int, default=5) + parser.add_argument("--generate-tokens", type=int, default=32) + parser.add_argument("--learning-rate", type=float, default=3e-4) + parser.add_argument("--top-k", type=int, default=None) + parser.add_argument("--seed", type=int, default=1337) + parser.add_argument("--max-data-chars", type=int, default=1_000_000) + parser.add_argument("--random-weights", action="store_true", help="Do not load a checkpoint even if one exists.") + parser.add_argument("--quick", action="store_true", help="Short run for smoke tests.") + args = parser.parse_args() + if args.quick: + args.runs = 2 + args.warmup = 1 + args.train_steps = 1 + args.generate_tokens = 4 + args.max_data_chars = min(args.max_data_chars, 50_000) + return args + + +def main() -> int: + try: + benchmark = QuadtrixPythonBenchmark(parse_args()) + benchmark.run() + return 0 + except ImportError as exc: + print(f"Missing Python benchmark dependency: {exc}", file=sys.stderr) + print("Install the engine requirements, including torch and tiktoken.", file=sys.stderr) + return 2 + + +if __name__ == "__main__": + raise SystemExit(main()) From 0e671ec3828205c74bbf2b5f23b569bf23fbbf98 Mon Sep 17 00:00:00 2001 From: Eamon Date: Sat, 16 May 2026 19:08:38 +0530 Subject: [PATCH 6/6] feat(benchmarks): implement core C++ benchmark for Quadtrix Introduces the primary C++ benchmark runner (cpp_benchmark.cpp). It defines the parsing configurations, tracking metrics structures (Stats and BenchRow), and basic time/utility abstractions needed to mirror the Python benchmark suite capabilities. --- benchmark/cpp_benchmark.cpp | 605 ++++++++++++++++++++++++++++++++++++ 1 file changed, 605 insertions(+) create mode 100644 benchmark/cpp_benchmark.cpp diff --git a/benchmark/cpp_benchmark.cpp b/benchmark/cpp_benchmark.cpp new file mode 100644 index 0000000..6422813 --- /dev/null +++ b/benchmark/cpp_benchmark.cpp @@ -0,0 +1,605 @@ +// Real C++ benchmark suite for Quadtrix.cpp. +// +// Build: +// g++ -std=c++17 -O3 -DNDEBUG -I. benchmark/cpp_benchmark.cpp -o benchmark/quadtrix_cpp_bench +// +// Run: +// benchmark/quadtrix_cpp_bench --quick +// benchmark/quadtrix_cpp_bench --data data/input.txt --model best_model.bin + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../config/config.h" +#include "../include/backward.h" +#include "../include/dataloader.h" +#include "../include/gpt.h" + +#if __has_include() +#include +namespace fs = std::filesystem; +#endif + +struct Options +{ + std::string data_path = DEFAULT_CLEANED_PATH; + std::string model_path = BEST_MODEL_PATH; + std::string out_dir = "benchmark/results"; + int runs = 10; + int warmup = 3; + int batch_size = BATCH_SIZE; + int train_steps = 3; + int generate_tokens = 32; + int max_data_chars = 0; + bool quick = false; + bool random_weights = false; +}; + +struct Stats +{ + double avg_ms{0.0}, median_ms{0.0}, min_ms{0.0}, max_ms{0.0}; + double p90_ms{0.0}, p95_ms{0.0}, std_ms{0.0}; +}; + +struct BenchRow +{ + std::string suite; + std::string name; + int batch_size{0}; + int sequence_length{0}; + int tokens{0}; + Stats stats; + double tokens_per_sec{0.0}; + int samples{0}; + double loss{0.0}; + bool has_loss{false}; + double memory_mb{0.0}; + std::string notes; +}; + +static double wall_ms() +{ + using namespace std::chrono; + return duration(steady_clock::now().time_since_epoch()).count(); +} + +static bool file_exists(const std::string &path) +{ + std::ifstream f(path.c_str(), std::ios::binary); + return f.good(); +} + +static std::string now_iso() +{ + std::time_t t = std::time(nullptr); + char buf[40]; + std::strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S%z", std::localtime(&t)); + return buf; +} + +static std::string json_escape(const std::string &s) +{ + std::ostringstream out; + for (char ch : s) + { + switch (ch) + { + case '\\': + out << "\\\\"; + break; + case '"': + out << "\\\""; + break; + case '\n': + out << "\\n"; + break; + case '\r': + out << "\\r"; + break; + case '\t': + out << "\\t"; + break; + default: + out << ch; + } + } + return out.str(); +} + +static double percentile(std::vector values, double pct) +{ + if (values.empty()) + return 0.0; + std::sort(values.begin(), values.end()); + double pos = (values.size() - 1) * pct; + int lo = (int)std::floor(pos); + int hi = (int)std::ceil(pos); + if (lo == hi) + return values[lo]; + return values[lo] + (values[hi] - values[lo]) * (pos - lo); +} + +static Stats summarize(const std::vector &samples) +{ + Stats s; + std::vector sorted = samples; + std::sort(sorted.begin(), sorted.end()); + s.avg_ms = std::accumulate(samples.begin(), samples.end(), 0.0) / samples.size(); + s.median_ms = sorted[sorted.size() / 2]; + if (sorted.size() % 2 == 0) + s.median_ms = 0.5 * (sorted[sorted.size() / 2 - 1] + sorted[sorted.size() / 2]); + s.min_ms = sorted.front(); + s.max_ms = sorted.back(); + s.p90_ms = percentile(samples, 0.90); + s.p95_ms = percentile(samples, 0.95); + double sq = 0.0; + for (double x : samples) + sq += (x - s.avg_ms) * (x - s.avg_ms); + s.std_ms = std::sqrt(sq / samples.size()); + return s; +} + +template +static std::pair run_timed(int runs, int warmup, Fn fn) +{ + double sink = 0.0; + for (int i = 0; i < warmup; ++i) + sink += fn(); + std::vector samples; + samples.reserve(runs); + for (int i = 0; i < runs; ++i) + { + double t0 = wall_ms(); + sink += fn(); + samples.push_back(wall_ms() - t0); + } + volatile double keep = sink; + (void)keep; + return {summarize(samples), sink}; +} + +static double estimated_model_mb(const GPTLanguageModel &model) +{ + return (double)model.num_params() * sizeof(float) / (1024.0 * 1024.0); +} + +static void push_row(std::vector &rows, const BenchRow &row) +{ + rows.push_back(row); + const BenchRow &r = rows.back(); + std::cout << std::left << std::setw(14) << r.suite + << std::setw(24) << r.name + << "avg=" << std::right << std::setw(9) << std::fixed << std::setprecision(3) + << r.stats.avg_ms << " ms p95=" << std::setw(9) + << r.stats.p95_ms << " ms tok/s=" << std::setw(10) + << std::setprecision(1) << r.tokens_per_sec << "\n"; +} + +static void prepare_dataloader_from_text(DataLoader &dl, const std::string &text, double train_split) +{ + if (text.empty()) + throw std::runtime_error("[Benchmark] Input text is empty."); + + dl.text = text; + std::set charset(text.begin(), text.end()); + dl.chars = std::vector(charset.begin(), charset.end()); + std::sort(dl.chars.begin(), dl.chars.end()); + dl.vocab_size = (int)dl.chars.size(); + dl.stoi.clear(); + dl.itos.clear(); + for (int i = 0; i < dl.vocab_size; ++i) + { + dl.stoi[dl.chars[i]] = i; + dl.itos[i] = dl.chars[i]; + } + + std::vector data; + data.reserve(text.size()); + for (char c : text) + data.push_back(dl.stoi.at(c)); + + int n = (int)(train_split * data.size()); + dl.train_data = std::vector(data.begin(), data.begin() + n); + dl.val_data = std::vector(data.begin() + n, data.end()); + if (dl.train_data.size() <= (size_t)BLOCK_SIZE || dl.val_data.size() <= (size_t)BLOCK_SIZE) + throw std::runtime_error("[Benchmark] Capped dataset is too small for BLOCK_SIZE."); +} + +static void load_data(const Options &opt, DataLoader &dl) +{ + if (opt.max_data_chars <= 0) + { + dl.load(opt.data_path); + return; + } + + std::ifstream f(opt.data_path.c_str(), std::ios::binary); + if (!f.is_open()) + throw std::runtime_error("[Benchmark] Cannot open file: " + opt.data_path); + + std::string text((std::istreambuf_iterator(f)), std::istreambuf_iterator()); + if ((int)text.size() > opt.max_data_chars) + text.resize(opt.max_data_chars); + prepare_dataloader_from_text(dl, text, TRAIN_SPLIT); + std::cout << "[DATA] Total characters : " << dl.text.size() << " (capped)\n"; + std::cout << "[DATA] Vocabulary size : " << dl.vocab_size << "\n"; + std::cout << "[DATA] Train tokens : " << dl.train_data.size() << "\n"; + std::cout << "[DATA] Val tokens : " << dl.val_data.size() << "\n"; +} + +static void bench_data(const Options &opt, DataLoader &dl, std::vector &rows) +{ + std::string text = dl.text; + auto encode_fn = [&]() -> double + { + std::vector ids = dl.encode(text); + return (double)ids.size(); + }; + auto timed = run_timed(opt.runs, opt.warmup, encode_fn); + BenchRow row; + row.suite = "data"; + row.name = "char_encode"; + row.tokens = (int)text.size(); + row.stats = timed.first; + row.tokens_per_sec = row.tokens / (row.stats.avg_ms / 1000.0); + row.samples = opt.runs; + row.memory_mb = 0.0; + push_row(rows, row); + + std::mt19937 rng(SEED); + int seq_len = BLOCK_SIZE; + int batch_size = opt.batch_size; + auto batch_fn = [&]() -> double + { + auto batch = dl.get_batch("train", batch_size, seq_len, rng); + return (double)(batch.first[0] + batch.second[0]); + }; + timed = run_timed(opt.runs, opt.warmup, batch_fn); + row = BenchRow(); + row.suite = "data"; + row.name = "batch_sample"; + row.batch_size = batch_size; + row.sequence_length = seq_len; + row.tokens = batch_size * seq_len; + row.stats = timed.first; + row.tokens_per_sec = row.tokens / (row.stats.avg_ms / 1000.0); + row.samples = opt.runs; + push_row(rows, row); +} + +static void bench_primitives(const Options &opt, std::vector &rows) +{ + std::mt19937 rng(SEED); + struct Case + { + std::string name; + int B, T, D, E; + }; + std::vector cases = { + {"matmul_3d", 1, 16, N_EMBD, N_EMBD}, + {"matmul_3d", opt.batch_size, BLOCK_SIZE, N_EMBD, N_EMBD}, + {"softmax3d", opt.batch_size, BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE}, + {"layer_norm", opt.batch_size, BLOCK_SIZE, N_EMBD, N_EMBD}, + }; + + for (const auto &c : cases) + { + Tensor x = Tensor::randn({c.B, c.T, c.D}, 0.0f, 1.0f, rng); + Tensor w = Tensor::randn({c.D, c.E}, 0.0f, 0.02f, rng); + Tensor gamma = Tensor::ones({c.D}); + Tensor beta = Tensor::zeros({c.D}); + auto fn = [&]() -> double + { + Tensor out; + if (c.name == "softmax3d") + out = softmax3d(x); + else if (c.name == "layer_norm") + out = layer_norm(x, gamma, beta); + else + out = matmul(x, w); + return out.data.empty() ? 0.0 : out.data[0]; + }; + auto timed = run_timed(opt.runs, opt.warmup, fn); + BenchRow row; + row.suite = "primitive"; + row.name = c.name + "_" + std::to_string(c.B) + "x" + std::to_string(c.T); + row.batch_size = c.B; + row.sequence_length = c.T; + row.tokens = c.B * c.T; + row.stats = timed.first; + row.tokens_per_sec = row.tokens / (row.stats.avg_ms / 1000.0); + row.samples = opt.runs; + push_row(rows, row); + } +} + +static void bench_forward(const Options &opt, GPTLanguageModel &model, DataLoader &dl, std::vector &rows) +{ + std::mt19937 rng(SEED); + std::vector> cases = {{1, 8}, {1, BLOCK_SIZE}, {opt.batch_size, BLOCK_SIZE}}; + for (auto c : cases) + { + int B = c.first; + int T = c.second; + auto batch = dl.get_batch("train", B, T, rng); + float last_loss = 0.0f; + auto fn = [&]() -> double + { + auto out = model.forward(batch.first, B, T, batch.second, false); + last_loss = out.second; + return out.first.data.empty() ? 0.0 : out.first.data[0]; + }; + auto timed = run_timed(opt.runs, opt.warmup, fn); + BenchRow row; + row.suite = "forward"; + row.name = "batch" + std::to_string(B) + "_seq" + std::to_string(T); + row.batch_size = B; + row.sequence_length = T; + row.tokens = B * T; + row.stats = timed.first; + row.tokens_per_sec = row.tokens / (row.stats.avg_ms / 1000.0); + row.samples = opt.runs; + row.loss = last_loss; + row.has_loss = true; + row.memory_mb = estimated_model_mb(model); + push_row(rows, row); + } +} + +static void bench_training_step(const Options &opt, GPTLanguageModel &model, DataLoader &dl, std::vector &rows) +{ + std::mt19937 rng(SEED); + AdamWState optimizer = build_optimizer(model, LEARNING_RATE); + auto batch = dl.get_batch("train", opt.batch_size, BLOCK_SIZE, rng); + float last_loss = 0.0f; + + auto fn = [&]() -> double + { + SavedForward saved = forward_save(model, batch.first, opt.batch_size, BLOCK_SIZE, batch.second, true); + last_loss = cross_entropy(saved.logits2d, batch.second); + Grads grads = backward(model, saved); + apply_grads(model, grads, optimizer); + return last_loss; + }; + auto timed = run_timed(opt.train_steps, opt.warmup, fn); + BenchRow row; + row.suite = "training"; + row.name = "adamw_step"; + row.batch_size = opt.batch_size; + row.sequence_length = BLOCK_SIZE; + row.tokens = opt.batch_size * BLOCK_SIZE; + row.stats = timed.first; + row.tokens_per_sec = row.tokens / (row.stats.avg_ms / 1000.0); + row.samples = opt.train_steps; + row.loss = last_loss; + row.has_loss = true; + row.memory_mb = estimated_model_mb(model); + push_row(rows, row); +} + +static void bench_generation(const Options &opt, GPTLanguageModel &model, DataLoader &dl, std::vector &rows) +{ + std::vector> prompts = { + {"empty", ""}, + {"short", "The future of local AI is"}, + {"long", "Quadtrix is a compact transformer benchmark that measures "}}; + for (const auto &p : prompts) + { + std::vector ctx = dl.encode(p.second); + if (ctx.empty()) + ctx = {0}; + if ((int)ctx.size() > BLOCK_SIZE) + ctx = std::vector(ctx.end() - BLOCK_SIZE, ctx.end()); + int prompt_len = (int)ctx.size(); + auto fn = [&]() -> double + { + std::vector out = model.generate(ctx, opt.generate_tokens); + return (double)out.back(); + }; + auto timed = run_timed(opt.runs, opt.warmup, fn); + BenchRow row; + row.suite = "generation"; + row.name = p.first; + row.batch_size = 1; + row.sequence_length = prompt_len; + row.tokens = opt.generate_tokens; + row.stats = timed.first; + row.tokens_per_sec = row.tokens / (row.stats.avg_ms / 1000.0); + row.samples = opt.runs; + row.memory_mb = estimated_model_mb(model); + push_row(rows, row); + } +} + +static void ensure_dir(const std::string &dir) +{ +#if __has_include() + fs::create_directories(dir); +#else + (void)dir; +#endif +} + +static void save_results(const Options &opt, const GPTLanguageModel &model, const DataLoader &dl, const std::vector &rows) +{ + ensure_dir(opt.out_dir); + std::string json_path = opt.out_dir + "/cpp_benchmark.json"; + std::string csv_path = opt.out_dir + "/cpp_benchmark.csv"; + + std::ofstream j(json_path.c_str()); + j << std::fixed << std::setprecision(6); + j << "{\n"; + j << " \"schema_version\": 1,\n"; + j << " \"timestamp\": \"" << json_escape(now_iso()) << "\",\n"; + j << " \"backend\": \"cpp\",\n"; + j << " \"system\": {\n"; + j << " \"compiler\": \"" << json_escape( +#ifdef __VERSION__ + __VERSION__ +#else + "unknown" +#endif + ) << "\",\n"; + j << " \"standard\": \"C++17\"\n"; + j << " },\n"; + j << " \"model\": {\n"; + j << " \"vocab_size\": " << dl.vocab_size << ",\n"; + j << " \"block_size\": " << BLOCK_SIZE << ",\n"; + j << " \"n_embd\": " << N_EMBD << ",\n"; + j << " \"n_head\": " << N_HEAD << ",\n"; + j << " \"n_layer\": " << N_LAYER << ",\n"; + j << " \"dropout\": " << DROPOUT << ",\n"; + j << " \"parameters\": " << model.num_params() << ",\n"; + j << " \"parameter_mb_fp32\": " << estimated_model_mb(model) << "\n"; + j << " },\n"; + j << " \"config\": {\n"; + j << " \"data\": \"" << json_escape(opt.data_path) << "\",\n"; + j << " \"model\": \"" << json_escape(opt.model_path) << "\",\n"; + j << " \"runs\": " << opt.runs << ",\n"; + j << " \"warmup\": " << opt.warmup << ",\n"; + j << " \"batch_size\": " << opt.batch_size << ",\n"; + j << " \"train_steps\": " << opt.train_steps << ",\n"; + j << " \"generate_tokens\": " << opt.generate_tokens << ",\n"; + j << " \"max_data_chars\": " << opt.max_data_chars << ",\n"; + j << " \"random_weights\": " << (opt.random_weights ? "true" : "false") << "\n"; + j << " },\n"; + j << " \"results\": [\n"; + for (size_t i = 0; i < rows.size(); ++i) + { + const BenchRow &r = rows[i]; + j << " {\n"; + j << " \"suite\": \"" << json_escape(r.suite) << "\",\n"; + j << " \"name\": \"" << json_escape(r.name) << "\",\n"; + j << " \"backend\": \"cpp\",\n"; + j << " \"batch_size\": " << r.batch_size << ",\n"; + j << " \"sequence_length\": " << r.sequence_length << ",\n"; + j << " \"tokens\": " << r.tokens << ",\n"; + j << " \"avg_ms\": " << r.stats.avg_ms << ",\n"; + j << " \"median_ms\": " << r.stats.median_ms << ",\n"; + j << " \"min_ms\": " << r.stats.min_ms << ",\n"; + j << " \"max_ms\": " << r.stats.max_ms << ",\n"; + j << " \"p90_ms\": " << r.stats.p90_ms << ",\n"; + j << " \"p95_ms\": " << r.stats.p95_ms << ",\n"; + j << " \"std_ms\": " << r.stats.std_ms << ",\n"; + j << " \"tokens_per_sec\": " << r.tokens_per_sec << ",\n"; + j << " \"samples\": " << r.samples << ",\n"; + if (r.has_loss) + j << " \"loss\": " << r.loss << ",\n"; + else + j << " \"loss\": null,\n"; + j << " \"memory_mb\": " << r.memory_mb << ",\n"; + j << " \"notes\": \"" << json_escape(r.notes) << "\"\n"; + j << " }" << (i + 1 < rows.size() ? "," : "") << "\n"; + } + j << " ]\n"; + j << "}\n"; + + std::ofstream c(csv_path.c_str()); + c << "suite,name,backend,batch_size,sequence_length,tokens,avg_ms,median_ms,min_ms,max_ms,p90_ms,p95_ms,std_ms,tokens_per_sec,samples,loss,memory_mb,notes\n"; + c << std::fixed << std::setprecision(6); + for (const auto &r : rows) + { + c << r.suite << "," << r.name << ",cpp," + << r.batch_size << "," << r.sequence_length << "," << r.tokens << "," + << r.stats.avg_ms << "," << r.stats.median_ms << "," << r.stats.min_ms << "," + << r.stats.max_ms << "," << r.stats.p90_ms << "," << r.stats.p95_ms << "," + << r.stats.std_ms << "," << r.tokens_per_sec << "," << r.samples << ","; + if (r.has_loss) + c << r.loss; + c << "," << r.memory_mb << ",\"" << json_escape(r.notes) << "\"\n"; + } + + std::cout << "Saved " << json_path << "\n"; + std::cout << "Saved " << csv_path << "\n"; +} + +static Options parse_args(int argc, char **argv) +{ + Options opt; + for (int i = 1; i < argc; ++i) + { + std::string a = argv[i]; + if (a == "--data" && i + 1 < argc) + opt.data_path = argv[++i]; + else if (a == "--model" && i + 1 < argc) + opt.model_path = argv[++i]; + else if (a == "--out" && i + 1 < argc) + opt.out_dir = argv[++i]; + else if (a == "--runs" && i + 1 < argc) + opt.runs = std::atoi(argv[++i]); + else if (a == "--warmup" && i + 1 < argc) + opt.warmup = std::atoi(argv[++i]); + else if (a == "--batch-size" && i + 1 < argc) + opt.batch_size = std::atoi(argv[++i]); + else if (a == "--train-steps" && i + 1 < argc) + opt.train_steps = std::atoi(argv[++i]); + else if (a == "--generate-tokens" && i + 1 < argc) + opt.generate_tokens = std::atoi(argv[++i]); + else if (a == "--max-data-chars" && i + 1 < argc) + opt.max_data_chars = std::atoi(argv[++i]); + else if (a == "--random-weights") + opt.random_weights = true; + else if (a == "--quick") + opt.quick = true; + } + if (opt.quick) + { + opt.runs = 2; + opt.warmup = 1; + opt.train_steps = 1; + opt.generate_tokens = 4; + opt.max_data_chars = opt.max_data_chars > 0 ? std::min(opt.max_data_chars, 50000) : 50000; + } + opt.runs = std::max(1, opt.runs); + opt.warmup = std::max(0, opt.warmup); + opt.batch_size = std::max(1, opt.batch_size); + opt.train_steps = std::max(1, opt.train_steps); + opt.generate_tokens = std::max(1, opt.generate_tokens); + opt.max_data_chars = std::max(0, opt.max_data_chars); + return opt; +} + +int main(int argc, char **argv) +{ + Options opt = parse_args(argc, argv); + std::cout << "Quadtrix C++ Benchmark\n"; + std::cout << "Runs: " << opt.runs << ", warmup: " << opt.warmup << "\n"; + + DataLoader dl; + try + { + load_data(opt, dl); + } + catch (const std::exception &e) + { + std::cerr << e.what() << "\n"; + return 1; + } + + GPTLanguageModel model(dl.vocab_size, N_EMBD, N_HEAD, N_LAYER, BLOCK_SIZE, SEED); + if (!opt.random_weights && file_exists(opt.model_path)) + model.load(opt.model_path); + else + std::cout << "[INFO] Using random weights for model benchmarks.\n"; + + std::cout << "Parameters: " << model.num_params() << "\n"; + std::vector rows; + bench_data(opt, dl, rows); + bench_primitives(opt, rows); + bench_forward(opt, model, dl, rows); + bench_training_step(opt, model, dl, rows); + bench_generation(opt, model, dl, rows); + save_results(opt, model, dl, rows); + return 0; +}