diff --git a/benchmarks/speaker-id/.gitignore b/benchmarks/speaker-id/.gitignore
new file mode 100644
index 0000000..b9fca71
--- /dev/null
+++ b/benchmarks/speaker-id/.gitignore
@@ -0,0 +1,6 @@
+ground/
+meetings/
+results/
+.venv/
+__pycache__/
+*.pyc
diff --git a/benchmarks/speaker-id/README.md b/benchmarks/speaker-id/README.md
new file mode 100644
index 0000000..e513628
--- /dev/null
+++ b/benchmarks/speaker-id/README.md
@@ -0,0 +1,76 @@
+# Speaker ID Benchmark — dogfood edition
+
+A harness for measuring speaker identification quality against **your own meetings**. No synthetic corpora, no stitching — just label the meetings you already have and optimize against the conditions Char actually ships in.
+
+## Why this shape
+
+Speaker ID in `src/store.ts` uses hand-tuned thresholds with no way to measure impact of changes. Academic corpora (VoxCeleb, AMI) measure different conditions than what Char users hit — read speech, studio mics, celebrity interviews.
+
+Your last 20 Char meetings are the right benchmark. You know who was there. You remember which identifications went wrong. An evening of labeling is faster than a week of synthetic harness plumbing that ends up measuring the wrong thing.
+
+## Pipeline
+
+```
+meeting-*.md (your exported meetings)
+     ↓  label.py   (play turn, type name, once per speaker)
+ground/<meeting_id>.json
+     ↓  extract_embeddings.py   (TODO — Swift bridge)
+meetings/<meeting_id>/embeddings.json
+     ↓  score.py
+results/<name>.json
+     ↓  report.py
+accuracy · unknown_rejection · false_accept · calibration_error
+```
+
+## Metrics
+
+Four numbers tracked together. Optimizing only the first is how you ship regressions:
+
+- **Identification accuracy** — enrolled speakers correctly identified
+- **Unknown rejection rate** — unseen speakers correctly marked "no match"
+- **False-accept rate** — unseen speakers confidently mislabeled as someone else
+- **Calibration error** — gap between claimed confidence and actual correctness
+
+For each meeting, half the labeled humans become "enrolled profiles" and the other half are "strangers" the matcher should reject. This keeps the 0.04 margin gate honest.
+
+## Labeling
+
+```
+cd benchmarks/speaker-id
+brew install ffmpeg  # for auto-preview on macOS
+python3 label.py ~/Documents/unsigned\ char/meeting-abc123.md
+```
+
+The labeler picks one random long turn per speaker, plays 5 seconds, asks who they are. A 3-speaker meeting takes about a minute. Aim for 20 meetings — that gives you meaningful numbers without losing your evening.
+
+**Important:** only label meetings whose speakers are still unlabeled in the app (`Speaker 1`, `Speaker 2` in the export). If you've already assigned human names inside Char, the matcher's guesses will leak into the markdown and contaminate ground truth. Either export a fresh meeting before labeling, or switch to the upcoming `uchar meetings export --raw` command once it lands.
+
+## Scoring
+
+```
+python3 score.py --out results/latest.json
+python3 report.py results/latest.json --baseline baseline.json
+```
+
+## Directory layout
+
+```
+benchmarks/speaker-id/
+├── README.md       # this file
+├── label.py        # interactive labeler
+├── score.py        # runs store.ts matching logic against embeddings + ground
+├── report.py       # prints metrics table with baseline delta
+├── ground/         # gitignored — your labels, your voices
+├── meetings/       # gitignored — per-meeting embeddings.json
+├── results/        # gitignored — metric runs
+└── baseline.json   # committed — main-branch numbers, no audio in it
+```
+
+## What's missing
+
+- **`extract_embeddings.py`** — the Swift speaker-embedding extractor needs a CLI entry point. Until that lands, `embeddings.json` doesn't get generated. Tracked in #3.
+- **`baseline.json`** — populated once the pipeline runs end-to-end on real meetings.
+
+## Note on privacy
+
+`ground/` and `meetings/` are gitignored on purpose. These directories hold your voice samples. Never commit them. `baseline.json` contains only aggregate numbers and is safe to share.
diff --git a/benchmarks/speaker-id/label.py b/benchmarks/speaker-id/label.py
new file mode 100644
index 0000000..2a7d015
--- /dev/null
+++ b/benchmarks/speaker-id/label.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+"""Label speakers in an unsigned Char meeting.
+
+Reads a meeting markdown export, plays a sample turn for each speaker, and
+asks who they are. Saves ground-truth labels to ground/<meeting_id>.json.
+
+Label by SPEAKER, not by turn. For a 30-minute meeting with 3 speakers, you
+answer 3 questions — not 80.
+
+Works against meetings whose speakers are still unlabeled in the app
+(Speaker 1, Speaker 2, ...). If you've already typed human names into the
+app, export a fresh meeting before labeling — otherwise the matcher's
+guesses leak into ground truth.
+
+Usage:
+  python label.py ~/Documents/unsigned\\ char/meeting-abc123.md
+
+Keys:
+  <name><enter>     assign this speaker
+  ?                 replay a different turn from this speaker
+  s                 skip (not labeled — excluded from scoring)
+  q                 save and quit
+"""
+
+import argparse
+import json
+import random
+import re
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+
+PREVIEW_SECONDS = 5.0
+SPEAKER_TURN_PATTERN = re.compile(
+    r"^\s*-\s+(?P<speaker>.+?)\s*:\s+"
+    r"(?P<start>\d{1,2}:\d{2})-(?P<end>\d{1,2}:\d{2})\s*$",
+    re.MULTILINE,
+)
+
+
+@dataclass
+class Turn:
+    speaker: str
+    start_seconds: float
+    end_seconds: float
+
+
+def parse_clock(value: str) -> float:
+    minutes, seconds = value.split(":")
+    return int(minutes) * 60 + int(seconds)
+
+
+def parse_meeting(markdown_path: Path) -> tuple[dict, list[Turn], Path]:
+    text = markdown_path.read_text()
+
+    frontmatter = {}
+    if text.startswith("---"):
+        end = text.find("\n---", 3)
+        if end > 0:
+            for line in text[3:end].splitlines():
+                if ":" in line:
+                    key, value = line.split(":", 1)
+                    frontmatter[key.strip()] = value.strip()
+            text = text[end + 4 :]
+
+    audio_path_raw = frontmatter.get("audio_path", "").strip()
+    if not audio_path_raw:
+        raise SystemExit(f"No audio_path in frontmatter of {markdown_path}")
+    audio_path = Path(audio_path_raw).expanduser()
+    if not audio_path.is_absolute():
+        audio_path = (markdown_path.parent / audio_path).resolve()
+    if not audio_path.exists():
+        raise SystemExit(f"Audio file not found: {audio_path}")
+
+    turns_section = text.split("## Speaker Turns", 1)
+    if len(turns_section) < 2:
+        raise SystemExit(f"No Speaker Turns section in {markdown_path}")
+
+    turns = []
+    for match in SPEAKER_TURN_PATTERN.finditer(turns_section[1]):
+        speaker = match.group("speaker").strip()
+        if speaker.lower() == "pipeline":
+            continue
+        turns.append(
+            Turn(
+                speaker=speaker,
+                start_seconds=parse_clock(match.group("start")),
+                end_seconds=parse_clock(match.group("end")),
+            )
+        )
+
+    if not turns:
+        raise SystemExit(f"No diarization turns parsed from {markdown_path}")
+
+    return frontmatter, turns, audio_path
+
+
+def group_by_speaker(turns: list[Turn]) -> dict[str, list[Turn]]:
+    groups: dict[str, list[Turn]] = {}
+    for turn in turns:
+        groups.setdefault(turn.speaker, []).append(turn)
+    for turns_list in groups.values():
+        turns_list.sort(key=lambda t: t.end_seconds - t.start_seconds, reverse=True)
+    return groups
+
+
+def play_clip(audio_path: Path, start: float, duration: float) -> None:
+    if sys.platform != "darwin":
+        print(f"  (auto-play unsupported on {sys.platform}: seek to {start:.1f}s)")
+        return
+    tmp = Path("/tmp") / f"uchar_label_{start:.0f}.wav"
+    try:
+        subprocess.run(
+            [
+                "ffmpeg",
+                "-y",
+                "-loglevel",
+                "error",
+                "-ss",
+                str(start),
+                "-t",
+                str(duration),
+                "-i",
+                str(audio_path),
+                str(tmp),
+            ],
+            check=True,
+        )
+    except FileNotFoundError:
+        print("  ffmpeg not installed. `brew install ffmpeg` to enable preview.")
+        return
+    subprocess.run(["afplay", str(tmp)], check=False)
+    tmp.unlink(missing_ok=True)
+
+
+def pick_preview_turn(turns: list[Turn], rng: random.Random) -> Turn:
+    long_enough = [t for t in turns if t.end_seconds - t.start_seconds >= 2.0]
+    pool = long_enough or turns
+    return rng.choice(pool[: max(5, len(pool) // 2)])
+
+
+def load_existing(path: Path) -> dict:
+    if not path.exists():
+        return {"speakers": {}}
+    data = json.loads(path.read_text())
+    data.setdefault("speakers", {})
+    return data
+
+
+def save_labels(path: Path, payload: dict) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, indent=2))
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("meeting", type=Path, help="Path to meeting-*.md export")
+    parser.add_argument(
+        "--out",
+        type=Path,
+        default=Path(__file__).parent / "ground",
+        help="Output directory for ground-truth labels",
+    )
+    parser.add_argument(
+        "--preview-seconds",
+        type=float,
+        default=PREVIEW_SECONDS,
+    )
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+
+    frontmatter, turns, audio_path = parse_meeting(args.meeting)
+    meeting_id = frontmatter.get("id", args.meeting.stem)
+    ground_path = args.out / f"{meeting_id}.json"
+    existing = load_existing(ground_path)
+    rng = random.Random(args.seed)
+
+    groups = group_by_speaker(turns)
+    print(f"meeting: {meeting_id}")
+    print(f"audio:   {audio_path}")
+    print(f"speakers: {len(groups)}  turns: {len(turns)}")
+    print()
+
+    for speaker_id, speaker_turns in groups.items():
+        if speaker_id in existing["speakers"]:
+            print(f"{speaker_id}: already labeled as '{existing['speakers'][speaker_id]}' — skipping")
+            continue
+
+        total = sum(t.end_seconds - t.start_seconds for t in speaker_turns)
+        print(f"— {speaker_id} —  {len(speaker_turns)} turns, {total:.0f}s total")
+
+        while True:
+            preview = pick_preview_turn(speaker_turns, rng)
+            duration = min(args.preview_seconds, preview.end_seconds - preview.start_seconds)
+            print(f"  sample: {preview.start_seconds:.0f}s–{preview.end_seconds:.0f}s")
+            play_clip(audio_path, preview.start_seconds, duration)
+
+            try:
+                response = input("  who is this? > ").strip()
+            except EOFError:
+                response = "q"
+
+            if response == "?":
+                continue
+            if response == "s":
+                label = None
+                break
+            if response == "q":
+                save_labels(ground_path, existing)
+                print(f"saved {ground_path}")
+                return 0
+            if not response:
+                print("  (type a name, '?' to replay, 's' to skip, 'q' to quit)")
+                continue
+
+            label = response
+            break
+
+        if label is not None:
+            existing["speakers"][speaker_id] = label
+            save_labels(ground_path, existing)
+            print(f"  → {label}")
+        else:
+            print("  → skipped")
+
+    save_labels(ground_path, existing)
+    labeled = sum(1 for v in existing["speakers"].values() if v)
+    print(f"\ndone. {labeled}/{len(groups)} speakers labeled → {ground_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmarks/speaker-id/report.py b/benchmarks/speaker-id/report.py
new file mode 100755
index 0000000..815e118
--- /dev/null
+++ b/benchmarks/speaker-id/report.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+"""Print metrics from a score.py results file.
+
+Emits four numbers that must move together — optimizing any one alone is
+how regressions ship:
+
+  accuracy              enrolled speakers correctly identified
+  unknown_rejection     strangers correctly marked "no match"
+  false_accept          strangers confidently labeled as someone else
+  calibration_error     mean gap between claimed confidence and actual correctness
+"""
+
+import argparse
+import json
+from pathlib import Path
+from statistics import mean
+
+
+def metrics(results: dict) -> dict:
+    turns = [t for m in results["meetings"] for t in m["predictions"]]
+    enrolled = [t for t in turns if t["enrolled"]]
+    strangers = [t for t in turns if not t["enrolled"]]
+
+    if enrolled:
+        correct = sum(1 for t in enrolled if t["predicted"] == t["speaker_truth"])
+        missed = sum(1 for t in enrolled if t["predicted"] is None)
+        wrong = sum(1 for t in enrolled if t["predicted"] and t["predicted"] != t["speaker_truth"])
+        accuracy = correct / len(enrolled)
+        miss_rate = missed / len(enrolled)
+        confusion_rate = wrong / len(enrolled)
+    else:
+        accuracy = miss_rate = confusion_rate = 0.0
+
+    if strangers:
+        rejected = sum(1 for t in strangers if t["predicted"] is None)
+        unknown_rejection = rejected / len(strangers)
+        false_accept = 1 - unknown_rejection
+    else:
+        unknown_rejection = false_accept = 0.0
+
+    confidences = [t for t in enrolled if t["confidence"] is not None]
+    if confidences:
+        bins = [[] for _ in range(10)]
+        for t in confidences:
+            idx = min(9, int(t["confidence"] * 10))
+            bins[idx].append(1 if t["predicted"] == t["speaker_truth"] else 0)
+        per_bin = [
+            (i / 10 + 0.05, mean(b)) for i, b in enumerate(bins) if b
+        ]
+        calibration_error = mean(abs(claimed - actual) for claimed, actual in per_bin)
+    else:
+        calibration_error = 0.0
+
+    return {
+        "n_enrolled_turns": len(enrolled),
+        "n_stranger_turns": len(strangers),
+        "accuracy": round(accuracy, 4),
+        "miss_rate": round(miss_rate, 4),
+        "confusion_rate": round(confusion_rate, 4),
+        "unknown_rejection": round(unknown_rejection, 4),
+        "false_accept": round(false_accept, 4),
+        "calibration_error": round(calibration_error, 4),
+    }
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("results", type=Path)
+    parser.add_argument("--baseline", type=Path, help="baseline.json to diff against")
+    args = parser.parse_args()
+
+    current = metrics(json.loads(args.results.read_text()))
+    print(f"{'metric':20}  {'value':>10}", end="")
+    if args.baseline and args.baseline.exists():
+        baseline = json.loads(args.baseline.read_text())
+        print(f"  {'baseline':>10}  {'delta':>10}")
+        for key, value in current.items():
+            base = baseline.get(key, 0)
+            delta = value - base if isinstance(value, (int, float)) else ""
+            delta_str = f"{delta:+.4f}" if isinstance(delta, float) else str(delta)
+            print(f"{key:20}  {value:>10}  {base:>10}  {delta_str:>10}")
+    else:
+        print()
+        for key, value in current.items():
+            print(f"{key:20}  {value:>10}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmarks/speaker-id/score.py b/benchmarks/speaker-id/score.py
new file mode 100755
index 0000000..88abcf5
--- /dev/null
+++ b/benchmarks/speaker-id/score.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+"""Score unsigned Char's speaker matching against labeled meetings.
+
+Expects ground truth from label.py — a per-meeting JSON with:
+
+  {"speakers": {"Speaker 1": "John", "Speaker 2": "Yujong", ...}}
+
+And an embeddings.json per meeting produced by the Swift pipeline:
+
+  {
+    "speakers": {
+      "Speaker 1": [[...], [...]],   // one embedding per turn
+      "Speaker 2": [[...]]
+    }
+  }
+
+Runs store.ts matching logic in Python. For each meeting, half the *labeled*
+human identities become enrolled profiles, the other half are strangers the
+matcher should reject.
+
+Emits results/<name>.json with per-turn predictions.
+"""
+
+import argparse
+import json
+import random
+from pathlib import Path
+
+
+def cosine(a: list[float], b: list[float]) -> float:
+    if not a or len(a) != len(b):
+        return 0.0
+    dot = sum(x * y for x, y in zip(a, b))
+    na = sum(x * x for x in a) ** 0.5
+    nb = sum(x * x for x in b) ** 0.5
+    return dot / (na * nb) if na and nb else 0.0
+
+
+def normalized_centroid(embeddings: list[list[float]]) -> list[float]:
+    if not embeddings or not embeddings[0]:
+        return []
+    dim = len(embeddings[0])
+    acc = [0.0] * dim
+    for emb in embeddings:
+        if len(emb) == dim:
+            for i, v in enumerate(emb):
+                acc[i] += v
+    norm = sum(v * v for v in acc) ** 0.5
+    return [v / norm for v in acc] if norm else acc
+
+
+def score_profile(embedding: list[float], profile: dict) -> dict:
+    centroid_score = cosine(embedding, profile["centroid"])
+    sample_scores = sorted(
+        (cosine(embedding, s) for s in profile["samples"]),
+        reverse=True,
+    )
+    best_sample_score = sample_scores[0] if sample_scores else 0.0
+    return {
+        "profile_id": profile["id"],
+        "profile_name": profile["name"],
+        "score": max(best_sample_score, centroid_score),
+        "centroid_score": centroid_score,
+        "best_sample_score": best_sample_score,
+        "sample_count": len(profile["samples"]),
+    }
+
+
+def recommend(embedding: list[float], profiles: list[dict]) -> dict | None:
+    ranked = sorted(
+        (score_profile(embedding, p) for p in profiles),
+        key=lambda r: r["score"],
+        reverse=True,
+    )
+    ranked = [r for r in ranked if r["score"] > 0]
+    if not ranked:
+        return None
+    best = ranked[0]
+    alternate_score = ranked[1]["score"] if len(ranked) > 1 else 0.0
+
+    threshold = 0.7 if best["sample_count"] >= 3 else 0.74
+    if best["score"] < threshold:
+        return None
+    if best["score"] < 0.82 and best["score"] - alternate_score < 0.04:
+        return None
+
+    return {
+        "profile_name": best["profile_name"],
+        "confidence": best["score"],
+        "alternate_confidence": alternate_score,
+    }
+
+
+def split_people(people: list[str], seed: int) -> tuple[set[str], set[str]]:
+    rng = random.Random(seed)
+    shuffled = sorted(people)
+    rng.shuffle(shuffled)
+    half = max(1, len(shuffled) // 2)
+    return set(shuffled[:half]), set(shuffled[half:])
+
+
+def score_meeting(meeting_dir: Path, seed: int) -> dict:
+    ground = json.loads((meeting_dir / "ground.json").read_text())
+    embeddings = json.loads((meeting_dir / "embeddings.json").read_text())
+
+    speaker_to_person = {
+        speaker: person
+        for speaker, person in ground["speakers"].items()
+        if person and not person.startswith("__")
+    }
+    if not speaker_to_person:
+        return {"meeting": meeting_dir.name, "predictions": [], "skipped": "no labels"}
+
+    people = sorted(set(speaker_to_person.values()))
+    enrolled, strangers = split_people(people, seed)
+
+    profiles = []
+    for speaker, person in speaker_to_person.items():
+        if person not in enrolled:
+            continue
+        samples = embeddings["speakers"].get(speaker, [])
+        if len(samples) < 2:
+            continue
+        profiles.append(
+            {
+                "id": f"profile_{person}",
+                "name": person,
+                "centroid": normalized_centroid(samples),
+                "samples": samples,
+            }
+        )
+
+    predictions = []
+    for speaker, person in speaker_to_person.items():
+        turn_embeddings = embeddings["speakers"].get(speaker, [])
+        for turn_embedding in turn_embeddings:
+            suggestion = recommend(turn_embedding, profiles)
+            predictions.append(
+                {
+                    "speaker_id": speaker,
+                    "truth": person,
+                    "enrolled": person in enrolled,
+                    "predicted": suggestion["profile_name"] if suggestion else None,
+                    "confidence": suggestion["confidence"] if suggestion else None,
+                }
+            )
+
+    return {
+        "meeting": meeting_dir.name,
+        "enrolled": sorted(enrolled),
+        "strangers": sorted(strangers),
+        "predictions": predictions,
+    }
+
+
+def collect_meetings(root: Path, ground_dir: Path) -> list[Path]:
+    """Pair every ground/<id>.json with meetings/<id>/embeddings.json."""
+    pairs = []
+    for ground_file in sorted(ground_dir.glob("*.json")):
+        meeting_dir = root / ground_file.stem
+        if (meeting_dir / "embeddings.json").exists():
+            ground_target = meeting_dir / "ground.json"
+            if not ground_target.exists() or ground_target.read_text() != ground_file.read_text():
+                meeting_dir.mkdir(parents=True, exist_ok=True)
+                ground_target.write_text(ground_file.read_text())
+            pairs.append(meeting_dir)
+    return pairs
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--meetings",
+        type=Path,
+        default=Path(__file__).parent / "meetings",
+    )
+    parser.add_argument(
+        "--ground",
+        type=Path,
+        default=Path(__file__).parent / "ground",
+    )
+    parser.add_argument("--out", type=Path, required=True)
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+
+    meetings = collect_meetings(args.meetings, args.ground)
+    if not meetings:
+        raise SystemExit(
+            f"No meetings with both ground/*.json and {args.meetings}/*/embeddings.json"
+        )
+
+    results = [score_meeting(meeting_dir, args.seed) for meeting_dir in meetings]
+    for result in results:
+        print(f"scored {result['meeting']}  turns={len(result.get('predictions', []))}")
+
+    args.out.parent.mkdir(parents=True, exist_ok=True)
+    args.out.write_text(json.dumps({"meetings": results}, indent=2))
+    print(f"wrote {args.out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())