diff --git a/benchmarks/speaker-id/.gitignore b/benchmarks/speaker-id/.gitignore new file mode 100644 index 0000000..b9fca71 --- /dev/null +++ b/benchmarks/speaker-id/.gitignore @@ -0,0 +1,6 @@ +ground/ +meetings/ +results/ +.venv/ +__pycache__/ +*.pyc diff --git a/benchmarks/speaker-id/README.md b/benchmarks/speaker-id/README.md new file mode 100644 index 0000000..e513628 --- /dev/null +++ b/benchmarks/speaker-id/README.md @@ -0,0 +1,76 @@ +# Speaker ID Benchmark — dogfood edition + +A harness for measuring speaker identification quality against **your own meetings**. No synthetic corpora, no stitching — just label the meetings you already have and optimize against the conditions Char actually ships in. + +## Why this shape + +Speaker ID in `src/store.ts` uses hand-tuned thresholds with no way to measure impact of changes. Academic corpora (VoxCeleb, AMI) measure different conditions than what Char users hit — read speech, studio mics, celebrity interviews. + +Your last 20 Char meetings are the right benchmark. You know who was there. You remember which identifications went wrong. An evening of labeling is faster than a week of synthetic harness plumbing that ends up measuring the wrong thing. + +## Pipeline + +``` +meeting-*.md (your exported meetings) + ↓ label.py (play turn, type name, once per speaker) +ground/.json + ↓ extract_embeddings.py (TODO — Swift bridge) +meetings//embeddings.json + ↓ score.py +results/.json + ↓ report.py +accuracy · unknown_rejection · false_accept · calibration_error +``` + +## Metrics + +Four numbers tracked together. Optimizing only the first is how you ship regressions: + +- **Identification accuracy** — enrolled speakers correctly identified +- **Unknown rejection rate** — unseen speakers correctly marked "no match" +- **False-accept rate** — unseen speakers confidently mislabeled as someone else +- **Calibration error** — gap between claimed confidence and actual correctness + +For each meeting, half the labeled humans become "enrolled profiles" and the other half are "strangers" the matcher should reject. This keeps the 0.04 margin gate honest. + +## Labeling + +``` +cd benchmarks/speaker-id +brew install ffmpeg # for auto-preview on macOS +python3 label.py ~/Documents/unsigned\ char/meeting-abc123.md +``` + +The labeler picks one random long turn per speaker, plays 5 seconds, asks who they are. A 3-speaker meeting takes about a minute. Aim for 20 meetings — that gives you meaningful numbers without losing your evening. + +**Important:** only label meetings whose speakers are still unlabeled in the app (`Speaker 1`, `Speaker 2` in the export). If you've already assigned human names inside Char, the matcher's guesses will leak into the markdown and contaminate ground truth. Either export a fresh meeting before labeling, or switch to the upcoming `uchar meetings export --raw` command once it lands. + +## Scoring + +``` +python3 score.py --out results/latest.json +python3 report.py results/latest.json --baseline baseline.json +``` + +## Directory layout + +``` +benchmarks/speaker-id/ +├── README.md # this file +├── label.py # interactive labeler +├── score.py # runs store.ts matching logic against embeddings + ground +├── report.py # prints metrics table with baseline delta +├── ground/ # gitignored — your labels, your voices +├── meetings/ # gitignored — per-meeting embeddings.json +├── results/ # gitignored — metric runs +└── baseline.json # committed — main-branch numbers, no audio in it +``` + +## What's missing + +- **`extract_embeddings.py`** — the Swift speaker-embedding extractor needs a CLI entry point. Until that lands, `embeddings.json` doesn't get generated. Tracked in #3. +- **`baseline.json`** — populated once the pipeline runs end-to-end on real meetings. + +## Note on privacy + +`ground/` and `meetings/` are gitignored on purpose. These directories hold your voice samples. Never commit them. `baseline.json` contains only aggregate numbers and is safe to share. diff --git a/benchmarks/speaker-id/label.py b/benchmarks/speaker-id/label.py new file mode 100644 index 0000000..2a7d015 --- /dev/null +++ b/benchmarks/speaker-id/label.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python3 +"""Label speakers in an unsigned Char meeting. + +Reads a meeting markdown export, plays a sample turn for each speaker, and +asks who they are. Saves ground-truth labels to ground/.json. + +Label by SPEAKER, not by turn. For a 30-minute meeting with 3 speakers, you +answer 3 questions — not 80. + +Works against meetings whose speakers are still unlabeled in the app +(Speaker 1, Speaker 2, ...). If you've already typed human names into the +app, export a fresh meeting before labeling — otherwise the matcher's +guesses leak into ground truth. + +Usage: + python label.py ~/Documents/unsigned\\ char/meeting-abc123.md + +Keys: + assign this speaker + ? replay a different turn from this speaker + s skip (not labeled — excluded from scoring) + q save and quit +""" + +import argparse +import json +import random +import re +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path + + +PREVIEW_SECONDS = 5.0 +SPEAKER_TURN_PATTERN = re.compile( + r"^\s*-\s+(?P.+?)\s*:\s+" + r"(?P\d{1,2}:\d{2})-(?P\d{1,2}:\d{2})\s*$", + re.MULTILINE, +) + + +@dataclass +class Turn: + speaker: str + start_seconds: float + end_seconds: float + + +def parse_clock(value: str) -> float: + minutes, seconds = value.split(":") + return int(minutes) * 60 + int(seconds) + + +def parse_meeting(markdown_path: Path) -> tuple[dict, list[Turn], Path]: + text = markdown_path.read_text() + + frontmatter = {} + if text.startswith("---"): + end = text.find("\n---", 3) + if end > 0: + for line in text[3:end].splitlines(): + if ":" in line: + key, value = line.split(":", 1) + frontmatter[key.strip()] = value.strip() + text = text[end + 4 :] + + audio_path_raw = frontmatter.get("audio_path", "").strip() + if not audio_path_raw: + raise SystemExit(f"No audio_path in frontmatter of {markdown_path}") + audio_path = Path(audio_path_raw).expanduser() + if not audio_path.is_absolute(): + audio_path = (markdown_path.parent / audio_path).resolve() + if not audio_path.exists(): + raise SystemExit(f"Audio file not found: {audio_path}") + + turns_section = text.split("## Speaker Turns", 1) + if len(turns_section) < 2: + raise SystemExit(f"No Speaker Turns section in {markdown_path}") + + turns = [] + for match in SPEAKER_TURN_PATTERN.finditer(turns_section[1]): + speaker = match.group("speaker").strip() + if speaker.lower() == "pipeline": + continue + turns.append( + Turn( + speaker=speaker, + start_seconds=parse_clock(match.group("start")), + end_seconds=parse_clock(match.group("end")), + ) + ) + + if not turns: + raise SystemExit(f"No diarization turns parsed from {markdown_path}") + + return frontmatter, turns, audio_path + + +def group_by_speaker(turns: list[Turn]) -> dict[str, list[Turn]]: + groups: dict[str, list[Turn]] = {} + for turn in turns: + groups.setdefault(turn.speaker, []).append(turn) + for turns_list in groups.values(): + turns_list.sort(key=lambda t: t.end_seconds - t.start_seconds, reverse=True) + return groups + + +def play_clip(audio_path: Path, start: float, duration: float) -> None: + if sys.platform != "darwin": + print(f" (auto-play unsupported on {sys.platform}: seek to {start:.1f}s)") + return + tmp = Path("/tmp") / f"uchar_label_{start:.0f}.wav" + try: + subprocess.run( + [ + "ffmpeg", + "-y", + "-loglevel", + "error", + "-ss", + str(start), + "-t", + str(duration), + "-i", + str(audio_path), + str(tmp), + ], + check=True, + ) + except FileNotFoundError: + print(" ffmpeg not installed. `brew install ffmpeg` to enable preview.") + return + subprocess.run(["afplay", str(tmp)], check=False) + tmp.unlink(missing_ok=True) + + +def pick_preview_turn(turns: list[Turn], rng: random.Random) -> Turn: + long_enough = [t for t in turns if t.end_seconds - t.start_seconds >= 2.0] + pool = long_enough or turns + return rng.choice(pool[: max(5, len(pool) // 2)]) + + +def load_existing(path: Path) -> dict: + if not path.exists(): + return {"speakers": {}} + data = json.loads(path.read_text()) + data.setdefault("speakers", {}) + return data + + +def save_labels(path: Path, payload: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2)) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("meeting", type=Path, help="Path to meeting-*.md export") + parser.add_argument( + "--out", + type=Path, + default=Path(__file__).parent / "ground", + help="Output directory for ground-truth labels", + ) + parser.add_argument( + "--preview-seconds", + type=float, + default=PREVIEW_SECONDS, + ) + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args() + + frontmatter, turns, audio_path = parse_meeting(args.meeting) + meeting_id = frontmatter.get("id", args.meeting.stem) + ground_path = args.out / f"{meeting_id}.json" + existing = load_existing(ground_path) + rng = random.Random(args.seed) + + groups = group_by_speaker(turns) + print(f"meeting: {meeting_id}") + print(f"audio: {audio_path}") + print(f"speakers: {len(groups)} turns: {len(turns)}") + print() + + for speaker_id, speaker_turns in groups.items(): + if speaker_id in existing["speakers"]: + print(f"{speaker_id}: already labeled as '{existing['speakers'][speaker_id]}' — skipping") + continue + + total = sum(t.end_seconds - t.start_seconds for t in speaker_turns) + print(f"— {speaker_id} — {len(speaker_turns)} turns, {total:.0f}s total") + + while True: + preview = pick_preview_turn(speaker_turns, rng) + duration = min(args.preview_seconds, preview.end_seconds - preview.start_seconds) + print(f" sample: {preview.start_seconds:.0f}s–{preview.end_seconds:.0f}s") + play_clip(audio_path, preview.start_seconds, duration) + + try: + response = input(" who is this? > ").strip() + except EOFError: + response = "q" + + if response == "?": + continue + if response == "s": + label = None + break + if response == "q": + save_labels(ground_path, existing) + print(f"saved {ground_path}") + return 0 + if not response: + print(" (type a name, '?' to replay, 's' to skip, 'q' to quit)") + continue + + label = response + break + + if label is not None: + existing["speakers"][speaker_id] = label + save_labels(ground_path, existing) + print(f" → {label}") + else: + print(" → skipped") + + save_labels(ground_path, existing) + labeled = sum(1 for v in existing["speakers"].values() if v) + print(f"\ndone. {labeled}/{len(groups)} speakers labeled → {ground_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/speaker-id/report.py b/benchmarks/speaker-id/report.py new file mode 100755 index 0000000..815e118 --- /dev/null +++ b/benchmarks/speaker-id/report.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +"""Print metrics from a score.py results file. + +Emits four numbers that must move together — optimizing any one alone is +how regressions ship: + + accuracy enrolled speakers correctly identified + unknown_rejection strangers correctly marked "no match" + false_accept strangers confidently labeled as someone else + calibration_error mean gap between claimed confidence and actual correctness +""" + +import argparse +import json +from pathlib import Path +from statistics import mean + + +def metrics(results: dict) -> dict: + turns = [t for m in results["meetings"] for t in m["predictions"]] + enrolled = [t for t in turns if t["enrolled"]] + strangers = [t for t in turns if not t["enrolled"]] + + if enrolled: + correct = sum(1 for t in enrolled if t["predicted"] == t["speaker_truth"]) + missed = sum(1 for t in enrolled if t["predicted"] is None) + wrong = sum(1 for t in enrolled if t["predicted"] and t["predicted"] != t["speaker_truth"]) + accuracy = correct / len(enrolled) + miss_rate = missed / len(enrolled) + confusion_rate = wrong / len(enrolled) + else: + accuracy = miss_rate = confusion_rate = 0.0 + + if strangers: + rejected = sum(1 for t in strangers if t["predicted"] is None) + unknown_rejection = rejected / len(strangers) + false_accept = 1 - unknown_rejection + else: + unknown_rejection = false_accept = 0.0 + + confidences = [t for t in enrolled if t["confidence"] is not None] + if confidences: + bins = [[] for _ in range(10)] + for t in confidences: + idx = min(9, int(t["confidence"] * 10)) + bins[idx].append(1 if t["predicted"] == t["speaker_truth"] else 0) + per_bin = [ + (i / 10 + 0.05, mean(b)) for i, b in enumerate(bins) if b + ] + calibration_error = mean(abs(claimed - actual) for claimed, actual in per_bin) + else: + calibration_error = 0.0 + + return { + "n_enrolled_turns": len(enrolled), + "n_stranger_turns": len(strangers), + "accuracy": round(accuracy, 4), + "miss_rate": round(miss_rate, 4), + "confusion_rate": round(confusion_rate, 4), + "unknown_rejection": round(unknown_rejection, 4), + "false_accept": round(false_accept, 4), + "calibration_error": round(calibration_error, 4), + } + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("results", type=Path) + parser.add_argument("--baseline", type=Path, help="baseline.json to diff against") + args = parser.parse_args() + + current = metrics(json.loads(args.results.read_text())) + print(f"{'metric':20} {'value':>10}", end="") + if args.baseline and args.baseline.exists(): + baseline = json.loads(args.baseline.read_text()) + print(f" {'baseline':>10} {'delta':>10}") + for key, value in current.items(): + base = baseline.get(key, 0) + delta = value - base if isinstance(value, (int, float)) else "" + delta_str = f"{delta:+.4f}" if isinstance(delta, float) else str(delta) + print(f"{key:20} {value:>10} {base:>10} {delta_str:>10}") + else: + print() + for key, value in current.items(): + print(f"{key:20} {value:>10}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/speaker-id/score.py b/benchmarks/speaker-id/score.py new file mode 100755 index 0000000..88abcf5 --- /dev/null +++ b/benchmarks/speaker-id/score.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +"""Score unsigned Char's speaker matching against labeled meetings. + +Expects ground truth from label.py — a per-meeting JSON with: + + {"speakers": {"Speaker 1": "John", "Speaker 2": "Yujong", ...}} + +And an embeddings.json per meeting produced by the Swift pipeline: + + { + "speakers": { + "Speaker 1": [[...], [...]], // one embedding per turn + "Speaker 2": [[...]] + } + } + +Runs store.ts matching logic in Python. For each meeting, half the *labeled* +human identities become enrolled profiles, the other half are strangers the +matcher should reject. + +Emits results/.json with per-turn predictions. +""" + +import argparse +import json +import random +from pathlib import Path + + +def cosine(a: list[float], b: list[float]) -> float: + if not a or len(a) != len(b): + return 0.0 + dot = sum(x * y for x, y in zip(a, b)) + na = sum(x * x for x in a) ** 0.5 + nb = sum(x * x for x in b) ** 0.5 + return dot / (na * nb) if na and nb else 0.0 + + +def normalized_centroid(embeddings: list[list[float]]) -> list[float]: + if not embeddings or not embeddings[0]: + return [] + dim = len(embeddings[0]) + acc = [0.0] * dim + for emb in embeddings: + if len(emb) == dim: + for i, v in enumerate(emb): + acc[i] += v + norm = sum(v * v for v in acc) ** 0.5 + return [v / norm for v in acc] if norm else acc + + +def score_profile(embedding: list[float], profile: dict) -> dict: + centroid_score = cosine(embedding, profile["centroid"]) + sample_scores = sorted( + (cosine(embedding, s) for s in profile["samples"]), + reverse=True, + ) + best_sample_score = sample_scores[0] if sample_scores else 0.0 + return { + "profile_id": profile["id"], + "profile_name": profile["name"], + "score": max(best_sample_score, centroid_score), + "centroid_score": centroid_score, + "best_sample_score": best_sample_score, + "sample_count": len(profile["samples"]), + } + + +def recommend(embedding: list[float], profiles: list[dict]) -> dict | None: + ranked = sorted( + (score_profile(embedding, p) for p in profiles), + key=lambda r: r["score"], + reverse=True, + ) + ranked = [r for r in ranked if r["score"] > 0] + if not ranked: + return None + best = ranked[0] + alternate_score = ranked[1]["score"] if len(ranked) > 1 else 0.0 + + threshold = 0.7 if best["sample_count"] >= 3 else 0.74 + if best["score"] < threshold: + return None + if best["score"] < 0.82 and best["score"] - alternate_score < 0.04: + return None + + return { + "profile_name": best["profile_name"], + "confidence": best["score"], + "alternate_confidence": alternate_score, + } + + +def split_people(people: list[str], seed: int) -> tuple[set[str], set[str]]: + rng = random.Random(seed) + shuffled = sorted(people) + rng.shuffle(shuffled) + half = max(1, len(shuffled) // 2) + return set(shuffled[:half]), set(shuffled[half:]) + + +def score_meeting(meeting_dir: Path, seed: int) -> dict: + ground = json.loads((meeting_dir / "ground.json").read_text()) + embeddings = json.loads((meeting_dir / "embeddings.json").read_text()) + + speaker_to_person = { + speaker: person + for speaker, person in ground["speakers"].items() + if person and not person.startswith("__") + } + if not speaker_to_person: + return {"meeting": meeting_dir.name, "predictions": [], "skipped": "no labels"} + + people = sorted(set(speaker_to_person.values())) + enrolled, strangers = split_people(people, seed) + + profiles = [] + for speaker, person in speaker_to_person.items(): + if person not in enrolled: + continue + samples = embeddings["speakers"].get(speaker, []) + if len(samples) < 2: + continue + profiles.append( + { + "id": f"profile_{person}", + "name": person, + "centroid": normalized_centroid(samples), + "samples": samples, + } + ) + + predictions = [] + for speaker, person in speaker_to_person.items(): + turn_embeddings = embeddings["speakers"].get(speaker, []) + for turn_embedding in turn_embeddings: + suggestion = recommend(turn_embedding, profiles) + predictions.append( + { + "speaker_id": speaker, + "truth": person, + "enrolled": person in enrolled, + "predicted": suggestion["profile_name"] if suggestion else None, + "confidence": suggestion["confidence"] if suggestion else None, + } + ) + + return { + "meeting": meeting_dir.name, + "enrolled": sorted(enrolled), + "strangers": sorted(strangers), + "predictions": predictions, + } + + +def collect_meetings(root: Path, ground_dir: Path) -> list[Path]: + """Pair every ground/.json with meetings//embeddings.json.""" + pairs = [] + for ground_file in sorted(ground_dir.glob("*.json")): + meeting_dir = root / ground_file.stem + if (meeting_dir / "embeddings.json").exists(): + ground_target = meeting_dir / "ground.json" + if not ground_target.exists() or ground_target.read_text() != ground_file.read_text(): + meeting_dir.mkdir(parents=True, exist_ok=True) + ground_target.write_text(ground_file.read_text()) + pairs.append(meeting_dir) + return pairs + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--meetings", + type=Path, + default=Path(__file__).parent / "meetings", + ) + parser.add_argument( + "--ground", + type=Path, + default=Path(__file__).parent / "ground", + ) + parser.add_argument("--out", type=Path, required=True) + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args() + + meetings = collect_meetings(args.meetings, args.ground) + if not meetings: + raise SystemExit( + f"No meetings with both ground/*.json and {args.meetings}/*/embeddings.json" + ) + + results = [score_meeting(meeting_dir, args.seed) for meeting_dir in meetings] + for result in results: + print(f"scored {result['meeting']} turns={len(result.get('predictions', []))}") + + args.out.parent.mkdir(parents=True, exist_ok=True) + args.out.write_text(json.dumps({"meetings": results}, indent=2)) + print(f"wrote {args.out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())