From 831cee50acf907c8dac1efe2338344075f8074d4 Mon Sep 17 00:00:00 2001 From: ComputelessComputer <63365510+ComputelessComputer@users.noreply.github.com> Date: Fri, 17 Apr 2026 03:06:00 +0900 Subject: [PATCH 1/8] Add speaker ID benchmark harness scaffold README, pinned deps, and gitignore for benchmarks/speaker-id. Harness measures speaker identification against labeled synthetic meetings. --- benchmarks/speaker-id/.gitignore | 5 ++ benchmarks/speaker-id/README.md | 68 ++++++++++++++++++++++++++ benchmarks/speaker-id/requirements.txt | 4 ++ 3 files changed, 77 insertions(+) create mode 100644 benchmarks/speaker-id/.gitignore create mode 100644 benchmarks/speaker-id/README.md create mode 100644 benchmarks/speaker-id/requirements.txt diff --git a/benchmarks/speaker-id/.gitignore b/benchmarks/speaker-id/.gitignore new file mode 100644 index 0000000..6c350c5 --- /dev/null +++ b/benchmarks/speaker-id/.gitignore @@ -0,0 +1,5 @@ +fixtures/ +results/ +.venv/ +__pycache__/ +*.pyc diff --git a/benchmarks/speaker-id/README.md b/benchmarks/speaker-id/README.md new file mode 100644 index 0000000..f57a4d8 --- /dev/null +++ b/benchmarks/speaker-id/README.md @@ -0,0 +1,68 @@ +# Speaker ID Benchmark Harness + +A reproducible harness for measuring speaker identification quality in unsigned Char. + +## Why + +Speaker ID in `src/store.ts` uses hand-tuned thresholds with no way to measure impact of changes. This harness builds labeled "meetings" from public speaker-verification corpora and scores unsigned Char's matching logic against ground truth — so you can tell whether a tweak actually helps. + +## Pipeline + +``` +labeled clips (VoxCeleb / LibriSpeech / your own) + ↓ stitch.py +synthetic meetings with ground-truth turn labels + ↓ score.py (runs TS scoring via bun) +predictions vs ground truth + ↓ report.py +accuracy, precision/recall, unknown-speaker rejection, calibration +``` + +## Metrics + +Four numbers we track together — optimizing only the first is how you ship regressions: + +- **Identification accuracy** — % of enrolled speakers correctly identified +- **Unknown rejection rate** — % of unseen speakers correctly marked as "no match" +- **False-accept rate** — unseen speakers confidently labeled as someone else +- **Calibration error** — gap between claimed confidence and actual accuracy + +## Tiers + +| Tier | Source | Purpose | Runtime | +|------|--------|---------|---------| +| unit | Synthetic (VCTK/LibriSpeech) | Fast iteration | seconds | +| integration | AMI Meeting Corpus | Real meeting dynamics | minutes | +| reality | Dogfood meetings | Actual Char conditions | minutes | + +## Usage + +``` +cd benchmarks/speaker-id +python3 -m venv .venv && source .venv/bin/activate +pip install -r requirements.txt + +python stitch.py --corpus vctk --out fixtures/unit --num-meetings 50 +python score.py --fixtures fixtures/unit --out results/unit.json +python report.py results/unit.json +``` + +Baseline numbers go in `baseline.json`. Any PR that changes speaker ID logic must report deltas against baseline. + +## Directory layout + +``` +benchmarks/speaker-id/ +├── README.md # this file +├── requirements.txt # pinned deps +├── stitch.py # build synthetic meetings from labeled clips +├── score.py # run matching logic against fixtures +├── report.py # print metrics table +├── fixtures/ # generated meetings (gitignored) +├── results/ # metric runs (gitignored) +└── baseline.json # reference numbers, committed +``` + +## Notes + +Fixtures and results are gitignored — they're large and reproducible from the seed. `baseline.json` is the only tracked output and represents current main's performance. diff --git a/benchmarks/speaker-id/requirements.txt b/benchmarks/speaker-id/requirements.txt new file mode 100644 index 0000000..d5697ff --- /dev/null +++ b/benchmarks/speaker-id/requirements.txt @@ -0,0 +1,4 @@ +# Minimal deps — stdlib does most of the work +soundfile==0.12.1 +numpy==1.26.4 +pyroomacoustics==0.7.4 From ccfbc8227f29a5703c75aa158e81bf0fb9828331 Mon Sep 17 00:00:00 2001 From: ComputelessComputer <63365510+ComputelessComputer@users.noreply.github.com> Date: Fri, 17 Apr 2026 03:06:00 +0900 Subject: [PATCH 2/8] Add meeting stitcher for speaker ID benchmark stitch.py builds synthetic meetings from a speaker-keyed corpus (VCTK, LibriSpeech, VoxCeleb). Emits audio.wav and ground.json per meeting. --- benchmarks/speaker-id/stitch.py | 173 ++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100755 benchmarks/speaker-id/stitch.py diff --git a/benchmarks/speaker-id/stitch.py b/benchmarks/speaker-id/stitch.py new file mode 100755 index 0000000..fcab6fb --- /dev/null +++ b/benchmarks/speaker-id/stitch.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +"""Stitch labeled speaker clips into synthetic meetings with ground truth. + +Input: a corpus directory where each subdirectory is a speaker id + containing that speaker's audio clips. + + corpus/ + speaker_001/ + clip_a.wav + clip_b.wav + speaker_002/ + ... + +Output: a fixtures directory with one meeting per subdir — audio.wav plus + ground.json listing the ordered turns (speaker, start, end). +""" + +import argparse +import json +import random +from dataclasses import asdict, dataclass +from pathlib import Path + +import numpy as np +import soundfile as sf + + +TARGET_SR = 16000 +MIN_TURN_S = 2.0 +MAX_TURN_S = 8.0 +PAUSE_MIN_S = 0.2 +PAUSE_MAX_S = 1.2 + + +@dataclass +class Turn: + speaker: str + start_seconds: float + end_seconds: float + source_clip: str + + +def load_clip(path: Path) -> np.ndarray: + audio, sr = sf.read(path, dtype="float32", always_2d=False) + if audio.ndim > 1: + audio = audio.mean(axis=1) + if sr != TARGET_SR: + ratio = TARGET_SR / sr + new_len = int(round(len(audio) * ratio)) + audio = np.interp( + np.linspace(0, len(audio) - 1, new_len), + np.arange(len(audio)), + audio, + ).astype(np.float32) + return audio + + +def trim_to_duration(audio: np.ndarray, rng: random.Random) -> np.ndarray: + target_s = rng.uniform(MIN_TURN_S, MAX_TURN_S) + target_samples = int(target_s * TARGET_SR) + if len(audio) <= target_samples: + return audio + start = rng.randint(0, len(audio) - target_samples) + return audio[start : start + target_samples] + + +def load_corpus(root: Path, min_clips: int) -> dict[str, list[Path]]: + speakers = {} + for speaker_dir in sorted(root.iterdir()): + if not speaker_dir.is_dir(): + continue + clips = sorted( + p for p in speaker_dir.rglob("*") if p.suffix.lower() in {".wav", ".flac", ".mp3"} + ) + if len(clips) >= min_clips: + speakers[speaker_dir.name] = clips + if not speakers: + raise SystemExit(f"No speakers with >={min_clips} clips found in {root}") + return speakers + + +def build_meeting( + meeting_dir: Path, + speakers: dict[str, list[Path]], + speaker_ids: list[str], + rng: random.Random, + turns_per_meeting: int, +) -> None: + meeting_dir.mkdir(parents=True, exist_ok=True) + audio_chunks: list[np.ndarray] = [] + turns: list[Turn] = [] + cursor_s = 0.0 + last_speaker: str | None = None + + for _ in range(turns_per_meeting): + # avoid same speaker twice in a row when more than one is in the meeting + candidates = [s for s in speaker_ids if s != last_speaker] or speaker_ids + speaker = rng.choice(candidates) + clip_path = rng.choice(speakers[speaker]) + clip = trim_to_duration(load_clip(clip_path), rng) + if len(clip) == 0: + continue + + duration_s = len(clip) / TARGET_SR + turns.append( + Turn( + speaker=speaker, + start_seconds=round(cursor_s, 3), + end_seconds=round(cursor_s + duration_s, 3), + source_clip=str(clip_path), + ) + ) + audio_chunks.append(clip) + cursor_s += duration_s + + pause_s = rng.uniform(PAUSE_MIN_S, PAUSE_MAX_S) + audio_chunks.append(np.zeros(int(pause_s * TARGET_SR), dtype=np.float32)) + cursor_s += pause_s + last_speaker = speaker + + audio = np.concatenate(audio_chunks) + sf.write(meeting_dir / "audio.wav", audio, TARGET_SR, subtype="PCM_16") + + ground = { + "sample_rate": TARGET_SR, + "duration_seconds": round(cursor_s, 3), + "speakers": sorted(set(speaker_ids)), + "turns": [asdict(turn) for turn in turns], + } + (meeting_dir / "ground.json").write_text(json.dumps(ground, indent=2)) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--corpus", type=Path, required=True, help="Directory of speaker subdirs") + parser.add_argument("--out", type=Path, required=True, help="Output fixtures directory") + parser.add_argument("--num-meetings", type=int, default=50) + parser.add_argument("--speakers-per-meeting", type=int, default=3) + parser.add_argument("--turns-per-meeting", type=int, default=20) + parser.add_argument("--min-clips-per-speaker", type=int, default=8) + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args() + + rng = random.Random(args.seed) + speakers = load_corpus(args.corpus, args.min_clips_per_speaker) + speaker_pool = list(speakers.keys()) + if len(speaker_pool) < args.speakers_per_meeting: + raise SystemExit( + f"Need >={args.speakers_per_meeting} speakers, corpus has {len(speaker_pool)}" + ) + + args.out.mkdir(parents=True, exist_ok=True) + + for index in range(args.num_meetings): + chosen = rng.sample(speaker_pool, args.speakers_per_meeting) + meeting_dir = args.out / f"meeting_{index:04d}" + build_meeting(meeting_dir, speakers, chosen, rng, args.turns_per_meeting) + print(f"built {meeting_dir.name}: speakers={chosen}") + + manifest = { + "seed": args.seed, + "num_meetings": args.num_meetings, + "speakers_per_meeting": args.speakers_per_meeting, + "turns_per_meeting": args.turns_per_meeting, + "corpus": str(args.corpus), + } + (args.out / "manifest.json").write_text(json.dumps(manifest, indent=2)) + print(f"wrote {args.num_meetings} meetings to {args.out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 57b7c40ae9bda2c7cb22d7c78f1e3dda43ff7a2c Mon Sep 17 00:00:00 2001 From: ComputelessComputer <63365510+ComputelessComputer@users.noreply.github.com> Date: Fri, 17 Apr 2026 03:06:00 +0900 Subject: [PATCH 3/8] Add score runner mirroring store.ts matching logic score.py re-implements scoreSpeakerProfile and recommendSpeakerProfile in Python so the harness can iterate on thresholds without rebuilding the Tauri app. Splits speakers into enrolled and stranger cohorts to measure unknown rejection. --- benchmarks/speaker-id/score.py | 183 +++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100755 benchmarks/speaker-id/score.py diff --git a/benchmarks/speaker-id/score.py b/benchmarks/speaker-id/score.py new file mode 100755 index 0000000..c47fbe4 --- /dev/null +++ b/benchmarks/speaker-id/score.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +"""Score unsigned Char's speaker matching against stitched fixtures. + +For each fixture meeting: + 1. Pick half the speakers as "enrolled" — their clips become stored profiles + 2. The other half are "strangers" — the matcher should reject them + 3. Run diarization + embedding on the meeting audio + 4. Feed each turn's embedding to scoreSpeakerProfile via the TS bridge + 5. Record the top suggestion (or "no match") per turn + +Emits results/.json with per-turn predictions and ground-truth labels. + +This script assumes embeddings are produced by the same Swift pipeline used +in-app. For the harness we accept a pre-computed embeddings file per meeting +(embeddings.json) so we can iterate on scoring logic without rebuilding the +Swift layer on every run. See `extract_embeddings.py` to generate those. +""" + +import argparse +import json +from pathlib import Path + + +def cosine(a: list[float], b: list[float]) -> float: + if not a or len(a) != len(b): + return 0.0 + dot = sum(x * y for x, y in zip(a, b)) + na = sum(x * x for x in a) ** 0.5 + nb = sum(x * x for x in b) ** 0.5 + return dot / (na * nb) if na and nb else 0.0 + + +def normalized_centroid(embeddings: list[list[float]]) -> list[float]: + if not embeddings or not embeddings[0]: + return [] + dim = len(embeddings[0]) + acc = [0.0] * dim + for emb in embeddings: + if len(emb) == dim: + for i, v in enumerate(emb): + acc[i] += v + norm = sum(v * v for v in acc) ** 0.5 + return [v / norm for v in acc] if norm else acc + + +def score_profile(embedding: list[float], profile: dict) -> dict: + """Mirror of store.ts scoreSpeakerProfile.""" + centroid_score = cosine(embedding, profile["centroid"]) + sample_scores = sorted( + (cosine(embedding, s) for s in profile["samples"]), + reverse=True, + ) + best_sample_score = sample_scores[0] if sample_scores else 0.0 + return { + "profile_id": profile["id"], + "profile_name": profile["name"], + "score": max(best_sample_score, centroid_score), + "centroid_score": centroid_score, + "best_sample_score": best_sample_score, + "sample_count": len(profile["samples"]), + } + + +def recommend(embedding: list[float], profiles: list[dict]) -> dict | None: + """Mirror of store.ts recommendSpeakerProfile.""" + ranked = sorted( + (score_profile(embedding, p) for p in profiles), + key=lambda r: r["score"], + reverse=True, + ) + ranked = [r for r in ranked if r["score"] > 0] + if not ranked: + return None + best = ranked[0] + alternate_score = ranked[1]["score"] if len(ranked) > 1 else 0.0 + + threshold = 0.7 if best["sample_count"] >= 3 else 0.74 + if best["score"] < threshold: + return None + if best["score"] < 0.82 and best["score"] - alternate_score < 0.04: + return None + + return { + "profile_id": best["profile_id"], + "profile_name": best["profile_name"], + "confidence": best["score"], + "alternate_confidence": alternate_score, + } + + +def split_speakers(speakers: list[str], rng_seed: int) -> tuple[list[str], list[str]]: + import random + + rng = random.Random(rng_seed) + shuffled = sorted(speakers) + rng.shuffle(shuffled) + half = len(shuffled) // 2 + return shuffled[:half], shuffled[half:] + + +def load_embeddings(meeting_dir: Path) -> dict: + path = meeting_dir / "embeddings.json" + if not path.exists(): + raise SystemExit( + f"Missing {path}. Run extract_embeddings.py first (or wire up the Swift bridge)." + ) + return json.loads(path.read_text()) + + +def build_profiles( + speakers_to_enroll: list[str], + enrollment_embeddings: dict[str, list[list[float]]], +) -> list[dict]: + profiles = [] + for speaker in speakers_to_enroll: + samples = enrollment_embeddings.get(speaker, []) + if len(samples) < 2: + continue + profiles.append( + { + "id": f"profile_{speaker}", + "name": speaker, + "centroid": normalized_centroid(samples), + "samples": samples, + } + ) + return profiles + + +def score_meeting(meeting_dir: Path, seed: int) -> dict: + ground = json.loads((meeting_dir / "ground.json").read_text()) + data = load_embeddings(meeting_dir) + enrollment = data["enrollment"] + per_turn = data["turns"] + + enrolled, strangers = split_speakers(ground["speakers"], seed) + profiles = build_profiles(enrolled, enrollment) + + predictions = [] + for turn, turn_embedding in zip(ground["turns"], per_turn): + suggestion = recommend(turn_embedding, profiles) + is_enrolled = turn["speaker"] in enrolled + predictions.append( + { + "speaker_truth": turn["speaker"], + "enrolled": is_enrolled, + "predicted": suggestion["profile_name"] if suggestion else None, + "confidence": suggestion["confidence"] if suggestion else None, + "start": turn["start_seconds"], + "end": turn["end_seconds"], + } + ) + + return { + "meeting": meeting_dir.name, + "enrolled_speakers": enrolled, + "stranger_speakers": strangers, + "predictions": predictions, + } + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--fixtures", type=Path, required=True) + parser.add_argument("--out", type=Path, required=True) + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args() + + meetings = [] + for meeting_dir in sorted(args.fixtures.iterdir()): + if not (meeting_dir / "ground.json").exists(): + continue + meetings.append(score_meeting(meeting_dir, args.seed)) + print(f"scored {meeting_dir.name}") + + args.out.parent.mkdir(parents=True, exist_ok=True) + args.out.write_text(json.dumps({"meetings": meetings}, indent=2)) + print(f"wrote {args.out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 015123a43106e77c349031c7d75442c0674a1813 Mon Sep 17 00:00:00 2001 From: ComputelessComputer <63365510+ComputelessComputer@users.noreply.github.com> Date: Fri, 17 Apr 2026 03:06:00 +0900 Subject: [PATCH 4/8] Add metrics report with baseline diff report.py prints accuracy, unknown rejection, false accept, and calibration error. Diffs against baseline.json when provided. --- benchmarks/speaker-id/report.py | 91 +++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100755 benchmarks/speaker-id/report.py diff --git a/benchmarks/speaker-id/report.py b/benchmarks/speaker-id/report.py new file mode 100755 index 0000000..815e118 --- /dev/null +++ b/benchmarks/speaker-id/report.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +"""Print metrics from a score.py results file. + +Emits four numbers that must move together — optimizing any one alone is +how regressions ship: + + accuracy enrolled speakers correctly identified + unknown_rejection strangers correctly marked "no match" + false_accept strangers confidently labeled as someone else + calibration_error mean gap between claimed confidence and actual correctness +""" + +import argparse +import json +from pathlib import Path +from statistics import mean + + +def metrics(results: dict) -> dict: + turns = [t for m in results["meetings"] for t in m["predictions"]] + enrolled = [t for t in turns if t["enrolled"]] + strangers = [t for t in turns if not t["enrolled"]] + + if enrolled: + correct = sum(1 for t in enrolled if t["predicted"] == t["speaker_truth"]) + missed = sum(1 for t in enrolled if t["predicted"] is None) + wrong = sum(1 for t in enrolled if t["predicted"] and t["predicted"] != t["speaker_truth"]) + accuracy = correct / len(enrolled) + miss_rate = missed / len(enrolled) + confusion_rate = wrong / len(enrolled) + else: + accuracy = miss_rate = confusion_rate = 0.0 + + if strangers: + rejected = sum(1 for t in strangers if t["predicted"] is None) + unknown_rejection = rejected / len(strangers) + false_accept = 1 - unknown_rejection + else: + unknown_rejection = false_accept = 0.0 + + confidences = [t for t in enrolled if t["confidence"] is not None] + if confidences: + bins = [[] for _ in range(10)] + for t in confidences: + idx = min(9, int(t["confidence"] * 10)) + bins[idx].append(1 if t["predicted"] == t["speaker_truth"] else 0) + per_bin = [ + (i / 10 + 0.05, mean(b)) for i, b in enumerate(bins) if b + ] + calibration_error = mean(abs(claimed - actual) for claimed, actual in per_bin) + else: + calibration_error = 0.0 + + return { + "n_enrolled_turns": len(enrolled), + "n_stranger_turns": len(strangers), + "accuracy": round(accuracy, 4), + "miss_rate": round(miss_rate, 4), + "confusion_rate": round(confusion_rate, 4), + "unknown_rejection": round(unknown_rejection, 4), + "false_accept": round(false_accept, 4), + "calibration_error": round(calibration_error, 4), + } + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("results", type=Path) + parser.add_argument("--baseline", type=Path, help="baseline.json to diff against") + args = parser.parse_args() + + current = metrics(json.loads(args.results.read_text())) + print(f"{'metric':20} {'value':>10}", end="") + if args.baseline and args.baseline.exists(): + baseline = json.loads(args.baseline.read_text()) + print(f" {'baseline':>10} {'delta':>10}") + for key, value in current.items(): + base = baseline.get(key, 0) + delta = value - base if isinstance(value, (int, float)) else "" + delta_str = f"{delta:+.4f}" if isinstance(delta, float) else str(delta) + print(f"{key:20} {value:>10} {base:>10} {delta_str:>10}") + else: + print() + for key, value in current.items(): + print(f"{key:20} {value:>10}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 93d488322c72af54855a5c823b9af901c1f99ec9 Mon Sep 17 00:00:00 2001 From: ComputelessComputer <63365510+ComputelessComputer@users.noreply.github.com> Date: Fri, 17 Apr 2026 03:12:48 +0900 Subject: [PATCH 5/8] Drop synthetic stitcher from speaker ID benchmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stitching LibriSpeech into fake meetings optimizes the wrong thing. Replacing with a labeler that runs on actual Char meetings — the conditions we ship in. --- benchmarks/speaker-id/requirements.txt | 4 - benchmarks/speaker-id/stitch.py | 173 ------------------------- 2 files changed, 177 deletions(-) delete mode 100644 benchmarks/speaker-id/requirements.txt delete mode 100755 benchmarks/speaker-id/stitch.py diff --git a/benchmarks/speaker-id/requirements.txt b/benchmarks/speaker-id/requirements.txt deleted file mode 100644 index d5697ff..0000000 --- a/benchmarks/speaker-id/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -# Minimal deps — stdlib does most of the work -soundfile==0.12.1 -numpy==1.26.4 -pyroomacoustics==0.7.4 diff --git a/benchmarks/speaker-id/stitch.py b/benchmarks/speaker-id/stitch.py deleted file mode 100755 index fcab6fb..0000000 --- a/benchmarks/speaker-id/stitch.py +++ /dev/null @@ -1,173 +0,0 @@ -#!/usr/bin/env python3 -"""Stitch labeled speaker clips into synthetic meetings with ground truth. - -Input: a corpus directory where each subdirectory is a speaker id - containing that speaker's audio clips. - - corpus/ - speaker_001/ - clip_a.wav - clip_b.wav - speaker_002/ - ... - -Output: a fixtures directory with one meeting per subdir — audio.wav plus - ground.json listing the ordered turns (speaker, start, end). -""" - -import argparse -import json -import random -from dataclasses import asdict, dataclass -from pathlib import Path - -import numpy as np -import soundfile as sf - - -TARGET_SR = 16000 -MIN_TURN_S = 2.0 -MAX_TURN_S = 8.0 -PAUSE_MIN_S = 0.2 -PAUSE_MAX_S = 1.2 - - -@dataclass -class Turn: - speaker: str - start_seconds: float - end_seconds: float - source_clip: str - - -def load_clip(path: Path) -> np.ndarray: - audio, sr = sf.read(path, dtype="float32", always_2d=False) - if audio.ndim > 1: - audio = audio.mean(axis=1) - if sr != TARGET_SR: - ratio = TARGET_SR / sr - new_len = int(round(len(audio) * ratio)) - audio = np.interp( - np.linspace(0, len(audio) - 1, new_len), - np.arange(len(audio)), - audio, - ).astype(np.float32) - return audio - - -def trim_to_duration(audio: np.ndarray, rng: random.Random) -> np.ndarray: - target_s = rng.uniform(MIN_TURN_S, MAX_TURN_S) - target_samples = int(target_s * TARGET_SR) - if len(audio) <= target_samples: - return audio - start = rng.randint(0, len(audio) - target_samples) - return audio[start : start + target_samples] - - -def load_corpus(root: Path, min_clips: int) -> dict[str, list[Path]]: - speakers = {} - for speaker_dir in sorted(root.iterdir()): - if not speaker_dir.is_dir(): - continue - clips = sorted( - p for p in speaker_dir.rglob("*") if p.suffix.lower() in {".wav", ".flac", ".mp3"} - ) - if len(clips) >= min_clips: - speakers[speaker_dir.name] = clips - if not speakers: - raise SystemExit(f"No speakers with >={min_clips} clips found in {root}") - return speakers - - -def build_meeting( - meeting_dir: Path, - speakers: dict[str, list[Path]], - speaker_ids: list[str], - rng: random.Random, - turns_per_meeting: int, -) -> None: - meeting_dir.mkdir(parents=True, exist_ok=True) - audio_chunks: list[np.ndarray] = [] - turns: list[Turn] = [] - cursor_s = 0.0 - last_speaker: str | None = None - - for _ in range(turns_per_meeting): - # avoid same speaker twice in a row when more than one is in the meeting - candidates = [s for s in speaker_ids if s != last_speaker] or speaker_ids - speaker = rng.choice(candidates) - clip_path = rng.choice(speakers[speaker]) - clip = trim_to_duration(load_clip(clip_path), rng) - if len(clip) == 0: - continue - - duration_s = len(clip) / TARGET_SR - turns.append( - Turn( - speaker=speaker, - start_seconds=round(cursor_s, 3), - end_seconds=round(cursor_s + duration_s, 3), - source_clip=str(clip_path), - ) - ) - audio_chunks.append(clip) - cursor_s += duration_s - - pause_s = rng.uniform(PAUSE_MIN_S, PAUSE_MAX_S) - audio_chunks.append(np.zeros(int(pause_s * TARGET_SR), dtype=np.float32)) - cursor_s += pause_s - last_speaker = speaker - - audio = np.concatenate(audio_chunks) - sf.write(meeting_dir / "audio.wav", audio, TARGET_SR, subtype="PCM_16") - - ground = { - "sample_rate": TARGET_SR, - "duration_seconds": round(cursor_s, 3), - "speakers": sorted(set(speaker_ids)), - "turns": [asdict(turn) for turn in turns], - } - (meeting_dir / "ground.json").write_text(json.dumps(ground, indent=2)) - - -def main() -> int: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--corpus", type=Path, required=True, help="Directory of speaker subdirs") - parser.add_argument("--out", type=Path, required=True, help="Output fixtures directory") - parser.add_argument("--num-meetings", type=int, default=50) - parser.add_argument("--speakers-per-meeting", type=int, default=3) - parser.add_argument("--turns-per-meeting", type=int, default=20) - parser.add_argument("--min-clips-per-speaker", type=int, default=8) - parser.add_argument("--seed", type=int, default=42) - args = parser.parse_args() - - rng = random.Random(args.seed) - speakers = load_corpus(args.corpus, args.min_clips_per_speaker) - speaker_pool = list(speakers.keys()) - if len(speaker_pool) < args.speakers_per_meeting: - raise SystemExit( - f"Need >={args.speakers_per_meeting} speakers, corpus has {len(speaker_pool)}" - ) - - args.out.mkdir(parents=True, exist_ok=True) - - for index in range(args.num_meetings): - chosen = rng.sample(speaker_pool, args.speakers_per_meeting) - meeting_dir = args.out / f"meeting_{index:04d}" - build_meeting(meeting_dir, speakers, chosen, rng, args.turns_per_meeting) - print(f"built {meeting_dir.name}: speakers={chosen}") - - manifest = { - "seed": args.seed, - "num_meetings": args.num_meetings, - "speakers_per_meeting": args.speakers_per_meeting, - "turns_per_meeting": args.turns_per_meeting, - "corpus": str(args.corpus), - } - (args.out / "manifest.json").write_text(json.dumps(manifest, indent=2)) - print(f"wrote {args.num_meetings} meetings to {args.out}") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) From 30935ad0bf077ce7146fedf1e8280cb7b777dd42 Mon Sep 17 00:00:00 2001 From: ComputelessComputer <63365510+ComputelessComputer@users.noreply.github.com> Date: Fri, 17 Apr 2026 03:16:26 +0900 Subject: [PATCH 6/8] Add speaker labeler for real Char meetings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit label.py plays a random long turn per speaker and asks who they are. One pass per meeting, not per turn — a 3-speaker meeting takes a minute. macOS-first: ffmpeg to slice, afplay to preview. --- benchmarks/speaker-id/label.py | 235 +++++++++++++++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 benchmarks/speaker-id/label.py diff --git a/benchmarks/speaker-id/label.py b/benchmarks/speaker-id/label.py new file mode 100644 index 0000000..2a7d015 --- /dev/null +++ b/benchmarks/speaker-id/label.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python3 +"""Label speakers in an unsigned Char meeting. + +Reads a meeting markdown export, plays a sample turn for each speaker, and +asks who they are. Saves ground-truth labels to ground/.json. + +Label by SPEAKER, not by turn. For a 30-minute meeting with 3 speakers, you +answer 3 questions — not 80. + +Works against meetings whose speakers are still unlabeled in the app +(Speaker 1, Speaker 2, ...). If you've already typed human names into the +app, export a fresh meeting before labeling — otherwise the matcher's +guesses leak into ground truth. + +Usage: + python label.py ~/Documents/unsigned\\ char/meeting-abc123.md + +Keys: + assign this speaker + ? replay a different turn from this speaker + s skip (not labeled — excluded from scoring) + q save and quit +""" + +import argparse +import json +import random +import re +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path + + +PREVIEW_SECONDS = 5.0 +SPEAKER_TURN_PATTERN = re.compile( + r"^\s*-\s+(?P.+?)\s*:\s+" + r"(?P\d{1,2}:\d{2})-(?P\d{1,2}:\d{2})\s*$", + re.MULTILINE, +) + + +@dataclass +class Turn: + speaker: str + start_seconds: float + end_seconds: float + + +def parse_clock(value: str) -> float: + minutes, seconds = value.split(":") + return int(minutes) * 60 + int(seconds) + + +def parse_meeting(markdown_path: Path) -> tuple[dict, list[Turn], Path]: + text = markdown_path.read_text() + + frontmatter = {} + if text.startswith("---"): + end = text.find("\n---", 3) + if end > 0: + for line in text[3:end].splitlines(): + if ":" in line: + key, value = line.split(":", 1) + frontmatter[key.strip()] = value.strip() + text = text[end + 4 :] + + audio_path_raw = frontmatter.get("audio_path", "").strip() + if not audio_path_raw: + raise SystemExit(f"No audio_path in frontmatter of {markdown_path}") + audio_path = Path(audio_path_raw).expanduser() + if not audio_path.is_absolute(): + audio_path = (markdown_path.parent / audio_path).resolve() + if not audio_path.exists(): + raise SystemExit(f"Audio file not found: {audio_path}") + + turns_section = text.split("## Speaker Turns", 1) + if len(turns_section) < 2: + raise SystemExit(f"No Speaker Turns section in {markdown_path}") + + turns = [] + for match in SPEAKER_TURN_PATTERN.finditer(turns_section[1]): + speaker = match.group("speaker").strip() + if speaker.lower() == "pipeline": + continue + turns.append( + Turn( + speaker=speaker, + start_seconds=parse_clock(match.group("start")), + end_seconds=parse_clock(match.group("end")), + ) + ) + + if not turns: + raise SystemExit(f"No diarization turns parsed from {markdown_path}") + + return frontmatter, turns, audio_path + + +def group_by_speaker(turns: list[Turn]) -> dict[str, list[Turn]]: + groups: dict[str, list[Turn]] = {} + for turn in turns: + groups.setdefault(turn.speaker, []).append(turn) + for turns_list in groups.values(): + turns_list.sort(key=lambda t: t.end_seconds - t.start_seconds, reverse=True) + return groups + + +def play_clip(audio_path: Path, start: float, duration: float) -> None: + if sys.platform != "darwin": + print(f" (auto-play unsupported on {sys.platform}: seek to {start:.1f}s)") + return + tmp = Path("/tmp") / f"uchar_label_{start:.0f}.wav" + try: + subprocess.run( + [ + "ffmpeg", + "-y", + "-loglevel", + "error", + "-ss", + str(start), + "-t", + str(duration), + "-i", + str(audio_path), + str(tmp), + ], + check=True, + ) + except FileNotFoundError: + print(" ffmpeg not installed. `brew install ffmpeg` to enable preview.") + return + subprocess.run(["afplay", str(tmp)], check=False) + tmp.unlink(missing_ok=True) + + +def pick_preview_turn(turns: list[Turn], rng: random.Random) -> Turn: + long_enough = [t for t in turns if t.end_seconds - t.start_seconds >= 2.0] + pool = long_enough or turns + return rng.choice(pool[: max(5, len(pool) // 2)]) + + +def load_existing(path: Path) -> dict: + if not path.exists(): + return {"speakers": {}} + data = json.loads(path.read_text()) + data.setdefault("speakers", {}) + return data + + +def save_labels(path: Path, payload: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2)) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("meeting", type=Path, help="Path to meeting-*.md export") + parser.add_argument( + "--out", + type=Path, + default=Path(__file__).parent / "ground", + help="Output directory for ground-truth labels", + ) + parser.add_argument( + "--preview-seconds", + type=float, + default=PREVIEW_SECONDS, + ) + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args() + + frontmatter, turns, audio_path = parse_meeting(args.meeting) + meeting_id = frontmatter.get("id", args.meeting.stem) + ground_path = args.out / f"{meeting_id}.json" + existing = load_existing(ground_path) + rng = random.Random(args.seed) + + groups = group_by_speaker(turns) + print(f"meeting: {meeting_id}") + print(f"audio: {audio_path}") + print(f"speakers: {len(groups)} turns: {len(turns)}") + print() + + for speaker_id, speaker_turns in groups.items(): + if speaker_id in existing["speakers"]: + print(f"{speaker_id}: already labeled as '{existing['speakers'][speaker_id]}' — skipping") + continue + + total = sum(t.end_seconds - t.start_seconds for t in speaker_turns) + print(f"— {speaker_id} — {len(speaker_turns)} turns, {total:.0f}s total") + + while True: + preview = pick_preview_turn(speaker_turns, rng) + duration = min(args.preview_seconds, preview.end_seconds - preview.start_seconds) + print(f" sample: {preview.start_seconds:.0f}s–{preview.end_seconds:.0f}s") + play_clip(audio_path, preview.start_seconds, duration) + + try: + response = input(" who is this? > ").strip() + except EOFError: + response = "q" + + if response == "?": + continue + if response == "s": + label = None + break + if response == "q": + save_labels(ground_path, existing) + print(f"saved {ground_path}") + return 0 + if not response: + print(" (type a name, '?' to replay, 's' to skip, 'q' to quit)") + continue + + label = response + break + + if label is not None: + existing["speakers"][speaker_id] = label + save_labels(ground_path, existing) + print(f" → {label}") + else: + print(" → skipped") + + save_labels(ground_path, existing) + labeled = sum(1 for v in existing["speakers"].values() if v) + print(f"\ndone. {labeled}/{len(groups)} speakers labeled → {ground_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From cd3c22126ddca179a36e482a42e42ec97e234e2c Mon Sep 17 00:00:00 2001 From: ComputelessComputer <63365510+ComputelessComputer@users.noreply.github.com> Date: Fri, 17 Apr 2026 03:16:26 +0900 Subject: [PATCH 7/8] Rework scorer for per-speaker ground truth Consume the per-speaker labels emitted by label.py instead of synthetic per-turn fixtures. Split enrolled vs stranger identities per meeting so unknown-speaker rejection stays in the metric set. --- benchmarks/speaker-id/score.py | 162 ++++++++++++++++++--------------- 1 file changed, 91 insertions(+), 71 deletions(-) diff --git a/benchmarks/speaker-id/score.py b/benchmarks/speaker-id/score.py index c47fbe4..88abcf5 100755 --- a/benchmarks/speaker-id/score.py +++ b/benchmarks/speaker-id/score.py @@ -1,23 +1,29 @@ #!/usr/bin/env python3 -"""Score unsigned Char's speaker matching against stitched fixtures. +"""Score unsigned Char's speaker matching against labeled meetings. -For each fixture meeting: - 1. Pick half the speakers as "enrolled" — their clips become stored profiles - 2. The other half are "strangers" — the matcher should reject them - 3. Run diarization + embedding on the meeting audio - 4. Feed each turn's embedding to scoreSpeakerProfile via the TS bridge - 5. Record the top suggestion (or "no match") per turn +Expects ground truth from label.py — a per-meeting JSON with: -Emits results/.json with per-turn predictions and ground-truth labels. + {"speakers": {"Speaker 1": "John", "Speaker 2": "Yujong", ...}} -This script assumes embeddings are produced by the same Swift pipeline used -in-app. For the harness we accept a pre-computed embeddings file per meeting -(embeddings.json) so we can iterate on scoring logic without rebuilding the -Swift layer on every run. See `extract_embeddings.py` to generate those. +And an embeddings.json per meeting produced by the Swift pipeline: + + { + "speakers": { + "Speaker 1": [[...], [...]], // one embedding per turn + "Speaker 2": [[...]] + } + } + +Runs store.ts matching logic in Python. For each meeting, half the *labeled* +human identities become enrolled profiles, the other half are strangers the +matcher should reject. + +Emits results/.json with per-turn predictions. """ import argparse import json +import random from pathlib import Path @@ -44,7 +50,6 @@ def normalized_centroid(embeddings: list[list[float]]) -> list[float]: def score_profile(embedding: list[float], profile: dict) -> dict: - """Mirror of store.ts scoreSpeakerProfile.""" centroid_score = cosine(embedding, profile["centroid"]) sample_scores = sorted( (cosine(embedding, s) for s in profile["samples"]), @@ -62,7 +67,6 @@ def score_profile(embedding: list[float], profile: dict) -> dict: def recommend(embedding: list[float], profiles: list[dict]) -> dict | None: - """Mirror of store.ts recommendSpeakerProfile.""" ranked = sorted( (score_profile(embedding, p) for p in profiles), key=lambda r: r["score"], @@ -81,100 +85,116 @@ def recommend(embedding: list[float], profiles: list[dict]) -> dict | None: return None return { - "profile_id": best["profile_id"], "profile_name": best["profile_name"], "confidence": best["score"], "alternate_confidence": alternate_score, } -def split_speakers(speakers: list[str], rng_seed: int) -> tuple[list[str], list[str]]: - import random - - rng = random.Random(rng_seed) - shuffled = sorted(speakers) +def split_people(people: list[str], seed: int) -> tuple[set[str], set[str]]: + rng = random.Random(seed) + shuffled = sorted(people) rng.shuffle(shuffled) - half = len(shuffled) // 2 - return shuffled[:half], shuffled[half:] + half = max(1, len(shuffled) // 2) + return set(shuffled[:half]), set(shuffled[half:]) -def load_embeddings(meeting_dir: Path) -> dict: - path = meeting_dir / "embeddings.json" - if not path.exists(): - raise SystemExit( - f"Missing {path}. Run extract_embeddings.py first (or wire up the Swift bridge)." - ) - return json.loads(path.read_text()) +def score_meeting(meeting_dir: Path, seed: int) -> dict: + ground = json.loads((meeting_dir / "ground.json").read_text()) + embeddings = json.loads((meeting_dir / "embeddings.json").read_text()) + + speaker_to_person = { + speaker: person + for speaker, person in ground["speakers"].items() + if person and not person.startswith("__") + } + if not speaker_to_person: + return {"meeting": meeting_dir.name, "predictions": [], "skipped": "no labels"} + people = sorted(set(speaker_to_person.values())) + enrolled, strangers = split_people(people, seed) -def build_profiles( - speakers_to_enroll: list[str], - enrollment_embeddings: dict[str, list[list[float]]], -) -> list[dict]: profiles = [] - for speaker in speakers_to_enroll: - samples = enrollment_embeddings.get(speaker, []) + for speaker, person in speaker_to_person.items(): + if person not in enrolled: + continue + samples = embeddings["speakers"].get(speaker, []) if len(samples) < 2: continue profiles.append( { - "id": f"profile_{speaker}", - "name": speaker, + "id": f"profile_{person}", + "name": person, "centroid": normalized_centroid(samples), "samples": samples, } ) - return profiles - - -def score_meeting(meeting_dir: Path, seed: int) -> dict: - ground = json.loads((meeting_dir / "ground.json").read_text()) - data = load_embeddings(meeting_dir) - enrollment = data["enrollment"] - per_turn = data["turns"] - - enrolled, strangers = split_speakers(ground["speakers"], seed) - profiles = build_profiles(enrolled, enrollment) predictions = [] - for turn, turn_embedding in zip(ground["turns"], per_turn): - suggestion = recommend(turn_embedding, profiles) - is_enrolled = turn["speaker"] in enrolled - predictions.append( - { - "speaker_truth": turn["speaker"], - "enrolled": is_enrolled, - "predicted": suggestion["profile_name"] if suggestion else None, - "confidence": suggestion["confidence"] if suggestion else None, - "start": turn["start_seconds"], - "end": turn["end_seconds"], - } - ) + for speaker, person in speaker_to_person.items(): + turn_embeddings = embeddings["speakers"].get(speaker, []) + for turn_embedding in turn_embeddings: + suggestion = recommend(turn_embedding, profiles) + predictions.append( + { + "speaker_id": speaker, + "truth": person, + "enrolled": person in enrolled, + "predicted": suggestion["profile_name"] if suggestion else None, + "confidence": suggestion["confidence"] if suggestion else None, + } + ) return { "meeting": meeting_dir.name, - "enrolled_speakers": enrolled, - "stranger_speakers": strangers, + "enrolled": sorted(enrolled), + "strangers": sorted(strangers), "predictions": predictions, } +def collect_meetings(root: Path, ground_dir: Path) -> list[Path]: + """Pair every ground/.json with meetings//embeddings.json.""" + pairs = [] + for ground_file in sorted(ground_dir.glob("*.json")): + meeting_dir = root / ground_file.stem + if (meeting_dir / "embeddings.json").exists(): + ground_target = meeting_dir / "ground.json" + if not ground_target.exists() or ground_target.read_text() != ground_file.read_text(): + meeting_dir.mkdir(parents=True, exist_ok=True) + ground_target.write_text(ground_file.read_text()) + pairs.append(meeting_dir) + return pairs + + def main() -> int: parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--fixtures", type=Path, required=True) + parser.add_argument( + "--meetings", + type=Path, + default=Path(__file__).parent / "meetings", + ) + parser.add_argument( + "--ground", + type=Path, + default=Path(__file__).parent / "ground", + ) parser.add_argument("--out", type=Path, required=True) parser.add_argument("--seed", type=int, default=42) args = parser.parse_args() - meetings = [] - for meeting_dir in sorted(args.fixtures.iterdir()): - if not (meeting_dir / "ground.json").exists(): - continue - meetings.append(score_meeting(meeting_dir, args.seed)) - print(f"scored {meeting_dir.name}") + meetings = collect_meetings(args.meetings, args.ground) + if not meetings: + raise SystemExit( + f"No meetings with both ground/*.json and {args.meetings}/*/embeddings.json" + ) + + results = [score_meeting(meeting_dir, args.seed) for meeting_dir in meetings] + for result in results: + print(f"scored {result['meeting']} turns={len(result.get('predictions', []))}") args.out.parent.mkdir(parents=True, exist_ok=True) - args.out.write_text(json.dumps({"meetings": meetings}, indent=2)) + args.out.write_text(json.dumps({"meetings": results}, indent=2)) print(f"wrote {args.out}") return 0 From de2e60696b54107797ae3a7ddef52d1c004132cf Mon Sep 17 00:00:00 2001 From: ComputelessComputer <63365510+ComputelessComputer@users.noreply.github.com> Date: Fri, 17 Apr 2026 03:16:26 +0900 Subject: [PATCH 8/8] Rewrite benchmark README around dogfood workflow Drops the tiered-corpus plan. Measures speaker ID on the user's own labeled meetings. Gitignore adds ground/ and meetings/ to keep voice samples off GitHub. --- benchmarks/speaker-id/.gitignore | 3 +- benchmarks/speaker-id/README.md | 88 +++++++++++++++++--------------- 2 files changed, 50 insertions(+), 41 deletions(-) diff --git a/benchmarks/speaker-id/.gitignore b/benchmarks/speaker-id/.gitignore index 6c350c5..b9fca71 100644 --- a/benchmarks/speaker-id/.gitignore +++ b/benchmarks/speaker-id/.gitignore @@ -1,4 +1,5 @@ -fixtures/ +ground/ +meetings/ results/ .venv/ __pycache__/ diff --git a/benchmarks/speaker-id/README.md b/benchmarks/speaker-id/README.md index f57a4d8..e513628 100644 --- a/benchmarks/speaker-id/README.md +++ b/benchmarks/speaker-id/README.md @@ -1,68 +1,76 @@ -# Speaker ID Benchmark Harness +# Speaker ID Benchmark — dogfood edition -A reproducible harness for measuring speaker identification quality in unsigned Char. +A harness for measuring speaker identification quality against **your own meetings**. No synthetic corpora, no stitching — just label the meetings you already have and optimize against the conditions Char actually ships in. -## Why +## Why this shape -Speaker ID in `src/store.ts` uses hand-tuned thresholds with no way to measure impact of changes. This harness builds labeled "meetings" from public speaker-verification corpora and scores unsigned Char's matching logic against ground truth — so you can tell whether a tweak actually helps. +Speaker ID in `src/store.ts` uses hand-tuned thresholds with no way to measure impact of changes. Academic corpora (VoxCeleb, AMI) measure different conditions than what Char users hit — read speech, studio mics, celebrity interviews. + +Your last 20 Char meetings are the right benchmark. You know who was there. You remember which identifications went wrong. An evening of labeling is faster than a week of synthetic harness plumbing that ends up measuring the wrong thing. ## Pipeline ``` -labeled clips (VoxCeleb / LibriSpeech / your own) - ↓ stitch.py -synthetic meetings with ground-truth turn labels - ↓ score.py (runs TS scoring via bun) -predictions vs ground truth +meeting-*.md (your exported meetings) + ↓ label.py (play turn, type name, once per speaker) +ground/.json + ↓ extract_embeddings.py (TODO — Swift bridge) +meetings//embeddings.json + ↓ score.py +results/.json ↓ report.py -accuracy, precision/recall, unknown-speaker rejection, calibration +accuracy · unknown_rejection · false_accept · calibration_error ``` ## Metrics -Four numbers we track together — optimizing only the first is how you ship regressions: - -- **Identification accuracy** — % of enrolled speakers correctly identified -- **Unknown rejection rate** — % of unseen speakers correctly marked as "no match" -- **False-accept rate** — unseen speakers confidently labeled as someone else -- **Calibration error** — gap between claimed confidence and actual accuracy +Four numbers tracked together. Optimizing only the first is how you ship regressions: -## Tiers +- **Identification accuracy** — enrolled speakers correctly identified +- **Unknown rejection rate** — unseen speakers correctly marked "no match" +- **False-accept rate** — unseen speakers confidently mislabeled as someone else +- **Calibration error** — gap between claimed confidence and actual correctness -| Tier | Source | Purpose | Runtime | -|------|--------|---------|---------| -| unit | Synthetic (VCTK/LibriSpeech) | Fast iteration | seconds | -| integration | AMI Meeting Corpus | Real meeting dynamics | minutes | -| reality | Dogfood meetings | Actual Char conditions | minutes | +For each meeting, half the labeled humans become "enrolled profiles" and the other half are "strangers" the matcher should reject. This keeps the 0.04 margin gate honest. -## Usage +## Labeling ``` cd benchmarks/speaker-id -python3 -m venv .venv && source .venv/bin/activate -pip install -r requirements.txt - -python stitch.py --corpus vctk --out fixtures/unit --num-meetings 50 -python score.py --fixtures fixtures/unit --out results/unit.json -python report.py results/unit.json +brew install ffmpeg # for auto-preview on macOS +python3 label.py ~/Documents/unsigned\ char/meeting-abc123.md ``` -Baseline numbers go in `baseline.json`. Any PR that changes speaker ID logic must report deltas against baseline. +The labeler picks one random long turn per speaker, plays 5 seconds, asks who they are. A 3-speaker meeting takes about a minute. Aim for 20 meetings — that gives you meaningful numbers without losing your evening. + +**Important:** only label meetings whose speakers are still unlabeled in the app (`Speaker 1`, `Speaker 2` in the export). If you've already assigned human names inside Char, the matcher's guesses will leak into the markdown and contaminate ground truth. Either export a fresh meeting before labeling, or switch to the upcoming `uchar meetings export --raw` command once it lands. + +## Scoring + +``` +python3 score.py --out results/latest.json +python3 report.py results/latest.json --baseline baseline.json +``` ## Directory layout ``` benchmarks/speaker-id/ -├── README.md # this file -├── requirements.txt # pinned deps -├── stitch.py # build synthetic meetings from labeled clips -├── score.py # run matching logic against fixtures -├── report.py # print metrics table -├── fixtures/ # generated meetings (gitignored) -├── results/ # metric runs (gitignored) -└── baseline.json # reference numbers, committed +├── README.md # this file +├── label.py # interactive labeler +├── score.py # runs store.ts matching logic against embeddings + ground +├── report.py # prints metrics table with baseline delta +├── ground/ # gitignored — your labels, your voices +├── meetings/ # gitignored — per-meeting embeddings.json +├── results/ # gitignored — metric runs +└── baseline.json # committed — main-branch numbers, no audio in it ``` -## Notes +## What's missing + +- **`extract_embeddings.py`** — the Swift speaker-embedding extractor needs a CLI entry point. Until that lands, `embeddings.json` doesn't get generated. Tracked in #3. +- **`baseline.json`** — populated once the pipeline runs end-to-end on real meetings. + +## Note on privacy -Fixtures and results are gitignored — they're large and reproducible from the seed. `baseline.json` is the only tracked output and represents current main's performance. +`ground/` and `meetings/` are gitignored on purpose. These directories hold your voice samples. Never commit them. `baseline.json` contains only aggregate numbers and is safe to share.