From 56e12a84b76eb45c42fed5d4b96f6ea202819686 Mon Sep 17 00:00:00 2001 From: ComputelessComputer <63365510+ComputelessComputer@users.noreply.github.com> Date: Fri, 17 Apr 2026 13:37:49 +0900 Subject: [PATCH] Normalize per-sample embeddings before averaging centroid Speaker embeddings must be L2-normalized before averaging so high-magnitude samples don't dominate the centroid. The old code summed raw WeSpeaker outputs and only normalized at the end, which biases the centroid toward louder or longer clips. Now each sample is L2-normalized before summation; the resulting mean is re-normalized as before. --- src-tauri/swift-permissions/src/speech_bridge.swift | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src-tauri/swift-permissions/src/speech_bridge.swift b/src-tauri/swift-permissions/src/speech_bridge.swift index cb5744d..8465704 100644 --- a/src-tauri/swift-permissions/src/speech_bridge.swift +++ b/src-tauri/swift-permissions/src/speech_bridge.swift @@ -403,8 +403,15 @@ private func normalizedEmbeddingCentroid(_ embeddings: [[Float]]) -> [Float] { var centroid = [Float](repeating: 0, count: first.count) for embedding in embeddings where embedding.count == centroid.count { + let sampleNorm = sqrt(embedding.reduce(Float.zero) { partialResult, value in + partialResult + (value * value) + }) + guard sampleNorm > 0 else { + continue + } + for (index, value) in embedding.enumerated() { - centroid[index] += value + centroid[index] += value / sampleNorm } }