diff --git a/stackchan_server/speech_recognition/whisper_cpp.py b/stackchan_server/speech_recognition/whisper_cpp.py index f0925a2..268015c 100644 --- a/stackchan_server/speech_recognition/whisper_cpp.py +++ b/stackchan_server/speech_recognition/whisper_cpp.py @@ -165,7 +165,14 @@ def _normalize_transcript(text: str) -> str: def _load_transcript_from_json(path: Path) -> str: - data = json.loads(path.read_text(encoding="utf-8")) + try: + raw_bytes = path.read_bytes() + text = raw_bytes.decode("utf-8", errors="replace") + if "\ufffd" in text: + logger.warning("whisper.cpp JSON output contains invalid UTF-8 bytes") + data = json.loads(text) + except (json.JSONDecodeError, OSError) as exc: + raise RuntimeError(f"Failed to read whisper.cpp JSON output: {exc}") from exc transcription = data.get("transcription") if not isinstance(transcription, list): return "" diff --git a/stackchan_server/speech_recognition/whisper_server.py b/stackchan_server/speech_recognition/whisper_server.py index c3b5d69..eb5d44d 100644 --- a/stackchan_server/speech_recognition/whisper_server.py +++ b/stackchan_server/speech_recognition/whisper_server.py @@ -96,11 +96,14 @@ def _request_transcript(self, wav_bytes: bytes, language: str) -> str: raise RuntimeError(f"whisper-server request failed: {exc.reason}") from exc if self._response_format == "json": - payload = json.loads(response_body.decode("utf-8")) + payload = _load_json_response_bytes(response_body) + if not isinstance(payload, Mapping): + return "" + payload = cast(Mapping[str, object], payload) text = payload.get("text") return text.strip() if isinstance(text, str) else "" - payload = json.loads(response_body.decode("utf-8")) + payload = _load_json_response_bytes(response_body) return _load_transcript_from_verbose_json(payload) @@ -118,6 +121,13 @@ def _normalize_language(language_code: str) -> str: return language_code.split("-", 1)[0].lower() +def _load_json_response_bytes(response_body: bytes) -> object: + response_text = response_body.decode("utf-8", errors="replace") + if "\ufffd" in response_text: + logger.warning("whisper-server JSON output contains invalid UTF-8 bytes") + return json.loads(response_text) + + def _load_transcript_from_verbose_json(payload: object) -> str: if not isinstance(payload, Mapping): return "" diff --git a/tests/test_whisper_server.py b/tests/test_whisper_server.py new file mode 100644 index 0000000..5469702 --- /dev/null +++ b/tests/test_whisper_server.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import unittest + +from stackchan_server.speech_recognition.whisper_server import ( + _load_json_response_bytes, + _load_transcript_from_verbose_json, +) + + +class WhisperServerJsonTests(unittest.TestCase): + def test_load_json_response_bytes_replaces_invalid_utf8(self) -> None: + payload = _load_json_response_bytes(b'{"transcription":[{"text":"\xe6\x90"},{"text":"ok"}]}') + + self.assertEqual(payload, {"transcription": [{"text": "�"}, {"text": "ok"}]}) + + def test_load_transcript_from_verbose_json_with_replacement_char(self) -> None: + payload = { + "transcription": [ + {"text": "�"}, + {"text": "ok"}, + ] + } + + transcript = _load_transcript_from_verbose_json(payload) + + self.assertEqual(transcript, "� ok") + + +if __name__ == "__main__": + unittest.main()