From 47f20db1bcb314f531889e96aa1ef4b3f6f5d70f Mon Sep 17 00:00:00 2001 From: Niko Nousiainen Date: Thu, 26 Mar 2026 08:59:10 +0200 Subject: [PATCH] Doing some decoupling of whisper.cpp to allow greater flexibility in the future and easier testing now --- AGENTS.md | 8 +- CMakeLists.txt | 2 + src/app/applicationcommands.cpp | 13 +-- src/transcription/transcriptionengine.cpp | 37 +++++++ src/transcription/transcriptionengine.h | 90 ++++++++++++++++ src/transcription/transcriptionworker.cpp | 17 ++- src/transcription/transcriptionworker.h | 13 ++- src/transcription/whispercpptranscriber.cpp | 7 +- src/transcription/whispercpptranscriber.h | 12 ++- tests/CMakeLists.txt | 13 ++- tests/transcriptionworkertest.cpp | 113 ++++++++++++++++++++ 11 files changed, 302 insertions(+), 23 deletions(-) create mode 100644 src/transcription/transcriptionengine.cpp create mode 100644 src/transcription/transcriptionengine.h create mode 100644 tests/transcriptionworkertest.cpp diff --git a/AGENTS.md b/AGENTS.md index 634f50a..dd001c7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -33,6 +33,7 @@ This repository is intentionally kept minimal: - `src/clipboardwriter.*`: clipboard integration, preferring KDE system clipboard support - `src/audio/recordingnormalizer.*`: conversion to Whisper-ready mono `float32` at `16 kHz` - `src/transcription/whispercpptranscriber.*`: in-process Whisper integration +- `src/transcription/transcriptionengine.*`: app-owned engine/session seam for backend selection and future runtime evolution - `src/transcription/transcriptionworker.*`: worker object hosted on a dedicated `QThread` - `src/transcription/transcriptiontypes.h`: normalized audio and transcription result value types - `src/config.*`: JSON config loading and defaults @@ -136,6 +137,7 @@ Notes: - When validating inside a restricted sandbox, be ready to disable `ccache` with `CCACHE_DISABLE=1` if the cache location is read-only; that is an execution-environment issue, not a Mutterkey build failure - Prefer fixing the code over weakening `.clang-tidy` or the Clazy check set; only relax tool config when the warning is clearly low-value for this repo - In this Qt-heavy repo, treat `misc-include-cleaner` and `readability-redundant-access-specifiers` as low-value `clang-tidy` noise unless the underlying tool behavior improves; they conflict with Qt header-provider reality and `signals` / `slots` / `Q_SLOTS` sectioning more than they improve safety +- Prefer anonymous-namespace `Q_LOGGING_CATEGORY` for file-local logging categories; `Q_STATIC_LOGGING_CATEGORY` is not portable enough across the Qt versions this repo may build against - Do not add broad Valgrind suppressions by default; only add narrow suppressions after reproducing stable third-party noise and keep them clearly scoped - When adding tests, prefer small `Qt Test` cases that run headlessly under `CTest` and avoid microphone, clipboard, or KDE session dependencies unless the task is specifically integration-focused - For tool-driven cleanups, preserve the existing design and behavior; do not perform broad rewrites just to satisfy style-oriented recommendations @@ -146,7 +148,7 @@ Notes: - Stay within the existing style and structure; do not reformat unrelated code - Prefer small, direct classes over adding abstraction layers without a concrete need - Keep Qt usage idiomatic: `QObject` ownership, signal/slot wiring, and `QThread` boundaries should remain explicit -- Prefer `Q_STATIC_LOGGING_CATEGORY` for translation-unit-local logging categories instead of global `Q_LOGGING_CATEGORY` declarations when no cross-file declaration is needed +- Prefer anonymous-namespace `Q_LOGGING_CATEGORY` for translation-unit-local logging categories when no cross-file declaration is needed; keep the pattern compatible with older Qt builds used in CI - When refactoring Qt class declarations, remember that `moc` still cares about section structure: keep explicit `signals`, `slots`, `Q_SLOTS`, and access sections valid for Qt even if a generic style check suggests flattening them - Prefer explicit validation and safe fallback behavior for config-driven runtime values - Avoid introducing optional backends, plugin systems, or cross-platform abstractions unless the task requires them @@ -154,6 +156,8 @@ Notes: - Prefer narrow shared value types across subsystems; for example, consumers that only need captured audio should include `src/audio/recording.h`, not the full recorder class - Keep JSON and other transport details at subsystem boundaries; prefer typed C++ snapshots/results once data crosses into app-owned control, tray, or service code - Prefer dependency injection for tray-shell and control-surface code from the first implementation so headless Qt tests stay simple +- When preparing the transcription path for future runtime work, prefer app-owned engine/session seams and injected sessions over leaking concrete backend types into CLI, service, or worker orchestration +- Prefer product-owned runtime interfaces, model/session separation, and deterministic backend selection before adding new inference backends or widening cross-platform support - Preserve the current product direction: embedded `whisper.cpp`, KDE-first, CLI/service-first ## C++ Core Guidelines Priorities @@ -225,9 +229,11 @@ Typical model location: - Update `LICENSE`, `THIRD_PARTY_NOTICES.md`, CMake install rules, and `third_party/whisper.cpp.UPSTREAM.md` when packaging, licensing, or vendored dependency behavior changes - Keep `README.md`, `AGENTS.md`, and any relevant local skills aligned with the current `scripts/update-whisper.sh` workflow when the vendor-update process changes - Store upcoming feature plans in `next_feature/` as Markdown files, and update the existing plan there when refining the same upcoming feature instead of scattering notes across the repo +- Keep architecture-evolution plans grounded in incremental slices that preserve the current shipped `whisper.cpp` path while moving ownership of interfaces, tests, and packaging toward repo-owned code - Treat `mutterkey-tray` as a shipped artifact once it is installed or validated in CI; keep install rules, README/setup notes, release checklist items, and workflow checks aligned with that status - Verify with a fresh CMake build when the change affects compilation or linkage - Run `ctest` when touching covered code in `src/config.*` or `src/audio/recordingnormalizer.*`, and extend the deterministic headless tests when practical +- When touching transcription orchestration or backend seams, prefer small headless tests with fake/injected sessions over model-dependent integration tests - When adding or fixing Qt GUI tests, make the `CTest` registration itself headless with `QT_QPA_PLATFORM=offscreen` so CI does not try to load `xcb` - Prefer expanding tests around pure parsing, value normalization, and other environment-independent logic before adding KDE-session or device-heavy coverage - Use `-DMUTTERKEY_ENABLE_ASAN=ON` and `-DMUTTERKEY_ENABLE_UBSAN=ON` for fast iteration on memory and UB bugs, and use the repo-owned Valgrind lane as the slower release-focused confirmation step diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b35c4e..255ec35 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,8 @@ set(MUTTERKEY_CORE_SOURCES src/service.cpp src/service.h src/transcription/transcriptiontypes.h + src/transcription/transcriptionengine.cpp + src/transcription/transcriptionengine.h src/transcription/transcriptionworker.cpp src/transcription/transcriptionworker.h src/transcription/whispercpptranscriber.cpp diff --git a/src/app/applicationcommands.cpp b/src/app/applicationcommands.cpp index 33b534f..56a0a1b 100644 --- a/src/app/applicationcommands.cpp +++ b/src/app/applicationcommands.cpp @@ -6,8 +6,8 @@ #include "clipboardwriter.h" #include "control/daemoncontrolserver.h" #include "service.h" +#include "transcription/transcriptionengine.h" #include "transcription/transcriptiontypes.h" -#include "transcription/whispercpptranscriber.h" #include #include @@ -59,18 +59,19 @@ int runDaemon(QGuiApplication &app, const AppConfig &config, const QString &conf int runOnce(QGuiApplication &app, const AppConfig &config, double seconds) { AudioRecorder recorder(config.audio); - WhisperCppTranscriber transcriber(config.transcriber); + const std::unique_ptr transcriptionEngine = createTranscriptionEngine(config.transcriber); + std::unique_ptr transcriber = transcriptionEngine->createSession(); ClipboardWriter clipboardWriter(QGuiApplication::clipboard()); if (config.transcriber.warmupOnStart) { QString warmupError; - if (!transcriber.warmup(&warmupError)) { + if (!transcriber->warmup(&warmupError)) { qCCritical(appLog) << "Failed to warm up transcriber:" << warmupError; return 1; } } - QTimer::singleShot(0, &app, [&app, &recorder, &transcriber, &clipboardWriter, seconds]() { + QTimer::singleShot(0, &app, [&app, &recorder, transcriber = transcriber.get(), &clipboardWriter, seconds]() { QString errorMessage; if (!recorder.start(&errorMessage)) { qCCritical(appLog) << "Failed to start one-shot recording:" << errorMessage; @@ -79,7 +80,7 @@ int runOnce(QGuiApplication &app, const AppConfig &config, double seconds) } qCInfo(appLog) << "Recording for" << seconds << "seconds"; - QTimer::singleShot(static_cast(seconds * 1000), &app, [&app, &recorder, &transcriber, &clipboardWriter]() { + QTimer::singleShot(static_cast(seconds * 1000), &app, [&app, &recorder, transcriber, &clipboardWriter]() { const Recording recording = recorder.stop(); if (!recording.isValid()) { qCCritical(appLog) << "Recorder returned no audio"; @@ -87,7 +88,7 @@ int runOnce(QGuiApplication &app, const AppConfig &config, double seconds) return; } - const TranscriptionResult result = transcriber.transcribe(recording); + const TranscriptionResult result = transcriber->transcribe(recording); if (!result.success) { qCCritical(appLog) << "One-shot transcription failed:" << result.error; QGuiApplication::exit(1); diff --git a/src/transcription/transcriptionengine.cpp b/src/transcription/transcriptionengine.cpp new file mode 100644 index 0000000..0adb8bb --- /dev/null +++ b/src/transcription/transcriptionengine.cpp @@ -0,0 +1,37 @@ +#include "transcription/transcriptionengine.h" + +#include "transcription/whispercpptranscriber.h" + +#include +#include + +namespace { + +class WhisperCppTranscriptionEngine final : public TranscriptionEngine +{ +public: + explicit WhisperCppTranscriptionEngine(TranscriberConfig config) + : m_config(std::move(config)) + { + } + + [[nodiscard]] QString backendName() const override + { + return WhisperCppTranscriber::backendNameStatic(); + } + + [[nodiscard]] std::unique_ptr createSession() const override + { + return std::make_unique(m_config); + } + +private: + TranscriberConfig m_config; +}; + +} // namespace + +std::unique_ptr createTranscriptionEngine(const TranscriberConfig &config) +{ + return std::make_unique(config); +} diff --git a/src/transcription/transcriptionengine.h b/src/transcription/transcriptionengine.h new file mode 100644 index 0000000..ef280c2 --- /dev/null +++ b/src/transcription/transcriptionengine.h @@ -0,0 +1,90 @@ +#pragma once + +#include "config.h" +#include "transcription/transcriptiontypes.h" + +#include + +struct Recording; + +/** + * @file + * @brief Stable engine/session boundary for embedded transcription backends. + */ + +/** + * @brief Mutable per-session transcription interface. + * + * Sessions own backend state that may be warmed up, reused, and kept isolated + * per thread or request flow. + */ +class TranscriptionSession +{ +public: + virtual ~TranscriptionSession() = default; + TranscriptionSession(const TranscriptionSession &) = delete; + TranscriptionSession &operator=(const TranscriptionSession &) = delete; + TranscriptionSession(TranscriptionSession &&) = delete; + TranscriptionSession &operator=(TranscriptionSession &&) = delete; + + /** + * @brief Returns the backend identifier for this live session. + * @return Short backend name used for logs and diagnostics. + */ + [[nodiscard]] virtual QString backendName() const = 0; + + /** + * @brief Performs optional backend warmup for this session. + * @param errorMessage Optional destination for a human-readable failure reason. + * @return `true` if the session is ready for transcription, otherwise `false`. + */ + virtual bool warmup(QString *errorMessage = nullptr) = 0; + + /** + * @brief Transcribes a single captured recording. + * @param recording Captured audio payload to normalize and transcribe. + * @return Normalized transcription result for the provided recording. + */ + [[nodiscard]] virtual TranscriptionResult transcribe(const Recording &recording) = 0; + +protected: + TranscriptionSession() = default; +}; + +/** + * @brief Immutable engine configuration that creates backend sessions. + * + * The engine boundary keeps future backend selection and model-loading policy + * out of the app/service orchestration layers. + */ +class TranscriptionEngine +{ +public: + virtual ~TranscriptionEngine() = default; + TranscriptionEngine(const TranscriptionEngine &) = delete; + TranscriptionEngine &operator=(const TranscriptionEngine &) = delete; + TranscriptionEngine(TranscriptionEngine &&) = delete; + TranscriptionEngine &operator=(TranscriptionEngine &&) = delete; + + /** + * @brief Returns the backend identifier for sessions created by this engine. + * @return Short backend name used for logs and diagnostics. + */ + [[nodiscard]] virtual QString backendName() const = 0; + + /** + * @brief Creates a new isolated transcription session. + * @return Newly constructed session that owns its backend state. + */ + [[nodiscard]] virtual std::unique_ptr createSession() const = 0; + +protected: + TranscriptionEngine() = default; +}; + +/** + * @brief Creates the configured embedded transcription engine. + * @param config Backend configuration copied into the engine. + * @return Engine suitable for creating isolated transcription sessions. + */ +[[nodiscard]] std::unique_ptr createTranscriptionEngine(const TranscriberConfig &config); diff --git a/src/transcription/transcriptionworker.cpp b/src/transcription/transcriptionworker.cpp index 90c2555..f3f72a4 100644 --- a/src/transcription/transcriptionworker.cpp +++ b/src/transcription/transcriptionworker.cpp @@ -1,26 +1,33 @@ #include "transcription/transcriptionworker.h" +#include #include -TranscriptionWorker::TranscriptionWorker(TranscriberConfig config, QObject *parent) +TranscriptionWorker::TranscriptionWorker(const TranscriberConfig &config, QObject *parent) + : TranscriptionWorker(createTranscriptionEngine(config)->createSession(), parent) +{ +} + +TranscriptionWorker::TranscriptionWorker(std::unique_ptr transcriber, QObject *parent) : QObject(parent) - , m_transcriber(std::move(config)) + , m_transcriber(std::move(transcriber)) { + assert(m_transcriber != nullptr); } QString TranscriptionWorker::backendName() const { - return WhisperCppTranscriber::backendName(); + return m_transcriber->backendName(); } bool TranscriptionWorker::warmup(QString *errorMessage) { - return m_transcriber.warmup(errorMessage); + return m_transcriber->warmup(errorMessage); } void TranscriptionWorker::transcribe(const Recording &recording) { - const TranscriptionResult result = m_transcriber.transcribe(recording); + const TranscriptionResult result = m_transcriber->transcribe(recording); if (!result.success) { emit transcriptionFailed(result.error); return; diff --git a/src/transcription/transcriptionworker.h b/src/transcription/transcriptionworker.h index c17ecfb..0cce128 100644 --- a/src/transcription/transcriptionworker.h +++ b/src/transcription/transcriptionworker.h @@ -2,9 +2,10 @@ #include "audio/recording.h" #include "config.h" -#include "transcription/whispercpptranscriber.h" +#include "transcription/transcriptionengine.h" #include +#include /** * @file @@ -28,7 +29,13 @@ class TranscriptionWorker final : public QObject * @param config Transcriber settings copied into the owned backend. * @param parent Optional QObject parent. */ - explicit TranscriptionWorker(TranscriberConfig config, QObject *parent = nullptr); + explicit TranscriptionWorker(const TranscriberConfig &config, QObject *parent = nullptr); + /** + * @brief Creates a worker around an already-constructed session. + * @param transcriber Owned session implementation. + * @param parent Optional QObject parent. + */ + explicit TranscriptionWorker(std::unique_ptr transcriber, QObject *parent = nullptr); ~TranscriptionWorker() override = default; Q_DISABLE_COPY_MOVE(TranscriptionWorker) @@ -67,5 +74,5 @@ class TranscriptionWorker final : public QObject private: /// Owned transcription backend implementation. - WhisperCppTranscriber m_transcriber; + std::unique_ptr m_transcriber; }; diff --git a/src/transcription/whispercpptranscriber.cpp b/src/transcription/whispercpptranscriber.cpp index 4ac784b..e376fe8 100644 --- a/src/transcription/whispercpptranscriber.cpp +++ b/src/transcription/whispercpptranscriber.cpp @@ -76,11 +76,16 @@ void WhisperCppTranscriber::freeContext(whisper_context *context) noexcept } } -QString WhisperCppTranscriber::backendName() +QString WhisperCppTranscriber::backendNameStatic() { return QStringLiteral("whisper.cpp"); } +QString WhisperCppTranscriber::backendName() const +{ + return backendNameStatic(); +} + bool WhisperCppTranscriber::warmup(QString *errorMessage) { return ensureInitialized(errorMessage); diff --git a/src/transcription/whispercpptranscriber.h b/src/transcription/whispercpptranscriber.h index 83633cc..079bdcf 100644 --- a/src/transcription/whispercpptranscriber.h +++ b/src/transcription/whispercpptranscriber.h @@ -2,6 +2,7 @@ #include "audio/recordingnormalizer.h" #include "config.h" +#include "transcription/transcriptionengine.h" #include "transcription/transcriptiontypes.h" #include @@ -20,7 +21,7 @@ struct whisper_context; * RAII-managed smart pointer so service shutdown and worker teardown stay * deterministic. */ -class WhisperCppTranscriber final +class WhisperCppTranscriber final : public TranscriptionSession { public: /** @@ -32,7 +33,7 @@ class WhisperCppTranscriber final /** * @brief Releases the owned whisper.cpp context. */ - ~WhisperCppTranscriber(); + ~WhisperCppTranscriber() override; WhisperCppTranscriber(const WhisperCppTranscriber &) = delete; WhisperCppTranscriber &operator=(const WhisperCppTranscriber &) = delete; @@ -43,21 +44,22 @@ class WhisperCppTranscriber final * @brief Returns the backend name used in diagnostics. * @return Human-readable backend identifier. */ - [[nodiscard]] static QString backendName(); + [[nodiscard]] static QString backendNameStatic(); + [[nodiscard]] QString backendName() const override; /** * @brief Eagerly initializes the whisper.cpp context. * @param errorMessage Optional output for initialization failures. * @return `true` when the backend is ready for transcription. */ - bool warmup(QString *errorMessage = nullptr); + bool warmup(QString *errorMessage = nullptr) override; /** * @brief Normalizes and transcribes one captured recording. * @param recording Captured audio payload and format metadata. * @return Structured transcription result. */ - [[nodiscard]] TranscriptionResult transcribe(const Recording &recording); + [[nodiscard]] TranscriptionResult transcribe(const Recording &recording) override; private: /** diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index cdd3d30..a96f789 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -35,6 +35,14 @@ mutterkey_add_qt_test(daemoncontroltypestest ../src/control/daemoncontroltypes.cpp ) +mutterkey_add_qt_test(transcriptionworkertest + transcriptionworkertest.cpp + ../src/audio/recordingnormalizer.cpp + ../src/transcription/transcriptionengine.cpp + ../src/transcription/transcriptionworker.cpp + ../src/transcription/whispercpptranscriber.cpp +) + mutterkey_add_qt_test(traystatuswindowtest traystatuswindowtest.cpp ../src/control/daemoncontrolclient.cpp @@ -47,12 +55,13 @@ mutterkey_add_qt_test(traystatuswindowtest target_link_libraries(configtest PRIVATE whisper) target_link_libraries(commanddispatchtest PRIVATE whisper) target_link_libraries(daemoncontroltypestest PRIVATE whisper) +target_link_libraries(transcriptionworkertest PRIVATE whisper) target_link_libraries(traystatuswindowtest PRIVATE whisper) if(TARGET clang-tidy) - add_dependencies(clang-tidy configtest_autogen commanddispatchtest_autogen recordingnormalizertest_autogen daemoncontrolprotocoltest_autogen daemoncontroltypestest_autogen traystatuswindowtest_autogen) + add_dependencies(clang-tidy configtest_autogen commanddispatchtest_autogen recordingnormalizertest_autogen daemoncontrolprotocoltest_autogen daemoncontroltypestest_autogen transcriptionworkertest_autogen traystatuswindowtest_autogen) endif() if(TARGET clazy) - add_dependencies(clazy configtest_autogen commanddispatchtest_autogen recordingnormalizertest_autogen daemoncontrolprotocoltest_autogen daemoncontroltypestest_autogen traystatuswindowtest_autogen) + add_dependencies(clazy configtest_autogen commanddispatchtest_autogen recordingnormalizertest_autogen daemoncontrolprotocoltest_autogen daemoncontroltypestest_autogen transcriptionworkertest_autogen traystatuswindowtest_autogen) endif() diff --git a/tests/transcriptionworkertest.cpp b/tests/transcriptionworkertest.cpp new file mode 100644 index 0000000..ee3d5a8 --- /dev/null +++ b/tests/transcriptionworkertest.cpp @@ -0,0 +1,113 @@ +#include "audio/recording.h" +#include "transcription/transcriptionengine.h" +#include "transcription/transcriptionworker.h" + +#include +#include + +#include +#include + +namespace { + +class FakeTranscriptionSession final : public TranscriptionSession +{ +public: + explicit FakeTranscriptionSession(TranscriptionResult result) + : m_result(std::move(result)) + { + } + + [[nodiscard]] QString backendName() const override + { + return QStringLiteral("fake"); + } + + bool warmup(QString *errorMessage) override + { + if (!m_warmupError.isEmpty() && errorMessage != nullptr) { + *errorMessage = m_warmupError; + } + return m_warmupError.isEmpty(); + } + + [[nodiscard]] TranscriptionResult transcribe(const Recording &) override + { + return m_result; + } + + void setWarmupError(QString warmupError) + { + m_warmupError = std::move(warmupError); + } + +private: + TranscriptionResult m_result; + QString m_warmupError; +}; + +class TranscriptionWorkerTest final : public QObject +{ + Q_OBJECT + +private slots: + void reportsInjectedBackendName(); + void emitsReadySignalForSuccessfulTranscription(); + void emitsFailureSignalForFailedTranscription(); + void surfacesWarmupFailure(); +}; + +} // namespace + +void TranscriptionWorkerTest::reportsInjectedBackendName() +{ + auto session = std::make_unique(TranscriptionResult{.success = true, .text = {}}); + const TranscriptionWorker worker(std::move(session)); + + QCOMPARE(worker.backendName(), QStringLiteral("fake")); +} + +void TranscriptionWorkerTest::emitsReadySignalForSuccessfulTranscription() +{ + auto session = + std::make_unique(TranscriptionResult{.success = true, .text = QStringLiteral("hello world")}); + TranscriptionWorker worker(std::move(session)); + const QSignalSpy readySpy(&worker, &TranscriptionWorker::transcriptionReady); + const QSignalSpy failedSpy(&worker, &TranscriptionWorker::transcriptionFailed); + + worker.transcribe(Recording{}); + + QCOMPARE(readySpy.count(), 1); + QCOMPARE(failedSpy.count(), 0); + QCOMPARE(readySpy.at(0).at(0).toString(), QStringLiteral("hello world")); +} + +void TranscriptionWorkerTest::emitsFailureSignalForFailedTranscription() +{ + auto session = std::make_unique( + TranscriptionResult{.success = false, .text = {}, .error = QStringLiteral("decode failed")}); + TranscriptionWorker worker(std::move(session)); + const QSignalSpy readySpy(&worker, &TranscriptionWorker::transcriptionReady); + const QSignalSpy failedSpy(&worker, &TranscriptionWorker::transcriptionFailed); + + worker.transcribe(Recording{}); + + QCOMPARE(readySpy.count(), 0); + QCOMPARE(failedSpy.count(), 1); + QCOMPARE(failedSpy.at(0).at(0).toString(), QStringLiteral("decode failed")); +} + +void TranscriptionWorkerTest::surfacesWarmupFailure() +{ + auto session = std::make_unique(TranscriptionResult{.success = true, .text = {}}); + session->setWarmupError(QStringLiteral("model unavailable")); + TranscriptionWorker worker(std::move(session)); + + QString errorMessage; + QVERIFY(!worker.warmup(&errorMessage)); + QCOMPARE(errorMessage, QStringLiteral("model unavailable")); +} + +QTEST_APPLESS_MAIN(TranscriptionWorkerTest) + +#include "transcriptionworkertest.moc"