From 27014d04ea71b70883d71073466e82f35611f599 Mon Sep 17 00:00:00 2001 From: Young Han Date: Tue, 24 Mar 2026 10:37:12 -0700 Subject: [PATCH 01/23] Fix Voxtral macOS packaging for runtime libraries. Bundle libc++ alongside the runner and add the rpaths needed for the generated Metal library to load reliably at runtime. Made-with: Cursor --- .../macos/VoxtralRealtime.xcodeproj/project.pbxproj | 10 ++++++---- voxtral_realtime/macos/project.yml | 11 +++++++++++ voxtral_realtime/macos/scripts/create_dmg.sh | 1 + 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/project.pbxproj b/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/project.pbxproj index 7b9bfb751..cc7f201ba 100644 --- a/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/project.pbxproj +++ b/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/project.pbxproj @@ -174,9 +174,10 @@ attributes = { BuildIndependentTargetsInParallel = YES; LastUpgradeCheck = 1600; + TargetAttributes = { + }; }; buildConfigurationList = DD10F8C9604ABCBFD61FA78C /* Build configuration list for PBXProject "VoxtralRealtime" */; - compatibilityVersion = "Xcode 14.0"; developmentRegion = en; hasScannedForEncodings = 0; knownRegions = ( @@ -186,6 +187,7 @@ mainGroup = E7943F393DA257C006D6411D; minimizedProjectReferenceProxies = 1; preferredProjectObjectVersion = 77; + productRefGroup = C62DD9BE8818F70C8044D7FF /* Products */; projectDirPath = ""; projectRoot = ""; targets = ( @@ -223,7 +225,7 @@ ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "set -euo pipefail\n\nET_PATH=\"${EXECUTORCH_PATH:-${HOME}/executorch}\"\nRUNNER_SRC=\"${ET_PATH}/cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner\"\nMODEL_DIR=\"${MODEL_DIR:-${HOME}/voxtral_realtime_quant_metal}\"\nLIBOMP_SRC=\"/opt/homebrew/opt/libomp/lib/libomp.dylib\"\nDEST=\"${BUILT_PRODUCTS_DIR}/${CONTENTS_FOLDER_PATH}/Resources\"\n\nmkdir -p \"${DEST}\"\n\ncopy_if_newer() {\n local src=\"$1\" dst=\"$2\"\n if [ ! -f \"${src}\" ]; then\n echo \"warning: Not found: ${src}\"\n return\n fi\n if [ ! -f \"${dst}\" ] || [ \"${src}\" -nt \"${dst}\" ]; then\n cp -fL \"${src}\" \"${dst}\"\n echo \"✓ Bundled $(basename \"${dst}\")\"\n else\n echo \"· $(basename \"${dst}\") up to date\"\n fi\n}\n\n# Runner binary\ncopy_if_newer \"${RUNNER_SRC}\" \"${DEST}/voxtral_realtime_runner\"\nchmod +x \"${DEST}/voxtral_realtime_runner\" 2>/dev/null || true\n\n# libomp (runner dependency)\ncopy_if_newer \"${LIBOMP_SRC}\" \"${DEST}/libomp.dylib\"\n\n# Patch runner to find libomp via @executable_path (SIP strips DYLD_LIBRARY_PATH)\nif [ -f \"${DEST}/voxtral_realtime_runner\" ] && [ -f \"${DEST}/libomp.dylib\" ]; then\n install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @executable_path/libomp.dylib \"${DEST}/voxtral_realtime_runner\" 2>/dev/null || true\n echo \"✓ Patched runner rpath for libomp\"\nfi\n\n# Model artifacts\ncopy_if_newer \"/Users/younghan/voxtral_realtime_quant_metal/model-metal-int4.pte\" \"${DEST}/model-metal-int4.pte\"\ncopy_if_newer \"/Users/younghan/voxtral_realtime_quant_metal/preprocessor.pte\" \"${DEST}/preprocessor.pte\"\ncopy_if_newer \"/Users/younghan/voxtral_realtime_quant_metal/tekken.json\" \"${DEST}/tekken.json\"\n"; + shellScript = "set -euo pipefail\n\nET_PATH=\"${EXECUTORCH_PATH:-${HOME}/executorch}\"\nRUNNER_SRC=\"${ET_PATH}/cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner\"\nMODEL_DIR=\"${MODEL_DIR:-${HOME}/voxtral_realtime_quant_metal}\"\nLIBOMP_SRC=\"/opt/homebrew/opt/libomp/lib/libomp.dylib\"\nLIBCXX_SRC=\"/usr/lib/libc++.1.dylib\"\nDEST=\"${BUILT_PRODUCTS_DIR}/${CONTENTS_FOLDER_PATH}/Resources\"\n\nmkdir -p \"${DEST}\"\n\ncopy_if_newer() {\n local src=\"$1\" dst=\"$2\"\n if [ ! -f \"${src}\" ]; then\n echo \"warning: Not found: ${src}\"\n return\n fi\n if [ ! -f \"${dst}\" ] || [ \"${src}\" -nt \"${dst}\" ]; then\n cp -fL \"${src}\" \"${dst}\"\n echo \"✓ Bundled $(basename \"${dst}\")\"\n else\n echo \"· $(basename \"${dst}\") up to date\"\n fi\n}\n\n# Runner binary\ncopy_if_newer \"${RUNNER_SRC}\" \"${DEST}/voxtral_realtime_runner\"\nchmod +x \"${DEST}/voxtral_realtime_runner\" 2>/dev/null || true\n\n# libomp (runner dependency)\ncopy_if_newer \"${LIBOMP_SRC}\" \"${DEST}/libomp.dylib\"\n\n# libc++ (required by Metal blob dylib loaded at runtime)\ncopy_if_newer \"${LIBCXX_SRC}\" \"${DEST}/libc++.1.dylib\"\n\n# Patch runner to find libomp via @executable_path (SIP strips DYLD_LIBRARY_PATH)\nif [ -f \"${DEST}/voxtral_realtime_runner\" ] && [ -f \"${DEST}/libomp.dylib\" ]; then\n install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @executable_path/libomp.dylib \"${DEST}/voxtral_realtime_runner\" 2>/dev/null || true\n echo \"✓ Patched runner rpath for libomp\"\nfi\n\n# Ensure @rpath can resolve libc++ for runtime-generated Metal dylibs\nif [ -f \"${DEST}/voxtral_realtime_runner\" ]; then\n install_name_tool -add_rpath @executable_path \"${DEST}/voxtral_realtime_runner\" 2>/dev/null || true\n install_name_tool -add_rpath /usr/lib \"${DEST}/voxtral_realtime_runner\" 2>/dev/null || true\n echo \"✓ Added rpaths for libc++\"\nfi\n\n# Model artifacts\ncopy_if_newer \"${MODEL_DIR}/model-metal-int4.pte\" \"${DEST}/model-metal-int4.pte\"\ncopy_if_newer \"${MODEL_DIR}/preprocessor.pte\" \"${DEST}/preprocessor.pte\"\ncopy_if_newer \"${MODEL_DIR}/tekken.json\" \"${DEST}/tekken.json\"\n"; }; /* End PBXShellScriptBuildPhase section */ @@ -272,7 +274,7 @@ "$(inherited)", "@executable_path/../Frameworks", ); - PRODUCT_BUNDLE_IDENTIFIER = com.younghan.VoxtralRealtime; + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.VoxtralRealtime; PRODUCT_NAME = "Voxtral Realtime"; SDKROOT = macosx; }; @@ -292,7 +294,7 @@ "$(inherited)", "@executable_path/../Frameworks", ); - PRODUCT_BUNDLE_IDENTIFIER = com.younghan.VoxtralRealtime; + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.VoxtralRealtime; PRODUCT_NAME = "Voxtral Realtime"; SDKROOT = macosx; }; diff --git a/voxtral_realtime/macos/project.yml b/voxtral_realtime/macos/project.yml index 2586da9ab..de289c635 100644 --- a/voxtral_realtime/macos/project.yml +++ b/voxtral_realtime/macos/project.yml @@ -46,6 +46,7 @@ targets: RUNNER_SRC="${ET_PATH}/cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner" MODEL_DIR="${MODEL_DIR:-${HOME}/voxtral_realtime_quant_metal}" LIBOMP_SRC="/opt/homebrew/opt/libomp/lib/libomp.dylib" + LIBCXX_SRC="/usr/lib/libc++.1.dylib" DEST="${BUILT_PRODUCTS_DIR}/${CONTENTS_FOLDER_PATH}/Resources" mkdir -p "${DEST}" @@ -71,12 +72,22 @@ targets: # libomp (runner dependency) copy_if_newer "${LIBOMP_SRC}" "${DEST}/libomp.dylib" + # libc++ (required by Metal blob dylib loaded at runtime) + copy_if_newer "${LIBCXX_SRC}" "${DEST}/libc++.1.dylib" + # Patch runner to find libomp via @executable_path (SIP strips DYLD_LIBRARY_PATH) if [ -f "${DEST}/voxtral_realtime_runner" ] && [ -f "${DEST}/libomp.dylib" ]; then install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @executable_path/libomp.dylib "${DEST}/voxtral_realtime_runner" 2>/dev/null || true echo "✓ Patched runner rpath for libomp" fi + # Ensure @rpath can resolve libc++ for runtime-generated Metal dylibs + if [ -f "${DEST}/voxtral_realtime_runner" ]; then + install_name_tool -add_rpath @executable_path "${DEST}/voxtral_realtime_runner" 2>/dev/null || true + install_name_tool -add_rpath /usr/lib "${DEST}/voxtral_realtime_runner" 2>/dev/null || true + echo "✓ Added rpaths for libc++" + fi + # Model artifacts copy_if_newer "${MODEL_DIR}/model-metal-int4.pte" "${DEST}/model-metal-int4.pte" copy_if_newer "${MODEL_DIR}/preprocessor.pte" "${DEST}/preprocessor.pte" diff --git a/voxtral_realtime/macos/scripts/create_dmg.sh b/voxtral_realtime/macos/scripts/create_dmg.sh index 1c6e12f62..df29025b5 100755 --- a/voxtral_realtime/macos/scripts/create_dmg.sh +++ b/voxtral_realtime/macos/scripts/create_dmg.sh @@ -29,6 +29,7 @@ RESOURCES="${APP_PATH}/Contents/Resources" REQUIRED_FILES=( "voxtral_realtime_runner" "libomp.dylib" + "libc++.1.dylib" "model-metal-int4.pte" "preprocessor.pte" "tekken.json" From 95f1053f3983e72437f86effeb93fb7186379ba0 Mon Sep 17 00:00:00 2001 From: Young Han Date: Wed, 25 Mar 2026 14:17:56 -0700 Subject: [PATCH 02/23] Add Flow-like features and Silero VAD wake to Voxtral macOS app Add text replacements (post-ASR word/phrase substitution), snippets (voice-triggered template expansion), history polish (rich session metadata, pinning, recency grouping, multi-format export), and a Silero VAD "hey torch" wake pipeline that runs dictation silently in background until the wake keyword is confirmed. Replacements and Snippets are now first-class sidebar pages instead of being buried in Settings tabs. The silence monitor uses consecutive-poll tracking for more robust endpointing. VAD hangover is wired to the preference slider. Microphone silence is detected and surfaced as an error banner. Authored with assistance from Claude. Made-with: Cursor --- .../VoxtralRealtime.xcodeproj/project.pbxproj | 146 ++++++++++- .../xcschemes/VoxtralRealtime.xcscheme | 113 +++++++++ .../VoxtralRealtime/Models/Preferences.swift | 78 ++++++ .../Models/ReplacementEntry.swift | 37 +++ .../VoxtralRealtime/Models/Session.swift | 58 ++++- .../VoxtralRealtime/Models/Snippet.swift | 37 +++ .../Models/TranscriptStore.swift | 114 ++++++++- .../VoxtralRealtime/Models/WakeState.swift | 17 ++ .../Services/DictationManager.swift | 170 ++++++++++++- .../Services/ReplacementStore.swift | 77 ++++++ .../Services/RunnerBridge.swift | 11 + .../Services/SnippetStore.swift | 81 ++++++ .../Services/TextPipeline.swift | 168 +++++++++++++ .../VoxtralRealtime/Services/VadService.swift | 231 ++++++++++++++++++ .../Utilities/PersistencePaths.swift | 30 +++ .../Utilities/RunnerError.swift | 3 + .../Utilities/SessionExportFormat.swift | 88 +++++++ .../VoxtralRealtime/Views/ContentView.swift | 42 +++- .../Views/DictationOverlayView.swift | 17 +- .../Views/ReplacementManagementView.swift | 153 ++++++++++++ .../VoxtralRealtime/Views/SettingsView.swift | 68 +++++- .../VoxtralRealtime/Views/SidebarView.swift | 148 +++++++++-- .../Views/SnippetManagementView.swift | 177 ++++++++++++++ .../VoxtralRealtime/Views/WelcomeView.swift | 28 ++- .../VoxtralRealtime/VoxtralRealtimeApp.swift | 11 +- .../SessionCompatibilityTests.swift | 62 +++++ .../TextPipelineTests.swift | 88 +++++++ voxtral_realtime/macos/project.yml | 34 +++ 28 files changed, 2231 insertions(+), 56 deletions(-) create mode 100644 voxtral_realtime/macos/VoxtralRealtime.xcodeproj/xcshareddata/xcschemes/VoxtralRealtime.xcscheme create mode 100644 voxtral_realtime/macos/VoxtralRealtime/Models/ReplacementEntry.swift create mode 100644 voxtral_realtime/macos/VoxtralRealtime/Models/Snippet.swift create mode 100644 voxtral_realtime/macos/VoxtralRealtime/Models/WakeState.swift create mode 100644 voxtral_realtime/macos/VoxtralRealtime/Services/ReplacementStore.swift create mode 100644 voxtral_realtime/macos/VoxtralRealtime/Services/SnippetStore.swift create mode 100644 voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift create mode 100644 voxtral_realtime/macos/VoxtralRealtime/Services/VadService.swift create mode 100644 voxtral_realtime/macos/VoxtralRealtime/Utilities/PersistencePaths.swift create mode 100644 voxtral_realtime/macos/VoxtralRealtime/Utilities/SessionExportFormat.swift create mode 100644 voxtral_realtime/macos/VoxtralRealtime/Views/ReplacementManagementView.swift create mode 100644 voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift create mode 100644 voxtral_realtime/macos/VoxtralRealtimeTests/SessionCompatibilityTests.swift create mode 100644 voxtral_realtime/macos/VoxtralRealtimeTests/TextPipelineTests.swift diff --git a/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/project.pbxproj b/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/project.pbxproj index cc7f201ba..b87106d18 100644 --- a/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/project.pbxproj +++ b/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/project.pbxproj @@ -8,9 +8,13 @@ /* Begin PBXBuildFile section */ 00C95410EB7173CB8EC29546 /* DictationManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = 085054419A1BAA91DC852C48 /* DictationManager.swift */; }; + 0902FE224C2886A2B01C53FC /* ReplacementStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = 23662220D8DC0A9E945AE589 /* ReplacementStore.swift */; }; + 0F53A5D1181652FA4510B10C /* PersistencePaths.swift in Sources */ = {isa = PBXBuildFile; fileRef = AB52D50C17A7A729A04F559D /* PersistencePaths.swift */; }; 1850513C3735A844666D0217 /* DictationOverlayView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 54933FCE568E7DC5C512B5D8 /* DictationOverlayView.swift */; }; + 1C4C9D6705E1843D518C5EC1 /* ReplacementEntry.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF25A732CF8FA1A7A982A8D8 /* ReplacementEntry.swift */; }; 2C0D9096EB59C859E877427A /* WelcomeView.swift in Sources */ = {isa = PBXBuildFile; fileRef = D2245F740610F505B2D2905E /* WelcomeView.swift */; }; 2C387248936D9F2313B8E2FB /* AudioEngine.swift in Sources */ = {isa = PBXBuildFile; fileRef = D14DCD3E95F36CEACCEF4335 /* AudioEngine.swift */; }; + 402BE72495113F445B8C693E /* TextPipelineTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = AF2BCF50209E617141188684 /* TextPipelineTests.swift */; }; 437CDA8161D03BC53BF5ACAA /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B977C4A3EF8EE9770D939E98 /* SettingsView.swift */; }; 4E3EFCC72CC492D94CC9494E /* Session.swift in Sources */ = {isa = PBXBuildFile; fileRef = E0D0245890DF77D480572858 /* Session.swift */; }; 4EDEB7964D71C06220AE2CE5 /* Preferences.swift in Sources */ = {isa = PBXBuildFile; fileRef = 74547233C8BECECCBCAC2E8C /* Preferences.swift */; }; @@ -18,34 +22,64 @@ 51009A5D18D14D7B19A77645 /* DictationPanel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 97AF5A7A49BCC1D9F9404C02 /* DictationPanel.swift */; }; 54B04C6AFEC0BEC16D5A427B /* RunnerBridge.swift in Sources */ = {isa = PBXBuildFile; fileRef = 65D90AECAFB4D759F980E25A /* RunnerBridge.swift */; }; 5E5BEB5668CE4A42F6B67569 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9069BFAFAFDDBB137F34675E /* ContentView.swift */; }; + 6248A7B9E31EB5CBC9451656 /* Session.swift in Sources */ = {isa = PBXBuildFile; fileRef = E0D0245890DF77D480572858 /* Session.swift */; }; 6ABE5D7F989ACBF0E68BF8AD /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = E4B018A732C7D539547855A3 /* Assets.xcassets */; }; 710AF905B60EB23C212AAC30 /* HealthCheck.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6CAB3C82C8E7393E06E6EE6E /* HealthCheck.swift */; }; 75FD2E801951086DFE9E28E3 /* AudioLevelView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B5282DE2DFDC7D992AC7D81D /* AudioLevelView.swift */; }; + 7CF7B1783B1948A4CA22BBB9 /* SessionCompatibilityTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5521AE4951FF78020CF91D18 /* SessionCompatibilityTests.swift */; }; + 7F4D25A854D63B9F528C56A0 /* ReplacementEntry.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF25A732CF8FA1A7A982A8D8 /* ReplacementEntry.swift */; }; 83C2211EB3926FCE3AE55BA8 /* TranscriptView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9539C4044A50A585F04687CA /* TranscriptView.swift */; }; + 8E1D5D4563D1FEA413FD49C6 /* PersistencePaths.swift in Sources */ = {isa = PBXBuildFile; fileRef = AB52D50C17A7A729A04F559D /* PersistencePaths.swift */; }; + 92C09998C9463D1BD392F8C0 /* ReplacementStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = 23662220D8DC0A9E945AE589 /* ReplacementStore.swift */; }; + 980B7949F856A2C50A65C1AD /* Snippet.swift in Sources */ = {isa = PBXBuildFile; fileRef = 592F5A369D8A7B187E7AC800 /* Snippet.swift */; }; 9D0149A93C0234587B0A7655 /* RecordingControls.swift in Sources */ = {isa = PBXBuildFile; fileRef = BE9D72CEB451260E46996613 /* RecordingControls.swift */; }; A23029B5977019EBC0CC217F /* SetupGuideView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A7DC59AEFE4416E91D2A001F /* SetupGuideView.swift */; }; B4086DFD7477802852F31D0E /* VoxtralRealtimeApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = D1FF01EEFDD18E056509CEB0 /* VoxtralRealtimeApp.swift */; }; + B4AF3290D7820297BA97100E /* TextPipeline.swift in Sources */ = {isa = PBXBuildFile; fileRef = 31283A9DAEC7965B08CB6A55 /* TextPipeline.swift */; }; + B5A4C34832F186322489706E /* VadService.swift in Sources */ = {isa = PBXBuildFile; fileRef = 62135A876AFC6B0DDCBF306F /* VadService.swift */; }; BA4D276D05BDEC9EE49C8979 /* RunnerError.swift in Sources */ = {isa = PBXBuildFile; fileRef = 887F4BCA7C5EBE3951E6BB56 /* RunnerError.swift */; }; BDC0CBA04D99E79D5EBA5DC6 /* TranscriptStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = 26CC8EBA770C5AE5AE9FE5AF /* TranscriptStore.swift */; }; + BFD3E022202BBB6E261C5530 /* SnippetStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = 96482F9055320E605013E741 /* SnippetStore.swift */; }; + CD94243BBE7525C8BBA1EA4B /* SessionExportFormat.swift in Sources */ = {isa = PBXBuildFile; fileRef = 12715E449A97150F1BE9A5E3 /* SessionExportFormat.swift */; }; + D6C1940C018A1B7A21007749 /* SnippetManagementView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A16CFDDF3005489D5CEFD58B /* SnippetManagementView.swift */; }; DB7FD45624EA9EECDA9DD799 /* ErrorBannerView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A2CB74DFC873A590526E6138 /* ErrorBannerView.swift */; }; + EA47EF0DBDDEB7A6B19125BB /* TextPipeline.swift in Sources */ = {isa = PBXBuildFile; fileRef = 31283A9DAEC7965B08CB6A55 /* TextPipeline.swift */; }; + EE2CF6B261CC691736E9BBDE /* WakeState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8CC6853F668DED1C9E00D31A /* WakeState.swift */; }; + F012D0ADEAFFA55D329C9470 /* SnippetStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = 96482F9055320E605013E741 /* SnippetStore.swift */; }; + F2871C888DE94A48AE31E175 /* SessionExportFormat.swift in Sources */ = {isa = PBXBuildFile; fileRef = 12715E449A97150F1BE9A5E3 /* SessionExportFormat.swift */; }; + F99C64CE2B73E3488A6943DB /* ReplacementManagementView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3DA671728594173ABC914B67 /* ReplacementManagementView.swift */; }; + FE02B324621EAC80BEADC1B5 /* Snippet.swift in Sources */ = {isa = PBXBuildFile; fileRef = 592F5A369D8A7B187E7AC800 /* Snippet.swift */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ 085054419A1BAA91DC852C48 /* DictationManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DictationManager.swift; sourceTree = ""; }; + 12715E449A97150F1BE9A5E3 /* SessionExportFormat.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SessionExportFormat.swift; sourceTree = ""; }; 1FB850226CD44666C82C7740 /* VoxtralRealtime.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = VoxtralRealtime.entitlements; sourceTree = ""; }; + 23662220D8DC0A9E945AE589 /* ReplacementStore.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ReplacementStore.swift; sourceTree = ""; }; 26CC8EBA770C5AE5AE9FE5AF /* TranscriptStore.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranscriptStore.swift; sourceTree = ""; }; 26D5AB3AA91015848E6CC122 /* VoxtralRealtime.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = VoxtralRealtime.app; sourceTree = BUILT_PRODUCTS_DIR; }; + 31283A9DAEC7965B08CB6A55 /* TextPipeline.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TextPipeline.swift; sourceTree = ""; }; + 3DA671728594173ABC914B67 /* ReplacementManagementView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ReplacementManagementView.swift; sourceTree = ""; }; + 408871360ABEA43A93E5BA1E /* VoxtralRealtimeTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = VoxtralRealtimeTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; 54933FCE568E7DC5C512B5D8 /* DictationOverlayView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DictationOverlayView.swift; sourceTree = ""; }; + 5521AE4951FF78020CF91D18 /* SessionCompatibilityTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SessionCompatibilityTests.swift; sourceTree = ""; }; + 592F5A369D8A7B187E7AC800 /* Snippet.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Snippet.swift; sourceTree = ""; }; + 62135A876AFC6B0DDCBF306F /* VadService.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = VadService.swift; sourceTree = ""; }; 65D90AECAFB4D759F980E25A /* RunnerBridge.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RunnerBridge.swift; sourceTree = ""; }; 68D216F7ED4F81E147F2CFEE /* SidebarView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SidebarView.swift; sourceTree = ""; }; 6CAB3C82C8E7393E06E6EE6E /* HealthCheck.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = HealthCheck.swift; sourceTree = ""; }; 74547233C8BECECCBCAC2E8C /* Preferences.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Preferences.swift; sourceTree = ""; }; 887F4BCA7C5EBE3951E6BB56 /* RunnerError.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RunnerError.swift; sourceTree = ""; }; + 8CC6853F668DED1C9E00D31A /* WakeState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WakeState.swift; sourceTree = ""; }; 9069BFAFAFDDBB137F34675E /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; 9539C4044A50A585F04687CA /* TranscriptView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranscriptView.swift; sourceTree = ""; }; + 96482F9055320E605013E741 /* SnippetStore.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SnippetStore.swift; sourceTree = ""; }; 97AF5A7A49BCC1D9F9404C02 /* DictationPanel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DictationPanel.swift; sourceTree = ""; }; + A16CFDDF3005489D5CEFD58B /* SnippetManagementView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SnippetManagementView.swift; sourceTree = ""; }; A2CB74DFC873A590526E6138 /* ErrorBannerView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ErrorBannerView.swift; sourceTree = ""; }; A7DC59AEFE4416E91D2A001F /* SetupGuideView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SetupGuideView.swift; sourceTree = ""; }; + AB52D50C17A7A729A04F559D /* PersistencePaths.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PersistencePaths.swift; sourceTree = ""; }; + AF2BCF50209E617141188684 /* TextPipelineTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TextPipelineTests.swift; sourceTree = ""; }; B5282DE2DFDC7D992AC7D81D /* AudioLevelView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AudioLevelView.swift; sourceTree = ""; }; B977C4A3EF8EE9770D939E98 /* SettingsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsView.swift; sourceTree = ""; }; BE9D72CEB451260E46996613 /* RecordingControls.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RecordingControls.swift; sourceTree = ""; }; @@ -55,13 +89,16 @@ D2245F740610F505B2D2905E /* WelcomeView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WelcomeView.swift; sourceTree = ""; }; E0D0245890DF77D480572858 /* Session.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Session.swift; sourceTree = ""; }; E4B018A732C7D539547855A3 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + FF25A732CF8FA1A7A982A8D8 /* ReplacementEntry.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ReplacementEntry.swift; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXGroup section */ 30572FD24E5528E224F7ED10 /* Utilities */ = { isa = PBXGroup; children = ( + AB52D50C17A7A729A04F559D /* PersistencePaths.swift */, 887F4BCA7C5EBE3951E6BB56 /* RunnerError.swift */, + 12715E449A97150F1BE9A5E3 /* SessionExportFormat.swift */, ); path = Utilities; sourceTree = ""; @@ -81,6 +118,15 @@ path = VoxtralRealtime; sourceTree = ""; }; + BB65A4B67667E366A9A7B235 /* VoxtralRealtimeTests */ = { + isa = PBXGroup; + children = ( + 5521AE4951FF78020CF91D18 /* SessionCompatibilityTests.swift */, + AF2BCF50209E617141188684 /* TextPipelineTests.swift */, + ); + path = VoxtralRealtimeTests; + sourceTree = ""; + }; C452E644F7EA840AFBDB43A4 /* Views */ = { isa = PBXGroup; children = ( @@ -90,9 +136,11 @@ 97AF5A7A49BCC1D9F9404C02 /* DictationPanel.swift */, A2CB74DFC873A590526E6138 /* ErrorBannerView.swift */, BE9D72CEB451260E46996613 /* RecordingControls.swift */, + 3DA671728594173ABC914B67 /* ReplacementManagementView.swift */, B977C4A3EF8EE9770D939E98 /* SettingsView.swift */, A7DC59AEFE4416E91D2A001F /* SetupGuideView.swift */, 68D216F7ED4F81E147F2CFEE /* SidebarView.swift */, + A16CFDDF3005489D5CEFD58B /* SnippetManagementView.swift */, 9539C4044A50A585F04687CA /* TranscriptView.swift */, D2245F740610F505B2D2905E /* WelcomeView.swift */, ); @@ -103,6 +151,7 @@ isa = PBXGroup; children = ( 26D5AB3AA91015848E6CC122 /* VoxtralRealtime.app */, + 408871360ABEA43A93E5BA1E /* VoxtralRealtimeTests.xctest */, ); name = Products; sourceTree = ""; @@ -113,7 +162,11 @@ D14DCD3E95F36CEACCEF4335 /* AudioEngine.swift */, 085054419A1BAA91DC852C48 /* DictationManager.swift */, 6CAB3C82C8E7393E06E6EE6E /* HealthCheck.swift */, + 23662220D8DC0A9E945AE589 /* ReplacementStore.swift */, 65D90AECAFB4D759F980E25A /* RunnerBridge.swift */, + 96482F9055320E605013E741 /* SnippetStore.swift */, + 31283A9DAEC7965B08CB6A55 /* TextPipeline.swift */, + 62135A876AFC6B0DDCBF306F /* VadService.swift */, ); path = Services; sourceTree = ""; @@ -130,6 +183,7 @@ isa = PBXGroup; children = ( 34E1BE9EC8ACD121D7795352 /* VoxtralRealtime */, + BB65A4B67667E366A9A7B235 /* VoxtralRealtimeTests */, C62DD9BE8818F70C8044D7FF /* Products */, ); sourceTree = ""; @@ -138,8 +192,11 @@ isa = PBXGroup; children = ( 74547233C8BECECCBCAC2E8C /* Preferences.swift */, + FF25A732CF8FA1A7A982A8D8 /* ReplacementEntry.swift */, E0D0245890DF77D480572858 /* Session.swift */, + 592F5A369D8A7B187E7AC800 /* Snippet.swift */, 26CC8EBA770C5AE5AE9FE5AF /* TranscriptStore.swift */, + 8CC6853F668DED1C9E00D31A /* WakeState.swift */, ); path = Models; sourceTree = ""; @@ -166,6 +223,23 @@ productReference = 26D5AB3AA91015848E6CC122 /* VoxtralRealtime.app */; productType = "com.apple.product-type.application"; }; + C824B0DF381376921DE5A144 /* VoxtralRealtimeTests */ = { + isa = PBXNativeTarget; + buildConfigurationList = 196053E02BAA1A0CC976C1BA /* Build configuration list for PBXNativeTarget "VoxtralRealtimeTests" */; + buildPhases = ( + 2EA864A9F061F9F864C0A80A /* Sources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = VoxtralRealtimeTests; + packageProductDependencies = ( + ); + productName = VoxtralRealtimeTests; + productReference = 408871360ABEA43A93E5BA1E /* VoxtralRealtimeTests.xctest */; + productType = "com.apple.product-type.bundle.unit-test"; + }; /* End PBXNativeTarget section */ /* Begin PBXProject section */ @@ -192,6 +266,7 @@ projectRoot = ""; targets = ( B8C73BD33D08CD15C03EC05D /* VoxtralRealtime */, + C824B0DF381376921DE5A144 /* VoxtralRealtimeTests */, ); }; /* End PBXProject section */ @@ -225,11 +300,28 @@ ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "set -euo pipefail\n\nET_PATH=\"${EXECUTORCH_PATH:-${HOME}/executorch}\"\nRUNNER_SRC=\"${ET_PATH}/cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner\"\nMODEL_DIR=\"${MODEL_DIR:-${HOME}/voxtral_realtime_quant_metal}\"\nLIBOMP_SRC=\"/opt/homebrew/opt/libomp/lib/libomp.dylib\"\nLIBCXX_SRC=\"/usr/lib/libc++.1.dylib\"\nDEST=\"${BUILT_PRODUCTS_DIR}/${CONTENTS_FOLDER_PATH}/Resources\"\n\nmkdir -p \"${DEST}\"\n\ncopy_if_newer() {\n local src=\"$1\" dst=\"$2\"\n if [ ! -f \"${src}\" ]; then\n echo \"warning: Not found: ${src}\"\n return\n fi\n if [ ! -f \"${dst}\" ] || [ \"${src}\" -nt \"${dst}\" ]; then\n cp -fL \"${src}\" \"${dst}\"\n echo \"✓ Bundled $(basename \"${dst}\")\"\n else\n echo \"· $(basename \"${dst}\") up to date\"\n fi\n}\n\n# Runner binary\ncopy_if_newer \"${RUNNER_SRC}\" \"${DEST}/voxtral_realtime_runner\"\nchmod +x \"${DEST}/voxtral_realtime_runner\" 2>/dev/null || true\n\n# libomp (runner dependency)\ncopy_if_newer \"${LIBOMP_SRC}\" \"${DEST}/libomp.dylib\"\n\n# libc++ (required by Metal blob dylib loaded at runtime)\ncopy_if_newer \"${LIBCXX_SRC}\" \"${DEST}/libc++.1.dylib\"\n\n# Patch runner to find libomp via @executable_path (SIP strips DYLD_LIBRARY_PATH)\nif [ -f \"${DEST}/voxtral_realtime_runner\" ] && [ -f \"${DEST}/libomp.dylib\" ]; then\n install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @executable_path/libomp.dylib \"${DEST}/voxtral_realtime_runner\" 2>/dev/null || true\n echo \"✓ Patched runner rpath for libomp\"\nfi\n\n# Ensure @rpath can resolve libc++ for runtime-generated Metal dylibs\nif [ -f \"${DEST}/voxtral_realtime_runner\" ]; then\n install_name_tool -add_rpath @executable_path \"${DEST}/voxtral_realtime_runner\" 2>/dev/null || true\n install_name_tool -add_rpath /usr/lib \"${DEST}/voxtral_realtime_runner\" 2>/dev/null || true\n echo \"✓ Added rpaths for libc++\"\nfi\n\n# Model artifacts\ncopy_if_newer \"${MODEL_DIR}/model-metal-int4.pte\" \"${DEST}/model-metal-int4.pte\"\ncopy_if_newer \"${MODEL_DIR}/preprocessor.pte\" \"${DEST}/preprocessor.pte\"\ncopy_if_newer \"${MODEL_DIR}/tekken.json\" \"${DEST}/tekken.json\"\n"; + shellScript = "set -euo pipefail\n\nET_PATH=\"${EXECUTORCH_PATH:-${HOME}/executorch}\"\nRUNNER_SRC=\"${ET_PATH}/cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner\"\nVAD_RUNNER_SRC=\"${ET_PATH}/cmake-out/examples/models/silero_vad/silero_vad_stream_runner\"\nMODEL_DIR=\"${MODEL_DIR:-${HOME}/voxtral_realtime_quant_metal}\"\nLIBOMP_SRC=\"/opt/homebrew/opt/libomp/lib/libomp.dylib\"\nLIBCXX_SRC=\"/usr/lib/libc++.1.dylib\"\nDEST=\"${BUILT_PRODUCTS_DIR}/${CONTENTS_FOLDER_PATH}/Resources\"\n\nmkdir -p \"${DEST}\"\n\ncopy_if_newer() {\n local src=\"$1\" dst=\"$2\"\n if [ ! -f \"${src}\" ]; then\n echo \"warning: Not found: ${src}\"\n return\n fi\n if [ ! -f \"${dst}\" ] || [ \"${src}\" -nt \"${dst}\" ]; then\n cp -fL \"${src}\" \"${dst}\"\n echo \"✓ Bundled $(basename \"${dst}\")\"\n else\n echo \"· $(basename \"${dst}\") up to date\"\n fi\n}\n\n# Runner binary\ncopy_if_newer \"${RUNNER_SRC}\" \"${DEST}/voxtral_realtime_runner\"\nchmod +x \"${DEST}/voxtral_realtime_runner\" 2>/dev/null || true\n\n# libomp (runner dependency)\ncopy_if_newer \"${LIBOMP_SRC}\" \"${DEST}/libomp.dylib\"\n\n# libc++ (required by Metal blob dylib loaded at runtime)\ncopy_if_newer \"${LIBCXX_SRC}\" \"${DEST}/libc++.1.dylib\"\n\n# Patch runner to find libomp via @executable_path (SIP strips DYLD_LIBRARY_PATH)\nif [ -f \"${DEST}/voxtral_realtime_runner\" ] && [ -f \"${DEST}/libomp.dylib\" ]; then\n install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @executable_path/libomp.dylib \"${DEST}/voxtral_realtime_runner\" 2>/dev/null || true\n echo \"✓ Patched runner rpath for libomp\"\nfi\n\n# Ensure @rpath can resolve libc++ for runtime-generated Metal dylibs\nif [ -f \"${DEST}/voxtral_realtime_runner\" ]; then\n install_name_tool -add_rpath @executable_path \"${DEST}/voxtral_realtime_runner\" 2>/dev/null || true\n install_name_tool -add_rpath /usr/lib \"${DEST}/voxtral_realtime_runner\" 2>/dev/null || true\n echo \"✓ Added rpaths for libc++\"\nfi\n\n# Model artifacts\ncopy_if_newer \"${MODEL_DIR}/model-metal-int4.pte\" \"${DEST}/model-metal-int4.pte\"\ncopy_if_newer \"${MODEL_DIR}/preprocessor.pte\" \"${DEST}/preprocessor.pte\"\ncopy_if_newer \"${MODEL_DIR}/tekken.json\" \"${DEST}/tekken.json\"\ncopy_if_newer \"${MODEL_DIR}/silero_vad.pte\" \"${DEST}/silero_vad.pte\"\n\n# Optional Silero VAD helper\ncopy_if_newer \"${VAD_RUNNER_SRC}\" \"${DEST}/silero_vad_stream_runner\"\nchmod +x \"${DEST}/silero_vad_stream_runner\" 2>/dev/null || true\n"; }; /* End PBXShellScriptBuildPhase section */ /* Begin PBXSourcesBuildPhase section */ + 2EA864A9F061F9F864C0A80A /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 8E1D5D4563D1FEA413FD49C6 /* PersistencePaths.swift in Sources */, + 1C4C9D6705E1843D518C5EC1 /* ReplacementEntry.swift in Sources */, + 0902FE224C2886A2B01C53FC /* ReplacementStore.swift in Sources */, + 6248A7B9E31EB5CBC9451656 /* Session.swift in Sources */, + 7CF7B1783B1948A4CA22BBB9 /* SessionCompatibilityTests.swift in Sources */, + F2871C888DE94A48AE31E175 /* SessionExportFormat.swift in Sources */, + FE02B324621EAC80BEADC1B5 /* Snippet.swift in Sources */, + F012D0ADEAFFA55D329C9470 /* SnippetStore.swift in Sources */, + B4AF3290D7820297BA97100E /* TextPipeline.swift in Sources */, + 402BE72495113F445B8C693E /* TextPipelineTests.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; 590D6B292BC0BAAD0A99F381 /* Sources */ = { isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; @@ -242,17 +334,28 @@ 51009A5D18D14D7B19A77645 /* DictationPanel.swift in Sources */, DB7FD45624EA9EECDA9DD799 /* ErrorBannerView.swift in Sources */, 710AF905B60EB23C212AAC30 /* HealthCheck.swift in Sources */, + 0F53A5D1181652FA4510B10C /* PersistencePaths.swift in Sources */, 4EDEB7964D71C06220AE2CE5 /* Preferences.swift in Sources */, 9D0149A93C0234587B0A7655 /* RecordingControls.swift in Sources */, + 7F4D25A854D63B9F528C56A0 /* ReplacementEntry.swift in Sources */, + F99C64CE2B73E3488A6943DB /* ReplacementManagementView.swift in Sources */, + 92C09998C9463D1BD392F8C0 /* ReplacementStore.swift in Sources */, 54B04C6AFEC0BEC16D5A427B /* RunnerBridge.swift in Sources */, BA4D276D05BDEC9EE49C8979 /* RunnerError.swift in Sources */, 4E3EFCC72CC492D94CC9494E /* Session.swift in Sources */, + CD94243BBE7525C8BBA1EA4B /* SessionExportFormat.swift in Sources */, 437CDA8161D03BC53BF5ACAA /* SettingsView.swift in Sources */, A23029B5977019EBC0CC217F /* SetupGuideView.swift in Sources */, 50B57C3652FDC3D30EE08D50 /* SidebarView.swift in Sources */, + 980B7949F856A2C50A65C1AD /* Snippet.swift in Sources */, + D6C1940C018A1B7A21007749 /* SnippetManagementView.swift in Sources */, + BFD3E022202BBB6E261C5530 /* SnippetStore.swift in Sources */, + EA47EF0DBDDEB7A6B19125BB /* TextPipeline.swift in Sources */, BDC0CBA04D99E79D5EBA5DC6 /* TranscriptStore.swift in Sources */, 83C2211EB3926FCE3AE55BA8 /* TranscriptView.swift in Sources */, + B5A4C34832F186322489706E /* VadService.swift in Sources */, B4086DFD7477802852F31D0E /* VoxtralRealtimeApp.swift in Sources */, + EE2CF6B261CC691736E9BBDE /* WakeState.swift in Sources */, 2C0D9096EB59C859E877427A /* WelcomeView.swift in Sources */, ); runOnlyForDeploymentPostprocessing = 0; @@ -300,6 +403,22 @@ }; name = Release; }; + 771FA134122860FF59C3CD77 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + BUNDLE_LOADER = "$(TEST_HOST)"; + COMBINE_HIDPI_IMAGES = YES; + GENERATE_INFOPLIST_FILE = YES; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/../Frameworks", + "@loader_path/../Frameworks", + ); + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.VoxtralRealtimeTests; + SDKROOT = macosx; + }; + name = Debug; + }; 824D2A17D67FC8808FAC2F63 /* Release */ = { isa = XCBuildConfiguration; buildSettings = { @@ -357,6 +476,22 @@ }; name = Release; }; + 8ED466D22BF982493F9BE040 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + BUNDLE_LOADER = "$(TEST_HOST)"; + COMBINE_HIDPI_IMAGES = YES; + GENERATE_INFOPLIST_FILE = YES; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/../Frameworks", + "@loader_path/../Frameworks", + ); + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.VoxtralRealtimeTests; + SDKROOT = macosx; + }; + name = Release; + }; B51BF99527053D861C58E2BE /* Debug */ = { isa = XCBuildConfiguration; buildSettings = { @@ -424,6 +559,15 @@ /* End XCBuildConfiguration section */ /* Begin XCConfigurationList section */ + 196053E02BAA1A0CC976C1BA /* Build configuration list for PBXNativeTarget "VoxtralRealtimeTests" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 771FA134122860FF59C3CD77 /* Debug */, + 8ED466D22BF982493F9BE040 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Debug; + }; 6C9DE1DB881A58FAD15232B9 /* Build configuration list for PBXNativeTarget "VoxtralRealtime" */ = { isa = XCConfigurationList; buildConfigurations = ( diff --git a/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/xcshareddata/xcschemes/VoxtralRealtime.xcscheme b/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/xcshareddata/xcschemes/VoxtralRealtime.xcscheme new file mode 100644 index 000000000..d0426854a --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/xcshareddata/xcschemes/VoxtralRealtime.xcscheme @@ -0,0 +1,113 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/voxtral_realtime/macos/VoxtralRealtime/Models/Preferences.swift b/voxtral_realtime/macos/VoxtralRealtime/Models/Preferences.swift index 0e55c92c7..ccb80f8c4 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Models/Preferences.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Models/Preferences.swift @@ -8,6 +8,10 @@ import Foundation +enum StyleProfile: String, CaseIterable, Codable { + case none +} + @MainActor @Observable final class Preferences { var runnerPath: String { @@ -30,9 +34,48 @@ final class Preferences { didSet { UserDefaults.standard.set(silenceTimeout, forKey: "silenceTimeout") } } + var styleProfile: StyleProfile { + didSet { UserDefaults.standard.set(styleProfile.rawValue, forKey: "styleProfile") } + } + + var enableSileroVAD: Bool { + didSet { UserDefaults.standard.set(enableSileroVAD, forKey: "enableSileroVAD") } + } + + var vadRunnerPath: String { + didSet { UserDefaults.standard.set(vadRunnerPath, forKey: "vadRunnerPath") } + } + + var vadModelPath: String { + didSet { UserDefaults.standard.set(vadModelPath, forKey: "vadModelPath") } + } + + var vadThreshold: Double { + didSet { UserDefaults.standard.set(vadThreshold, forKey: "vadThreshold") } + } + + var vadHangoverMilliseconds: Double { + didSet { UserDefaults.standard.set(vadHangoverMilliseconds, forKey: "vadHangoverMilliseconds") } + } + + var enableWakePhrase: Bool { + didSet { UserDefaults.standard.set(enableWakePhrase, forKey: "enableWakePhrase") } + } + + var wakeKeyword: String { + didSet { UserDefaults.standard.set(wakeKeyword, forKey: "wakeKeyword") } + } + + var wakePhrase: String { "hey \(wakeKeyword)" } + + var wakeCheckSeconds: Double { + didSet { UserDefaults.standard.set(wakeCheckSeconds, forKey: "wakeCheckSeconds") } + } + var modelPath: String { "\(modelDirectory)/model-metal-int4.pte" } var tokenizerPath: String { "\(modelDirectory)/tekken.json" } var preprocessorPath: String { "\(modelDirectory)/preprocessor.pte" } + var defaultVadModelPath: String { "\(modelDirectory)/silero_vad.pte" } var usingBundledResources: Bool { runnerPath.hasPrefix(Bundle.main.bundlePath) @@ -47,6 +90,8 @@ final class Preferences { let bundledRunner = "\(bundleResources)/voxtral_realtime_runner" let bundledModel = "\(bundleResources)/model-metal-int4.pte" let buildRunner = "\(home)/executorch/cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner" + let bundledVadRunner = "\(bundleResources)/silero_vad_stream_runner" + let buildVadRunner = "\(home)/executorch/cmake-out/examples/models/silero_vad/silero_vad_stream_runner" let hasBundledRunner = FileManager.default.isExecutableFile(atPath: bundledRunner) let hasBundledModel = FileManager.default.fileExists(atPath: bundledModel) @@ -67,5 +112,38 @@ final class Preferences { self.audioDeviceID = defaults.string(forKey: "audioDeviceID") self.silenceThreshold = defaults.object(forKey: "silenceThreshold") as? Double ?? 0.02 self.silenceTimeout = defaults.object(forKey: "silenceTimeout") as? Double ?? 2.0 + self.styleProfile = StyleProfile(rawValue: defaults.string(forKey: "styleProfile") ?? "") ?? .none + self.enableSileroVAD = defaults.object(forKey: "enableSileroVAD") as? Bool ?? true + self.vadRunnerPath = defaults.string(forKey: "vadRunnerPath") ?? buildVadRunner + let storedVadModelPath = defaults.string(forKey: "vadModelPath") + let bundledVadModelPath = "\(bundleResources)/silero_vad.pte" + var resolvedVadModelPath = storedVadModelPath ?? bundledVadModelPath + if resolvedVadModelPath.isEmpty { + resolvedVadModelPath = "\(home)/voxtral_realtime_quant_metal/silero_vad.pte" + } + self.vadThreshold = defaults.object(forKey: "vadThreshold") as? Double ?? 0.55 + self.vadHangoverMilliseconds = defaults.object(forKey: "vadHangoverMilliseconds") as? Double ?? 320 + self.enableWakePhrase = defaults.object(forKey: "enableWakePhrase") as? Bool ?? true + self.wakeKeyword = defaults.string(forKey: "wakeKeyword") ?? defaults.string(forKey: "wakePhrase").flatMap { + let words = $0.lowercased().split(separator: " ") + return words.count > 1 ? words.dropFirst().joined(separator: " ") : words.first.map(String.init) + } ?? "torch" + self.wakeCheckSeconds = defaults.object(forKey: "wakeCheckSeconds") as? Double ?? 2.0 + + if !FileManager.default.fileExists(atPath: resolvedVadModelPath) { + let probePaths = [ + storedVadModelPath, + "\(home)/silero_vad_xnnpack/silero_vad.pte", + "\(home)/voxtral_realtime_quant_metal/silero_vad.pte", + ] + resolvedVadModelPath = probePaths + .compactMap { $0 } + .first(where: { FileManager.default.fileExists(atPath: $0) }) + ?? "\(home)/silero_vad_xnnpack/silero_vad.pte" + } + self.vadModelPath = resolvedVadModelPath + if FileManager.default.isExecutableFile(atPath: bundledVadRunner) { + self.vadRunnerPath = bundledVadRunner + } } } diff --git a/voxtral_realtime/macos/VoxtralRealtime/Models/ReplacementEntry.swift b/voxtral_realtime/macos/VoxtralRealtime/Models/ReplacementEntry.swift new file mode 100644 index 000000000..545c26a7e --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtime/Models/ReplacementEntry.swift @@ -0,0 +1,37 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +struct ReplacementEntry: Identifiable, Codable, Sendable, Hashable { + let id: UUID + var trigger: String + var replacement: String + var isEnabled: Bool + var isCaseSensitive: Bool + var requiresWordBoundary: Bool + var notes: String + + init( + id: UUID = UUID(), + trigger: String = "", + replacement: String = "", + isEnabled: Bool = true, + isCaseSensitive: Bool = false, + requiresWordBoundary: Bool = true, + notes: String = "" + ) { + self.id = id + self.trigger = trigger + self.replacement = replacement + self.isEnabled = isEnabled + self.isCaseSensitive = isCaseSensitive + self.requiresWordBoundary = requiresWordBoundary + self.notes = notes + } +} diff --git a/voxtral_realtime/macos/VoxtralRealtime/Models/Session.swift b/voxtral_realtime/macos/VoxtralRealtime/Models/Session.swift index a57fe82b4..3686e16c9 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Models/Session.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Models/Session.swift @@ -8,25 +8,77 @@ import Foundation +enum SessionSource: String, Codable, Sendable, Hashable { + case transcription + case dictation +} + struct Session: Identifiable, Codable, Sendable, Hashable { let id: UUID let date: Date var title: String var transcript: String var duration: TimeInterval + var source: SessionSource + var rawTranscript: String? + var tags: [String] + var wakeTriggered: Bool + var pinned: Bool + var usedSnippetIDs: [UUID] init( id: UUID = UUID(), date: Date = .now, title: String = "", transcript: String = "", - duration: TimeInterval = 0 + duration: TimeInterval = 0, + source: SessionSource = .transcription, + rawTranscript: String? = nil, + tags: [String] = [], + wakeTriggered: Bool = false, + pinned: Bool = false, + usedSnippetIDs: [UUID] = [] ) { self.id = id self.date = date self.title = title self.transcript = transcript self.duration = duration + self.source = source + self.rawTranscript = rawTranscript + self.tags = tags + self.wakeTriggered = wakeTriggered + self.pinned = pinned + self.usedSnippetIDs = usedSnippetIDs + } + + enum CodingKeys: String, CodingKey { + case id + case date + case title + case transcript + case duration + case source + case rawTranscript + case tags + case wakeTriggered + case pinned + case usedSnippetIDs + } + + init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + id = try container.decodeIfPresent(UUID.self, forKey: .id) ?? UUID() + date = try container.decodeIfPresent(Date.self, forKey: .date) ?? .now + title = try container.decodeIfPresent(String.self, forKey: .title) ?? "" + transcript = try container.decodeIfPresent(String.self, forKey: .transcript) ?? "" + duration = try container.decodeIfPresent(TimeInterval.self, forKey: .duration) ?? 0 + source = try container.decodeIfPresent(SessionSource.self, forKey: .source) ?? .transcription + rawTranscript = try container.decodeIfPresent(String.self, forKey: .rawTranscript) + tags = try container.decodeIfPresent([String].self, forKey: .tags) ?? [] + wakeTriggered = try container.decodeIfPresent(Bool.self, forKey: .wakeTriggered) ?? false + pinned = try container.decodeIfPresent(Bool.self, forKey: .pinned) ?? false + usedSnippetIDs = try container.decodeIfPresent([UUID].self, forKey: .usedSnippetIDs) ?? [] } var displayTitle: String { @@ -36,4 +88,8 @@ struct Session: Identifiable, Codable, Sendable, Hashable { formatter.timeStyle = .short return formatter.string(from: date) } + + var previewText: String { + transcript.isEmpty ? rawTranscript ?? "" : transcript + } } diff --git a/voxtral_realtime/macos/VoxtralRealtime/Models/Snippet.swift b/voxtral_realtime/macos/VoxtralRealtime/Models/Snippet.swift new file mode 100644 index 000000000..c2fec9ac4 --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtime/Models/Snippet.swift @@ -0,0 +1,37 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +struct Snippet: Identifiable, Codable, Sendable, Hashable { + let id: UUID + var name: String + var trigger: String + var content: String + var isEnabled: Bool + var notes: String + var lastUsedAt: Date? + + init( + id: UUID = UUID(), + name: String = "", + trigger: String = "", + content: String = "", + isEnabled: Bool = true, + notes: String = "", + lastUsedAt: Date? = nil + ) { + self.id = id + self.name = name + self.trigger = trigger + self.content = content + self.isEnabled = isEnabled + self.notes = notes + self.lastUsedAt = lastUsedAt + } +} diff --git a/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift b/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift index e57bea974..2922239f1 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift @@ -6,6 +6,7 @@ * LICENSE file in the root directory of this source tree. */ +import AppKit import Foundation @MainActor @Observable @@ -35,20 +36,29 @@ final class TranscriptStore { var dictationText = "" var isDictating = false + var wakeState: WakeState = .disabled var hasActiveSession: Bool { sessionState != .idle } var isTranscribing: Bool { sessionState == .transcribing } var isPaused: Bool { sessionState == .paused } var isLoading: Bool { sessionState == .loading } var isModelReady: Bool { modelState == .ready } + var recentDictationSessions: [Session] { + sessions.filter { $0.source == .dictation }.prefix(5).map { $0 } + } private let runner = RunnerBridge() private let preferences: Preferences + private let textPipeline: TextPipeline private var startDate: Date? private var streamTask: Task? - init(preferences: Preferences) { + init( + preferences: Preferences, + textPipeline: TextPipeline + ) { self.preferences = preferences + self.textPipeline = textPipeline loadSessions() } @@ -139,10 +149,15 @@ final class TranscriptStore { let duration = startDate.map { Date.now.timeIntervalSince($0) } ?? 0 if !liveTranscript.isEmpty { + let processed = textPipeline.process(liveTranscript, context: .sessionSave) let session = Session( date: startDate ?? .now, - transcript: liveTranscript, - duration: duration + transcript: processed.outputText, + duration: duration, + source: .transcription, + rawTranscript: processed.rawText == processed.outputText ? nil : processed.rawText, + tags: processed.tags, + usedSnippetIDs: processed.usedSnippetIDs ) sessions.insert(session, at: 0) selectedSessionID = session.id @@ -247,7 +262,7 @@ final class TranscriptStore { // MARK: - Dictation - func startDictation() async { + func startDictation(initialSamples: [Float] = []) async { guard !isDictating else { return } let micOK = await checkMicPermissionLive() @@ -266,6 +281,7 @@ final class TranscriptStore { audioLevel = 0 do { + try await runner.primeAudioSamples(initialSamples) try await runner.startAudioCapture() } catch { isDictating = false @@ -298,6 +314,79 @@ final class TranscriptStore { saveSessions() } + func togglePinned(_ session: Session) { + guard let index = sessions.firstIndex(where: { $0.id == session.id }) else { return } + sessions[index].pinned.toggle() + saveSessions() + } + + func saveDictationSession( + result: TextProcessingResult, + duration: TimeInterval, + wakeTriggered: Bool + ) { + guard !result.outputText.isEmpty else { return } + let session = Session( + date: .now, + transcript: result.outputText, + duration: duration, + source: .dictation, + rawTranscript: result.rawText == result.outputText ? nil : result.rawText, + tags: result.tags, + wakeTriggered: wakeTriggered, + usedSnippetIDs: result.usedSnippetIDs + ) + sessions.insert(session, at: 0) + selectedSessionID = session.id + saveSessions() + } + + func processDictationText(_ rawText: String) -> TextProcessingResult { + textPipeline.process(rawText, context: .dictation) + } + + func normalizeWakePhrase(_ text: String) -> String { + textPipeline.normalizeForWakePhrase(text) + } + + func stripLeadingWakePhrase(_ wakePhrase: String) { + guard !dictationText.isEmpty, !wakePhrase.isEmpty else { return } + + let words = wakePhrase.lowercased() + .components(separatedBy: .whitespacesAndNewlines) + .filter { !$0.isEmpty } + + var farthestEnd = dictationText.startIndex + let searchLimit = dictationText.index( + dictationText.startIndex, + offsetBy: min(dictationText.count, max(wakePhrase.count + 20, 30)) + ) + let leadingSlice = String(dictationText[.. farthestEnd { + farthestEnd = range.upperBound + } + } + } + + guard farthestEnd > dictationText.startIndex else { return } + + let remainder = String(dictationText[farthestEnd...]) + .trimmingCharacters(in: CharacterSet(charactersIn: " ,.:;-").union(.whitespacesAndNewlines)) + dictationText = remainder + } + + func exportSession(_ session: Session, format: SessionExportFormat) { + let panel = NSSavePanel() + panel.canCreateDirectories = true + panel.nameFieldStringValue = suggestedExportName(for: session, format: format) + panel.allowedContentTypes = [format.contentType] + guard panel.runModal() == .OK, let url = panel.url else { return } + try? format.render(session).write(to: url, atomically: true, encoding: .utf8) + } + func clearError() { currentError = nil } @@ -333,22 +422,21 @@ final class TranscriptStore { // MARK: - Persistence - private var sessionsURL: URL { - let appSupport = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first! - let dir = appSupport.appendingPathComponent("VoxtralRealtime", isDirectory: true) - try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true) - return dir.appendingPathComponent("sessions.json") - } - private func saveSessions() { guard let data = try? JSONEncoder().encode(sessions) else { return } - try? data.write(to: sessionsURL, options: .atomic) + try? data.write(to: PersistencePaths.sessionsURL, options: .atomic) } private func loadSessions() { - guard let data = try? Data(contentsOf: sessionsURL), + guard let data = try? Data(contentsOf: PersistencePaths.sessionsURL), let decoded = try? JSONDecoder().decode([Session].self, from: data) else { return } sessions = decoded } + + private func suggestedExportName(for session: Session, format: SessionExportFormat) -> String { + let formatter = ISO8601DateFormatter() + let stamp = formatter.string(from: session.date).replacingOccurrences(of: ":", with: "-") + return "voxtral-\(session.source.rawValue)-\(stamp).\(format.fileExtension)" + } } diff --git a/voxtral_realtime/macos/VoxtralRealtime/Models/WakeState.swift b/voxtral_realtime/macos/VoxtralRealtime/Models/WakeState.swift new file mode 100644 index 000000000..93012e06f --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtime/Models/WakeState.swift @@ -0,0 +1,17 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +enum WakeState: String, Sendable, Equatable { + case disabled + case listening + case speechDetected + case checkingPhrase + case active +} diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift index 8986c4f8c..b6fe20134 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift @@ -24,12 +24,16 @@ final class DictationManager { private let store: TranscriptStore private let preferences: Preferences + private let vadService = VadService() private var panel: DictationPanel? private var hotKeyRef: EventHotKeyRef? private var eventHandlerRef: EventHandlerRef? private var silenceTimer: Task? + private var wakeCheckTask: Task? private var lastVoiceTime: Date = .now private var targetApp: NSRunningApplication? + private var dictationStartedAt: Date? + private var wakeTriggeredForCurrentSession = false init(store: TranscriptStore, preferences: Preferences) { self.store = store @@ -39,6 +43,7 @@ final class DictationManager { nonisolated func cleanup() { MainActor.assumeIsolated { unregisterHotKey() + Task { await self.vadService.stop() } } } @@ -63,6 +68,7 @@ final class DictationManager { } installEventHandler() + Task { await startWakeListeningIfNeeded() } } func unregisterHotKey() { @@ -74,6 +80,8 @@ final class DictationManager { RemoveEventHandler(handler) eventHandlerRef = nil } + wakeCheckTask?.cancel() + Task { await vadService.stop() } } private func installEventHandler() { @@ -111,6 +119,9 @@ final class DictationManager { // MARK: - Listening private func startListening() async { + await vadService.stop() + wakeCheckTask?.cancel() + guard store.isModelReady || store.modelState == .unloaded else { return } let micStatus = await HealthCheck.liveMicPermission() @@ -136,6 +147,9 @@ final class DictationManager { state = .listening lastVoiceTime = .now + dictationStartedAt = .now + wakeTriggeredForCurrentSession = false + store.wakeState = .active showPanel() await store.startDictation() @@ -149,12 +163,18 @@ final class DictationManager { silenceTimer?.cancel() silenceTimer = nil - let text = await store.stopDictation() + let rawText = await store.stopDictation() state = .idle + let duration = dictationStartedAt.map { Date.now.timeIntervalSince($0) } ?? 0 + dictationStartedAt = nil dismissPanel() - log.info("Dictation stopped, text length: \(text.count)") + log.info("Dictation stopped, text length: \(rawText.count)") + + guard !rawText.isEmpty else { return } + let result = store.processDictationText(rawText) + let text = result.outputText guard !text.isEmpty else { return } NSPasteboard.general.clearContents() @@ -173,28 +193,48 @@ final class DictationManager { log.warning("Accessibility permission lost — text is on clipboard, prompting user to re-grant") _ = Self.checkAccessibility(prompt: true) } + + store.saveDictationSession( + result: result, + duration: duration, + wakeTriggered: wakeTriggeredForCurrentSession + ) + store.wakeState = preferences.enableSileroVAD ? .listening : .disabled + await startWakeListeningIfNeeded() } // MARK: - Silence Detection + private var consecutiveSilencePolls = 0 + private func startSilenceMonitor() { silenceTimer?.cancel() + consecutiveSilencePolls = 0 silenceTimer = Task { @MainActor [weak self] in + let pollIntervalMs = 250 while !Task.isCancelled { - try? await Task.sleep(for: .milliseconds(250)) + try? await Task.sleep(for: .milliseconds(pollIntervalMs)) guard let self, self.state == .listening else { break } let level = self.store.audioLevel if level > Float(self.preferences.silenceThreshold) { self.lastVoiceTime = .now + self.consecutiveSilencePolls = 0 + } else { + self.consecutiveSilencePolls += 1 + } + + if self.store.wakeState == .checkingPhrase { + continue } - let silenceDuration = Date.now.timeIntervalSince(self.lastVoiceTime) let hasText = !self.store.dictationText.isEmpty + let requiredPolls = max(1, Int(self.preferences.silenceTimeout * 1000) / pollIntervalMs) - if hasText && silenceDuration >= self.preferences.silenceTimeout { - log.info("Auto-stop: \(String(format: "%.1f", silenceDuration))s silence (level: \(String(format: "%.4f", level)))") + if hasText && self.consecutiveSilencePolls >= requiredPolls { + let silenceDuration = Date.now.timeIntervalSince(self.lastVoiceTime) + log.info("Auto-stop: \(String(format: "%.1f", silenceDuration))s silence (\(self.consecutiveSilencePolls) consecutive polls, level: \(String(format: "%.4f", level)))") await self.stopAndPaste() break } @@ -202,6 +242,124 @@ final class DictationManager { } } + private func startWakeListeningIfNeeded() async { + guard preferences.enableSileroVAD, state == .idle, !store.isDictating else { + store.wakeState = preferences.enableSileroVAD ? .listening : .disabled + return + } + + guard await HealthCheck.liveMicPermission() == .authorized else { + store.wakeState = .disabled + return + } + + if !FileManager.default.isExecutableFile(atPath: preferences.vadRunnerPath) || + !FileManager.default.fileExists(atPath: preferences.vadModelPath) { + store.wakeState = .disabled + return + } + + store.wakeState = .listening + + do { + try await vadService.start( + runnerPath: preferences.vadRunnerPath, + modelPath: preferences.vadModelPath, + threshold: Float(preferences.vadThreshold), + hangoverMs: Int(preferences.vadHangoverMilliseconds) + ) { [weak self] event in + guard let self else { return } + Task { @MainActor in + await self.handleVadEvent(event) + } + } + } catch { + log.error("Failed to start VAD: \(error.localizedDescription)") + store.wakeState = .disabled + } + } + + private func handleVadEvent(_ event: VadService.Event) async { + switch event { + case .ready: + store.wakeState = .listening + case let .speechDetected(preRollSamples): + guard state == .idle else { return } + await beginWakePhraseCheck(preRollSamples: preRollSamples) + case .silenceDetected: + log.warning("VAD detected microphone silence — stopping wake listening") + await vadService.stop() + store.wakeState = .disabled + store.currentError = .microphoneSilent + case .stopped: + if state == .idle && preferences.enableSileroVAD { + store.wakeState = .listening + } + case let .error(message): + log.error("VAD error: \(message)") + store.wakeState = .disabled + } + } + + private func beginWakePhraseCheck(preRollSamples: [Float]) async { + await vadService.stop() + wakeCheckTask?.cancel() + + targetApp = NSWorkspace.shared.frontmostApplication + wakeTriggeredForCurrentSession = false + dictationStartedAt = .now + state = .listening + store.wakeState = .checkingPhrase + + await store.startDictation(initialSamples: preRollSamples) + + if !preferences.enableWakePhrase { + store.wakeState = .active + showPanel() + startSilenceMonitor() + return + } + + let requiredPhrase = store.normalizeWakePhrase(preferences.wakePhrase) + let keywords = Self.wakeKeywords(from: requiredPhrase) + let checkDurationNs = UInt64(preferences.wakeCheckSeconds * 1_000_000_000) + let deadline = DispatchTime.now().uptimeNanoseconds + checkDurationNs + + wakeCheckTask = Task { @MainActor [weak self] in + guard let self else { return } + while !Task.isCancelled && self.state == .listening { + let normalized = self.store.normalizeWakePhrase(self.store.dictationText) + if !keywords.isEmpty && keywords.allSatisfy({ normalized.contains($0) }) { + self.wakeTriggeredForCurrentSession = true + self.store.stripLeadingWakePhrase(self.preferences.wakePhrase) + self.store.wakeState = .active + self.showPanel() + self.startSilenceMonitor() + return + } + if DispatchTime.now().uptimeNanoseconds >= deadline { + log.info("Wake phrase not matched within \(self.preferences.wakeCheckSeconds)s — returning to idle") + _ = await self.store.stopDictation() + self.state = .idle + self.store.wakeState = .listening + await self.startWakeListeningIfNeeded() + return + } + try? await Task.sleep(for: .milliseconds(100)) + } + } + } + + private static let fillerWords: Set = [ + "hey", "hi", "hello", "ok", "okay", "yo", "oh", "ah", "um", "uh", + ] + + private static func wakeKeywords(from normalizedPhrase: String) -> [String] { + let words = normalizedPhrase.split(separator: " ").map(String.init) + let significant = words.filter { !fillerWords.contains($0) } + return significant.isEmpty ? words : significant + } + // MARK: - Panel private func showPanel() { diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/ReplacementStore.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/ReplacementStore.swift new file mode 100644 index 000000000..2aad85fe5 --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/ReplacementStore.swift @@ -0,0 +1,77 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +@MainActor @Observable +final class ReplacementStore { + var entries: [ReplacementEntry] = [] + + private let fileURL: URL + + init(fileURL: URL = PersistencePaths.replacementsURL) { + self.fileURL = fileURL + load() + } + + func add(_ entry: ReplacementEntry) { + entries.insert(entry, at: 0) + save() + } + + func update(_ entry: ReplacementEntry) { + guard let index = entries.firstIndex(where: { $0.id == entry.id }) else { return } + entries[index] = entry + save() + } + + func delete(_ entry: ReplacementEntry) { + entries.removeAll { $0.id == entry.id } + save() + } + + func toggleEnabled(for id: UUID) { + guard let index = entries.firstIndex(where: { $0.id == id }) else { return } + entries[index].isEnabled.toggle() + save() + } + + private func load() { + let legacyURL = fileURL.deletingLastPathComponent().appendingPathComponent("dictionary.json") + + if let data = try? Data(contentsOf: fileURL), + let decoded = try? JSONDecoder().decode([ReplacementEntry].self, from: data) { + entries = decoded + return + } + + if let data = try? Data(contentsOf: legacyURL), + let decoded = try? JSONDecoder().decode([ReplacementEntry].self, from: data) { + entries = decoded + save() + try? FileManager.default.removeItem(at: legacyURL) + return + } + + entries = Self.defaultEntries + if !entries.isEmpty { + save() + } + } + + private func save() { + guard let data = try? JSONEncoder().encode(entries) else { return } + try? data.write(to: fileURL, options: .atomic) + } + + private static let defaultEntries: [ReplacementEntry] = [ + ReplacementEntry(trigger: "mtia", replacement: "MTIA"), + ReplacementEntry(trigger: "mvai", replacement: "MVAI"), + ReplacementEntry(trigger: "executorch", replacement: "ExecuTorch", requiresWordBoundary: false), + ] +} diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/RunnerBridge.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/RunnerBridge.swift index 01d476675..35c07100f 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/RunnerBridge.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/RunnerBridge.swift @@ -202,6 +202,17 @@ actor RunnerBridge { } } + func primeAudioSamples(_ samples: [Float]) throws { + guard !samples.isEmpty else { return } + guard let handle = stdinPipe?.fileHandleForWriting else { + throw RunnerError.launchFailed(description: "Runner stdin not available") + } + let data = samples.withUnsafeBufferPointer { + Data(buffer: $0) + } + try handle.write(contentsOf: data) + } + func stopAudioCapture() async { await audioEngine.stopCapture() levelContinuation?.yield(0) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/SnippetStore.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/SnippetStore.swift new file mode 100644 index 000000000..7fa9c3678 --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/SnippetStore.swift @@ -0,0 +1,81 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +@MainActor @Observable +final class SnippetStore { + var snippets: [Snippet] = [] + + private let fileURL: URL + + init(fileURL: URL = PersistencePaths.snippetsURL) { + self.fileURL = fileURL + load() + } + + func add(_ snippet: Snippet) { + snippets.insert(snippet, at: 0) + save() + } + + func update(_ snippet: Snippet) { + guard let index = snippets.firstIndex(where: { $0.id == snippet.id }) else { return } + snippets[index] = snippet + save() + } + + func delete(_ snippet: Snippet) { + snippets.removeAll { $0.id == snippet.id } + save() + } + + func toggleEnabled(for id: UUID) { + guard let index = snippets.firstIndex(where: { $0.id == id }) else { return } + snippets[index].isEnabled.toggle() + save() + } + + func markUsed(_ snippetID: UUID, at date: Date = .now) { + guard let index = snippets.firstIndex(where: { $0.id == snippetID }) else { return } + snippets[index].lastUsedAt = date + save() + } + + private func load() { + guard + let data = try? Data(contentsOf: fileURL), + let decoded = try? JSONDecoder().decode([Snippet].self, from: data) + else { + snippets = Self.defaultSnippets + if !snippets.isEmpty { + save() + } + return + } + snippets = decoded + } + + private func save() { + guard let data = try? JSONEncoder().encode(snippets) else { return } + try? data.write(to: fileURL, options: .atomic) + } + + private static let defaultSnippets: [Snippet] = [ + Snippet( + name: "Daily Standup", + trigger: "daily standup", + content: "Yesterday:\n- \n\nToday:\n- \n\nBlockers:\n- " + ), + Snippet( + name: "Email Signature", + trigger: "email signature", + content: "Best,\nYounghan" + ), + ] +} diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift new file mode 100644 index 000000000..b69259119 --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift @@ -0,0 +1,168 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +struct TextProcessingResult: Sendable, Equatable { + let rawText: String + let outputText: String + let tags: [String] + let usedSnippetIDs: [UUID] + let skippedSnippetExpansion: Bool + + var transformed: Bool { + rawText != outputText + } +} + +@MainActor +final class TextPipeline { + enum Context: Sendable { + case dictation + case sessionSave + case wakePhraseCheck + } + + private let replacementStore: ReplacementStore + private let snippetStore: SnippetStore + + init(replacementStore: ReplacementStore, snippetStore: SnippetStore) { + self.replacementStore = replacementStore + self.snippetStore = snippetStore + } + + func process(_ text: String, context: Context) -> TextProcessingResult { + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { + return TextProcessingResult( + rawText: text, + outputText: "", + tags: [], + usedSnippetIDs: [], + skippedSnippetExpansion: false + ) + } + + let literalPrefix = "literal " + let literalCommand = trimmed.lowercased().hasPrefix(literalPrefix) + let baseText: String + if literalCommand { + baseText = String(trimmed.dropFirst(literalPrefix.count)) + } else { + baseText = trimmed + } + + let replacementsApplied = applyReplacements(to: baseText) + let snippetResolution = resolveSnippet(in: replacementsApplied, allowExpansion: context == .dictation && !literalCommand) + let styleApplied = applyStyle(to: snippetResolution.text) + + var tags: [String] = [] + if replacementsApplied != baseText { + tags.append("replacement") + } + if !snippetResolution.usedSnippetIDs.isEmpty { + tags.append("snippet") + } + if literalCommand { + tags.append("literal") + } + + return TextProcessingResult( + rawText: text, + outputText: styleApplied, + tags: tags, + usedSnippetIDs: snippetResolution.usedSnippetIDs, + skippedSnippetExpansion: literalCommand + ) + } + + func normalizeForWakePhrase(_ text: String) -> String { + let processed = process(text, context: .wakePhraseCheck).outputText + let folded = processed + .folding(options: [.diacriticInsensitive, .caseInsensitive], locale: .current) + .replacingOccurrences(of: #"[^a-z0-9\s]"#, with: " ", options: .regularExpression) + return folded + .replacingOccurrences(of: #"\s+"#, with: " ", options: .regularExpression) + .trimmingCharacters(in: .whitespacesAndNewlines) + } + + private func applyReplacements(to text: String) -> String { + replacementStore.entries + .filter(\.isEnabled) + .sorted { $0.trigger.count > $1.trigger.count } + .reduce(text) { partial, entry in + replace(entry: entry, in: partial) + } + } + + private func replace(entry: ReplacementEntry, in text: String) -> String { + guard !entry.trigger.isEmpty else { return text } + + let escaped = NSRegularExpression.escapedPattern(for: entry.trigger) + let pattern = entry.requiresWordBoundary ? #"\b\#(escaped)\b"# : escaped + let options: NSRegularExpression.Options = entry.isCaseSensitive ? [] : [.caseInsensitive] + + guard let regex = try? NSRegularExpression(pattern: pattern, options: options) else { + return text + } + + let range = NSRange(text.startIndex..., in: text) + let matches = regex.matches(in: text, options: [], range: range) + guard !matches.isEmpty else { return text } + + var output = text + for match in matches.reversed() { + guard let matchRange = Range(match.range, in: output) else { continue } + let original = String(output[matchRange]) + let replacement = preserveCaseIfNeeded(original: original, replacement: entry.replacement) + output.replaceSubrange(matchRange, with: replacement) + } + return output + } + + private func preserveCaseIfNeeded(original: String, replacement: String) -> String { + if original == original.uppercased() { + return replacement.uppercased() + } + if original == original.lowercased() { + return replacement + } + if let first = original.first, String(first) == String(first).uppercased() { + return replacement.prefix(1).uppercased() + replacement.dropFirst() + } + return replacement + } + + private func resolveSnippet(in text: String, allowExpansion: Bool) -> (text: String, usedSnippetIDs: [UUID]) { + guard allowExpansion else { + return (text, []) + } + + let normalized = text.trimmingCharacters(in: .whitespacesAndNewlines) + let lowered = normalized.lowercased() + let commandPrefixes = ["insert snippet ", "snippet ", "template "] + + for prefix in commandPrefixes where lowered.hasPrefix(prefix) { + let requestedTrigger = String(normalized.dropFirst(prefix.count)).trimmingCharacters(in: .whitespacesAndNewlines) + if let snippet = snippetStore.snippets.first(where: { + $0.isEnabled && $0.trigger.compare(requestedTrigger, options: .caseInsensitive) == .orderedSame + }) { + snippetStore.markUsed(snippet.id) + return (snippet.content, [snippet.id]) + } + } + + return (text, []) + } + + private func applyStyle(to text: String) -> String { + // Style is intentionally a no-op in v1. Keep the stage so later presets + // can plug into the same transformation pipeline. + text + } +} diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/VadService.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/VadService.swift new file mode 100644 index 000000000..59106fb4a --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/VadService.swift @@ -0,0 +1,231 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import AVFoundation +import Foundation +import os + +private let vadLog = Logger(subsystem: "org.pytorch.executorch.VoxtralRealtime", category: "VadService") + +actor VadService { + enum Event: Sendable { + case ready + case speechDetected(preRollSamples: [Float]) + case silenceDetected + case stopped + case error(String) + } + + private var process: Process? + private var stdinPipe: Pipe? + private var engine: AVAudioEngine? + private var recentSamples: [Float] = [] + private var byteBuffer = Data() + private var consecutiveSpeechFrames = 0 + private var armed = true + private var eventHandler: (@Sendable (Event) -> Void)? + + private var totalSamplesWritten: Int = 0 + private var peakRms: Float = 0 + private var silenceCheckFired = false + private static let silenceCheckSamples = 16_000 * 2 // 2s at 16kHz + private static let silenceRmsThreshold: Float = 1e-6 + + private var hangoverFramesRemaining = 0 + private var hangoverFramesMax = 0 + private static let frameDurationMs = 32 + + func start( + runnerPath: String, + modelPath: String, + threshold: Float, + hangoverMs: Int, + eventHandler: @escaping @Sendable (Event) -> Void + ) async throws { + await stop() + + self.eventHandler = eventHandler + armed = true + recentSamples = [] + byteBuffer = Data() + consecutiveSpeechFrames = 0 + hangoverFramesRemaining = 0 + hangoverFramesMax = max(0, hangoverMs / Self.frameDurationMs) + totalSamplesWritten = 0 + peakRms = 0 + silenceCheckFired = false + + let stdoutPipe = Pipe() + let stdinPipe = Pipe() + self.stdinPipe = stdinPipe + + let process = Process() + process.executableURL = URL(fileURLWithPath: runnerPath) + process.arguments = [ + "--model_path", modelPath + ] + process.standardInput = stdinPipe + process.standardOutput = stdoutPipe + process.standardError = Pipe() + self.process = process + + process.terminationHandler = { [weak self] _ in + Task { + await self?.emit(.stopped) + } + } + + DispatchQueue.global(qos: .userInitiated).async { [weak self] in + let handle = stdoutPipe.fileHandleForReading + while true { + let data = handle.availableData + if data.isEmpty { break } + guard let text = String(data: data, encoding: .utf8) else { continue } + for line in text.split(whereSeparator: \.isNewline) { + Task { + await self?.handleOutputLine(String(line), threshold: threshold) + } + } + } + } + + try process.run() + try await startAudioCapture() + vadLog.info("Silero VAD started") + } + + func stop() async { + engine?.inputNode.removeTap(onBus: 0) + engine?.stop() + engine = nil + + stdinPipe?.fileHandleForWriting.closeFile() + stdinPipe = nil + + if let process, process.isRunning { + process.terminate() + } + process = nil + } + + private func startAudioCapture() async throws { + guard let handle = stdinPipe?.fileHandleForWriting else { + throw RunnerError.launchFailed(description: "VAD stdin not available") + } + + let engine = AVAudioEngine() + let inputNode = engine.inputNode + let hwFormat = inputNode.outputFormat(forBus: 0) + guard hwFormat.sampleRate > 0, hwFormat.channelCount > 0 else { + throw RunnerError.microphoneNotAvailable + } + + let targetFormat = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: 16_000, + channels: 1, + interleaved: false + )! + guard let converter = AVAudioConverter(from: hwFormat, to: targetFormat) else { + throw RunnerError.launchFailed(description: "Cannot create VAD audio converter") + } + + let sampleRateRatio = 16_000.0 / hwFormat.sampleRate + inputNode.installTap(onBus: 0, bufferSize: 4096, format: hwFormat) { [weak self] buffer, _ in + guard let self else { return } + let capacity = AVAudioFrameCount(Double(buffer.frameLength) * sampleRateRatio) + 1 + guard let converted = AVAudioPCMBuffer(pcmFormat: targetFormat, frameCapacity: capacity) else { + return + } + + var consumed = false + var error: NSError? + converter.convert(to: converted, error: &error) { _, outStatus in + if !consumed { + consumed = true + outStatus.pointee = .haveData + return buffer + } + outStatus.pointee = .noDataNow + return nil + } + + guard error == nil, + converted.frameLength > 0, + let channelData = converted.floatChannelData + else { return } + + let frameCount = Int(converted.frameLength) + let samples = Array(UnsafeBufferPointer(start: channelData[0], count: frameCount)) + Task { + await self.appendRecent(samples) + try? await self.write(samples: samples, to: handle) + } + } + + try engine.start() + self.engine = engine + } + + private func write(samples: [Float], to handle: FileHandle) throws { + guard !samples.isEmpty else { return } + let data = samples.withUnsafeBufferPointer { Data(buffer: $0) } + try handle.write(contentsOf: data) + + let sumSq = samples.reduce(Float(0)) { $0 + $1 * $1 } + let rms = (sumSq / Float(samples.count)).squareRoot() + peakRms = max(peakRms, rms) + totalSamplesWritten += samples.count + + if !silenceCheckFired && totalSamplesWritten >= Self.silenceCheckSamples { + silenceCheckFired = true + if peakRms < Self.silenceRmsThreshold { + vadLog.warning("Microphone producing silence after \(self.totalSamplesWritten) samples (peak RMS: \(self.peakRms))") + emit(.silenceDetected) + } + } + } + + private func appendRecent(_ samples: [Float]) { + recentSamples.append(contentsOf: samples) + let maxSamples = 16_000 * 2 + if recentSamples.count > maxSamples { + recentSamples.removeFirst(recentSamples.count - maxSamples) + } + } + + private func handleOutputLine(_ line: String, threshold: Float) { + if line == "READY" { + emit(.ready) + return + } + + let parts = line.split(separator: " ") + guard parts.count == 3, parts[0] == "PROB", let probability = Float(parts[2]) else { + return + } + + if probability >= threshold { + consecutiveSpeechFrames += 1 + hangoverFramesRemaining = hangoverFramesMax + } else if hangoverFramesRemaining > 0 { + hangoverFramesRemaining -= 1 + } else { + consecutiveSpeechFrames = 0 + } + + if armed && consecutiveSpeechFrames >= 2 { + armed = false + emit(.speechDetected(preRollSamples: recentSamples)) + } + } + + private func emit(_ event: Event) { + eventHandler?(event) + } +} diff --git a/voxtral_realtime/macos/VoxtralRealtime/Utilities/PersistencePaths.swift b/voxtral_realtime/macos/VoxtralRealtime/Utilities/PersistencePaths.swift new file mode 100644 index 000000000..9e7a5cff2 --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtime/Utilities/PersistencePaths.swift @@ -0,0 +1,30 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +enum PersistencePaths { + static var appSupportDirectory: URL { + let appSupport = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first! + let directory = appSupport.appendingPathComponent("VoxtralRealtime", isDirectory: true) + try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory + } + + static var sessionsURL: URL { + appSupportDirectory.appendingPathComponent("sessions.json") + } + + static var replacementsURL: URL { + appSupportDirectory.appendingPathComponent("replacements.json") + } + + static var snippetsURL: URL { + appSupportDirectory.appendingPathComponent("snippets.json") + } +} diff --git a/voxtral_realtime/macos/VoxtralRealtime/Utilities/RunnerError.swift b/voxtral_realtime/macos/VoxtralRealtime/Utilities/RunnerError.swift index f5cf8ac52..78f258c44 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Utilities/RunnerError.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Utilities/RunnerError.swift @@ -13,6 +13,7 @@ enum RunnerError: Error, Sendable { case modelMissing(file: String) case microphonePermissionDenied case microphoneNotAvailable + case microphoneSilent case runnerCrashed(exitCode: Int32, stderr: String) case transcriptionInterrupted(partial: String) case launchFailed(description: String) @@ -29,6 +30,8 @@ extension RunnerError: LocalizedError { "Microphone access denied. Enable it in System Settings → Privacy & Security → Microphone, then quit and relaunch the app." case .microphoneNotAvailable: "No audio input available. Check that your microphone is connected, enable it in System Settings → Privacy & Security → Microphone, then quit and relaunch the app." + case .microphoneSilent: + "Microphone is producing silence. Toggle the app's microphone permission off and on in System Settings → Privacy & Security → Microphone, then quit and relaunch the app." case .runnerCrashed(let code, let stderr): "Runner exited with code \(code): \(stderr)" case .transcriptionInterrupted: diff --git a/voxtral_realtime/macos/VoxtralRealtime/Utilities/SessionExportFormat.swift b/voxtral_realtime/macos/VoxtralRealtime/Utilities/SessionExportFormat.swift new file mode 100644 index 000000000..9232f7ea1 --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtime/Utilities/SessionExportFormat.swift @@ -0,0 +1,88 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import UniformTypeIdentifiers + +enum SessionExportFormat: String, CaseIterable, Sendable { + case text + case json + case srt + + var fileExtension: String { + switch self { + case .text: + return "txt" + case .json: + return "json" + case .srt: + return "srt" + } + } + + var contentType: UTType { + switch self { + case .text: + return .plainText + case .json: + return .json + case .srt: + return UTType(filenameExtension: "srt") ?? .plainText + } + } + + func render(_ session: Session) -> String { + switch self { + case .text: + return session.transcript + case .json: + let payload = ExportPayload( + title: session.displayTitle, + date: session.date, + source: session.source, + transcript: session.transcript, + rawTranscript: session.rawTranscript, + duration: session.duration, + tags: session.tags, + wakeTriggered: session.wakeTriggered + ) + let encoder = JSONEncoder() + encoder.outputFormatting = [.prettyPrinted, .sortedKeys] + encoder.dateEncodingStrategy = .iso8601 + let data = (try? encoder.encode(payload)) ?? Data("{}".utf8) + return String(decoding: data, as: UTF8.self) + case .srt: + let end = max(session.duration, 1) + return """ + 1 + 00:00:00,000 --> \(srtTimestamp(end)) + \(session.transcript) + """ + } + } + + private func srtTimestamp(_ interval: TimeInterval) -> String { + let totalMilliseconds = Int((interval * 1000).rounded()) + let hours = totalMilliseconds / 3_600_000 + let minutes = (totalMilliseconds / 60_000) % 60 + let seconds = (totalMilliseconds / 1_000) % 60 + let milliseconds = totalMilliseconds % 1_000 + return String(format: "%02d:%02d:%02d,%03d", hours, minutes, seconds, milliseconds) + } + + private struct ExportPayload: Codable { + let title: String + let date: Date + let source: SessionSource + let transcript: String + let rawTranscript: String? + let duration: TimeInterval + let tags: [String] + let wakeTriggered: Bool + } +} diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/ContentView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/ContentView.swift index 18740d878..24300e119 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/ContentView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/ContentView.swift @@ -11,12 +11,11 @@ import SwiftUI struct ContentView: View { @Environment(TranscriptStore.self) private var store @State private var columnVisibility: NavigationSplitViewVisibility = .doubleColumn + @State private var activePage: SidebarPage = .home var body: some View { - @Bindable var store = store - NavigationSplitView(columnVisibility: $columnVisibility) { - SidebarView() + SidebarView(activePage: $activePage) .navigationSplitViewColumnWidth(min: 180, ideal: 220, max: 320) } detail: { detailContent @@ -31,6 +30,16 @@ struct ContentView: View { } } .animation(.easeInOut(duration: 0.25), value: store.currentError != nil) + .onChange(of: activePage) { _, newPage in + if case .session(let id) = newPage { + store.selectedSessionID = id + } + } + .onChange(of: store.selectedSessionID) { _, newID in + if let newID { + activePage = .session(newID) + } + } .task { await store.runHealthCheck() } @@ -38,6 +47,29 @@ struct ContentView: View { @ViewBuilder private var detailContent: some View { + switch activePage { + case .replacements: + ReplacementManagementView() + .padding() + .navigationTitle("Replacements") + case .snippets: + SnippetManagementView() + .padding() + .navigationTitle("Snippets") + case .home: + homeContent + case .session(let id): + if let session = store.sessions.first(where: { $0.id == id }) { + TranscriptView(text: session.transcript, isLive: false) + .navigationTitle(session.displayTitle) + } else { + homeContent + } + } + } + + @ViewBuilder + private var homeContent: some View { if store.healthResult?.allGood == false && !store.hasActiveSession && store.modelState == .unloaded { SetupGuideView() } else if store.hasActiveSession { @@ -48,10 +80,6 @@ struct ContentView: View { audioLevel: store.audioLevel, statusMessage: store.statusMessage ) - } else if let id = store.selectedSessionID, - let session = store.sessions.first(where: { $0.id == id }) { - TranscriptView(text: session.transcript, isLive: false) - .navigationTitle(session.displayTitle) } else { WelcomeView() } diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/DictationOverlayView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/DictationOverlayView.swift index 831099e0b..43bb3be64 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/DictationOverlayView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/DictationOverlayView.swift @@ -31,13 +31,26 @@ struct DictationOverlayView: View { return min(CGFloat(lines) * 18, 200) } + private var statusText: String { + switch store.wakeState { + case .disabled, .active: + return "Listening..." + case .listening: + return "Wake listening..." + case .speechDetected: + return "Speech detected..." + case .checkingPhrase: + return "Checking wake phrase..." + } + } + var body: some View { VStack(spacing: 10) { AudioLevelView(level: store.audioLevel, barCount: 20) .frame(height: 36) if store.dictationText.isEmpty { - Text("Listening...") + Text(statusText) .font(.callout) .foregroundStyle(.secondary) } else { @@ -57,7 +70,7 @@ struct DictationOverlayView: View { } } - Text("⌃Space to finish") + Text(store.wakeState == .checkingPhrase ? "Say your wake phrase to continue" : "⌃Space to finish") .font(.caption2) .foregroundStyle(.tertiary) } diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/ReplacementManagementView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/ReplacementManagementView.swift new file mode 100644 index 000000000..d5a9fe29c --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/ReplacementManagementView.swift @@ -0,0 +1,153 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +struct ReplacementManagementView: View { + @Environment(ReplacementStore.self) private var replacementStore + @State private var searchText = "" + @State private var editingEntry = ReplacementEntry() + @State private var editingEntryID: UUID? + @State private var isPresentingEditor = false + + var body: some View { + VStack(alignment: .leading, spacing: 12) { + HStack { + TextField("Search terms", text: $searchText) + .textFieldStyle(.roundedBorder) + Button("Add") { + editingEntry = ReplacementEntry() + editingEntryID = nil + isPresentingEditor = true + } + .buttonStyle(.borderedProminent) + } + + if filteredEntries.isEmpty { + ContentUnavailableView( + "No Replacements", + systemImage: "arrow.2.squarepath", + description: Text("Add names, acronyms, and product terms you want corrected automatically.") + ) + } else { + List { + ForEach(filteredEntries) { entry in + HStack(alignment: .top, spacing: 12) { + Toggle("", isOn: binding(for: entry.id)) + .labelsHidden() + VStack(alignment: .leading, spacing: 4) { + Text(entry.replacement) + .font(.headline) + Text("Trigger: \(entry.trigger)") + .font(.caption) + .foregroundStyle(.secondary) + if !entry.notes.isEmpty { + Text(entry.notes) + .font(.caption) + .foregroundStyle(.tertiary) + } + } + Spacer() + Button("Edit") { + editingEntry = entry + editingEntryID = entry.id + isPresentingEditor = true + } + .buttonStyle(.borderless) + } + .contextMenu { + Button("Edit") { + editingEntry = entry + editingEntryID = entry.id + isPresentingEditor = true + } + Button(entry.isEnabled ? "Disable" : "Enable") { + replacementStore.toggleEnabled(for: entry.id) + } + Divider() + Button("Delete", role: .destructive) { + replacementStore.delete(entry) + } + } + } + } + .listStyle(.inset) + } + } + .sheet(isPresented: $isPresentingEditor) { + ReplacementEntryEditor(entry: editingEntry) { entry in + if editingEntryID == nil { + replacementStore.add(entry) + } else { + replacementStore.update(entry) + } + isPresentingEditor = false + } onCancel: { + isPresentingEditor = false + } + .padding(20) + .frame(width: 420) + } + } + + private var filteredEntries: [ReplacementEntry] { + guard !searchText.isEmpty else { return replacementStore.entries } + return replacementStore.entries.filter { + $0.trigger.localizedCaseInsensitiveContains(searchText) || + $0.replacement.localizedCaseInsensitiveContains(searchText) || + $0.notes.localizedCaseInsensitiveContains(searchText) + } + } + + private func binding(for id: UUID) -> Binding { + Binding( + get: { + replacementStore.entries.first(where: { $0.id == id })?.isEnabled ?? false + }, + set: { _ in + replacementStore.toggleEnabled(for: id) + } + ) + } +} + +private struct ReplacementEntryEditor: View { + @State var entry: ReplacementEntry + let onSave: (ReplacementEntry) -> Void + let onCancel: () -> Void + + var body: some View { + VStack(alignment: .leading, spacing: 16) { + Text(entry.trigger.isEmpty ? "Add Replacement" : "Edit Replacement") + .font(.headline) + + TextField("Trigger phrase", text: $entry.trigger) + .textFieldStyle(.roundedBorder) + TextField("Replacement", text: $entry.replacement) + .textFieldStyle(.roundedBorder) + TextField("Notes (optional)", text: $entry.notes) + .textFieldStyle(.roundedBorder) + + Toggle("Case sensitive", isOn: $entry.isCaseSensitive) + Toggle("Require word boundary", isOn: $entry.requiresWordBoundary) + Toggle("Enabled", isOn: $entry.isEnabled) + + HStack { + Spacer() + Button("Cancel", role: .cancel) { + onCancel() + } + Button("Save") { + onSave(entry) + } + .keyboardShortcut(.defaultAction) + .disabled(entry.trigger.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty || entry.replacement.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty) + } + } + } +} diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/SettingsView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/SettingsView.swift index 5a49cb900..152dbf01a 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/SettingsView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/SettingsView.swift @@ -40,6 +40,7 @@ struct SettingsView: View { fileStatus("model-metal-int4.pte", path: preferences.modelPath) fileStatus("tekken.json", path: preferences.tokenizerPath) fileStatus("preprocessor.pte", path: preferences.preprocessorPath) + fileStatus("silero_vad.pte", path: preferences.vadModelPath) } } } @@ -85,12 +86,77 @@ struct SettingsView: View { .background(.quaternary, in: RoundedRectangle(cornerRadius: 4)) } } + + Section("Wake") { + Toggle("Enable Silero VAD", isOn: $prefs.enableSileroVAD) + Toggle("Require wake phrase", isOn: $prefs.enableWakePhrase) + .disabled(!prefs.enableSileroVAD) + + LabeledContent("VAD runner") { + HStack { + TextField("Path to silero_vad_stream_runner", text: $prefs.vadRunnerPath) + .textFieldStyle(.roundedBorder) + browseButton(for: $prefs.vadRunnerPath) + } + } + + LabeledContent("VAD model") { + HStack { + TextField("Path to silero_vad.pte", text: $prefs.vadModelPath) + .textFieldStyle(.roundedBorder) + browseButton(for: $prefs.vadModelPath) + } + } + + LabeledContent("Wake phrase") { + HStack(spacing: 4) { + Text("Hey") + .foregroundStyle(.secondary) + TextField("torch", text: $prefs.wakeKeyword) + .textFieldStyle(.roundedBorder) + .frame(width: 140) + } + } + + LabeledContent("Speech threshold") { + VStack(alignment: .trailing, spacing: 4) { + Slider(value: $prefs.vadThreshold, in: 0.3...0.9, step: 0.05) + .frame(width: 200) + Text(String(format: "%.2f probability", prefs.vadThreshold)) + .font(.caption) + .foregroundStyle(.secondary) + .monospacedDigit() + } + } + + LabeledContent("Hangover") { + VStack(alignment: .trailing, spacing: 4) { + Slider(value: $prefs.vadHangoverMilliseconds, in: 160...800, step: 40) + .frame(width: 200) + Text("\(Int(prefs.vadHangoverMilliseconds)) ms") + .font(.caption) + .foregroundStyle(.secondary) + .monospacedDigit() + } + } + + LabeledContent("Wake check window") { + VStack(alignment: .trailing, spacing: 4) { + Slider(value: $prefs.wakeCheckSeconds, in: 1.0...4.0, step: 0.5) + .frame(width: 200) + Text(String(format: "%.1f s", prefs.wakeCheckSeconds)) + .font(.caption) + .foregroundStyle(.secondary) + .monospacedDigit() + } + } + } } .formStyle(.grouped) .padding() .tabItem { Label("Dictation", systemImage: "mic.badge.plus") } } - .frame(width: 500, height: 320) + .frame(width: 720, height: 520) } private func browseButton(for binding: Binding, directory: Bool = false) -> some View { diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift index 915246311..f9e91a1fc 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift @@ -8,39 +8,67 @@ import SwiftUI +enum SidebarPage: Hashable { + case home + case replacements + case snippets + case session(UUID) +} + struct SidebarView: View { @Environment(TranscriptStore.self) private var store + @Binding var activePage: SidebarPage @State private var searchText = "" @State private var renamingSessionID: UUID? @State private var renameText = "" var body: some View { - @Bindable var store = store + List(selection: $activePage) { + Section { + Label("Home", systemImage: "house") + .tag(SidebarPage.home) + Label("Replacements", systemImage: "arrow.2.squarepath") + .tag(SidebarPage.replacements) + Label("Snippets", systemImage: "text.append") + .tag(SidebarPage.snippets) + } - List(selection: $store.selectedSessionID) { if store.hasActiveSession { liveRow } - Section("History") { - ForEach(filteredSessions) { session in - sessionRow(session) - .tag(session.id) - .contextMenu { sessionContextMenu(session) } + if !pinnedSessions.isEmpty { + Section("Pinned") { + ForEach(pinnedSessions) { session in + sessionRow(session) + .tag(SidebarPage.session(session.id)) + .contextMenu { sessionContextMenu(session) } + } + } + } + + if !recentDictations.isEmpty { + Section("Recent Dictations") { + ForEach(recentDictations) { session in + sessionRow(session) + .tag(SidebarPage.session(session.id)) + .contextMenu { sessionContextMenu(session) } + } + } + } + + ForEach(historySections) { section in + Section(section.title) { + ForEach(section.sessions) { session in + sessionRow(session) + .tag(SidebarPage.session(session.id)) + .contextMenu { sessionContextMenu(session) } + } } } } .listStyle(.sidebar) .searchable(text: $searchText, placement: .sidebar, prompt: "Search history") - .overlay { - if store.sessions.isEmpty && !store.hasActiveSession { - ContentUnavailableView( - "No History", - systemImage: "waveform", - description: Text("Transcriptions will appear here") - ) - } - } .sheet(item: renamingBinding) { session in RenameSheet(title: renameText) { newTitle in store.renameSession(session, to: newTitle) @@ -61,11 +89,49 @@ struct SidebarView: View { ) } - private var filteredSessions: [Session] { + private struct SessionSection: Identifiable { + let id = UUID() + let title: String + let sessions: [Session] + } + + private var visibleSessions: [Session] { if searchText.isEmpty { return store.sessions } return store.sessions.filter { $0.transcript.localizedCaseInsensitiveContains(searchText) || - $0.title.localizedCaseInsensitiveContains(searchText) + $0.title.localizedCaseInsensitiveContains(searchText) || + ($0.rawTranscript?.localizedCaseInsensitiveContains(searchText) ?? false) || + $0.tags.joined(separator: " ").localizedCaseInsensitiveContains(searchText) + } + } + + private var pinnedSessions: [Session] { + visibleSessions.filter(\.pinned) + } + + private var recentDictations: [Session] { + visibleSessions + .filter { !$0.pinned && $0.source == .dictation } + .prefix(5) + .map { $0 } + } + + private var historySections: [SessionSection] { + let hiddenIDs = Set(pinnedSessions.map(\.id) + recentDictations.map(\.id)) + let remainder = visibleSessions.filter { !hiddenIDs.contains($0.id) } + let calendar = Calendar.current + let grouped = Dictionary(grouping: remainder) { session -> String in + if calendar.isDateInToday(session.date) { + return "Today" + } + if calendar.isDateInYesterday(session.date) { + return "Yesterday" + } + return "Earlier" + } + return ["Today", "Yesterday", "Earlier"].compactMap { key in + guard let sessions = grouped[key], !sessions.isEmpty else { return nil } + return SessionSection(title: key, sessions: sessions) } } @@ -94,11 +160,20 @@ struct SidebarView: View { } private func sessionRow(_ session: Session) -> some View { - VStack(alignment: .leading, spacing: 4) { - Text(session.displayTitle) - .font(.headline) - .lineLimit(1) - Text(session.transcript.prefix(80).description) + VStack(alignment: .leading, spacing: 6) { + HStack(spacing: 6) { + if session.pinned { + Image(systemName: "pin.fill") + .font(.caption2) + .foregroundStyle(.yellow) + } + Text(session.displayTitle) + .font(.headline) + .lineLimit(1) + Spacer(minLength: 0) + sourceBadge(for: session) + } + Text(session.previewText.prefix(100).description) .font(.caption) .foregroundStyle(.secondary) .lineLimit(2) @@ -106,6 +181,12 @@ struct SidebarView: View { Text(session.date, format: .dateTime.month(.abbreviated).day().hour().minute()) Text("·") Text(formattedDuration(session.duration)) + if session.wakeTriggered { + Text("wake") + } + ForEach(session.tags.prefix(2), id: \.self) { tag in + Text(tag) + } } .font(.caption2) .foregroundStyle(.tertiary) @@ -113,8 +194,22 @@ struct SidebarView: View { .padding(.vertical, 2) } + private func sourceBadge(for session: Session) -> some View { + Text(session.source == .dictation ? "Dictation" : "Transcript") + .font(.caption2.weight(.medium)) + .padding(.horizontal, 6) + .padding(.vertical, 2) + .background( + (session.source == .dictation ? Color.accentColor : Color.secondary).opacity(0.12), + in: Capsule() + ) + } + @ViewBuilder private func sessionContextMenu(_ session: Session) -> some View { + Button(session.pinned ? "Unpin" : "Pin") { + store.togglePinned(session) + } Button("Rename...") { renameText = session.title renamingSessionID = session.id @@ -123,6 +218,13 @@ struct SidebarView: View { NSPasteboard.general.clearContents() NSPasteboard.general.setString(session.transcript, forType: .string) } + Menu("Export") { + ForEach(SessionExportFormat.allCases, id: \.rawValue) { format in + Button(format.fileExtension.uppercased()) { + store.exportSession(session, format: format) + } + } + } Divider() Button("Delete", role: .destructive) { store.deleteSession(session) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift new file mode 100644 index 000000000..9e9ad0c18 --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift @@ -0,0 +1,177 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +struct SnippetManagementView: View { + @Environment(SnippetStore.self) private var snippetStore + @State private var searchText = "" + @State private var editingSnippet = Snippet() + @State private var editingSnippetID: UUID? + @State private var isPresentingEditor = false + + var body: some View { + VStack(alignment: .leading, spacing: 12) { + HStack { + TextField("Search snippets", text: $searchText) + .textFieldStyle(.roundedBorder) + Button("Add") { + editingSnippet = Snippet() + editingSnippetID = nil + isPresentingEditor = true + } + .buttonStyle(.borderedProminent) + } + + if filteredSnippets.isEmpty { + ContentUnavailableView( + "No Snippets", + systemImage: "text.append", + description: Text("Create reusable templates and trigger them with commands like 'insert snippet daily standup'.") + ) + } else { + List { + ForEach(filteredSnippets) { snippet in + HStack(alignment: .top, spacing: 12) { + Toggle("", isOn: binding(for: snippet.id)) + .labelsHidden() + VStack(alignment: .leading, spacing: 4) { + HStack { + Text(snippet.name) + .font(.headline) + if let lastUsedAt = snippet.lastUsedAt { + Text(lastUsedAt, format: .relative(presentation: .named)) + .font(.caption2) + .foregroundStyle(.tertiary) + } + } + Text("Say: insert snippet \(snippet.trigger)") + .font(.caption) + .foregroundStyle(.secondary) + Text(snippet.content) + .font(.caption) + .foregroundStyle(.tertiary) + .lineLimit(3) + if !snippet.notes.isEmpty { + Text(snippet.notes) + .font(.caption) + .foregroundStyle(.tertiary) + } + } + Spacer() + Button("Edit") { + editingSnippet = snippet + editingSnippetID = snippet.id + isPresentingEditor = true + } + .buttonStyle(.borderless) + } + .contextMenu { + Button("Edit") { + editingSnippet = snippet + editingSnippetID = snippet.id + isPresentingEditor = true + } + Button(snippet.isEnabled ? "Disable" : "Enable") { + snippetStore.toggleEnabled(for: snippet.id) + } + Divider() + Button("Delete", role: .destructive) { + snippetStore.delete(snippet) + } + } + } + } + .listStyle(.inset) + } + } + .sheet(isPresented: $isPresentingEditor) { + SnippetEditor(snippet: editingSnippet) { snippet in + if editingSnippetID == nil { + snippetStore.add(snippet) + } else { + snippetStore.update(snippet) + } + isPresentingEditor = false + } onCancel: { + isPresentingEditor = false + } + .padding(20) + .frame(width: 480, height: 420) + } + } + + private var filteredSnippets: [Snippet] { + guard !searchText.isEmpty else { return snippetStore.snippets } + return snippetStore.snippets.filter { + $0.name.localizedCaseInsensitiveContains(searchText) || + $0.trigger.localizedCaseInsensitiveContains(searchText) || + $0.content.localizedCaseInsensitiveContains(searchText) || + $0.notes.localizedCaseInsensitiveContains(searchText) + } + } + + private func binding(for id: UUID) -> Binding { + Binding( + get: { + snippetStore.snippets.first(where: { $0.id == id })?.isEnabled ?? false + }, + set: { _ in + snippetStore.toggleEnabled(for: id) + } + ) + } +} + +private struct SnippetEditor: View { + @State var snippet: Snippet + let onSave: (Snippet) -> Void + let onCancel: () -> Void + + var body: some View { + VStack(alignment: .leading, spacing: 16) { + Text(snippet.name.isEmpty ? "Add Snippet" : "Edit Snippet") + .font(.headline) + + TextField("Display name", text: $snippet.name) + .textFieldStyle(.roundedBorder) + TextField("Trigger phrase", text: $snippet.trigger) + .textFieldStyle(.roundedBorder) + + VStack(alignment: .leading, spacing: 6) { + Text("Content") + .font(.caption) + .foregroundStyle(.secondary) + TextEditor(text: $snippet.content) + .font(.body.monospaced()) + .frame(minHeight: 180) + .overlay(RoundedRectangle(cornerRadius: 8).stroke(.quaternary)) + } + + TextField("Notes (optional)", text: $snippet.notes) + .textFieldStyle(.roundedBorder) + Toggle("Enabled", isOn: $snippet.isEnabled) + + HStack { + Spacer() + Button("Cancel", role: .cancel) { + onCancel() + } + Button("Save") { + onSave(snippet) + } + .keyboardShortcut(.defaultAction) + .disabled( + snippet.name.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty || + snippet.trigger.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty || + snippet.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + ) + } + } + } +} diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/WelcomeView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/WelcomeView.swift index 4661049a1..547be9170 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/WelcomeView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/WelcomeView.swift @@ -41,9 +41,13 @@ struct WelcomeView: View { } shortcutHints + + if !store.recentDictationSessions.isEmpty { + recentDictationsSection + } } .padding(40) - .frame(maxWidth: 480) + .frame(maxWidth: 560) } @ViewBuilder @@ -101,6 +105,28 @@ struct WelcomeView: View { .padding(.top, 8) } + private var recentDictationsSection: some View { + VStack(alignment: .leading, spacing: 10) { + Divider() + Text("Recent Dictations") + .font(.headline) + ForEach(store.recentDictationSessions.prefix(3)) { session in + VStack(alignment: .leading, spacing: 4) { + Text(session.displayTitle) + .font(.caption.weight(.semibold)) + .foregroundStyle(.secondary) + Text(session.previewText.prefix(120).description) + .font(.callout) + .lineLimit(2) + } + .frame(maxWidth: .infinity, alignment: .leading) + .padding(12) + .background(.background.secondary, in: RoundedRectangle(cornerRadius: 12)) + } + } + .frame(maxWidth: .infinity, alignment: .leading) + } + private func shortcutBadge(_ shortcut: String, label: String) -> some View { VStack(spacing: 4) { Text(shortcut) diff --git a/voxtral_realtime/macos/VoxtralRealtime/VoxtralRealtimeApp.swift b/voxtral_realtime/macos/VoxtralRealtime/VoxtralRealtimeApp.swift index 7c950dffb..53cc97524 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/VoxtralRealtimeApp.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/VoxtralRealtimeApp.swift @@ -12,13 +12,20 @@ import SwiftUI @main struct VoxtralRealtimeApp: App { @State private var preferences = Preferences() + @State private var replacementStore: ReplacementStore + @State private var snippetStore: SnippetStore @State private var store: TranscriptStore @State private var dictation: DictationManager init() { let prefs = Preferences() - let s = TranscriptStore(preferences: prefs) + let replacements = ReplacementStore() + let snippets = SnippetStore() + let pipeline = TextPipeline(replacementStore: replacements, snippetStore: snippets) + let s = TranscriptStore(preferences: prefs, textPipeline: pipeline) _preferences = State(initialValue: prefs) + _replacementStore = State(initialValue: replacements) + _snippetStore = State(initialValue: snippets) _store = State(initialValue: s) _dictation = State(initialValue: DictationManager(store: s, preferences: prefs)) } @@ -28,6 +35,8 @@ struct VoxtralRealtimeApp: App { ContentView() .environment(store) .environment(preferences) + .environment(replacementStore) + .environment(snippetStore) .frame(minWidth: 600, minHeight: 400) .task { _ = await AVCaptureDevice.requestAccess(for: .audio) diff --git a/voxtral_realtime/macos/VoxtralRealtimeTests/SessionCompatibilityTests.swift b/voxtral_realtime/macos/VoxtralRealtimeTests/SessionCompatibilityTests.swift new file mode 100644 index 000000000..376e78d81 --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtimeTests/SessionCompatibilityTests.swift @@ -0,0 +1,62 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import Testing + +struct SessionCompatibilityTests { + @Test + func decodesLegacySessionsWithoutNewMetadata() throws { + let json = """ + { + "id": "6BDF20D0-6E25-43EB-81A4-34748EF304F6", + "date": "2026-03-24T10:30:00Z", + "title": "Legacy Session", + "transcript": "hello world", + "duration": 12.5 + } + """ + + let decoder = JSONDecoder() + decoder.dateDecodingStrategy = .iso8601 + + let session = try decoder.decode(Session.self, from: Data(json.utf8)) + + #expect(session.source == .transcription) + #expect(session.rawTranscript == nil) + #expect(session.tags.isEmpty) + #expect(session.usedSnippetIDs.isEmpty) + #expect(!session.wakeTriggered) + #expect(!session.pinned) + #expect(session.transcript == "hello world") + } + + @Test + func exportFormatsIncludeRichSessionMetadata() throws { + let session = Session( + id: UUID(uuidString: "6BDF20D0-6E25-43EB-81A4-34748EF304F6")!, + date: Date(timeIntervalSince1970: 1_742_814_600), + title: "Wake Dictation", + transcript: "hello world", + duration: 12.5, + source: .dictation, + rawTranscript: "hey torch hello world", + tags: ["replacement", "snippet"], + wakeTriggered: true + ) + + let json = SessionExportFormat.json.render(session) + let srt = SessionExportFormat.srt.render(session) + + #expect(json.contains("\"wakeTriggered\" : true")) + #expect(json.contains("\"rawTranscript\" : \"hey torch hello world\"")) + #expect(json.contains("\"tags\" : [")) + #expect(srt.contains("00:00:00,000 --> 00:00:12,500")) + #expect(srt.contains("hello world")) + } +} diff --git a/voxtral_realtime/macos/VoxtralRealtimeTests/TextPipelineTests.swift b/voxtral_realtime/macos/VoxtralRealtimeTests/TextPipelineTests.swift new file mode 100644 index 000000000..d0e03987c --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtimeTests/TextPipelineTests.swift @@ -0,0 +1,88 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import Testing + +@MainActor +struct TextPipelineTests { + @Test + func replacementsApplyLongestMatchFirst() { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [ + ReplacementEntry(trigger: "young", replacement: "Young"), + ReplacementEntry(trigger: "young han", replacement: "Younghan"), + ReplacementEntry(trigger: "mtia", replacement: "MTIA"), + ] + let snippetStore = SnippetStore(fileURL: sandbox.appendingPathComponent("snippets.json")) + snippetStore.snippets = [] + let pipeline = TextPipeline(replacementStore: replacementStore, snippetStore: snippetStore) + + let result = pipeline.process("young han joined mtia", context: .dictation) + + #expect(result.outputText == "Younghan joined MTIA") + #expect(result.tags.contains("replacement")) + } + + @Test + func snippetExpandsOnlyFromExplicitCommand() { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [] + let snippetStore = SnippetStore(fileURL: sandbox.appendingPathComponent("snippets.json")) + let standup = Snippet(name: "Standup", trigger: "daily standup", content: "Yesterday:\n- ") + snippetStore.snippets = [standup] + let pipeline = TextPipeline(replacementStore: replacementStore, snippetStore: snippetStore) + + let expanded = pipeline.process("insert snippet daily standup", context: .dictation) + let untouched = pipeline.process("daily standup", context: .dictation) + + #expect(expanded.outputText == "Yesterday:\n- ") + #expect(expanded.usedSnippetIDs == [standup.id]) + #expect(untouched.outputText == "daily standup") + } + + @Test + func literalPrefixSkipsSnippetExpansion() { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [] + let snippetStore = SnippetStore(fileURL: sandbox.appendingPathComponent("snippets.json")) + snippetStore.snippets = [Snippet(name: "Standup", trigger: "daily standup", content: "template")] + let pipeline = TextPipeline(replacementStore: replacementStore, snippetStore: snippetStore) + + let result = pipeline.process("literal insert snippet daily standup", context: .dictation) + + #expect(result.outputText == "insert snippet daily standup") + #expect(result.skippedSnippetExpansion) + #expect(result.usedSnippetIDs.isEmpty) + } + + @Test + func wakePhraseNormalizationUsesReplacementsAndStripsNoise() { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [ + ReplacementEntry(trigger: "torchhh", replacement: "torch"), + ] + let snippetStore = SnippetStore(fileURL: sandbox.appendingPathComponent("snippets.json")) + snippetStore.snippets = [] + let pipeline = TextPipeline(replacementStore: replacementStore, snippetStore: snippetStore) + + let normalized = pipeline.normalizeForWakePhrase(" Hey, torchhh!! ") + + #expect(normalized == "hey torch") + } + + private func makeSandbox() -> URL { + let directory = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString, isDirectory: true) + try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory + } +} diff --git a/voxtral_realtime/macos/project.yml b/voxtral_realtime/macos/project.yml index de289c635..22a9ea6f1 100644 --- a/voxtral_realtime/macos/project.yml +++ b/voxtral_realtime/macos/project.yml @@ -13,6 +13,17 @@ settings: MACOSX_DEPLOYMENT_TARGET: "14.0" ENABLE_HARDENED_RUNTIME: YES +schemes: + VoxtralRealtime: + build: + targets: + VoxtralRealtime: all + VoxtralRealtimeTests: [test] + test: + gatherCoverageData: true + targets: + - VoxtralRealtimeTests + targets: VoxtralRealtime: type: application @@ -44,6 +55,7 @@ targets: ET_PATH="${EXECUTORCH_PATH:-${HOME}/executorch}" RUNNER_SRC="${ET_PATH}/cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner" + VAD_RUNNER_SRC="${ET_PATH}/cmake-out/examples/models/silero_vad/silero_vad_stream_runner" MODEL_DIR="${MODEL_DIR:-${HOME}/voxtral_realtime_quant_metal}" LIBOMP_SRC="/opt/homebrew/opt/libomp/lib/libomp.dylib" LIBCXX_SRC="/usr/lib/libc++.1.dylib" @@ -92,5 +104,27 @@ targets: copy_if_newer "${MODEL_DIR}/model-metal-int4.pte" "${DEST}/model-metal-int4.pte" copy_if_newer "${MODEL_DIR}/preprocessor.pte" "${DEST}/preprocessor.pte" copy_if_newer "${MODEL_DIR}/tekken.json" "${DEST}/tekken.json" + copy_if_newer "${MODEL_DIR}/silero_vad.pte" "${DEST}/silero_vad.pte" + + # Optional Silero VAD helper + copy_if_newer "${VAD_RUNNER_SRC}" "${DEST}/silero_vad_stream_runner" + chmod +x "${DEST}/silero_vad_stream_runner" 2>/dev/null || true name: Bundle Runner & Model Artifacts basedOnDependencyAnalysis: false + VoxtralRealtimeTests: + type: bundle.unit-test + platform: macOS + sources: + - path: VoxtralRealtime/Models/ReplacementEntry.swift + - path: VoxtralRealtime/Models/Session.swift + - path: VoxtralRealtime/Models/Snippet.swift + - path: VoxtralRealtime/Utilities/PersistencePaths.swift + - path: VoxtralRealtime/Utilities/SessionExportFormat.swift + - path: VoxtralRealtime/Services/ReplacementStore.swift + - path: VoxtralRealtime/Services/SnippetStore.swift + - path: VoxtralRealtime/Services/TextPipeline.swift + - path: VoxtralRealtimeTests + settings: + base: + PRODUCT_BUNDLE_IDENTIFIER: org.pytorch.executorch.VoxtralRealtimeTests + GENERATE_INFOPLIST_FILE: YES From a64526cd5ceb854f94a53a983e5bb31572f351c4 Mon Sep 17 00:00:00 2001 From: Young Han Date: Wed, 25 Mar 2026 14:20:32 -0700 Subject: [PATCH 03/23] Add Wake settings page to sidebar with toggle shortcut Move wake/VAD settings from the Settings window into a dedicated sidebar page with live status indicator. Add Ctrl+Shift+W shortcut to toggle voice wake on/off from the menu bar. Made-with: Cursor --- .../VoxtralRealtime.xcodeproj/project.pbxproj | 4 + .../Services/DictationManager.swift | 11 ++ .../VoxtralRealtime/Views/ContentView.swift | 4 + .../VoxtralRealtime/Views/SettingsView.swift | 64 ------- .../VoxtralRealtime/Views/SidebarView.swift | 3 + .../Views/WakeSettingsView.swift | 162 ++++++++++++++++++ .../VoxtralRealtime/VoxtralRealtimeApp.swift | 12 ++ 7 files changed, 196 insertions(+), 64 deletions(-) create mode 100644 voxtral_realtime/macos/VoxtralRealtime/Views/WakeSettingsView.swift diff --git a/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/project.pbxproj b/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/project.pbxproj index b87106d18..55a9a9432 100644 --- a/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/project.pbxproj +++ b/voxtral_realtime/macos/VoxtralRealtime.xcodeproj/project.pbxproj @@ -27,6 +27,7 @@ 710AF905B60EB23C212AAC30 /* HealthCheck.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6CAB3C82C8E7393E06E6EE6E /* HealthCheck.swift */; }; 75FD2E801951086DFE9E28E3 /* AudioLevelView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B5282DE2DFDC7D992AC7D81D /* AudioLevelView.swift */; }; 7CF7B1783B1948A4CA22BBB9 /* SessionCompatibilityTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5521AE4951FF78020CF91D18 /* SessionCompatibilityTests.swift */; }; + 7E6D25D0306CA07BAE482CD4 /* WakeSettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E559087816AF48C30A5FD6B5 /* WakeSettingsView.swift */; }; 7F4D25A854D63B9F528C56A0 /* ReplacementEntry.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF25A732CF8FA1A7A982A8D8 /* ReplacementEntry.swift */; }; 83C2211EB3926FCE3AE55BA8 /* TranscriptView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9539C4044A50A585F04687CA /* TranscriptView.swift */; }; 8E1D5D4563D1FEA413FD49C6 /* PersistencePaths.swift in Sources */ = {isa = PBXBuildFile; fileRef = AB52D50C17A7A729A04F559D /* PersistencePaths.swift */; }; @@ -89,6 +90,7 @@ D2245F740610F505B2D2905E /* WelcomeView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WelcomeView.swift; sourceTree = ""; }; E0D0245890DF77D480572858 /* Session.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Session.swift; sourceTree = ""; }; E4B018A732C7D539547855A3 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + E559087816AF48C30A5FD6B5 /* WakeSettingsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WakeSettingsView.swift; sourceTree = ""; }; FF25A732CF8FA1A7A982A8D8 /* ReplacementEntry.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ReplacementEntry.swift; sourceTree = ""; }; /* End PBXFileReference section */ @@ -142,6 +144,7 @@ 68D216F7ED4F81E147F2CFEE /* SidebarView.swift */, A16CFDDF3005489D5CEFD58B /* SnippetManagementView.swift */, 9539C4044A50A585F04687CA /* TranscriptView.swift */, + E559087816AF48C30A5FD6B5 /* WakeSettingsView.swift */, D2245F740610F505B2D2905E /* WelcomeView.swift */, ); path = Views; @@ -355,6 +358,7 @@ 83C2211EB3926FCE3AE55BA8 /* TranscriptView.swift in Sources */, B5A4C34832F186322489706E /* VadService.swift in Sources */, B4086DFD7477802852F31D0E /* VoxtralRealtimeApp.swift in Sources */, + 7E6D25D0306CA07BAE482CD4 /* WakeSettingsView.swift in Sources */, EE2CF6B261CC691736E9BBDE /* WakeState.swift in Sources */, 2C0D9096EB59C859E877427A /* WelcomeView.swift in Sources */, ); diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift index b6fe20134..9fa0b0a1e 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift @@ -105,6 +105,17 @@ final class DictationManager { ) } + // MARK: - Wake Control + + func restartWakeListening() async { + await startWakeListeningIfNeeded() + } + + func stopWakeListening() async { + await vadService.stop() + store.wakeState = .disabled + } + // MARK: - Toggle func toggle() async { diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/ContentView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/ContentView.swift index 24300e119..327d5a3d3 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/ContentView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/ContentView.swift @@ -56,6 +56,10 @@ struct ContentView: View { SnippetManagementView() .padding() .navigationTitle("Snippets") + case .wake: + WakeSettingsView() + .padding() + .navigationTitle("Wake") case .home: homeContent case .session(let id): diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/SettingsView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/SettingsView.swift index 152dbf01a..d94116007 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/SettingsView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/SettingsView.swift @@ -87,70 +87,6 @@ struct SettingsView: View { } } - Section("Wake") { - Toggle("Enable Silero VAD", isOn: $prefs.enableSileroVAD) - Toggle("Require wake phrase", isOn: $prefs.enableWakePhrase) - .disabled(!prefs.enableSileroVAD) - - LabeledContent("VAD runner") { - HStack { - TextField("Path to silero_vad_stream_runner", text: $prefs.vadRunnerPath) - .textFieldStyle(.roundedBorder) - browseButton(for: $prefs.vadRunnerPath) - } - } - - LabeledContent("VAD model") { - HStack { - TextField("Path to silero_vad.pte", text: $prefs.vadModelPath) - .textFieldStyle(.roundedBorder) - browseButton(for: $prefs.vadModelPath) - } - } - - LabeledContent("Wake phrase") { - HStack(spacing: 4) { - Text("Hey") - .foregroundStyle(.secondary) - TextField("torch", text: $prefs.wakeKeyword) - .textFieldStyle(.roundedBorder) - .frame(width: 140) - } - } - - LabeledContent("Speech threshold") { - VStack(alignment: .trailing, spacing: 4) { - Slider(value: $prefs.vadThreshold, in: 0.3...0.9, step: 0.05) - .frame(width: 200) - Text(String(format: "%.2f probability", prefs.vadThreshold)) - .font(.caption) - .foregroundStyle(.secondary) - .monospacedDigit() - } - } - - LabeledContent("Hangover") { - VStack(alignment: .trailing, spacing: 4) { - Slider(value: $prefs.vadHangoverMilliseconds, in: 160...800, step: 40) - .frame(width: 200) - Text("\(Int(prefs.vadHangoverMilliseconds)) ms") - .font(.caption) - .foregroundStyle(.secondary) - .monospacedDigit() - } - } - - LabeledContent("Wake check window") { - VStack(alignment: .trailing, spacing: 4) { - Slider(value: $prefs.wakeCheckSeconds, in: 1.0...4.0, step: 0.5) - .frame(width: 200) - Text(String(format: "%.1f s", prefs.wakeCheckSeconds)) - .font(.caption) - .foregroundStyle(.secondary) - .monospacedDigit() - } - } - } } .formStyle(.grouped) .padding() diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift index f9e91a1fc..38d4d6d63 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift @@ -12,6 +12,7 @@ enum SidebarPage: Hashable { case home case replacements case snippets + case wake case session(UUID) } @@ -31,6 +32,8 @@ struct SidebarView: View { .tag(SidebarPage.replacements) Label("Snippets", systemImage: "text.append") .tag(SidebarPage.snippets) + Label("Wake", systemImage: "ear.and.waveform") + .tag(SidebarPage.wake) } if store.hasActiveSession { diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/WakeSettingsView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/WakeSettingsView.swift new file mode 100644 index 000000000..84770369b --- /dev/null +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/WakeSettingsView.swift @@ -0,0 +1,162 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +struct WakeSettingsView: View { + @Environment(Preferences.self) private var preferences + @Environment(TranscriptStore.self) private var store + + var body: some View { + @Bindable var prefs = preferences + + Form { + Section { + HStack(spacing: 12) { + Image(systemName: prefs.enableSileroVAD ? "ear.and.waveform" : "ear.badge.waveform") + .font(.title) + .foregroundStyle(prefs.enableSileroVAD ? .green : .secondary) + .frame(width: 40) + VStack(alignment: .leading, spacing: 2) { + Text(prefs.enableSileroVAD ? "Wake is active" : "Wake is off") + .font(.headline) + Text(prefs.enableSileroVAD + ? "Say \"Hey \(prefs.wakeKeyword)\" to start dictation" + : "Enable to start dictation hands-free") + .font(.caption) + .foregroundStyle(.secondary) + } + Spacer() + Toggle("", isOn: $prefs.enableSileroVAD) + .labelsHidden() + .toggleStyle(.switch) + } + .padding(.vertical, 4) + + Text("⌃⇧W to toggle") + .font(.caption2) + .foregroundStyle(.tertiary) + } + + Section("Wake Phrase") { + Toggle("Require wake phrase", isOn: $prefs.enableWakePhrase) + .disabled(!prefs.enableSileroVAD) + + LabeledContent("Keyword") { + HStack(spacing: 4) { + Text("Hey") + .foregroundStyle(.secondary) + TextField("torch", text: $prefs.wakeKeyword) + .textFieldStyle(.roundedBorder) + .frame(width: 140) + } + } + + LabeledContent("Check window") { + VStack(alignment: .trailing, spacing: 4) { + Slider(value: $prefs.wakeCheckSeconds, in: 1.0...4.0, step: 0.5) + .frame(width: 200) + Text(String(format: "%.1f s", prefs.wakeCheckSeconds)) + .font(.caption) + .foregroundStyle(.secondary) + .monospacedDigit() + } + } + } + + Section("Detection") { + LabeledContent("Speech threshold") { + VStack(alignment: .trailing, spacing: 4) { + Slider(value: $prefs.vadThreshold, in: 0.3...0.9, step: 0.05) + .frame(width: 200) + Text(String(format: "%.2f probability", prefs.vadThreshold)) + .font(.caption) + .foregroundStyle(.secondary) + .monospacedDigit() + } + } + + LabeledContent("Hangover") { + VStack(alignment: .trailing, spacing: 4) { + Slider(value: $prefs.vadHangoverMilliseconds, in: 160...800, step: 40) + .frame(width: 200) + Text("\(Int(prefs.vadHangoverMilliseconds)) ms") + .font(.caption) + .foregroundStyle(.secondary) + .monospacedDigit() + } + } + } + + Section("Paths") { + LabeledContent("VAD runner") { + HStack { + TextField("silero_vad_stream_runner", text: $prefs.vadRunnerPath) + .textFieldStyle(.roundedBorder) + browseButton(for: $prefs.vadRunnerPath) + } + } + + LabeledContent("VAD model") { + HStack { + TextField("silero_vad.pte", text: $prefs.vadModelPath) + .textFieldStyle(.roundedBorder) + browseButton(for: $prefs.vadModelPath) + } + } + } + + if let wakeState = store.wakeState as WakeState? { + Section("Status") { + HStack(spacing: 8) { + Circle() + .fill(statusColor(for: wakeState)) + .frame(width: 8, height: 8) + Text(statusLabel(for: wakeState)) + .font(.callout) + .foregroundStyle(.secondary) + } + } + } + } + .formStyle(.grouped) + } + + private func browseButton(for binding: Binding) -> some View { + Button("Browse...") { + let panel = NSOpenPanel() + panel.canChooseFiles = true + panel.canChooseDirectories = false + panel.allowsMultipleSelection = false + if panel.runModal() == .OK, let url = panel.url { + binding.wrappedValue = url.path(percentEncoded: false) + } + } + .controlSize(.small) + } + + private func statusColor(for state: WakeState) -> Color { + switch state { + case .disabled: .secondary + case .listening: .green + case .speechDetected: .orange + case .checkingPhrase: .yellow + case .active: .blue + } + } + + private func statusLabel(for state: WakeState) -> String { + switch state { + case .disabled: "Disabled" + case .listening: "Listening for speech..." + case .speechDetected: "Speech detected" + case .checkingPhrase: "Checking wake phrase..." + case .active: "Active dictation" + } + } +} diff --git a/voxtral_realtime/macos/VoxtralRealtime/VoxtralRealtimeApp.swift b/voxtral_realtime/macos/VoxtralRealtime/VoxtralRealtimeApp.swift index 53cc97524..ecd58ec5b 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/VoxtralRealtimeApp.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/VoxtralRealtimeApp.swift @@ -109,6 +109,18 @@ struct VoxtralRealtimeApp: App { Task { await dictation.toggle() } } .disabled(!store.isModelReady && store.modelState != .unloaded) + + Divider() + + Button(preferences.enableSileroVAD ? "Disable Voice Wake" : "Enable Voice Wake") { + preferences.enableSileroVAD.toggle() + if preferences.enableSileroVAD { + Task { await dictation.restartWakeListening() } + } else { + Task { await dictation.stopWakeListening() } + } + } + .keyboardShortcut("W", modifiers: [.control, .shift]) } } From 548465d1a4375eddecf6990c71a19ed5abae8f39 Mon Sep 17 00:00:00 2001 From: Young Han Date: Wed, 25 Mar 2026 14:22:22 -0700 Subject: [PATCH 04/23] Move all settings into sidebar, remove Settings window MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consolidate General and Dictation settings into a single Settings page in the sidebar. The app no longer has a separate Settings window — everything is accessible from Home, Replacements, Snippets, Wake, and Settings in the left nav. Made-with: Cursor --- .../VoxtralRealtime/Views/ContentView.swift | 4 + .../VoxtralRealtime/Views/SettingsView.swift | 118 +++++++++--------- .../VoxtralRealtime/Views/SidebarView.swift | 3 + .../VoxtralRealtime/VoxtralRealtimeApp.swift | 4 - 4 files changed, 64 insertions(+), 65 deletions(-) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/ContentView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/ContentView.swift index 327d5a3d3..3b2313322 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/ContentView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/ContentView.swift @@ -60,6 +60,10 @@ struct ContentView: View { WakeSettingsView() .padding() .navigationTitle("Wake") + case .settings: + SettingsView() + .padding() + .navigationTitle("Settings") case .home: homeContent case .session(let id): diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/SettingsView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/SettingsView.swift index d94116007..21ecf9fd1 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/SettingsView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/SettingsView.swift @@ -14,85 +14,81 @@ struct SettingsView: View { var body: some View { @Bindable var prefs = preferences - TabView { - Form { - Section("Runner") { - LabeledContent("Binary path") { - HStack { - TextField("Path to voxtral_realtime_runner", text: $prefs.runnerPath) - .textFieldStyle(.roundedBorder) - browseButton(for: $prefs.runnerPath) - } + Form { + Section("Runner") { + LabeledContent("Binary path") { + HStack { + TextField("Path to voxtral_realtime_runner", text: $prefs.runnerPath) + .textFieldStyle(.roundedBorder) + browseButton(for: $prefs.runnerPath) } } + } - Section("Model") { - LabeledContent("Model directory") { - HStack { - TextField("Path to model artifacts", text: $prefs.modelDirectory) - .textFieldStyle(.roundedBorder) - browseButton(for: $prefs.modelDirectory, directory: true) - } + Section("Model") { + LabeledContent("Model directory") { + HStack { + TextField("Path to model artifacts", text: $prefs.modelDirectory) + .textFieldStyle(.roundedBorder) + browseButton(for: $prefs.modelDirectory, directory: true) } + } - LabeledContent("Files") { - VStack(alignment: .leading, spacing: 4) { - fileStatus("model-metal-int4.pte", path: preferences.modelPath) - fileStatus("tekken.json", path: preferences.tokenizerPath) - fileStatus("preprocessor.pte", path: preferences.preprocessorPath) - fileStatus("silero_vad.pte", path: preferences.vadModelPath) - } + LabeledContent("Files") { + VStack(alignment: .leading, spacing: 4) { + fileStatus("model-metal-int4.pte", path: preferences.modelPath) + fileStatus("tekken.json", path: preferences.tokenizerPath) + fileStatus("preprocessor.pte", path: preferences.preprocessorPath) + fileStatus("silero_vad.pte", path: preferences.vadModelPath) } } } - .formStyle(.grouped) - .padding() - .tabItem { Label("General", systemImage: "gear") } - Form { - Section("Silence Detection") { - LabeledContent("Silence threshold") { - VStack(alignment: .trailing, spacing: 4) { - Slider(value: $prefs.silenceThreshold, in: 0.005...0.1, step: 0.005) - .frame(width: 200) - Text(String(format: "%.3f RMS", preferences.silenceThreshold)) - .font(.caption) - .foregroundStyle(.secondary) - .monospacedDigit() - } + Section("Silence Detection") { + LabeledContent("Silence threshold") { + VStack(alignment: .trailing, spacing: 4) { + Slider(value: $prefs.silenceThreshold, in: 0.005...0.1, step: 0.005) + .frame(width: 200) + Text(String(format: "%.3f RMS", preferences.silenceThreshold)) + .font(.caption) + .foregroundStyle(.secondary) + .monospacedDigit() } + } - LabeledContent("Auto-stop delay") { - VStack(alignment: .trailing, spacing: 4) { - Slider(value: $prefs.silenceTimeout, in: 0.5...5.0, step: 0.5) - .frame(width: 200) - Text(String(format: "%.1fs after silence", preferences.silenceTimeout)) - .font(.caption) - .foregroundStyle(.secondary) - .monospacedDigit() - } + LabeledContent("Auto-stop delay") { + VStack(alignment: .trailing, spacing: 4) { + Slider(value: $prefs.silenceTimeout, in: 0.5...5.0, step: 0.5) + .frame(width: 200) + Text(String(format: "%.1fs after silence", preferences.silenceTimeout)) + .font(.caption) + .foregroundStyle(.secondary) + .monospacedDigit() } - - Text("Lower threshold = more sensitive (stops on softer sounds). Higher = only stops in true silence. Adjust based on your environment.") - .font(.caption) - .foregroundStyle(.tertiary) } - Section("Shortcut") { - LabeledContent("Dictation hotkey") { - Text("⌃Space") - .padding(.horizontal, 8) - .padding(.vertical, 4) - .background(.quaternary, in: RoundedRectangle(cornerRadius: 4)) - } + Text("Lower threshold = more sensitive (stops on softer sounds). Higher = only stops in true silence. Adjust based on your environment.") + .font(.caption) + .foregroundStyle(.tertiary) + } + + Section("Shortcut") { + LabeledContent("Dictation hotkey") { + Text("⌃Space") + .padding(.horizontal, 8) + .padding(.vertical, 4) + .background(.quaternary, in: RoundedRectangle(cornerRadius: 4)) } + LabeledContent("Toggle wake") { + Text("⌃⇧W") + .padding(.horizontal, 8) + .padding(.vertical, 4) + .background(.quaternary, in: RoundedRectangle(cornerRadius: 4)) + } } - .formStyle(.grouped) - .padding() - .tabItem { Label("Dictation", systemImage: "mic.badge.plus") } } - .frame(width: 720, height: 520) + .formStyle(.grouped) } private func browseButton(for binding: Binding, directory: Bool = false) -> some View { diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift index 38d4d6d63..09b499f56 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift @@ -13,6 +13,7 @@ enum SidebarPage: Hashable { case replacements case snippets case wake + case settings case session(UUID) } @@ -34,6 +35,8 @@ struct SidebarView: View { .tag(SidebarPage.snippets) Label("Wake", systemImage: "ear.and.waveform") .tag(SidebarPage.wake) + Label("Settings", systemImage: "gear") + .tag(SidebarPage.settings) } if store.hasActiveSession { diff --git a/voxtral_realtime/macos/VoxtralRealtime/VoxtralRealtimeApp.swift b/voxtral_realtime/macos/VoxtralRealtime/VoxtralRealtimeApp.swift index ecd58ec5b..9b114e34a 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/VoxtralRealtimeApp.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/VoxtralRealtimeApp.swift @@ -124,10 +124,6 @@ struct VoxtralRealtimeApp: App { } } - Settings { - SettingsView() - .environment(preferences) - } } private var selectedSessionTranscript: String { From 33ad4dab42bb7fe4e0f223ecc3c12f24c57c0ebe Mon Sep 17 00:00:00 2001 From: Young Han Date: Wed, 25 Mar 2026 14:27:45 -0700 Subject: [PATCH 05/23] Skip redundant mic permission prompt on VAD-triggered dictation When VadService detects speech and triggers dictation, the mic is already proven working. Skip the AVCaptureDevice permission check in startDictation to avoid re-prompting the user on every wake trigger after a fresh build. Made-with: Cursor --- .../macos/VoxtralRealtime/Models/TranscriptStore.swift | 8 +++++--- .../macos/VoxtralRealtime/Services/DictationManager.swift | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift b/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift index 2922239f1..bbefb1491 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift @@ -262,11 +262,13 @@ final class TranscriptStore { // MARK: - Dictation - func startDictation(initialSamples: [Float] = []) async { + func startDictation(initialSamples: [Float] = [], skipMicCheck: Bool = false) async { guard !isDictating else { return } - let micOK = await checkMicPermissionLive() - guard micOK else { return } + if !skipMicCheck { + let micOK = await checkMicPermissionLive() + guard micOK else { return } + } if modelState != .ready { await ensureRunnerLaunched() diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift index 9fa0b0a1e..11ded1b55 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift @@ -322,7 +322,7 @@ final class DictationManager { state = .listening store.wakeState = .checkingPhrase - await store.startDictation(initialSamples: preRollSamples) + await store.startDictation(initialSamples: preRollSamples, skipMicCheck: true) if !preferences.enableWakePhrase { store.wakeState = .active From 8c4df93f07adae61e832628ee772d7f34de0b71c Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 10:12:11 -0700 Subject: [PATCH 06/23] Preload Voxtral model before starting wake listening The 2-second wake check window was expiring before Voxtral could produce any transcription because the model wasn't loaded yet. Now startWakeListeningIfNeeded preloads the model so it's ready instantly when VAD detects speech. Made-with: Cursor --- .../macos/VoxtralRealtime/Services/DictationManager.swift | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift index 11ded1b55..0727eb3dc 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift @@ -270,6 +270,13 @@ final class DictationManager { return } + if !store.isModelReady { + await store.preloadModel() + while store.modelState == .loading { + try? await Task.sleep(for: .milliseconds(100)) + } + } + store.wakeState = .listening do { From a5e5a3b8c660ded2ecd84b519571e5e3f81f3e6f Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 10:16:39 -0700 Subject: [PATCH 07/23] Fix runner launch blocked by mic permission after fresh build ensureRunnerLaunched required healthResult.allGood which includes AVCaptureDevice mic authorization. After a fresh build, macOS resets this to notDetermined, blocking the runner from launching even though the binary and model files are present. Split HealthCheck.Result into filesReady (binary + model files) and allGood (files + mic). The runner only needs filesReady to launch. Also auto-run health check if it hasn't been run yet. Made-with: Cursor --- .../VoxtralRealtime/Models/TranscriptStore.swift | 12 ++++++++++-- .../macos/VoxtralRealtime/Services/HealthCheck.swift | 6 +++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift b/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift index bbefb1491..d18ae78ff 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift @@ -187,11 +187,19 @@ final class TranscriptStore { private func ensureRunnerLaunched() async { guard await !runner.isRunnerAlive else { return } - guard healthResult?.allGood == true else { + if healthResult == nil { + healthResult = await HealthCheck.run( + runnerPath: preferences.runnerPath, + modelPath: preferences.modelPath, + tokenizerPath: preferences.tokenizerPath, + preprocessorPath: preferences.preprocessorPath + ) + } + + guard healthResult?.filesReady == true else { currentError = healthResult.flatMap { result in if !result.runnerAvailable { return .binaryNotFound(path: preferences.runnerPath) } if let missing = result.missingFiles.first { return .modelMissing(file: missing) } - if result.micPermission != .authorized { return .microphonePermissionDenied } return nil } return diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/HealthCheck.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/HealthCheck.swift index d2af865f7..de72aabac 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/HealthCheck.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/HealthCheck.swift @@ -17,7 +17,11 @@ struct HealthCheck: Sendable { var micPermission: MicPermission var allGood: Bool { - runnerAvailable && modelAvailable && preprocessorAvailable && tokenizerAvailable && micPermission == .authorized + filesReady && micPermission == .authorized + } + + var filesReady: Bool { + runnerAvailable && modelAvailable && preprocessorAvailable && tokenizerAvailable } var missingFiles: [String] { From 060cc76ba60d8c21a57f2135e8a046624bb9172b Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 10:28:42 -0700 Subject: [PATCH 08/23] Increase wake check window default to 4s Voxtral's audio encoder needs 1-2s to produce the first token. With a 2s check window, short utterances like "hey torch" expired before any transcription appeared, requiring the user to say the keyword twice. Increase default to 4s and slider range to 2-8s. Made-with: Cursor --- voxtral_realtime/macos/VoxtralRealtime/Models/Preferences.swift | 2 +- .../macos/VoxtralRealtime/Views/WakeSettingsView.swift | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Models/Preferences.swift b/voxtral_realtime/macos/VoxtralRealtime/Models/Preferences.swift index ccb80f8c4..512324c14 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Models/Preferences.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Models/Preferences.swift @@ -128,7 +128,7 @@ final class Preferences { let words = $0.lowercased().split(separator: " ") return words.count > 1 ? words.dropFirst().joined(separator: " ") : words.first.map(String.init) } ?? "torch" - self.wakeCheckSeconds = defaults.object(forKey: "wakeCheckSeconds") as? Double ?? 2.0 + self.wakeCheckSeconds = defaults.object(forKey: "wakeCheckSeconds") as? Double ?? 4.0 if !FileManager.default.fileExists(atPath: resolvedVadModelPath) { let probePaths = [ diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/WakeSettingsView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/WakeSettingsView.swift index 84770369b..05b73a4bc 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/WakeSettingsView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/WakeSettingsView.swift @@ -59,7 +59,7 @@ struct WakeSettingsView: View { LabeledContent("Check window") { VStack(alignment: .trailing, spacing: 4) { - Slider(value: $prefs.wakeCheckSeconds, in: 1.0...4.0, step: 0.5) + Slider(value: $prefs.wakeCheckSeconds, in: 2.0...8.0, step: 0.5) .frame(width: 200) Text(String(format: "%.1f s", prefs.wakeCheckSeconds)) .font(.caption) From 005ecdcebfb582cff20eee735794ca6ca1765f12 Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 11:31:47 -0700 Subject: [PATCH 09/23] Accumulate full speech segments before wake phrase check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of triggering on the first 2 frames of speech (64ms) and stopping VAD, keep VAD running through the entire utterance and fire only when speech ends. The complete audio segment — including "hey" through "torch" — is then fed to Voxtral at once. VAD never stops during wake checking, so there is no restart gap between attempts. If the segment doesn't contain the wake keyword, VAD is already listening for the next utterance. This lets users say "hey torch" naturally as a single phrase without needing to make noise first to "wake up" the VAD. Made-with: Cursor --- .../Services/DictationManager.swift | 39 +++++----- .../VoxtralRealtime/Services/VadService.swift | 76 ++++++++++++------- 2 files changed, 69 insertions(+), 46 deletions(-) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift index 0727eb3dc..a25e0d1cd 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift @@ -301,9 +301,9 @@ final class DictationManager { switch event { case .ready: store.wakeState = .listening - case let .speechDetected(preRollSamples): + case let .speechSegment(samples): guard state == .idle else { return } - await beginWakePhraseCheck(preRollSamples: preRollSamples) + await checkWakeSegment(samples: samples) case .silenceDetected: log.warning("VAD detected microphone silence — stopping wake listening") await vadService.stop() @@ -319,48 +319,51 @@ final class DictationManager { } } - private func beginWakePhraseCheck(preRollSamples: [Float]) async { - await vadService.stop() + private func checkWakeSegment(samples: [Float]) async { wakeCheckTask?.cancel() - targetApp = NSWorkspace.shared.frontmostApplication - wakeTriggeredForCurrentSession = false - dictationStartedAt = .now - state = .listening - store.wakeState = .checkingPhrase - - await store.startDictation(initialSamples: preRollSamples, skipMicCheck: true) - if !preferences.enableWakePhrase { + await vadService.stop() + targetApp = NSWorkspace.shared.frontmostApplication + wakeTriggeredForCurrentSession = false + dictationStartedAt = .now + state = .listening store.wakeState = .active showPanel() + await store.startDictation(initialSamples: samples, skipMicCheck: true) startSilenceMonitor() return } - let requiredPhrase = store.normalizeWakePhrase(preferences.wakePhrase) - let keywords = Self.wakeKeywords(from: requiredPhrase) + store.wakeState = .checkingPhrase + + await store.startDictation(initialSamples: samples, skipMicCheck: true) + + let keywords = Self.wakeKeywords(from: store.normalizeWakePhrase(preferences.wakePhrase)) let checkDurationNs = UInt64(preferences.wakeCheckSeconds * 1_000_000_000) let deadline = DispatchTime.now().uptimeNanoseconds + checkDurationNs wakeCheckTask = Task { @MainActor [weak self] in guard let self else { return } - while !Task.isCancelled && self.state == .listening { + while !Task.isCancelled { let normalized = self.store.normalizeWakePhrase(self.store.dictationText) if !keywords.isEmpty && keywords.allSatisfy({ normalized.contains($0) }) { + await self.vadService.stop() + self.targetApp = NSWorkspace.shared.frontmostApplication self.wakeTriggeredForCurrentSession = true + self.dictationStartedAt = .now + self.state = .listening self.store.stripLeadingWakePhrase(self.preferences.wakePhrase) self.store.wakeState = .active self.showPanel() self.startSilenceMonitor() + log.info("Wake keyword matched — dictation active") return } if DispatchTime.now().uptimeNanoseconds >= deadline { - log.info("Wake phrase not matched within \(self.preferences.wakeCheckSeconds)s — returning to idle") _ = await self.store.stopDictation() - self.state = .idle self.store.wakeState = .listening - await self.startWakeListeningIfNeeded() + log.info("Speech segment did not contain wake keyword — continuing to listen") return } try? await Task.sleep(for: .milliseconds(100)) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/VadService.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/VadService.swift index 59106fb4a..282f72b9b 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/VadService.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/VadService.swift @@ -15,7 +15,7 @@ private let vadLog = Logger(subsystem: "org.pytorch.executorch.VoxtralRealtime", actor VadService { enum Event: Sendable { case ready - case speechDetected(preRollSamples: [Float]) + case speechSegment(samples: [Float]) case silenceDetected case stopped case error(String) @@ -25,21 +25,25 @@ actor VadService { private var stdinPipe: Pipe? private var engine: AVAudioEngine? private var recentSamples: [Float] = [] - private var byteBuffer = Data() - private var consecutiveSpeechFrames = 0 - private var armed = true private var eventHandler: (@Sendable (Event) -> Void)? private var totalSamplesWritten: Int = 0 private var peakRms: Float = 0 private var silenceCheckFired = false - private static let silenceCheckSamples = 16_000 * 2 // 2s at 16kHz + private static let silenceCheckSamples = 16_000 * 2 private static let silenceRmsThreshold: Float = 1e-6 - private var hangoverFramesRemaining = 0 private var hangoverFramesMax = 0 private static let frameDurationMs = 32 + private var inSpeech = false + private var hangoverFramesRemaining = 0 + private var speechSamples: [Float] = [] + private var preRollSamples: [Float] = [] + private var speechFrameCount = 0 + private static let minSpeechFrames = 3 + private static let preRollBufferSize = 16_000 / 2 // 0.5s at 16kHz + func start( runnerPath: String, modelPath: String, @@ -50,15 +54,16 @@ actor VadService { await stop() self.eventHandler = eventHandler - armed = true recentSamples = [] - byteBuffer = Data() - consecutiveSpeechFrames = 0 - hangoverFramesRemaining = 0 hangoverFramesMax = max(0, hangoverMs / Self.frameDurationMs) totalSamplesWritten = 0 peakRms = 0 silenceCheckFired = false + inSpeech = false + hangoverFramesRemaining = 0 + speechSamples = [] + preRollSamples = [] + speechFrameCount = 0 let stdoutPipe = Pipe() let stdinPipe = Pipe() @@ -163,7 +168,7 @@ actor VadService { let frameCount = Int(converted.frameLength) let samples = Array(UnsafeBufferPointer(start: channelData[0], count: frameCount)) Task { - await self.appendRecent(samples) + await self.bufferSamples(samples) try? await self.write(samples: samples, to: handle) } } @@ -172,6 +177,17 @@ actor VadService { self.engine = engine } + private func bufferSamples(_ samples: [Float]) { + if inSpeech { + speechSamples.append(contentsOf: samples) + } else { + preRollSamples.append(contentsOf: samples) + if preRollSamples.count > Self.preRollBufferSize { + preRollSamples.removeFirst(preRollSamples.count - Self.preRollBufferSize) + } + } + } + private func write(samples: [Float], to handle: FileHandle) throws { guard !samples.isEmpty else { return } let data = samples.withUnsafeBufferPointer { Data(buffer: $0) } @@ -191,14 +207,6 @@ actor VadService { } } - private func appendRecent(_ samples: [Float]) { - recentSamples.append(contentsOf: samples) - let maxSamples = 16_000 * 2 - if recentSamples.count > maxSamples { - recentSamples.removeFirst(recentSamples.count - maxSamples) - } - } - private func handleOutputLine(_ line: String, threshold: Float) { if line == "READY" { emit(.ready) @@ -211,17 +219,29 @@ actor VadService { } if probability >= threshold { - consecutiveSpeechFrames += 1 + speechFrameCount += 1 hangoverFramesRemaining = hangoverFramesMax - } else if hangoverFramesRemaining > 0 { - hangoverFramesRemaining -= 1 - } else { - consecutiveSpeechFrames = 0 - } - if armed && consecutiveSpeechFrames >= 2 { - armed = false - emit(.speechDetected(preRollSamples: recentSamples)) + if !inSpeech && speechFrameCount >= Self.minSpeechFrames { + inSpeech = true + speechSamples = preRollSamples + vadLog.info("Speech segment started") + } + } else if inSpeech { + if hangoverFramesRemaining > 0 { + hangoverFramesRemaining -= 1 + } else { + let segment = speechSamples + inSpeech = false + speechSamples = [] + speechFrameCount = 0 + preRollSamples = [] + + vadLog.info("Speech segment ended (\(segment.count) samples, \(String(format: "%.2f", Double(segment.count) / 16000))s)") + emit(.speechSegment(samples: segment)) + } + } else { + speechFrameCount = 0 } } From 9f6354f84f541e7979b46f2f6b40ab655a26d4c2 Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 11:37:40 -0700 Subject: [PATCH 10/23] Always restart VAD after dictation ends stopAndPaste had early returns for empty text that skipped the startWakeListeningIfNeeded call. Use defer to guarantee VAD restarts regardless of which code path exits the method. Made-with: Cursor --- .../macos/VoxtralRealtime/Services/DictationManager.swift | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift index a25e0d1cd..7dca73d62 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift @@ -182,6 +182,11 @@ final class DictationManager { dismissPanel() log.info("Dictation stopped, text length: \(rawText.count)") + defer { + store.wakeState = preferences.enableSileroVAD ? .listening : .disabled + Task { await startWakeListeningIfNeeded() } + } + guard !rawText.isEmpty else { return } let result = store.processDictationText(rawText) @@ -210,8 +215,6 @@ final class DictationManager { duration: duration, wakeTriggered: wakeTriggeredForCurrentSession ) - store.wakeState = preferences.enableSileroVAD ? .listening : .disabled - await startWakeListeningIfNeeded() } // MARK: - Silence Detection From b8b9a41a70ca4f3c79cca0c2a5417c1ab16af9e0 Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 11:44:21 -0700 Subject: [PATCH 11/23] Remove Recent Dictations section and strip trailing wake punctuation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Remove "Recent Dictations" from sidebar and welcome page — all sessions are already visible in the time-grouped history. 2. Fix stray "." appearing at start of dictation text. Voxtral outputs tokens incrementally, so "." from "Torch." can arrive after stripLeadingWakePhrase already ran. Add a 200ms settle delay then strip any leading punctuation/whitespace that leaked. Made-with: Cursor --- .../Models/TranscriptStore.swift | 9 ++++--- .../Services/DictationManager.swift | 2 ++ .../VoxtralRealtime/Views/SidebarView.swift | 19 +------------- .../VoxtralRealtime/Views/WelcomeView.swift | 26 ------------------- 4 files changed, 9 insertions(+), 47 deletions(-) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift b/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift index d18ae78ff..fde6127dc 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Models/TranscriptStore.swift @@ -43,9 +43,6 @@ final class TranscriptStore { var isPaused: Bool { sessionState == .paused } var isLoading: Bool { sessionState == .loading } var isModelReady: Bool { modelState == .ready } - var recentDictationSessions: [Session] { - sessions.filter { $0.source == .dictation }.prefix(5).map { $0 } - } private let runner = RunnerBridge() private let preferences: Preferences @@ -359,6 +356,12 @@ final class TranscriptStore { textPipeline.normalizeForWakePhrase(text) } + func stripLeadingPunctuation() { + dictationText = dictationText.drop(while: { + $0.isPunctuation || $0.isWhitespace || $0.isNewline + }).description + } + func stripLeadingWakePhrase(_ wakePhrase: String) { guard !dictationText.isEmpty, !wakePhrase.isEmpty else { return } diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift index 7dca73d62..9fa2d24ee 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift @@ -357,6 +357,8 @@ final class DictationManager { self.dictationStartedAt = .now self.state = .listening self.store.stripLeadingWakePhrase(self.preferences.wakePhrase) + try? await Task.sleep(for: .milliseconds(200)) + self.store.stripLeadingPunctuation() self.store.wakeState = .active self.showPanel() self.startSilenceMonitor() diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift index 09b499f56..4c987da9e 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/SidebarView.swift @@ -53,16 +53,6 @@ struct SidebarView: View { } } - if !recentDictations.isEmpty { - Section("Recent Dictations") { - ForEach(recentDictations) { session in - sessionRow(session) - .tag(SidebarPage.session(session.id)) - .contextMenu { sessionContextMenu(session) } - } - } - } - ForEach(historySections) { section in Section(section.title) { ForEach(section.sessions) { session in @@ -115,15 +105,8 @@ struct SidebarView: View { visibleSessions.filter(\.pinned) } - private var recentDictations: [Session] { - visibleSessions - .filter { !$0.pinned && $0.source == .dictation } - .prefix(5) - .map { $0 } - } - private var historySections: [SessionSection] { - let hiddenIDs = Set(pinnedSessions.map(\.id) + recentDictations.map(\.id)) + let hiddenIDs = Set(pinnedSessions.map(\.id)) let remainder = visibleSessions.filter { !hiddenIDs.contains($0.id) } let calendar = Calendar.current let grouped = Dictionary(grouping: remainder) { session -> String in diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/WelcomeView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/WelcomeView.swift index 547be9170..963dd0e79 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/WelcomeView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/WelcomeView.swift @@ -41,10 +41,6 @@ struct WelcomeView: View { } shortcutHints - - if !store.recentDictationSessions.isEmpty { - recentDictationsSection - } } .padding(40) .frame(maxWidth: 560) @@ -105,28 +101,6 @@ struct WelcomeView: View { .padding(.top, 8) } - private var recentDictationsSection: some View { - VStack(alignment: .leading, spacing: 10) { - Divider() - Text("Recent Dictations") - .font(.headline) - ForEach(store.recentDictationSessions.prefix(3)) { session in - VStack(alignment: .leading, spacing: 4) { - Text(session.displayTitle) - .font(.caption.weight(.semibold)) - .foregroundStyle(.secondary) - Text(session.previewText.prefix(120).description) - .font(.callout) - .lineLimit(2) - } - .frame(maxWidth: .infinity, alignment: .leading) - .padding(12) - .background(.background.secondary, in: RoundedRectangle(cornerRadius: 12)) - } - } - .frame(maxWidth: .infinity, alignment: .leading) - } - private func shortcutBadge(_ shortcut: String, label: String) -> some View { VStack(spacing: 4) { Text(shortcut) From 8bc6e3a913c2346061cc389cec1a6f023044e232 Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 11:53:03 -0700 Subject: [PATCH 12/23] Stop saving dictation sessions to history Dictation is a transient paste-and-forget flow. Only in-app transcription sessions are saved to history. Made-with: Cursor --- .../macos/VoxtralRealtime/Services/DictationManager.swift | 5 ----- 1 file changed, 5 deletions(-) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift index 9fa2d24ee..3e24b6ba9 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift @@ -210,11 +210,6 @@ final class DictationManager { _ = Self.checkAccessibility(prompt: true) } - store.saveDictationSession( - result: result, - duration: duration, - wakeTriggered: wakeTriggeredForCurrentSession - ) } // MARK: - Silence Detection From b10ec9778735eea5243669595b41bfab364f79c3 Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 11:57:37 -0700 Subject: [PATCH 13/23] Fix wake keyword placeholder text Made-with: Cursor --- .../macos/VoxtralRealtime/Views/WakeSettingsView.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/WakeSettingsView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/WakeSettingsView.swift index 05b73a4bc..551d6a039 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/WakeSettingsView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/WakeSettingsView.swift @@ -51,7 +51,7 @@ struct WakeSettingsView: View { HStack(spacing: 4) { Text("Hey") .foregroundStyle(.secondary) - TextField("torch", text: $prefs.wakeKeyword) + TextField("", text: $prefs.wakeKeyword) .textFieldStyle(.roundedBorder) .frame(width: 140) } From 09dd1f51039cb4c09841605963f4f9b21f3c7f78 Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 12:07:01 -0700 Subject: [PATCH 14/23] Add Excitorch -> ExecuTorch default replacement Made-with: Cursor --- .../macos/VoxtralRealtime/Services/ReplacementStore.swift | 2 ++ 1 file changed, 2 insertions(+) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/ReplacementStore.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/ReplacementStore.swift index 2aad85fe5..f98a3b558 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/ReplacementStore.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/ReplacementStore.swift @@ -73,5 +73,7 @@ final class ReplacementStore { ReplacementEntry(trigger: "mtia", replacement: "MTIA"), ReplacementEntry(trigger: "mvai", replacement: "MVAI"), ReplacementEntry(trigger: "executorch", replacement: "ExecuTorch", requiresWordBoundary: false), + ReplacementEntry(trigger: "excitorch", replacement: "ExecuTorch"), + ReplacementEntry(trigger: "execute torch", replacement: "ExecuTorch"), ] } From 2b7fc3c66b1c824aa29f2032473a5ef55ab3b0c6 Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 12:11:25 -0700 Subject: [PATCH 15/23] Add padding to snippet editor sheet Made-with: Cursor --- .../macos/VoxtralRealtime/Views/SnippetManagementView.swift | 1 + 1 file changed, 1 insertion(+) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift index 9e9ad0c18..5f4c5f612 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift @@ -173,5 +173,6 @@ private struct SnippetEditor: View { ) } } + .padding(24) } } From b43614cd639f84d21557e6fdff0e6c765e4dc851 Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 12:19:52 -0700 Subject: [PATCH 16/23] Fix snippet expansion: strip ASR punctuation and reorder pipeline Two issues prevented snippet triggers from matching: 1. Voxtral adds trailing punctuation (e.g., "Snippet email signature.") which made the trigger comparison fail. Now strip punctuation from both the lowered command and the extracted trigger before matching. 2. Replacements ran before snippet resolution, so a replacement could corrupt a command prefix or trigger word. Now snippets are checked first on the raw text. If a snippet matches, skip replacements on the expanded content. Made-with: Cursor --- .../VoxtralRealtime/Services/TextPipeline.swift | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift index b69259119..3da96dc52 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift @@ -57,12 +57,15 @@ final class TextPipeline { baseText = trimmed } - let replacementsApplied = applyReplacements(to: baseText) - let snippetResolution = resolveSnippet(in: replacementsApplied, allowExpansion: context == .dictation && !literalCommand) - let styleApplied = applyStyle(to: snippetResolution.text) + let snippetResolution = resolveSnippet(in: baseText, allowExpansion: context == .dictation && !literalCommand) + let afterSnippets = snippetResolution.text + let replacementsApplied = snippetResolution.usedSnippetIDs.isEmpty + ? applyReplacements(to: afterSnippets) + : afterSnippets + let styleApplied = applyStyle(to: replacementsApplied) var tags: [String] = [] - if replacementsApplied != baseText { + if replacementsApplied != afterSnippets { tags.append("replacement") } if !snippetResolution.usedSnippetIDs.isEmpty { @@ -145,10 +148,13 @@ final class TextPipeline { let normalized = text.trimmingCharacters(in: .whitespacesAndNewlines) let lowered = normalized.lowercased() + .trimmingCharacters(in: .punctuationCharacters) let commandPrefixes = ["insert snippet ", "snippet ", "template "] for prefix in commandPrefixes where lowered.hasPrefix(prefix) { - let requestedTrigger = String(normalized.dropFirst(prefix.count)).trimmingCharacters(in: .whitespacesAndNewlines) + let requestedTrigger = String(lowered.dropFirst(prefix.count)) + .trimmingCharacters(in: .whitespacesAndNewlines) + .trimmingCharacters(in: .punctuationCharacters) if let snippet = snippetStore.snippets.first(where: { $0.isEnabled && $0.trigger.compare(requestedTrigger, options: .caseInsensitive) == .orderedSame }) { From 85b1a59c6a86de03e27a8dd8e4b554bbb81250d9 Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 12:32:19 -0700 Subject: [PATCH 17/23] Fix editor sheets: wrong title on edit, missing padding Both Snippet and Replacement editors used @State to check if the entry name/trigger was empty to decide the title. Since @State captures the initial value, editing an existing entry could still show "Add" if SwiftUI reused the view. Pass an explicit isEditing flag instead. Also move padding inside the editor body and remove the conflicting outer .padding(20) on the sheet content. Made-with: Cursor --- .../VoxtralRealtime/Views/ReplacementManagementView.swift | 7 ++++--- .../VoxtralRealtime/Views/SnippetManagementView.swift | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/ReplacementManagementView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/ReplacementManagementView.swift index d5a9fe29c..03b5861a3 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/ReplacementManagementView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/ReplacementManagementView.swift @@ -80,7 +80,7 @@ struct ReplacementManagementView: View { } } .sheet(isPresented: $isPresentingEditor) { - ReplacementEntryEditor(entry: editingEntry) { entry in + ReplacementEntryEditor(entry: editingEntry, isEditing: editingEntryID != nil) { entry in if editingEntryID == nil { replacementStore.add(entry) } else { @@ -90,7 +90,6 @@ struct ReplacementManagementView: View { } onCancel: { isPresentingEditor = false } - .padding(20) .frame(width: 420) } } @@ -118,12 +117,13 @@ struct ReplacementManagementView: View { private struct ReplacementEntryEditor: View { @State var entry: ReplacementEntry + let isEditing: Bool let onSave: (ReplacementEntry) -> Void let onCancel: () -> Void var body: some View { VStack(alignment: .leading, spacing: 16) { - Text(entry.trigger.isEmpty ? "Add Replacement" : "Edit Replacement") + Text(isEditing ? "Edit Replacement" : "Add Replacement") .font(.headline) TextField("Trigger phrase", text: $entry.trigger) @@ -149,5 +149,6 @@ private struct ReplacementEntryEditor: View { .disabled(entry.trigger.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty || entry.replacement.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty) } } + .padding(24) } } diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift index 5f4c5f612..0507d320c 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift @@ -91,7 +91,7 @@ struct SnippetManagementView: View { } } .sheet(isPresented: $isPresentingEditor) { - SnippetEditor(snippet: editingSnippet) { snippet in + SnippetEditor(snippet: editingSnippet, isEditing: editingSnippetID != nil) { snippet in if editingSnippetID == nil { snippetStore.add(snippet) } else { @@ -101,8 +101,7 @@ struct SnippetManagementView: View { } onCancel: { isPresentingEditor = false } - .padding(20) - .frame(width: 480, height: 420) + .frame(width: 480, height: 460) } } @@ -130,12 +129,13 @@ struct SnippetManagementView: View { private struct SnippetEditor: View { @State var snippet: Snippet + let isEditing: Bool let onSave: (Snippet) -> Void let onCancel: () -> Void var body: some View { VStack(alignment: .leading, spacing: 16) { - Text(snippet.name.isEmpty ? "Add Snippet" : "Edit Snippet") + Text(isEditing ? "Edit Snippet" : "Add Snippet") .font(.headline) TextField("Display name", text: $snippet.name) From b62a7cdb3ead6b8b4c14af2012b8816c182e26e6 Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 13:43:48 -0700 Subject: [PATCH 18/23] Fix edit button showing Add sheet on first click sheet(isPresented:) captures closure values at build time, not at presentation time. The first click sets editingSnippetID but the sheet evaluates the old nil value, showing "Add" instead of "Edit". Switch to sheet(item:) with an Identifiable wrapper that carries the snippet and isEditing flag, guaranteeing the sheet is created fresh with the correct data each time. Made-with: Cursor --- .../Views/ReplacementManagementView.swift | 36 +++++++++---------- .../Views/SnippetManagementView.swift | 36 +++++++++---------- 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/ReplacementManagementView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/ReplacementManagementView.swift index 03b5861a3..8cf40184b 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/ReplacementManagementView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/ReplacementManagementView.swift @@ -8,12 +8,16 @@ import SwiftUI +private struct ReplacementEditorItem: Identifiable { + let id = UUID() + let entry: ReplacementEntry + let isEditing: Bool +} + struct ReplacementManagementView: View { @Environment(ReplacementStore.self) private var replacementStore @State private var searchText = "" - @State private var editingEntry = ReplacementEntry() - @State private var editingEntryID: UUID? - @State private var isPresentingEditor = false + @State private var editorItem: ReplacementEditorItem? var body: some View { VStack(alignment: .leading, spacing: 12) { @@ -21,9 +25,7 @@ struct ReplacementManagementView: View { TextField("Search terms", text: $searchText) .textFieldStyle(.roundedBorder) Button("Add") { - editingEntry = ReplacementEntry() - editingEntryID = nil - isPresentingEditor = true + editorItem = ReplacementEditorItem(entry: ReplacementEntry(), isEditing: false) } .buttonStyle(.borderedProminent) } @@ -54,17 +56,13 @@ struct ReplacementManagementView: View { } Spacer() Button("Edit") { - editingEntry = entry - editingEntryID = entry.id - isPresentingEditor = true + editorItem = ReplacementEditorItem(entry: entry, isEditing: true) } .buttonStyle(.borderless) } .contextMenu { Button("Edit") { - editingEntry = entry - editingEntryID = entry.id - isPresentingEditor = true + editorItem = ReplacementEditorItem(entry: entry, isEditing: true) } Button(entry.isEnabled ? "Disable" : "Enable") { replacementStore.toggleEnabled(for: entry.id) @@ -79,16 +77,16 @@ struct ReplacementManagementView: View { .listStyle(.inset) } } - .sheet(isPresented: $isPresentingEditor) { - ReplacementEntryEditor(entry: editingEntry, isEditing: editingEntryID != nil) { entry in - if editingEntryID == nil { - replacementStore.add(entry) - } else { + .sheet(item: $editorItem) { item in + ReplacementEntryEditor(entry: item.entry, isEditing: item.isEditing) { entry in + if item.isEditing { replacementStore.update(entry) + } else { + replacementStore.add(entry) } - isPresentingEditor = false + editorItem = nil } onCancel: { - isPresentingEditor = false + editorItem = nil } .frame(width: 420) } diff --git a/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift b/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift index 0507d320c..bcaafdd94 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Views/SnippetManagementView.swift @@ -8,12 +8,16 @@ import SwiftUI +private struct SnippetEditorItem: Identifiable { + let id = UUID() + let snippet: Snippet + let isEditing: Bool +} + struct SnippetManagementView: View { @Environment(SnippetStore.self) private var snippetStore @State private var searchText = "" - @State private var editingSnippet = Snippet() - @State private var editingSnippetID: UUID? - @State private var isPresentingEditor = false + @State private var editorItem: SnippetEditorItem? var body: some View { VStack(alignment: .leading, spacing: 12) { @@ -21,9 +25,7 @@ struct SnippetManagementView: View { TextField("Search snippets", text: $searchText) .textFieldStyle(.roundedBorder) Button("Add") { - editingSnippet = Snippet() - editingSnippetID = nil - isPresentingEditor = true + editorItem = SnippetEditorItem(snippet: Snippet(), isEditing: false) } .buttonStyle(.borderedProminent) } @@ -65,17 +67,13 @@ struct SnippetManagementView: View { } Spacer() Button("Edit") { - editingSnippet = snippet - editingSnippetID = snippet.id - isPresentingEditor = true + editorItem = SnippetEditorItem(snippet: snippet, isEditing: true) } .buttonStyle(.borderless) } .contextMenu { Button("Edit") { - editingSnippet = snippet - editingSnippetID = snippet.id - isPresentingEditor = true + editorItem = SnippetEditorItem(snippet: snippet, isEditing: true) } Button(snippet.isEnabled ? "Disable" : "Enable") { snippetStore.toggleEnabled(for: snippet.id) @@ -90,16 +88,16 @@ struct SnippetManagementView: View { .listStyle(.inset) } } - .sheet(isPresented: $isPresentingEditor) { - SnippetEditor(snippet: editingSnippet, isEditing: editingSnippetID != nil) { snippet in - if editingSnippetID == nil { - snippetStore.add(snippet) - } else { + .sheet(item: $editorItem) { item in + SnippetEditor(snippet: item.snippet, isEditing: item.isEditing) { snippet in + if item.isEditing { snippetStore.update(snippet) + } else { + snippetStore.add(snippet) } - isPresentingEditor = false + editorItem = nil } onCancel: { - isPresentingEditor = false + editorItem = nil } .frame(width: 480, height: 460) } From f0a0ddd0411a991ce0aef3007b86ad86a01c10a8 Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 13:44:59 -0700 Subject: [PATCH 19/23] Allow snippet expansion by direct trigger match without prefix Made-with: Cursor --- .../macos/VoxtralRealtime/Services/TextPipeline.swift | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift index 3da96dc52..e37ddf28f 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift @@ -163,6 +163,15 @@ final class TextPipeline { } } + let directMatch = lowered + .trimmingCharacters(in: .punctuationCharacters) + if let snippet = snippetStore.snippets.first(where: { + $0.isEnabled && $0.trigger.compare(directMatch, options: .caseInsensitive) == .orderedSame + }) { + snippetStore.markUsed(snippet.id) + return (snippet.content, [snippet.id]) + } + return (text, []) } From d841430094b2c278db3a99939c91ba422ce9d03b Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 13:46:59 -0700 Subject: [PATCH 20/23] Simplify snippet matching to direct trigger only Made-with: Cursor --- .../Services/TextPipeline.swift | 21 +++---------------- 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift index e37ddf28f..b28883ae1 100644 --- a/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift +++ b/voxtral_realtime/macos/VoxtralRealtime/Services/TextPipeline.swift @@ -146,27 +146,12 @@ final class TextPipeline { return (text, []) } - let normalized = text.trimmingCharacters(in: .whitespacesAndNewlines) - let lowered = normalized.lowercased() + let trigger = text.trimmingCharacters(in: .whitespacesAndNewlines) + .lowercased() .trimmingCharacters(in: .punctuationCharacters) - let commandPrefixes = ["insert snippet ", "snippet ", "template "] - - for prefix in commandPrefixes where lowered.hasPrefix(prefix) { - let requestedTrigger = String(lowered.dropFirst(prefix.count)) - .trimmingCharacters(in: .whitespacesAndNewlines) - .trimmingCharacters(in: .punctuationCharacters) - if let snippet = snippetStore.snippets.first(where: { - $0.isEnabled && $0.trigger.compare(requestedTrigger, options: .caseInsensitive) == .orderedSame - }) { - snippetStore.markUsed(snippet.id) - return (snippet.content, [snippet.id]) - } - } - let directMatch = lowered - .trimmingCharacters(in: .punctuationCharacters) if let snippet = snippetStore.snippets.first(where: { - $0.isEnabled && $0.trigger.compare(directMatch, options: .caseInsensitive) == .orderedSame + $0.isEnabled && $0.trigger.compare(trigger, options: .caseInsensitive) == .orderedSame }) { snippetStore.markUsed(snippet.id) return (snippet.content, [snippet.id]) From f3acd804438e38c0e9acedda5eaf03d07c8fb40d Mon Sep 17 00:00:00 2001 From: Young Han Date: Thu, 26 Mar 2026 13:57:55 -0700 Subject: [PATCH 21/23] Update README with replacements, snippets, wake, and sidebar docs Made-with: Cursor --- voxtral_realtime/macos/README.md | 108 +++++++++++++++++++++++++++---- 1 file changed, 95 insertions(+), 13 deletions(-) diff --git a/voxtral_realtime/macos/README.md b/voxtral_realtime/macos/README.md index b85858b8b..2e2a19dfe 100644 --- a/voxtral_realtime/macos/README.md +++ b/voxtral_realtime/macos/README.md @@ -8,10 +8,14 @@ https://github.com/user-attachments/assets/6d6089fc-5feb-458b-a60b-08379855976a - **Live transcription** — real-time token streaming with audio waveform visualization - **System-wide dictation** — press `Ctrl+Space` in any app to transcribe speech and auto-paste the result +- **"Hey torch" voice wake** — hands-free dictation via Silero VAD speech detection and wake phrase matching +- **Text replacements** — auto-correct names, acronyms, and domain terms after transcription +- **Snippets** — say a trigger phrase (e.g., "email signature") to paste pre-written templates - **Model preloading** — load the model once, transcribe instantly across sessions - **Pause / resume** — pause and resume within the same session without losing context -- **Session history** — searchable history with rename, copy, and delete -- **Silence detection** — dictation auto-stops after 2 seconds of silence +- **Session history** — searchable history with pinning, recency grouping, rename, copy, and multi-format export (.txt, .json, .srt) +- **Silence detection** — dictation auto-stops after configurable silence timeout +- **Sidebar navigation** — Home, Replacements, Snippets, Wake, and Settings pages - **Self-contained DMG** — runner binary, model weights, and runtime libraries all bundled ## Download @@ -40,9 +44,39 @@ https://github.com/user-attachments/assets/6d6089fc-5feb-458b-a60b-08379855976a 2. Focus any text field in any app (Notes, Slack, browser, etc.) 3. Press **`Ctrl+Space`** — a floating overlay appears with a waveform 4. Speak — live transcribed text appears in the overlay -5. Press **`Ctrl+Space`** again to stop, or wait for 2 seconds of silence +5. Press **`Ctrl+Space`** again to stop, or wait for silence auto-stop 6. The transcribed text is automatically pasted into the focused text field +### Voice wake ("Hey torch") + +1. Open the **Wake** page in the sidebar and enable it (or press `Ctrl+Shift+W`) +2. Say **"Hey torch"** — Silero VAD detects your speech, Voxtral checks for the wake keyword +3. If matched, the dictation panel appears and you can start speaking +4. Dictation auto-pastes when you stop speaking, then wake listening resumes + +The wake keyword is configurable in the Wake settings (default: "torch"). The app accumulates the full speech segment before checking, so you can say "hey torch" naturally as one phrase. + +### Replacements + +Add text replacements in the **Replacements** page. Each entry has a trigger and replacement — when the trigger appears in transcribed text, it's automatically replaced. + +Examples: +- `mtia` → `MTIA` +- `executorch` → `ExecuTorch` +- `execute torch` → `ExecuTorch` + +Supports case-preserving matching and word boundary options. + +### Snippets + +Add voice-triggered templates in the **Snippets** page. When your entire dictation matches a snippet trigger, the template content is pasted instead. + +Example: say **"email signature"** to paste: +``` +Best, +Younghan +``` + ### Keyboard shortcuts | Shortcut | Action | @@ -53,7 +87,7 @@ https://github.com/user-attachments/assets/6d6089fc-5feb-458b-a60b-08379855976a | `Cmd+Shift+C` | Copy transcript | | `Cmd+Shift+U` | Unload model | | `Ctrl+Space` | Toggle system-wide dictation | -| `Cmd+,` | Settings | +| `Ctrl+Shift+W` | Toggle voice wake on/off | --- @@ -129,7 +163,19 @@ The runner binary will be at: ${EXECUTORCH_PATH}/cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner ``` -#### 4. Install Python packages +#### 4. Build the Silero VAD stream runner (for voice wake) + +```bash +cd ${EXECUTORCH_PATH} +make silero-vad-cpu +``` + +This builds `silero_vad_stream_runner` at: +``` +${EXECUTORCH_PATH}/cmake-out/examples/models/silero_vad/silero_vad_stream_runner +``` + +#### 5. Install Python packages ```bash pip install huggingface_hub sounddevice @@ -138,7 +184,7 @@ pip install huggingface_hub sounddevice - `huggingface_hub` — to download model artifacts from HuggingFace - `sounddevice` — for the CLI mic streaming test script -#### 5. Download model artifacts +#### 6. Download model artifacts ```bash export LOCAL_FOLDER="$HOME/voxtral_realtime_quant_metal" @@ -152,7 +198,13 @@ This downloads three files (~6.2 GB total): HuggingFace repo: [`mistralai/Voxtral-Mini-4B-Realtime-2602-ExecuTorch`](https://huggingface.co/mistral-labs/Voxtral-Mini-4B-Realtime-2602-ExecuTorch) -#### 6. Test with CLI (optional) +Download the Silero VAD model: + +```bash +hf download younghan-meta/Silero-VAD-ExecuTorch-XNNPACK --local-dir ~/silero_vad_xnnpack +``` + +#### 7. Test with CLI (optional) Verify the runner works before building the app: @@ -170,7 +222,7 @@ cd ${LOCAL_FOLDER} && chmod +x stream_audio.py --mic ``` -#### 7. Build the app and create DMG +#### 8. Build the app and create DMG ```bash cd voxtral_realtime/macos @@ -198,6 +250,7 @@ VoxtralRealtimeApp ├── TranscriptStore (@Observable, @MainActor) │ ├── SessionState: idle → loading → transcribing ⇆ paused → idle │ ├── ModelState: unloaded → loading → ready +│ ├── TextPipeline: replacements → snippets → style (no-op) │ └── RunnerBridge (actor) │ ├── Process (voxtral_realtime_runner) │ │ ├── stdin ← raw 16kHz mono f32le PCM @@ -207,9 +260,21 @@ VoxtralRealtimeApp │ └── AVAudioEngine → format conversion → pipe ├── DictationManager (@Observable, @MainActor) │ ├── Global hotkey (Carbon RegisterEventHotKey) +│ ├── VadService (actor) +│ │ ├── Process (silero_vad_stream_runner) +│ │ │ ├── stdin ← 16kHz mono f32le PCM +│ │ │ └── stdout → PROB