1+ const _ = require ( 'lodash' )
2+ const { createClient } = require ( '@deepgram/sdk' )
3+ const { PassThrough } = require ( 'stream' )
4+ const EventEmitter = require ( 'events' )
5+ const axios = require ( 'axios' )
6+ const debug = require ( 'debug' ) ( 'botium-speech-processing-deepgram-stt' )
7+
8+ const { deepgramOptions } = require ( '../utils' )
9+
10+ class DeepgramSTT {
11+ async _fetchLanguagesFromDocs ( ) {
12+ try {
13+ // Fetch Deepgram STT documentation page
14+ const response = await axios . get ( 'https://developers.deepgram.com/docs/models-languages-overview' , {
15+ timeout : 5000 ,
16+ headers : {
17+ 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
18+ }
19+ } )
20+
21+ const html = response . data
22+ const languages = new Set ( )
23+
24+ // Parse language codes from documentation
25+ // Look for patterns like language codes in tables or lists
26+ const languagePattern = / \b ( [ a - z ] { 2 } (?: - [ A - Z ] { 2 } ) ? ) \b / g
27+ let match
28+
29+ // Common language codes that Deepgram typically supports
30+ const commonLanguages = [
31+ 'af' , 'ar' , 'bg' , 'bn' , 'ca' , 'cs' , 'da' , 'de' , 'el' , 'en' , 'en-AU' , 'en-GB' , 'en-IN' , 'en-NZ' , 'en-US' ,
32+ 'es' , 'es-419' , 'et' , 'fa' , 'fi' , 'fr' , 'fr-CA' , 'he' , 'hi' , 'hr' , 'hu' , 'id' , 'it' , 'ja' ,
33+ 'ko' , 'lt' , 'lv' , 'ms' , 'nl' , 'no' , 'pl' , 'pt' , 'pt-BR' , 'pt-PT' , 'ro' , 'ru' , 'sk' , 'sl' , 'sr' , 'sv' ,
34+ 'sw' , 'ta' , 'th' , 'tr' , 'uk' , 'ur' , 'vi' , 'zh' , 'zh-CN' , 'zh-TW'
35+ ]
36+
37+ while ( ( match = languagePattern . exec ( html ) ) !== null ) {
38+ const lang = match [ 1 ]
39+ if ( commonLanguages . includes ( lang ) ) {
40+ languages . add ( lang )
41+ }
42+ }
43+
44+ const languageArray = Array . from ( languages ) . sort ( )
45+ debug ( `Fetched ${ languageArray . length } languages from Deepgram STT documentation` )
46+
47+ return languageArray . length > 0 ? languageArray : null
48+
49+ } catch ( err ) {
50+ debug ( `Failed to fetch languages from documentation: ${ err . message } ` )
51+ return null
52+ }
53+ }
54+
55+ async languages ( req ) {
56+ // Try to fetch from documentation first
57+ const docLanguages = await this . _fetchLanguagesFromDocs ( )
58+ if ( docLanguages && docLanguages . length > 0 ) {
59+ return docLanguages
60+ }
61+
62+ // Fallback to static list if documentation parsing fails
63+ debug ( 'Using fallback static language list' )
64+ return [ ]
65+ }
66+
67+ async stt_OpenStream ( req , { language } ) {
68+ const options = deepgramOptions ( req )
69+ if ( ! options . apiKey ) {
70+ throw new Error ( 'Deepgram API key not configured' )
71+ }
72+
73+ const deepgram = createClient ( options . apiKey )
74+
75+ const streamOptions = {
76+ model : 'general' ,
77+ language : language ,
78+ smart_format : true ,
79+ punctuate : true ,
80+ interim_results : true ,
81+ utterance_end_ms : 1000 ,
82+ vad_events : true
83+ }
84+
85+ // Apply default config from environment
86+ if ( process . env . BOTIUM_SPEECH_DEEPGRAM_CONFIG ) {
87+ try {
88+ const defaultConfig = JSON . parse ( process . env . BOTIUM_SPEECH_DEEPGRAM_CONFIG )
89+ Object . assign ( streamOptions , defaultConfig )
90+ } catch ( err ) {
91+ throw new Error ( `Deepgram config in BOTIUM_SPEECH_DEEPGRAM_CONFIG invalid: ${ err . message } ` )
92+ }
93+ }
94+
95+ // Apply request-specific config
96+ if ( req . body && req . body . deepgram && req . body . deepgram . config ) {
97+ Object . assign ( streamOptions , req . body . deepgram . config )
98+ }
99+
100+ const events = new EventEmitter ( )
101+ let eventHistory = [ ]
102+ let connection = null
103+
104+ try {
105+ connection = deepgram . listen . live ( streamOptions )
106+
107+ connection . on ( 'open' , ( ) => {
108+ debug ( 'Deepgram WebSocket opened' )
109+ } )
110+
111+ connection . on ( 'Results' , ( data ) => {
112+ const result = data . channel . alternatives [ 0 ]
113+ if ( result && result . transcript ) {
114+ const event = {
115+ status : 'ok' ,
116+ text : result . transcript ,
117+ final : data . is_final || false ,
118+ debug : data
119+ }
120+
121+ // Add timing information if available
122+ if ( data . start && data . duration ) {
123+ event . start = _ . round ( data . start , 3 )
124+ event . end = _ . round ( data . start + data . duration , 3 )
125+ }
126+
127+ events . emit ( 'data' , event )
128+ if ( eventHistory ) {
129+ eventHistory . push ( event )
130+ }
131+ }
132+ } )
133+
134+ connection . on ( 'UtteranceEnd' , ( data ) => {
135+ debug ( 'Deepgram utterance end detected' )
136+ } )
137+
138+ connection . on ( 'error' , ( err ) => {
139+ const event = {
140+ status : 'error' ,
141+ err : `Deepgram STT failed: ${ err . message || err } `
142+ }
143+ events . emit ( 'data' , event )
144+ if ( eventHistory ) {
145+ eventHistory . push ( event )
146+ }
147+ } )
148+
149+ connection . on ( 'close' , ( ) => {
150+ debug ( 'Deepgram WebSocket closed' )
151+ events . emit ( 'close' )
152+ } )
153+
154+ } catch ( err ) {
155+ debug ( err )
156+ throw new Error ( `Deepgram STT streaming setup failed: ${ err . message } ` )
157+ }
158+
159+ return {
160+ events,
161+ write : ( buffer ) => {
162+ if ( connection && connection . getReadyState ( ) === 1 ) {
163+ connection . send ( buffer )
164+ }
165+ } ,
166+ end : ( ) => {
167+ if ( connection ) {
168+ connection . finish ( )
169+ }
170+ } ,
171+ close : ( ) => {
172+ if ( connection ) {
173+ connection . finish ( )
174+ connection = null
175+ }
176+ eventHistory = null
177+ } ,
178+ triggerHistoryEmit : ( ) => {
179+ for ( const eh of eventHistory ) {
180+ events . emit ( 'data' , eh )
181+ }
182+ }
183+ }
184+ }
185+
186+ async stt ( req , { language, buffer, hint } ) {
187+ const options = deepgramOptions ( req )
188+ if ( ! options . apiKey ) {
189+ throw new Error ( 'Deepgram API key not configured' )
190+ }
191+
192+ const deepgram = createClient ( options . apiKey )
193+
194+ const transcribeOptions = {
195+ model : 'general' ,
196+ language : language ,
197+ smart_format : true ,
198+ punctuate : true
199+ }
200+
201+ // Add search terms if hint is provided
202+ if ( hint && hint . length > 0 ) {
203+ transcribeOptions . search = [ hint ]
204+ }
205+
206+ // Apply default config from environment
207+ if ( process . env . BOTIUM_SPEECH_DEEPGRAM_CONFIG ) {
208+ try {
209+ const defaultConfig = JSON . parse ( process . env . BOTIUM_SPEECH_DEEPGRAM_CONFIG )
210+ Object . assign ( transcribeOptions , defaultConfig )
211+ } catch ( err ) {
212+ throw new Error ( `Deepgram config in BOTIUM_SPEECH_DEEPGRAM_CONFIG invalid: ${ err . message } ` )
213+ }
214+ }
215+
216+ // Apply request-specific config
217+ if ( req . body && req . body . deepgram && req . body . deepgram . config ) {
218+ Object . assign ( transcribeOptions , req . body . deepgram . config )
219+ }
220+
221+ try {
222+ debug ( `Calling Deepgram API with options: ${ JSON . stringify ( transcribeOptions ) } ` )
223+
224+ const response = await deepgram . listen . prerecorded . transcribeFile (
225+ buffer ,
226+ transcribeOptions
227+ )
228+
229+ debug ( `Deepgram response: ${ JSON . stringify ( response , null , 2 ) } ` )
230+
231+ if ( response . results && response . results . channels && response . results . channels [ 0 ] ) {
232+ const channel = response . results . channels [ 0 ]
233+ if ( channel . alternatives && channel . alternatives [ 0 ] ) {
234+ const transcript = channel . alternatives [ 0 ] . transcript || ''
235+ return {
236+ text : transcript ,
237+ debug : response
238+ }
239+ }
240+ }
241+
242+ return {
243+ text : '' ,
244+ debug : response
245+ }
246+ } catch ( err ) {
247+ debug ( err )
248+ throw new Error ( `Deepgram STT failed: ${ err . message || err } ` )
249+ }
250+ }
251+ }
252+
253+ module . exports = DeepgramSTT
0 commit comments