Skip to content

Commit 45abbb1

Browse files
authored
Merge pull request #37 from codeforequity-at/deepgram-stt
Deepgram stt
2 parents 8dad544 + a45c9d9 commit 45abbb1

File tree

7 files changed

+459
-9
lines changed

7 files changed

+459
-9
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ For the major cloud providers there are additional docker-compose files. If usin
5757

5858
> docker-compose -f docker-compose-azure.yml up -d
5959

60+
For Deepgram, add your API key to the file *docker-compose-deepgram.yml* and start the services:
61+
62+
> docker-compose -f docker-compose-deepgram.yml up -d
63+
6064
### Optional: Build Docker Images
6165

6266
You can optionally built your own docker images (if you made any changes in this repository, for instance to download the latest version of a model). Clone or download this repository and run docker-compose:

docker-compose-deepgram.yml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
version: '3'
2+
services:
3+
nginx:
4+
image: nginx
5+
restart: always
6+
volumes:
7+
- ./nginx.conf:/etc/nginx/nginx.conf
8+
ports:
9+
- 80:80
10+
frontend:
11+
image: botium/botium-speech-frontend:latest
12+
restart: always
13+
environment:
14+
BOTIUM_API_TOKENS:
15+
BOTIUM_SPEECH_PROVIDER_TTS: google
16+
BOTIUM_SPEECH_PROVIDER_STT: deepgram
17+
BOTIUM_SPEECH_DEEPGRAM_API_KEY:
18+
volumes:
19+
- "./frontend/resources:/app/resources"

frontend/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"@aws-sdk/client-transcribe": "^3.775.0",
1919
"@aws-sdk/client-transcribe-streaming": "^3.775.0",
2020
"@aws-sdk/util-endpoints": "^3.775.0",
21+
"@deepgram/sdk": "^3.10.1",
2122
"@google-cloud/speech": "^7.0.1",
2223
"@google-cloud/storage": "^7.15.2",
2324
"@google-cloud/text-to-speech": "^6.0.1",

frontend/src/routes.js

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ const ttsEngines = {
3939
google: new (require('./tts/google'))(),
4040
ibm: new (require('./tts/ibm'))(),
4141
azure: new (require('./tts/azure'))(),
42-
polly: new (require('./tts/polly'))()
42+
polly: new (require('./tts/polly'))(),
43+
deepgram: new (require('./tts/deepgram'))()
4344
// marytts: new (require('./tts/marytts'))(),
4445
// picotts: new (require('./tts/picotts'))()
4546
}
@@ -48,7 +49,8 @@ const sttEngines = {
4849
// kaldi: new (require('./stt/kaldi'))(),
4950
ibm: new (require('./stt/ibm'))(),
5051
azure: new (require('./stt/azure'))(),
51-
awstranscribe: new (require('./stt/awstranscribe'))()
52+
awstranscribe: new (require('./stt/awstranscribe'))(),
53+
deepgram: new (require('./stt/deepgram'))()
5254
}
5355

5456
const multerMemoryStorage = multer.memoryStorage()
@@ -144,7 +146,7 @@ const router = express.Router()
144146
* required: false
145147
* schema:
146148
* type: string
147-
* enum: [google, ibm, azure, awstranscribe]
149+
* enum: [google, ibm, azure, awstranscribe, deepgram]
148150
* responses:
149151
* 200:
150152
* description: List of supported STT languages
@@ -196,7 +198,7 @@ const router = express.Router()
196198
* required: false
197199
* schema:
198200
* type: string
199-
* enum: [google, ibm, azure, awstranscribe]
201+
* enum: [google, ibm, azure, awstranscribe, deepgram]
200202
* - name: cache
201203
* description: Use result cache (default Y)
202204
* in: query
@@ -299,7 +301,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
299301
* required: false
300302
* schema:
301303
* type: string
302-
* enum: [google, ibm, azure, polly]
304+
* enum: [google, ibm, azure, polly, deepgram]
303305
* responses:
304306
* 200:
305307
* description: List of supported voices
@@ -340,7 +342,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
340342
* required: false
341343
* schema:
342344
* type: string
343-
* enum: [google, ibm, azure, polly]
345+
* enum: [google, ibm, azure, polly, deepgram]
344346
* responses:
345347
* 200:
346348
* description: List of supported TTS languages
@@ -395,7 +397,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
395397
* required: false
396398
* schema:
397399
* type: string
398-
* enum: [google, ibm, azure, polly]
400+
* enum: [google, ibm, azure, polly, deepgram]
399401
* - name: cache
400402
* description: Use result cache (default Y)
401403
* in: query
@@ -783,7 +785,7 @@ const wssStreams = {}
783785
* required: false
784786
* schema:
785787
* type: string
786-
* enum: [google, ibm, azure, awstranscribe]
788+
* enum: [google, ibm, azure, awstranscribe, deepgram]
787789
* responses:
788790
* 200:
789791
* description: Websocket Url to stream the audio to, and the uri to check status and end the stream

frontend/src/stt/deepgram.js

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
const _ = require('lodash')
2+
const { createClient } = require('@deepgram/sdk')
3+
const { PassThrough } = require('stream')
4+
const EventEmitter = require('events')
5+
const axios = require('axios')
6+
const debug = require('debug')('botium-speech-processing-deepgram-stt')
7+
8+
const { deepgramOptions } = require('../utils')
9+
10+
class DeepgramSTT {
11+
async _fetchLanguagesFromDocs() {
12+
try {
13+
// Fetch Deepgram STT documentation page
14+
const response = await axios.get('https://developers.deepgram.com/docs/models-languages-overview', {
15+
timeout: 5000,
16+
headers: {
17+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
18+
}
19+
})
20+
21+
const html = response.data
22+
const languages = new Set()
23+
24+
// Parse language codes from documentation
25+
// Look for patterns like language codes in tables or lists
26+
const languagePattern = /\b([a-z]{2}(?:-[A-Z]{2})?)\b/g
27+
let match
28+
29+
// Common language codes that Deepgram typically supports
30+
const commonLanguages = [
31+
'af', 'ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'en-AU', 'en-GB', 'en-IN', 'en-NZ', 'en-US',
32+
'es', 'es-419', 'et', 'fa', 'fi', 'fr', 'fr-CA', 'he', 'hi', 'hr', 'hu', 'id', 'it', 'ja',
33+
'ko', 'lt', 'lv', 'ms', 'nl', 'no', 'pl', 'pt', 'pt-BR', 'pt-PT', 'ro', 'ru', 'sk', 'sl', 'sr', 'sv',
34+
'sw', 'ta', 'th', 'tr', 'uk', 'ur', 'vi', 'zh', 'zh-CN', 'zh-TW'
35+
]
36+
37+
while ((match = languagePattern.exec(html)) !== null) {
38+
const lang = match[1]
39+
if (commonLanguages.includes(lang)) {
40+
languages.add(lang)
41+
}
42+
}
43+
44+
const languageArray = Array.from(languages).sort()
45+
debug(`Fetched ${languageArray.length} languages from Deepgram STT documentation`)
46+
47+
return languageArray.length > 0 ? languageArray : null
48+
49+
} catch (err) {
50+
debug(`Failed to fetch languages from documentation: ${err.message}`)
51+
return null
52+
}
53+
}
54+
55+
async languages (req) {
56+
// Try to fetch from documentation first
57+
const docLanguages = await this._fetchLanguagesFromDocs()
58+
if (docLanguages && docLanguages.length > 0) {
59+
return docLanguages
60+
}
61+
62+
// Fallback to static list if documentation parsing fails
63+
debug('Using fallback static language list')
64+
return []
65+
}
66+
67+
async stt_OpenStream (req, { language }) {
68+
const options = deepgramOptions(req)
69+
if (!options.apiKey) {
70+
throw new Error('Deepgram API key not configured')
71+
}
72+
73+
const deepgram = createClient(options.apiKey)
74+
75+
const streamOptions = {
76+
model: 'general',
77+
language: language,
78+
smart_format: true,
79+
punctuate: true,
80+
interim_results: true,
81+
utterance_end_ms: 1000,
82+
vad_events: true
83+
}
84+
85+
// Apply default config from environment
86+
if (process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG) {
87+
try {
88+
const defaultConfig = JSON.parse(process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG)
89+
Object.assign(streamOptions, defaultConfig)
90+
} catch (err) {
91+
throw new Error(`Deepgram config in BOTIUM_SPEECH_DEEPGRAM_CONFIG invalid: ${err.message}`)
92+
}
93+
}
94+
95+
// Apply request-specific config
96+
if (req.body && req.body.deepgram && req.body.deepgram.config) {
97+
Object.assign(streamOptions, req.body.deepgram.config)
98+
}
99+
100+
const events = new EventEmitter()
101+
let eventHistory = []
102+
let connection = null
103+
104+
try {
105+
connection = deepgram.listen.live(streamOptions)
106+
107+
connection.on('open', () => {
108+
debug('Deepgram WebSocket opened')
109+
})
110+
111+
connection.on('Results', (data) => {
112+
const result = data.channel.alternatives[0]
113+
if (result && result.transcript) {
114+
const event = {
115+
status: 'ok',
116+
text: result.transcript,
117+
final: data.is_final || false,
118+
debug: data
119+
}
120+
121+
// Add timing information if available
122+
if (data.start && data.duration) {
123+
event.start = _.round(data.start, 3)
124+
event.end = _.round(data.start + data.duration, 3)
125+
}
126+
127+
events.emit('data', event)
128+
if (eventHistory) {
129+
eventHistory.push(event)
130+
}
131+
}
132+
})
133+
134+
connection.on('UtteranceEnd', (data) => {
135+
debug('Deepgram utterance end detected')
136+
})
137+
138+
connection.on('error', (err) => {
139+
const event = {
140+
status: 'error',
141+
err: `Deepgram STT failed: ${err.message || err}`
142+
}
143+
events.emit('data', event)
144+
if (eventHistory) {
145+
eventHistory.push(event)
146+
}
147+
})
148+
149+
connection.on('close', () => {
150+
debug('Deepgram WebSocket closed')
151+
events.emit('close')
152+
})
153+
154+
} catch (err) {
155+
debug(err)
156+
throw new Error(`Deepgram STT streaming setup failed: ${err.message}`)
157+
}
158+
159+
return {
160+
events,
161+
write: (buffer) => {
162+
if (connection && connection.getReadyState() === 1) {
163+
connection.send(buffer)
164+
}
165+
},
166+
end: () => {
167+
if (connection) {
168+
connection.finish()
169+
}
170+
},
171+
close: () => {
172+
if (connection) {
173+
connection.finish()
174+
connection = null
175+
}
176+
eventHistory = null
177+
},
178+
triggerHistoryEmit: () => {
179+
for (const eh of eventHistory) {
180+
events.emit('data', eh)
181+
}
182+
}
183+
}
184+
}
185+
186+
async stt (req, { language, buffer, hint }) {
187+
const options = deepgramOptions(req)
188+
if (!options.apiKey) {
189+
throw new Error('Deepgram API key not configured')
190+
}
191+
192+
const deepgram = createClient(options.apiKey)
193+
194+
const transcribeOptions = {
195+
model: 'general',
196+
language: language,
197+
smart_format: true,
198+
punctuate: true
199+
}
200+
201+
// Add search terms if hint is provided
202+
if (hint && hint.length > 0) {
203+
transcribeOptions.search = [hint]
204+
}
205+
206+
// Apply default config from environment
207+
if (process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG) {
208+
try {
209+
const defaultConfig = JSON.parse(process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG)
210+
Object.assign(transcribeOptions, defaultConfig)
211+
} catch (err) {
212+
throw new Error(`Deepgram config in BOTIUM_SPEECH_DEEPGRAM_CONFIG invalid: ${err.message}`)
213+
}
214+
}
215+
216+
// Apply request-specific config
217+
if (req.body && req.body.deepgram && req.body.deepgram.config) {
218+
Object.assign(transcribeOptions, req.body.deepgram.config)
219+
}
220+
221+
try {
222+
debug(`Calling Deepgram API with options: ${JSON.stringify(transcribeOptions)}`)
223+
224+
const response = await deepgram.listen.prerecorded.transcribeFile(
225+
buffer,
226+
transcribeOptions
227+
)
228+
229+
debug(`Deepgram response: ${JSON.stringify(response, null, 2)}`)
230+
231+
if (response.results && response.results.channels && response.results.channels[0]) {
232+
const channel = response.results.channels[0]
233+
if (channel.alternatives && channel.alternatives[0]) {
234+
const transcript = channel.alternatives[0].transcript || ''
235+
return {
236+
text: transcript,
237+
debug: response
238+
}
239+
}
240+
}
241+
242+
return {
243+
text: '',
244+
debug: response
245+
}
246+
} catch (err) {
247+
debug(err)
248+
throw new Error(`Deepgram STT failed: ${err.message || err}`)
249+
}
250+
}
251+
}
252+
253+
module.exports = DeepgramSTT

0 commit comments

Comments
 (0)