Skip to content

Commit 42148df

Browse files
committed
deepgram stt support
1 parent 30381d9 commit 42148df

File tree

6 files changed

+248
-5
lines changed

6 files changed

+248
-5
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ For the major cloud providers there are additional docker-compose files. If usin
5757

5858
> docker-compose -f docker-compose-azure.yml up -d
5959

60+
For Deepgram, add your API key to the file *docker-compose-deepgram.yml* and start the services:
61+
62+
> docker-compose -f docker-compose-deepgram.yml up -d
63+
6064
### Optional: Build Docker Images
6165

6266
You can optionally built your own docker images (if you made any changes in this repository, for instance to download the latest version of a model). Clone or download this repository and run docker-compose:

docker-compose-deepgram.yml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
version: '3'
2+
services:
3+
nginx:
4+
image: nginx
5+
restart: always
6+
volumes:
7+
- ./nginx.conf:/etc/nginx/nginx.conf
8+
ports:
9+
- 80:80
10+
frontend:
11+
image: botium/botium-speech-frontend:latest
12+
restart: always
13+
environment:
14+
BOTIUM_API_TOKENS:
15+
BOTIUM_SPEECH_PROVIDER_TTS: google
16+
BOTIUM_SPEECH_PROVIDER_STT: deepgram
17+
BOTIUM_SPEECH_DEEPGRAM_API_KEY:
18+
volumes:
19+
- "./frontend/resources:/app/resources"

frontend/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"@aws-sdk/client-transcribe": "^3.775.0",
1919
"@aws-sdk/client-transcribe-streaming": "^3.775.0",
2020
"@aws-sdk/util-endpoints": "^3.775.0",
21+
"@deepgram/sdk": "^3.10.1",
2122
"@google-cloud/speech": "^7.0.1",
2223
"@google-cloud/storage": "^7.15.2",
2324
"@google-cloud/text-to-speech": "^6.0.1",

frontend/src/routes.js

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ const sttEngines = {
4848
// kaldi: new (require('./stt/kaldi'))(),
4949
ibm: new (require('./stt/ibm'))(),
5050
azure: new (require('./stt/azure'))(),
51-
awstranscribe: new (require('./stt/awstranscribe'))()
51+
awstranscribe: new (require('./stt/awstranscribe'))(),
52+
deepgram: new (require('./stt/deepgram'))()
5253
}
5354

5455
const multerMemoryStorage = multer.memoryStorage()
@@ -144,7 +145,7 @@ const router = express.Router()
144145
* required: false
145146
* schema:
146147
* type: string
147-
* enum: [google, ibm, azure, awstranscribe]
148+
* enum: [google, ibm, azure, awstranscribe, deepgram]
148149
* responses:
149150
* 200:
150151
* description: List of supported STT languages
@@ -196,7 +197,7 @@ const router = express.Router()
196197
* required: false
197198
* schema:
198199
* type: string
199-
* enum: [google, ibm, azure, awstranscribe]
200+
* enum: [google, ibm, azure, awstranscribe, deepgram]
200201
* - name: cache
201202
* description: Use result cache (default Y)
202203
* in: query
@@ -783,7 +784,7 @@ const wssStreams = {}
783784
* required: false
784785
* schema:
785786
* type: string
786-
* enum: [google, ibm, azure, awstranscribe]
787+
* enum: [google, ibm, azure, awstranscribe, deepgram]
787788
* responses:
788789
* 200:
789790
* description: Websocket Url to stream the audio to, and the uri to check status and end the stream

frontend/src/stt/deepgram.js

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
const _ = require('lodash')
2+
const { createClient } = require('@deepgram/sdk')
3+
const { PassThrough } = require('stream')
4+
const EventEmitter = require('events')
5+
const debug = require('debug')('botium-speech-processing-deepgram-stt')
6+
7+
const { deepgramOptions } = require('../utils')
8+
9+
class DeepgramSTT {
10+
async languages (req) {
11+
// Deepgram supports a wide range of languages
12+
// This is a subset of commonly used languages
13+
return [
14+
'da', 'de', 'en', 'en-AU', 'en-GB', 'en-IN', 'en-NZ', 'en-US',
15+
'es', 'es-419', 'fi', 'fr', 'fr-CA', 'hi', 'id', 'it', 'ja',
16+
'ko', 'nl', 'no', 'pl', 'pt', 'pt-BR', 'pt-PT', 'ru', 'sv',
17+
'ta', 'th', 'tr', 'uk', 'zh', 'zh-CN', 'zh-TW'
18+
].sort()
19+
}
20+
21+
async stt_OpenStream (req, { language }) {
22+
const options = deepgramOptions(req)
23+
if (!options.apiKey) {
24+
throw new Error('Deepgram API key not configured')
25+
}
26+
27+
const deepgram = createClient(options.apiKey)
28+
29+
const streamOptions = {
30+
model: 'general',
31+
language: language,
32+
smart_format: true,
33+
punctuate: true,
34+
interim_results: true,
35+
utterance_end_ms: 1000,
36+
vad_events: true
37+
}
38+
39+
// Apply default config from environment
40+
if (process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG) {
41+
try {
42+
const defaultConfig = JSON.parse(process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG)
43+
Object.assign(streamOptions, defaultConfig)
44+
} catch (err) {
45+
throw new Error(`Deepgram config in BOTIUM_SPEECH_DEEPGRAM_CONFIG invalid: ${err.message}`)
46+
}
47+
}
48+
49+
// Apply request-specific config
50+
if (req.body && req.body.deepgram && req.body.deepgram.config) {
51+
Object.assign(streamOptions, req.body.deepgram.config)
52+
}
53+
54+
const events = new EventEmitter()
55+
let eventHistory = []
56+
let connection = null
57+
58+
try {
59+
connection = deepgram.listen.live(streamOptions)
60+
61+
connection.on('open', () => {
62+
debug('Deepgram WebSocket opened')
63+
})
64+
65+
connection.on('Results', (data) => {
66+
console.log(data)
67+
const result = data.channel.alternatives[0]
68+
if (result && result.transcript) {
69+
const event = {
70+
status: 'ok',
71+
text: result.transcript,
72+
final: data.is_final || false,
73+
debug: data
74+
}
75+
76+
// Add timing information if available
77+
if (data.start && data.duration) {
78+
event.start = _.round(data.start, 3)
79+
event.end = _.round(data.start + data.duration, 3)
80+
}
81+
82+
events.emit('data', event)
83+
if (eventHistory) {
84+
eventHistory.push(event)
85+
}
86+
}
87+
})
88+
89+
connection.on('UtteranceEnd', (data) => {
90+
debug('Deepgram utterance end detected')
91+
})
92+
93+
connection.on('error', (err) => {
94+
const event = {
95+
status: 'error',
96+
err: `Deepgram STT failed: ${err.message || err}`
97+
}
98+
events.emit('data', event)
99+
if (eventHistory) {
100+
eventHistory.push(event)
101+
}
102+
})
103+
104+
connection.on('close', () => {
105+
debug('Deepgram WebSocket closed')
106+
events.emit('close')
107+
})
108+
109+
} catch (err) {
110+
debug(err)
111+
throw new Error(`Deepgram STT streaming setup failed: ${err.message}`)
112+
}
113+
114+
return {
115+
events,
116+
write: (buffer) => {
117+
if (connection && connection.getReadyState() === 1) {
118+
connection.send(buffer)
119+
}
120+
},
121+
end: () => {
122+
if (connection) {
123+
connection.finish()
124+
}
125+
},
126+
close: () => {
127+
if (connection) {
128+
connection.finish()
129+
connection = null
130+
}
131+
eventHistory = null
132+
},
133+
triggerHistoryEmit: () => {
134+
for (const eh of eventHistory) {
135+
events.emit('data', eh)
136+
}
137+
}
138+
}
139+
}
140+
141+
async stt (req, { language, buffer, hint }) {
142+
const options = deepgramOptions(req)
143+
if (!options.apiKey) {
144+
throw new Error('Deepgram API key not configured')
145+
}
146+
147+
const deepgram = createClient(options.apiKey)
148+
149+
const transcribeOptions = {
150+
model: 'general',
151+
language: language,
152+
smart_format: true,
153+
punctuate: true
154+
}
155+
156+
// Add search terms if hint is provided
157+
if (hint && hint.length > 0) {
158+
transcribeOptions.search = [hint]
159+
}
160+
161+
// Apply default config from environment
162+
if (process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG) {
163+
try {
164+
const defaultConfig = JSON.parse(process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG)
165+
Object.assign(transcribeOptions, defaultConfig)
166+
} catch (err) {
167+
throw new Error(`Deepgram config in BOTIUM_SPEECH_DEEPGRAM_CONFIG invalid: ${err.message}`)
168+
}
169+
}
170+
171+
// Apply request-specific config
172+
if (req.body && req.body.deepgram && req.body.deepgram.config) {
173+
Object.assign(transcribeOptions, req.body.deepgram.config)
174+
}
175+
176+
try {
177+
debug(`Calling Deepgram API with options: ${JSON.stringify(transcribeOptions)}`)
178+
179+
const response = await deepgram.listen.prerecorded.transcribeFile(
180+
buffer,
181+
transcribeOptions
182+
)
183+
184+
debug(`Deepgram response: ${JSON.stringify(response, null, 2)}`)
185+
186+
if (response.results && response.results.channels && response.results.channels[0]) {
187+
const channel = response.results.channels[0]
188+
if (channel.alternatives && channel.alternatives[0]) {
189+
const transcript = channel.alternatives[0].transcript || ''
190+
return {
191+
text: transcript,
192+
debug: response
193+
}
194+
}
195+
}
196+
197+
return {
198+
text: '',
199+
debug: response
200+
}
201+
} catch (err) {
202+
debug(err)
203+
throw new Error(`Deepgram STT failed: ${err.message || err}`)
204+
}
205+
}
206+
}
207+
208+
module.exports = DeepgramSTT

frontend/src/utils.js

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,15 @@ const applyIfExists = (target, src, p) => {
170170
return Object.assign(target, _.get(src, p) || {})
171171
}
172172

173+
const deepgramOptions = (req) => {
174+
const apiKey = _.get(req, 'body.deepgram.credentials.apiKey') || process.env.BOTIUM_SPEECH_DEEPGRAM_API_KEY
175+
176+
if (apiKey) {
177+
return { apiKey }
178+
}
179+
throw new Error('Deepgram API key not found')
180+
}
181+
173182
module.exports = {
174183
asJson,
175184
enumValueToName,
@@ -184,5 +193,6 @@ module.exports = {
184193
applyExtraAzureSpeechConfig,
185194
getAzureErrorDetails,
186195
readBaseUrls,
187-
applyIfExists
196+
applyIfExists,
197+
deepgramOptions
188198
}

0 commit comments

Comments
 (0)