Skip to content

Commit a45c9d9

Browse files
committed
deepgram tts
1 parent 42148df commit a45c9d9

File tree

3 files changed

+220
-13
lines changed

3 files changed

+220
-13
lines changed

frontend/src/routes.js

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ const ttsEngines = {
3939
google: new (require('./tts/google'))(),
4040
ibm: new (require('./tts/ibm'))(),
4141
azure: new (require('./tts/azure'))(),
42-
polly: new (require('./tts/polly'))()
42+
polly: new (require('./tts/polly'))(),
43+
deepgram: new (require('./tts/deepgram'))()
4344
// marytts: new (require('./tts/marytts'))(),
4445
// picotts: new (require('./tts/picotts'))()
4546
}
@@ -300,7 +301,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
300301
* required: false
301302
* schema:
302303
* type: string
303-
* enum: [google, ibm, azure, polly]
304+
* enum: [google, ibm, azure, polly, deepgram]
304305
* responses:
305306
* 200:
306307
* description: List of supported voices
@@ -341,7 +342,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
341342
* required: false
342343
* schema:
343344
* type: string
344-
* enum: [google, ibm, azure, polly]
345+
* enum: [google, ibm, azure, polly, deepgram]
345346
* responses:
346347
* 200:
347348
* description: List of supported TTS languages
@@ -396,7 +397,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
396397
* required: false
397398
* schema:
398399
* type: string
399-
* enum: [google, ibm, azure, polly]
400+
* enum: [google, ibm, azure, polly, deepgram]
400401
* - name: cache
401402
* description: Use result cache (default Y)
402403
* in: query

frontend/src/stt/deepgram.js

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,66 @@ const _ = require('lodash')
22
const { createClient } = require('@deepgram/sdk')
33
const { PassThrough } = require('stream')
44
const EventEmitter = require('events')
5+
const axios = require('axios')
56
const debug = require('debug')('botium-speech-processing-deepgram-stt')
67

78
const { deepgramOptions } = require('../utils')
89

910
class DeepgramSTT {
11+
async _fetchLanguagesFromDocs() {
12+
try {
13+
// Fetch Deepgram STT documentation page
14+
const response = await axios.get('https://developers.deepgram.com/docs/models-languages-overview', {
15+
timeout: 5000,
16+
headers: {
17+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
18+
}
19+
})
20+
21+
const html = response.data
22+
const languages = new Set()
23+
24+
// Parse language codes from documentation
25+
// Look for patterns like language codes in tables or lists
26+
const languagePattern = /\b([a-z]{2}(?:-[A-Z]{2})?)\b/g
27+
let match
28+
29+
// Common language codes that Deepgram typically supports
30+
const commonLanguages = [
31+
'af', 'ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'en-AU', 'en-GB', 'en-IN', 'en-NZ', 'en-US',
32+
'es', 'es-419', 'et', 'fa', 'fi', 'fr', 'fr-CA', 'he', 'hi', 'hr', 'hu', 'id', 'it', 'ja',
33+
'ko', 'lt', 'lv', 'ms', 'nl', 'no', 'pl', 'pt', 'pt-BR', 'pt-PT', 'ro', 'ru', 'sk', 'sl', 'sr', 'sv',
34+
'sw', 'ta', 'th', 'tr', 'uk', 'ur', 'vi', 'zh', 'zh-CN', 'zh-TW'
35+
]
36+
37+
while ((match = languagePattern.exec(html)) !== null) {
38+
const lang = match[1]
39+
if (commonLanguages.includes(lang)) {
40+
languages.add(lang)
41+
}
42+
}
43+
44+
const languageArray = Array.from(languages).sort()
45+
debug(`Fetched ${languageArray.length} languages from Deepgram STT documentation`)
46+
47+
return languageArray.length > 0 ? languageArray : null
48+
49+
} catch (err) {
50+
debug(`Failed to fetch languages from documentation: ${err.message}`)
51+
return null
52+
}
53+
}
54+
1055
async languages (req) {
11-
// Deepgram supports a wide range of languages
12-
// This is a subset of commonly used languages
13-
return [
14-
'da', 'de', 'en', 'en-AU', 'en-GB', 'en-IN', 'en-NZ', 'en-US',
15-
'es', 'es-419', 'fi', 'fr', 'fr-CA', 'hi', 'id', 'it', 'ja',
16-
'ko', 'nl', 'no', 'pl', 'pt', 'pt-BR', 'pt-PT', 'ru', 'sv',
17-
'ta', 'th', 'tr', 'uk', 'zh', 'zh-CN', 'zh-TW'
18-
].sort()
56+
// Try to fetch from documentation first
57+
const docLanguages = await this._fetchLanguagesFromDocs()
58+
if (docLanguages && docLanguages.length > 0) {
59+
return docLanguages
60+
}
61+
62+
// Fallback to static list if documentation parsing fails
63+
debug('Using fallback static language list')
64+
return []
1965
}
2066

2167
async stt_OpenStream (req, { language }) {
@@ -63,7 +109,6 @@ class DeepgramSTT {
63109
})
64110

65111
connection.on('Results', (data) => {
66-
console.log(data)
67112
const result = data.channel.alternatives[0]
68113
if (result && result.transcript) {
69114
const event = {

frontend/src/tts/deepgram.js

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
const _ = require('lodash')
2+
const { createClient } = require('@deepgram/sdk')
3+
const axios = require('axios')
4+
const debug = require('debug')('botium-speech-processing-deepgram-tts')
5+
6+
const { deepgramOptions, ttsFilename } = require('../utils')
7+
8+
class DeepgramTTS {
9+
async _fetchVoicesFromDocs() {
10+
try {
11+
// Fetch Deepgram TTS documentation page
12+
const response = await axios.get('https://developers.deepgram.com/docs/tts-models', {
13+
timeout: 5000,
14+
headers: {
15+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
16+
}
17+
})
18+
19+
const html = response.data
20+
const voices = []
21+
22+
// Parse voice models from documentation
23+
// Look for patterns like "aura-2-asteria-en" (only Aura-2 generation)
24+
const voicePattern = /aura-2-([a-z]+)-([a-z]{2,3})/g
25+
let match
26+
27+
while ((match = voicePattern.exec(html)) !== null) {
28+
console.log(match)
29+
const fullMatch = match[0] // Full match like "aura-2-asteria-en"
30+
const name = fullMatch
31+
const voiceName = match[1] // asteria
32+
const language = match[2] // en
33+
34+
// Determine gender based on common name patterns
35+
const femaleNames = ['asteria', 'luna', 'stella', 'athena', 'hera', 'esperanza', 'ramona', 'margot', 'claire', 'liesel', 'greta', 'lucia', 'sofia', 'valentina', 'giulia', 'hina', 'yuki', 'yuna', 'soo', 'xiaoxiao', 'mei', 'nova', 'emma', 'klara', 'katya', 'natasha', 'zeynep', 'maya', 'astrid', 'ingrid', 'maja', 'aino', 'oksana', 'tereza', 'zsofia', 'elena', 'maria', 'ana', 'milica', 'jana', 'meta', 'ausra', 'liga', 'kadri', 'sarah', 'layla', 'siriporn', 'linh', 'sari', 'siti', 'priya', 'rashida', 'fatima', 'maryam', 'amara']
36+
37+
const gender = femaleNames.includes(voiceName) ? 'female' : 'male'
38+
39+
voices.push({ name, gender, language })
40+
}
41+
42+
// Remove duplicates
43+
const uniqueVoices = _.uniqBy(voices, 'name')
44+
45+
// Filter out non-existent languages (keep only valid ISO codes)
46+
const validLanguages = [
47+
'ar', 'bg', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'et',
48+
'fa', 'fi', 'fr', 'he', 'hi', 'hr', 'hu', 'id', 'it', 'ja',
49+
'ko', 'lt', 'lv', 'ms', 'nl', 'no', 'pl', 'pt', 'ro', 'ru',
50+
'sk', 'sl', 'sr', 'sv', 'sw', 'ta', 'th', 'tr', 'uk', 'ur',
51+
'vi', 'zh'
52+
]
53+
54+
const filteredVoices = uniqueVoices.filter(voice =>
55+
validLanguages.includes(voice.language)
56+
)
57+
58+
debug(`Fetched ${filteredVoices.length} voices with valid languages from Deepgram documentation`)
59+
return filteredVoices.length > 0 ? filteredVoices : null
60+
61+
} catch (err) {
62+
debug(`Failed to fetch voices from documentation: ${err.message}`)
63+
return null
64+
}
65+
}
66+
67+
async voices (req) {
68+
// Try to fetch from documentation first
69+
const docVoices = await this._fetchVoicesFromDocs()
70+
if (docVoices && docVoices.length > 0) {
71+
return docVoices
72+
}
73+
74+
// Fallback to static list if documentation parsing fails
75+
debug('Using fallback static voice list (Aura-2 only)')
76+
return [
77+
// English voices (Aura-2)
78+
{ name: 'aura-2-asteria-en', gender: 'female', language: 'en' },
79+
{ name: 'aura-2-luna-en', gender: 'female', language: 'en' },
80+
{ name: 'aura-2-stella-en', gender: 'female', language: 'en' },
81+
{ name: 'aura-2-athena-en', gender: 'female', language: 'en' },
82+
{ name: 'aura-2-hera-en', gender: 'female', language: 'en' },
83+
{ name: 'aura-2-orion-en', gender: 'male', language: 'en' },
84+
{ name: 'aura-2-arcas-en', gender: 'male', language: 'en' },
85+
{ name: 'aura-2-perseus-en', gender: 'male', language: 'en' },
86+
{ name: 'aura-2-angus-en', gender: 'male', language: 'en' },
87+
{ name: 'aura-2-orpheus-en', gender: 'male', language: 'en' },
88+
{ name: 'aura-2-helios-en', gender: 'male', language: 'en' },
89+
{ name: 'aura-2-zeus-en', gender: 'male', language: 'en' }
90+
]
91+
}
92+
93+
async languages (req) {
94+
const voicesList = await this.voices(req)
95+
return _.uniq(voicesList.map(v => v.language)).sort()
96+
}
97+
98+
async tts (req, { language, voice, text }) {
99+
const options = deepgramOptions(req)
100+
if (!options.apiKey) {
101+
throw new Error('Deepgram API key not configured')
102+
}
103+
104+
const deepgram = createClient(options.apiKey)
105+
106+
const speakOptions = {
107+
model: voice || 'aura-2-asteria-en',
108+
encoding: 'linear16',
109+
sample_rate: 16000
110+
}
111+
112+
// Apply default config from environment
113+
if (process.env.BOTIUM_SPEECH_DEEPGRAM_TTS_CONFIG) {
114+
try {
115+
const defaultConfig = JSON.parse(process.env.BOTIUM_SPEECH_DEEPGRAM_TTS_CONFIG)
116+
Object.assign(speakOptions, defaultConfig)
117+
} catch (err) {
118+
throw new Error(`Deepgram TTS config in BOTIUM_SPEECH_DEEPGRAM_TTS_CONFIG invalid: ${err.message}`)
119+
}
120+
}
121+
122+
// Apply request-specific config
123+
if (req.body && req.body.deepgram && req.body.deepgram.config) {
124+
Object.assign(speakOptions, req.body.deepgram.config)
125+
}
126+
127+
try {
128+
debug(`Calling Deepgram TTS API with options: ${JSON.stringify(speakOptions)}`)
129+
130+
const response = await deepgram.speak.request(
131+
{ text },
132+
speakOptions
133+
)
134+
135+
// Get the audio stream
136+
const stream = await response.getStream()
137+
if (!stream) {
138+
throw new Error('No audio stream received from Deepgram')
139+
}
140+
141+
// Convert stream to buffer
142+
const chunks = []
143+
for await (const chunk of stream) {
144+
chunks.push(chunk)
145+
}
146+
const buffer = Buffer.concat(chunks)
147+
148+
debug(`Deepgram TTS response received, buffer size: ${buffer.length}`)
149+
150+
return {
151+
buffer: buffer,
152+
name: `${ttsFilename(text)}.wav`
153+
}
154+
} catch (err) {
155+
debug(err)
156+
throw new Error(`Deepgram TTS failed: ${err.message || err}`)
157+
}
158+
}
159+
}
160+
161+
module.exports = DeepgramTTS

0 commit comments

Comments
 (0)