Merge pull request #37 from codeforequity-at/deepgram-stt

murliwatz · web-flow · commit 45abbb198ae7 · 2025-07-24T05:31:24.000+02:00
Deepgram stt
diff --git a/README.md b/README.md
@@ -57,6 +57,10 @@ For the major cloud providers there are additional docker-compose files. If usin
 
     > docker-compose -f docker-compose-azure.yml up -d
 
+For Deepgram, add your API key to the file *docker-compose-deepgram.yml* and start the services:
+
+    > docker-compose -f docker-compose-deepgram.yml up -d
+
 ### Optional: Build Docker Images
 
 You can optionally built your own docker images (if you made any changes in this repository, for instance to download the latest version of a model). Clone or download this repository and run docker-compose:
diff --git a/docker-compose-deepgram.yml b/docker-compose-deepgram.yml
@@ -0,0 +1,19 @@
+version: '3'
+services:
+  nginx:
+    image: nginx
+    restart: always
+    volumes:
+      - ./nginx.conf:/etc/nginx/nginx.conf
+    ports:
+      - 80:80
+  frontend:
+    image: botium/botium-speech-frontend:latest
+    restart: always
+    environment:
+      BOTIUM_API_TOKENS: 
+      BOTIUM_SPEECH_PROVIDER_TTS: google
+      BOTIUM_SPEECH_PROVIDER_STT: deepgram
+      BOTIUM_SPEECH_DEEPGRAM_API_KEY:
+    volumes:
+      - "./frontend/resources:/app/resources"
diff --git a/frontend/package.json b/frontend/package.json
@@ -18,6 +18,7 @@
     "@aws-sdk/client-transcribe": "^3.775.0",
     "@aws-sdk/client-transcribe-streaming": "^3.775.0",
     "@aws-sdk/util-endpoints": "^3.775.0",
+    "@deepgram/sdk": "^3.10.1",
     "@google-cloud/speech": "^7.0.1",
     "@google-cloud/storage": "^7.15.2",
     "@google-cloud/text-to-speech": "^6.0.1",
diff --git a/frontend/src/routes.js b/frontend/src/routes.js
@@ -39,7 +39,8 @@ const ttsEngines = {
   google: new (require('./tts/google'))(),
   ibm: new (require('./tts/ibm'))(),
   azure: new (require('./tts/azure'))(),
-  polly: new (require('./tts/polly'))()
+  polly: new (require('./tts/polly'))(),
+  deepgram: new (require('./tts/deepgram'))()
   // marytts: new (require('./tts/marytts'))(),
   // picotts: new (require('./tts/picotts'))()
 }
@@ -48,7 +49,8 @@ const sttEngines = {
   // kaldi: new (require('./stt/kaldi'))(),
   ibm: new (require('./stt/ibm'))(),
   azure: new (require('./stt/azure'))(),
-  awstranscribe: new (require('./stt/awstranscribe'))()
+  awstranscribe: new (require('./stt/awstranscribe'))(),
+  deepgram: new (require('./stt/deepgram'))()
 }
 
 const multerMemoryStorage = multer.memoryStorage()
@@ -144,7 +146,7 @@ const router = express.Router()
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, awstranscribe]
+ *           enum: [google, ibm, azure, awstranscribe, deepgram]
  *     responses:
  *       200:
  *         description: List of supported STT languages
@@ -196,7 +198,7 @@ const router = express.Router()
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, awstranscribe]
+ *           enum: [google, ibm, azure, awstranscribe, deepgram]
  *       - name: cache
  *         description: Use result cache (default Y)
  *         in: query
@@ -299,7 +301,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, polly]
+ *           enum: [google, ibm, azure, polly, deepgram]
  *     responses:
  *       200:
  *         description: List of supported voices
@@ -340,7 +342,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, polly]
+ *           enum: [google, ibm, azure, polly, deepgram]
  *     responses:
  *       200:
  *         description: List of supported TTS languages
@@ -395,7 +397,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, polly]
+ *           enum: [google, ibm, azure, polly, deepgram]
  *       - name: cache
  *         description: Use result cache (default Y)
  *         in: query
@@ -783,7 +785,7 @@ const wssStreams = {}
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, awstranscribe]
+ *           enum: [google, ibm, azure, awstranscribe, deepgram]
  *     responses:
  *       200:
  *         description: Websocket Url to stream the audio to, and the uri to check status and end the stream
diff --git a/frontend/src/stt/deepgram.js b/frontend/src/stt/deepgram.js
@@ -0,0 +1,253 @@
+const _ = require('lodash')
+const { createClient } = require('@deepgram/sdk')
+const { PassThrough } = require('stream')
+const EventEmitter = require('events')
+const axios = require('axios')
+const debug = require('debug')('botium-speech-processing-deepgram-stt')
+
+const { deepgramOptions } = require('../utils')
+
+class DeepgramSTT {
+  async _fetchLanguagesFromDocs() {
+    try {
+      // Fetch Deepgram STT documentation page
+      const response = await axios.get('https://developers.deepgram.com/docs/models-languages-overview', {
+        timeout: 5000,
+        headers: {
+          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+      })
+      
+      const html = response.data
+      const languages = new Set()
+      
+      // Parse language codes from documentation
+      // Look for patterns like language codes in tables or lists
+      const languagePattern = /\b([a-z]{2}(?:-[A-Z]{2})?)\b/g
+      let match
+      
+      // Common language codes that Deepgram typically supports
+      const commonLanguages = [
+        'af', 'ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'en-AU', 'en-GB', 'en-IN', 'en-NZ', 'en-US',
+        'es', 'es-419', 'et', 'fa', 'fi', 'fr', 'fr-CA', 'he', 'hi', 'hr', 'hu', 'id', 'it', 'ja',
+        'ko', 'lt', 'lv', 'ms', 'nl', 'no', 'pl', 'pt', 'pt-BR', 'pt-PT', 'ro', 'ru', 'sk', 'sl', 'sr', 'sv',
+        'sw', 'ta', 'th', 'tr', 'uk', 'ur', 'vi', 'zh', 'zh-CN', 'zh-TW'
+      ]
+      
+      while ((match = languagePattern.exec(html)) !== null) {
+        const lang = match[1]
+        if (commonLanguages.includes(lang)) {
+          languages.add(lang)
+        }
+      }
+      
+      const languageArray = Array.from(languages).sort()
+      debug(`Fetched ${languageArray.length} languages from Deepgram STT documentation`)
+      
+      return languageArray.length > 0 ? languageArray : null
+      
+    } catch (err) {
+      debug(`Failed to fetch languages from documentation: ${err.message}`)
+      return null
+    }
+  }
+
+  async languages (req) {
+    // Try to fetch from documentation first
+    const docLanguages = await this._fetchLanguagesFromDocs()
+    if (docLanguages && docLanguages.length > 0) {
+      return docLanguages
+    }
+    
+    // Fallback to static list if documentation parsing fails
+    debug('Using fallback static language list')
+    return []
+  }
+
+  async stt_OpenStream (req, { language }) {
+    const options = deepgramOptions(req)
+    if (!options.apiKey) {
+      throw new Error('Deepgram API key not configured')
+    }
+
+    const deepgram = createClient(options.apiKey)
+    
+    const streamOptions = {
+      model: 'general',
+      language: language,
+      smart_format: true,
+      punctuate: true,
+      interim_results: true,
+      utterance_end_ms: 1000,
+      vad_events: true
+    }
+
+    // Apply default config from environment
+    if (process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG) {
+      try {
+        const defaultConfig = JSON.parse(process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG)
+        Object.assign(streamOptions, defaultConfig)
+      } catch (err) {
+        throw new Error(`Deepgram config in BOTIUM_SPEECH_DEEPGRAM_CONFIG invalid: ${err.message}`)
+      }
+    }
+
+    // Apply request-specific config
+    if (req.body && req.body.deepgram && req.body.deepgram.config) {
+      Object.assign(streamOptions, req.body.deepgram.config)
+    }
+
+    const events = new EventEmitter()
+    let eventHistory = []
+    let connection = null
+
+    try {
+      connection = deepgram.listen.live(streamOptions)
+      
+      connection.on('open', () => {
+        debug('Deepgram WebSocket opened')
+      })
+
+      connection.on('Results', (data) => {
+        const result = data.channel.alternatives[0]
+        if (result && result.transcript) {
+          const event = {
+            status: 'ok',
+            text: result.transcript,
+            final: data.is_final || false,
+            debug: data
+          }
+          
+          // Add timing information if available
+          if (data.start && data.duration) {
+            event.start = _.round(data.start, 3)
+            event.end = _.round(data.start + data.duration, 3)
+          }
+          
+          events.emit('data', event)
+          if (eventHistory) {
+            eventHistory.push(event)
+          }
+        }
+      })
+
+      connection.on('UtteranceEnd', (data) => {
+        debug('Deepgram utterance end detected')
+      })
+
+      connection.on('error', (err) => {
+        const event = {
+          status: 'error',
+          err: `Deepgram STT failed: ${err.message || err}`
+        }
+        events.emit('data', event)
+        if (eventHistory) {
+          eventHistory.push(event)
+        }
+      })
+
+      connection.on('close', () => {
+        debug('Deepgram WebSocket closed')
+        events.emit('close')
+      })
+
+    } catch (err) {
+      debug(err)
+      throw new Error(`Deepgram STT streaming setup failed: ${err.message}`)
+    }
+
+    return {
+      events,
+      write: (buffer) => {
+        if (connection && connection.getReadyState() === 1) {
+          connection.send(buffer)
+        }
+      },
+      end: () => {
+        if (connection) {
+          connection.finish()
+        }
+      },
+      close: () => {
+        if (connection) {
+          connection.finish()
+          connection = null
+        }
+        eventHistory = null
+      },
+      triggerHistoryEmit: () => {
+        for (const eh of eventHistory) {
+          events.emit('data', eh)
+        }
+      }
+    }
+  }
+
+  async stt (req, { language, buffer, hint }) {
+    const options = deepgramOptions(req)
+    if (!options.apiKey) {
+      throw new Error('Deepgram API key not configured')
+    }
+
+    const deepgram = createClient(options.apiKey)
+    
+    const transcribeOptions = {
+      model: 'general',
+      language: language,
+      smart_format: true,
+      punctuate: true
+    }
+
+    // Add search terms if hint is provided
+    if (hint && hint.length > 0) {
+      transcribeOptions.search = [hint]
+    }
+
+    // Apply default config from environment
+    if (process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG) {
+      try {
+        const defaultConfig = JSON.parse(process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG)
+        Object.assign(transcribeOptions, defaultConfig)
+      } catch (err) {
+        throw new Error(`Deepgram config in BOTIUM_SPEECH_DEEPGRAM_CONFIG invalid: ${err.message}`)
+      }
+    }
+
+    // Apply request-specific config
+    if (req.body && req.body.deepgram && req.body.deepgram.config) {
+      Object.assign(transcribeOptions, req.body.deepgram.config)
+    }
+
+    try {
+      debug(`Calling Deepgram API with options: ${JSON.stringify(transcribeOptions)}`)
+      
+      const response = await deepgram.listen.prerecorded.transcribeFile(
+        buffer,
+        transcribeOptions
+      )
+
+      debug(`Deepgram response: ${JSON.stringify(response, null, 2)}`)
+
+      if (response.results && response.results.channels && response.results.channels[0]) {
+        const channel = response.results.channels[0]
+        if (channel.alternatives && channel.alternatives[0]) {
+          const transcript = channel.alternatives[0].transcript || ''
+          return {
+            text: transcript,
+            debug: response
+          }
+        }
+      }
+
+      return {
+        text: '',
+        debug: response
+      }
+    } catch (err) {
+      debug(err)
+      throw new Error(`Deepgram STT failed: ${err.message || err}`)
+    }
+  }
+}
+
+module.exports = DeepgramSTT
diff --git a/frontend/src/tts/deepgram.js b/frontend/src/tts/deepgram.js
diff --git a/frontend/src/utils.js b/frontend/src/utils.js