deepgram stt support

murliwatz · murliwatz · commit 42148dfe1d9a · 2025-07-23T08:47:30.000+02:00
diff --git a/README.md b/README.md
@@ -57,6 +57,10 @@ For the major cloud providers there are additional docker-compose files. If usin
 
     > docker-compose -f docker-compose-azure.yml up -d
 
+For Deepgram, add your API key to the file *docker-compose-deepgram.yml* and start the services:
+
+    > docker-compose -f docker-compose-deepgram.yml up -d
+
 ### Optional: Build Docker Images
 
 You can optionally built your own docker images (if you made any changes in this repository, for instance to download the latest version of a model). Clone or download this repository and run docker-compose:
diff --git a/docker-compose-deepgram.yml b/docker-compose-deepgram.yml
@@ -0,0 +1,19 @@
+version: '3'
+services:
+  nginx:
+    image: nginx
+    restart: always
+    volumes:
+      - ./nginx.conf:/etc/nginx/nginx.conf
+    ports:
+      - 80:80
+  frontend:
+    image: botium/botium-speech-frontend:latest
+    restart: always
+    environment:
+      BOTIUM_API_TOKENS: 
+      BOTIUM_SPEECH_PROVIDER_TTS: google
+      BOTIUM_SPEECH_PROVIDER_STT: deepgram
+      BOTIUM_SPEECH_DEEPGRAM_API_KEY:
+    volumes:
+      - "./frontend/resources:/app/resources"
diff --git a/frontend/package.json b/frontend/package.json
@@ -18,6 +18,7 @@
     "@aws-sdk/client-transcribe": "^3.775.0",
     "@aws-sdk/client-transcribe-streaming": "^3.775.0",
     "@aws-sdk/util-endpoints": "^3.775.0",
+    "@deepgram/sdk": "^3.10.1",
     "@google-cloud/speech": "^7.0.1",
     "@google-cloud/storage": "^7.15.2",
     "@google-cloud/text-to-speech": "^6.0.1",
diff --git a/frontend/src/routes.js b/frontend/src/routes.js
@@ -48,7 +48,8 @@ const sttEngines = {
   // kaldi: new (require('./stt/kaldi'))(),
   ibm: new (require('./stt/ibm'))(),
   azure: new (require('./stt/azure'))(),
-  awstranscribe: new (require('./stt/awstranscribe'))()
+  awstranscribe: new (require('./stt/awstranscribe'))(),
+  deepgram: new (require('./stt/deepgram'))()
 }
 
 const multerMemoryStorage = multer.memoryStorage()
@@ -144,7 +145,7 @@ const router = express.Router()
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, awstranscribe]
+ *           enum: [google, ibm, azure, awstranscribe, deepgram]
  *     responses:
  *       200:
  *         description: List of supported STT languages
@@ -196,7 +197,7 @@ const router = express.Router()
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, awstranscribe]
+ *           enum: [google, ibm, azure, awstranscribe, deepgram]
  *       - name: cache
  *         description: Use result cache (default Y)
  *         in: query
@@ -783,7 +784,7 @@ const wssStreams = {}
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, awstranscribe]
+ *           enum: [google, ibm, azure, awstranscribe, deepgram]
  *     responses:
  *       200:
  *         description: Websocket Url to stream the audio to, and the uri to check status and end the stream
diff --git a/frontend/src/stt/deepgram.js b/frontend/src/stt/deepgram.js
@@ -0,0 +1,208 @@
+const _ = require('lodash')
+const { createClient } = require('@deepgram/sdk')
+const { PassThrough } = require('stream')
+const EventEmitter = require('events')
+const debug = require('debug')('botium-speech-processing-deepgram-stt')
+
+const { deepgramOptions } = require('../utils')
+
+class DeepgramSTT {
+  async languages (req) {
+    // Deepgram supports a wide range of languages
+    // This is a subset of commonly used languages
+    return [
+      'da', 'de', 'en', 'en-AU', 'en-GB', 'en-IN', 'en-NZ', 'en-US',
+      'es', 'es-419', 'fi', 'fr', 'fr-CA', 'hi', 'id', 'it', 'ja',
+      'ko', 'nl', 'no', 'pl', 'pt', 'pt-BR', 'pt-PT', 'ru', 'sv',
+      'ta', 'th', 'tr', 'uk', 'zh', 'zh-CN', 'zh-TW'
+    ].sort()
+  }
+
+  async stt_OpenStream (req, { language }) {
+    const options = deepgramOptions(req)
+    if (!options.apiKey) {
+      throw new Error('Deepgram API key not configured')
+    }
+
+    const deepgram = createClient(options.apiKey)
+    
+    const streamOptions = {
+      model: 'general',
+      language: language,
+      smart_format: true,
+      punctuate: true,
+      interim_results: true,
+      utterance_end_ms: 1000,
+      vad_events: true
+    }
+
+    // Apply default config from environment
+    if (process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG) {
+      try {
+        const defaultConfig = JSON.parse(process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG)
+        Object.assign(streamOptions, defaultConfig)
+      } catch (err) {
+        throw new Error(`Deepgram config in BOTIUM_SPEECH_DEEPGRAM_CONFIG invalid: ${err.message}`)
+      }
+    }
+
+    // Apply request-specific config
+    if (req.body && req.body.deepgram && req.body.deepgram.config) {
+      Object.assign(streamOptions, req.body.deepgram.config)
+    }
+
+    const events = new EventEmitter()
+    let eventHistory = []
+    let connection = null
+
+    try {
+      connection = deepgram.listen.live(streamOptions)
+      
+      connection.on('open', () => {
+        debug('Deepgram WebSocket opened')
+      })
+
+      connection.on('Results', (data) => {
+        console.log(data)
+        const result = data.channel.alternatives[0]
+        if (result && result.transcript) {
+          const event = {
+            status: 'ok',
+            text: result.transcript,
+            final: data.is_final || false,
+            debug: data
+          }
+          
+          // Add timing information if available
+          if (data.start && data.duration) {
+            event.start = _.round(data.start, 3)
+            event.end = _.round(data.start + data.duration, 3)
+          }
+          
+          events.emit('data', event)
+          if (eventHistory) {
+            eventHistory.push(event)
+          }
+        }
+      })
+
+      connection.on('UtteranceEnd', (data) => {
+        debug('Deepgram utterance end detected')
+      })
+
+      connection.on('error', (err) => {
+        const event = {
+          status: 'error',
+          err: `Deepgram STT failed: ${err.message || err}`
+        }
+        events.emit('data', event)
+        if (eventHistory) {
+          eventHistory.push(event)
+        }
+      })
+
+      connection.on('close', () => {
+        debug('Deepgram WebSocket closed')
+        events.emit('close')
+      })
+
+    } catch (err) {
+      debug(err)
+      throw new Error(`Deepgram STT streaming setup failed: ${err.message}`)
+    }
+
+    return {
+      events,
+      write: (buffer) => {
+        if (connection && connection.getReadyState() === 1) {
+          connection.send(buffer)
+        }
+      },
+      end: () => {
+        if (connection) {
+          connection.finish()
+        }
+      },
+      close: () => {
+        if (connection) {
+          connection.finish()
+          connection = null
+        }
+        eventHistory = null
+      },
+      triggerHistoryEmit: () => {
+        for (const eh of eventHistory) {
+          events.emit('data', eh)
+        }
+      }
+    }
+  }
+
+  async stt (req, { language, buffer, hint }) {
+    const options = deepgramOptions(req)
+    if (!options.apiKey) {
+      throw new Error('Deepgram API key not configured')
+    }
+
+    const deepgram = createClient(options.apiKey)
+    
+    const transcribeOptions = {
+      model: 'general',
+      language: language,
+      smart_format: true,
+      punctuate: true
+    }
+
+    // Add search terms if hint is provided
+    if (hint && hint.length > 0) {
+      transcribeOptions.search = [hint]
+    }
+
+    // Apply default config from environment
+    if (process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG) {
+      try {
+        const defaultConfig = JSON.parse(process.env.BOTIUM_SPEECH_DEEPGRAM_CONFIG)
+        Object.assign(transcribeOptions, defaultConfig)
+      } catch (err) {
+        throw new Error(`Deepgram config in BOTIUM_SPEECH_DEEPGRAM_CONFIG invalid: ${err.message}`)
+      }
+    }
+
+    // Apply request-specific config
+    if (req.body && req.body.deepgram && req.body.deepgram.config) {
+      Object.assign(transcribeOptions, req.body.deepgram.config)
+    }
+
+    try {
+      debug(`Calling Deepgram API with options: ${JSON.stringify(transcribeOptions)}`)
+      
+      const response = await deepgram.listen.prerecorded.transcribeFile(
+        buffer,
+        transcribeOptions
+      )
+
+      debug(`Deepgram response: ${JSON.stringify(response, null, 2)}`)
+
+      if (response.results && response.results.channels && response.results.channels[0]) {
+        const channel = response.results.channels[0]
+        if (channel.alternatives && channel.alternatives[0]) {
+          const transcript = channel.alternatives[0].transcript || ''
+          return {
+            text: transcript,
+            debug: response
+          }
+        }
+      }
+
+      return {
+        text: '',
+        debug: response
+      }
+    } catch (err) {
+      debug(err)
+      throw new Error(`Deepgram STT failed: ${err.message || err}`)
+    }
+  }
+}
+
+module.exports = DeepgramSTT
diff --git a/frontend/src/utils.js b/frontend/src/utils.js
@@ -170,6 +170,15 @@ const applyIfExists = (target, src, p) => {
   return Object.assign(target, _.get(src, p) || {})
 }
 
+const deepgramOptions = (req) => {
+  const apiKey = _.get(req, 'body.deepgram.credentials.apiKey') || process.env.BOTIUM_SPEECH_DEEPGRAM_API_KEY
+
+  if (apiKey) {
+    return { apiKey }
+  }
+  throw new Error('Deepgram API key not found')
+}
+
 module.exports = {
   asJson,
   enumValueToName,
@@ -184,5 +193,6 @@ module.exports = {
   applyExtraAzureSpeechConfig,
   getAzureErrorDetails,
   readBaseUrls,
-  applyIfExists
+  applyIfExists,
+  deepgramOptions
 }