codeforequity-at
diff --git a/‎Makefile‎
Lines changed: 4 additions & 14 deletions b/‎Makefile‎
Lines changed: 4 additions & 14 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 0 deletions b/‎README.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docker-compose-deepgram.yml‎
Lines changed: 19 additions & 0 deletions b/‎docker-compose-deepgram.yml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎frontend/package.json‎
Lines changed: 1 addition & 0 deletions b/‎frontend/package.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎frontend/src/routes.js‎
Lines changed: 162 additions & 12 deletions b/‎frontend/src/routes.js‎
Lines changed: 162 additions & 12 deletions
@@ -1,20 +1,10 @@
-TAG_COMMIT := $(shell git rev-list --abbrev-commit --tags --max-count=1)
-VERSION := $(shell git describe --abbrev=0 --tags ${TAG_COMMIT} 2>/dev/null || true)
-
-docker_build_dev:
-	docker build -t botium.speech:develop frontend
-
-docker_publish_dev:
-	docker tag botium.speech:develop ${AWS_REGISTRY_HOSTNAME}/botium.speech:develop
-	docker push ${AWS_REGISTRY_HOSTNAME}/botium.speech:develop
-
 docker_build:
-	docker build -t botium.speech:$(VERSION) frontend
+	docker build -t botium.speech:${SPEECH_VERSION} frontend
 
 docker_publish:
-	docker tag botium.speech:$(VERSION) ${AWS_REGISTRY_HOSTNAME}/botium.speech:$(VERSION)
-	docker push ${AWS_REGISTRY_HOSTNAME}/botium.speech:$(VERSION)
+	docker tag botium.speech:${SPEECH_VERSION} ${AWS_REGISTRY_HOSTNAME}/botium.speech:${SPEECH_VERSION}
+	docker push ${AWS_REGISTRY_HOSTNAME}/botium.speech:${SPEECH_VERSION}
 
-develop: docker_build_dev docker_publish_dev
+develop: docker_build docker_publish
 
 release: docker_build docker_publish
@@ -57,6 +57,10 @@ For the major cloud providers there are additional docker-compose files. If usin
 
     > docker-compose -f docker-compose-azure.yml up -d
 
+For Deepgram, add your API key to the file *docker-compose-deepgram.yml* and start the services:
+
+    > docker-compose -f docker-compose-deepgram.yml up -d
+
 ### Optional: Build Docker Images
 
 You can optionally built your own docker images (if you made any changes in this repository, for instance to download the latest version of a model). Clone or download this repository and run docker-compose:
 
@@ -0,0 +1,19 @@
+version: '3'
+services:
+  nginx:
+    image: nginx
+    restart: always
+    volumes:
+      - ./nginx.conf:/etc/nginx/nginx.conf
+    ports:
+      - 80:80
+  frontend:
+    image: botium/botium-speech-frontend:latest
+    restart: always
+    environment:
+      BOTIUM_API_TOKENS: 
+      BOTIUM_SPEECH_PROVIDER_TTS: google
+      BOTIUM_SPEECH_PROVIDER_STT: deepgram
+      BOTIUM_SPEECH_DEEPGRAM_API_KEY:
+    volumes:
+      - "./frontend/resources:/app/resources"
@@ -18,6 +18,7 @@
     "@aws-sdk/client-transcribe": "^3.775.0",
     "@aws-sdk/client-transcribe-streaming": "^3.775.0",
     "@aws-sdk/util-endpoints": "^3.775.0",
+    "@deepgram/sdk": "^3.10.1",
     "@google-cloud/speech": "^7.0.1",
     "@google-cloud/storage": "^7.15.2",
     "@google-cloud/text-to-speech": "^6.0.1",
 
@@ -39,7 +39,8 @@ const ttsEngines = {
   google: new (require('./tts/google'))(),
   ibm: new (require('./tts/ibm'))(),
   azure: new (require('./tts/azure'))(),
-  polly: new (require('./tts/polly'))()
+  polly: new (require('./tts/polly'))(),
+  deepgram: new (require('./tts/deepgram'))()
   // marytts: new (require('./tts/marytts'))(),
   // picotts: new (require('./tts/picotts'))()
 }
@@ -48,7 +49,8 @@ const sttEngines = {
   // kaldi: new (require('./stt/kaldi'))(),
   ibm: new (require('./stt/ibm'))(),
   azure: new (require('./stt/azure'))(),
-  awstranscribe: new (require('./stt/awstranscribe'))()
+  awstranscribe: new (require('./stt/awstranscribe'))(),
+  deepgram: new (require('./stt/deepgram'))()
 }
 
 const multerMemoryStorage = multer.memoryStorage()
@@ -144,7 +146,7 @@ const router = express.Router()
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, awstranscribe]
+ *           enum: [google, ibm, azure, awstranscribe, deepgram]
  *     responses:
  *       200:
  *         description: List of supported STT languages
@@ -196,7 +198,7 @@ const router = express.Router()
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, awstranscribe]
+ *           enum: [google, ibm, azure, awstranscribe, deepgram]
  *       - name: cache
  *         description: Use result cache (default Y)
  *         in: query
@@ -299,7 +301,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, polly]
+ *           enum: [google, ibm, azure, polly, deepgram]
  *     responses:
  *       200:
  *         description: List of supported voices
@@ -340,7 +342,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, polly]
+ *           enum: [google, ibm, azure, polly, deepgram]
  *     responses:
  *       200:
  *         description: List of supported TTS languages
@@ -395,7 +397,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, polly]
+ *           enum: [google, ibm, azure, polly, deepgram]
  *       - name: cache
  *         description: Use result cache (default Y)
  *         in: query
@@ -484,6 +486,140 @@ router.post('/api/stt/:language', async (req, res, next) => {
   }
 }))
 
+/**
+ * @swagger
+ * /api/ttsstream/{language}:
+ *   post:
+ *     description: Open a Websocket stream for converting text stream to audio
+ *     security:
+ *       - ApiKeyAuth: []
+ *     produces:
+ *       - application/json
+ *     parameters:
+ *       - name: language
+ *         description: Language code (as returned from ttslanguages endpoint)
+ *         in: path
+ *         required: true
+ *         schema:
+ *           type: string
+ *       - name: tts
+ *         description: Text-to-speech backend
+ *         in: query
+ *         required: false
+ *         schema:
+ *           type: string
+ *           enum: [google, azure, polly, ibm, deepgram]
+ *       - name: voice
+ *         description: Voice name (as returned from ttsvoices endpoint)
+ *         in: query
+ *         required: false
+ *         schema:
+ *           type: string
+ *     responses:
+ *       200:
+ *         description: Websocket Url to stream the text to, and the uri to check status and end the stream
+ *         schema:
+ *           properties:
+ *             wsUri:
+ *               type: string
+ *             statusUri:
+ *               type: string
+ *             endUri:
+ *               type: string
+ */
+;[router.get.bind(router), router.post.bind(router)].forEach(m => m('/api/ttsstream/:language', async (req, res, next) => {
+  try {
+    const tts = ttsEngines[(req.query.tts && sanitize(req.query.tts)) || process.env.BOTIUM_SPEECH_PROVIDER_TTS]
+
+    if (!tts.tts_OpenStream) {
+      return next(new Error(`TTS provider ${(req.query.tts && sanitize(req.query.tts)) || process.env.BOTIUM_SPEECH_PROVIDER_TTS} does not support streaming`))
+    }
+
+    const streamId = uuidv1()
+    const stream = await tts.tts_OpenStream(req, { 
+      language: req.params.language,
+      voice: req.query.voice 
+    })
+    stream.events.on('close', () => delete wssStreams[streamId])
+    stream.dateTimeStart = new Date()
+    stream.type = 'tts'
+    wssStreams[streamId] = stream
+
+    const baseUrls = readBaseUrls(req)
+    res.json({
+      wsUri: `${baseUrls.wsUri}/${streamId}`,
+      statusUri: `${baseUrls.baseUri}/api/ttsstatus/${streamId}`,
+      endUri: `${baseUrls.baseUri}/api/ttsend/${streamId}`
+    }).end()
+  } catch (err) {
+    return next(err)
+  }
+}))
+
+/**
+ * @swagger
+ * /api/ttsstatus/{streamId}:
+ *   get:
+ *     description: Check a Websocket stream for converting text stream to audio
+ *     security:
+ *       - ApiKeyAuth: []
+ *     produces:
+ *       - application/json
+ *     parameters:
+ *       - name: streamId
+ *         description: Stream Id (as returned from ttsstream endpoint)
+ *         in: path
+ *         required: true
+ *         schema:
+ *           type: string
+ *     responses:
+ *       200:
+ *         description: Websocket stream ok
+ *       404:
+ *         description: Websocket stream not available
+ */
+;[router.get.bind(router), router.post.bind(router)].forEach(m => m('/api/ttsstatus/:streamId', async (req, res, next) => {
+  const stream = wssStreams[req.params.streamId]
+  if (stream) {
+    const streamDuration = ((new Date() - stream.dateTimeStart) / 1000).toFixed(3)
+    res.status(200).json({ status: 'OK', streamId: req.params.streamId, streamDuration })
+  } else {
+    res.status(404).json({ status: 'NOTFOUND', streamId: req.params.streamId })
+  }
+}))
+
+/**
+ * @swagger
+ * /api/ttsend/{streamId}:
+ *   get:
+ *     description: Close a Websocket stream for converting text stream to audio
+ *     security:
+ *       - ApiKeyAuth: []
+ *     produces:
+ *       - application/json
+ *     parameters:
+ *       - name: streamId
+ *         description: Stream Id (as returned from ttsstream endpoint)
+ *         in: path
+ *         required: true
+ *         schema:
+ *           type: string
+ *     responses:
+ *       200:
+ *         description: Websocket stream closed
+ */
+;[router.get.bind(router), router.post.bind(router)].forEach(m => m('/api/ttsend/:streamId', async (req, res, next) => {
+  const stream = wssStreams[req.params.streamId]
+  if (stream) {
+    try {
+      stream.end()
+    } catch (err) {
+      return next(err)
+    }
+  }
+  res.end()
+}))
+
 /**
  * @swagger
  * /api/audio/info:
@@ -783,7 +919,7 @@ const wssStreams = {}
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, awstranscribe]
+ *           enum: [google, ibm, azure, awstranscribe, deepgram]
  *     responses:
  *       200:
  *         description: Websocket Url to stream the audio to, and the uri to check status and end the stream
@@ -804,6 +940,7 @@ const wssStreams = {}
     const stream = await stt.stt_OpenStream(req, { language: req.params.language })
     stream.events.on('close', () => delete wssStreams[streamId])
     stream.dateTimeStart = new Date()
+    stream.type = 'stt'
     wssStreams[streamId] = stream
 
     const baseUrls = readBaseUrls(req)
@@ -891,15 +1028,28 @@ const wssUpgrade = (req, socket, head) => {
     stream.events.on('data', async (data) => {
       if (data.err) debug(data)
       data.streamDuration = ((new Date() - stream.dateTimeStart) / 1000).toFixed(3)
-      ws.send(JSON.stringify(data))
+      
+      // For TTS streams, send audio data as binary, for STT send JSON
+      if (stream.type === 'tts' && data.buffer) {
+        ws.send(data.buffer)
+      } else {
+        ws.send(JSON.stringify(data))
+      }
     })
     stream.events.on('close', () => {
       ws.close()
       wss1.close()
     })
     ws.on('message', (data) => {
-      if (Buffer.isBuffer(data)) {
-        stream.write(data)
+      if (stream.type === 'tts') {
+        // TTS streams expect text messages
+        const textData = Buffer.isBuffer(data) ? data.toString('utf8') : data.toString()
+        stream.write(textData)
+      } else {
+        // STT streams expect audio buffers
+        if (Buffer.isBuffer(data)) {
+          stream.write(data)
+        }
       }
     })
     ws.on('close', () => {
@@ -915,7 +1065,7 @@ const wssUpgrade = (req, socket, head) => {
 }
 
 module.exports = {
-  skipSecurityCheck: (req) => (req.url.startsWith('/api/sttstatus/') || req.url.startsWith('/api/sttend/')),
+  skipSecurityCheck: (req) => (req.url.startsWith('/api/sttstatus/') || req.url.startsWith('/api/sttend/') || req.url.startsWith('/api/ttsstatus/') || req.url.startsWith('/api/ttsend/')),
   router,
   wssUpgrade
 }