Skip to content

Commit 1cc534e

Browse files
Merge pull request #39 from codeforequity-at/develop
Develop
2 parents 30381d9 + b1d83bb commit 1cc534e

File tree

12 files changed

+1710
-41
lines changed

12 files changed

+1710
-41
lines changed

Makefile

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,10 @@
1-
TAG_COMMIT := $(shell git rev-list --abbrev-commit --tags --max-count=1)
2-
VERSION := $(shell git describe --abbrev=0 --tags ${TAG_COMMIT} 2>/dev/null || true)
3-
4-
docker_build_dev:
5-
docker build -t botium.speech:develop frontend
6-
7-
docker_publish_dev:
8-
docker tag botium.speech:develop ${AWS_REGISTRY_HOSTNAME}/botium.speech:develop
9-
docker push ${AWS_REGISTRY_HOSTNAME}/botium.speech:develop
10-
111
docker_build:
12-
docker build -t botium.speech:$(VERSION) frontend
2+
docker build -t botium.speech:${SPEECH_VERSION} frontend
133

144
docker_publish:
15-
docker tag botium.speech:$(VERSION) ${AWS_REGISTRY_HOSTNAME}/botium.speech:$(VERSION)
16-
docker push ${AWS_REGISTRY_HOSTNAME}/botium.speech:$(VERSION)
5+
docker tag botium.speech:${SPEECH_VERSION} ${AWS_REGISTRY_HOSTNAME}/botium.speech:${SPEECH_VERSION}
6+
docker push ${AWS_REGISTRY_HOSTNAME}/botium.speech:${SPEECH_VERSION}
177

18-
develop: docker_build_dev docker_publish_dev
8+
develop: docker_build docker_publish
199

2010
release: docker_build docker_publish

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ For the major cloud providers there are additional docker-compose files. If usin
5757

5858
> docker-compose -f docker-compose-azure.yml up -d
5959

60+
For Deepgram, add your API key to the file *docker-compose-deepgram.yml* and start the services:
61+
62+
> docker-compose -f docker-compose-deepgram.yml up -d
63+
6064
### Optional: Build Docker Images
6165

6266
You can optionally built your own docker images (if you made any changes in this repository, for instance to download the latest version of a model). Clone or download this repository and run docker-compose:

docker-compose-deepgram.yml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
version: '3'
2+
services:
3+
nginx:
4+
image: nginx
5+
restart: always
6+
volumes:
7+
- ./nginx.conf:/etc/nginx/nginx.conf
8+
ports:
9+
- 80:80
10+
frontend:
11+
image: botium/botium-speech-frontend:latest
12+
restart: always
13+
environment:
14+
BOTIUM_API_TOKENS:
15+
BOTIUM_SPEECH_PROVIDER_TTS: google
16+
BOTIUM_SPEECH_PROVIDER_STT: deepgram
17+
BOTIUM_SPEECH_DEEPGRAM_API_KEY:
18+
volumes:
19+
- "./frontend/resources:/app/resources"

frontend/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"@aws-sdk/client-transcribe": "^3.775.0",
1919
"@aws-sdk/client-transcribe-streaming": "^3.775.0",
2020
"@aws-sdk/util-endpoints": "^3.775.0",
21+
"@deepgram/sdk": "^3.10.1",
2122
"@google-cloud/speech": "^7.0.1",
2223
"@google-cloud/storage": "^7.15.2",
2324
"@google-cloud/text-to-speech": "^6.0.1",

frontend/src/routes.js

Lines changed: 162 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ const ttsEngines = {
3939
google: new (require('./tts/google'))(),
4040
ibm: new (require('./tts/ibm'))(),
4141
azure: new (require('./tts/azure'))(),
42-
polly: new (require('./tts/polly'))()
42+
polly: new (require('./tts/polly'))(),
43+
deepgram: new (require('./tts/deepgram'))()
4344
// marytts: new (require('./tts/marytts'))(),
4445
// picotts: new (require('./tts/picotts'))()
4546
}
@@ -48,7 +49,8 @@ const sttEngines = {
4849
// kaldi: new (require('./stt/kaldi'))(),
4950
ibm: new (require('./stt/ibm'))(),
5051
azure: new (require('./stt/azure'))(),
51-
awstranscribe: new (require('./stt/awstranscribe'))()
52+
awstranscribe: new (require('./stt/awstranscribe'))(),
53+
deepgram: new (require('./stt/deepgram'))()
5254
}
5355

5456
const multerMemoryStorage = multer.memoryStorage()
@@ -144,7 +146,7 @@ const router = express.Router()
144146
* required: false
145147
* schema:
146148
* type: string
147-
* enum: [google, ibm, azure, awstranscribe]
149+
* enum: [google, ibm, azure, awstranscribe, deepgram]
148150
* responses:
149151
* 200:
150152
* description: List of supported STT languages
@@ -196,7 +198,7 @@ const router = express.Router()
196198
* required: false
197199
* schema:
198200
* type: string
199-
* enum: [google, ibm, azure, awstranscribe]
201+
* enum: [google, ibm, azure, awstranscribe, deepgram]
200202
* - name: cache
201203
* description: Use result cache (default Y)
202204
* in: query
@@ -299,7 +301,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
299301
* required: false
300302
* schema:
301303
* type: string
302-
* enum: [google, ibm, azure, polly]
304+
* enum: [google, ibm, azure, polly, deepgram]
303305
* responses:
304306
* 200:
305307
* description: List of supported voices
@@ -340,7 +342,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
340342
* required: false
341343
* schema:
342344
* type: string
343-
* enum: [google, ibm, azure, polly]
345+
* enum: [google, ibm, azure, polly, deepgram]
344346
* responses:
345347
* 200:
346348
* description: List of supported TTS languages
@@ -395,7 +397,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
395397
* required: false
396398
* schema:
397399
* type: string
398-
* enum: [google, ibm, azure, polly]
400+
* enum: [google, ibm, azure, polly, deepgram]
399401
* - name: cache
400402
* description: Use result cache (default Y)
401403
* in: query
@@ -484,6 +486,140 @@ router.post('/api/stt/:language', async (req, res, next) => {
484486
}
485487
}))
486488

489+
/**
490+
* @swagger
491+
* /api/ttsstream/{language}:
492+
* post:
493+
* description: Open a Websocket stream for converting text stream to audio
494+
* security:
495+
* - ApiKeyAuth: []
496+
* produces:
497+
* - application/json
498+
* parameters:
499+
* - name: language
500+
* description: Language code (as returned from ttslanguages endpoint)
501+
* in: path
502+
* required: true
503+
* schema:
504+
* type: string
505+
* - name: tts
506+
* description: Text-to-speech backend
507+
* in: query
508+
* required: false
509+
* schema:
510+
* type: string
511+
* enum: [google, azure, polly, ibm, deepgram]
512+
* - name: voice
513+
* description: Voice name (as returned from ttsvoices endpoint)
514+
* in: query
515+
* required: false
516+
* schema:
517+
* type: string
518+
* responses:
519+
* 200:
520+
* description: Websocket Url to stream the text to, and the uri to check status and end the stream
521+
* schema:
522+
* properties:
523+
* wsUri:
524+
* type: string
525+
* statusUri:
526+
* type: string
527+
* endUri:
528+
* type: string
529+
*/
530+
;[router.get.bind(router), router.post.bind(router)].forEach(m => m('/api/ttsstream/:language', async (req, res, next) => {
531+
try {
532+
const tts = ttsEngines[(req.query.tts && sanitize(req.query.tts)) || process.env.BOTIUM_SPEECH_PROVIDER_TTS]
533+
534+
if (!tts.tts_OpenStream) {
535+
return next(new Error(`TTS provider ${(req.query.tts && sanitize(req.query.tts)) || process.env.BOTIUM_SPEECH_PROVIDER_TTS} does not support streaming`))
536+
}
537+
538+
const streamId = uuidv1()
539+
const stream = await tts.tts_OpenStream(req, {
540+
language: req.params.language,
541+
voice: req.query.voice
542+
})
543+
stream.events.on('close', () => delete wssStreams[streamId])
544+
stream.dateTimeStart = new Date()
545+
stream.type = 'tts'
546+
wssStreams[streamId] = stream
547+
548+
const baseUrls = readBaseUrls(req)
549+
res.json({
550+
wsUri: `${baseUrls.wsUri}/${streamId}`,
551+
statusUri: `${baseUrls.baseUri}/api/ttsstatus/${streamId}`,
552+
endUri: `${baseUrls.baseUri}/api/ttsend/${streamId}`
553+
}).end()
554+
} catch (err) {
555+
return next(err)
556+
}
557+
}))
558+
559+
/**
560+
* @swagger
561+
* /api/ttsstatus/{streamId}:
562+
* get:
563+
* description: Check a Websocket stream for converting text stream to audio
564+
* security:
565+
* - ApiKeyAuth: []
566+
* produces:
567+
* - application/json
568+
* parameters:
569+
* - name: streamId
570+
* description: Stream Id (as returned from ttsstream endpoint)
571+
* in: path
572+
* required: true
573+
* schema:
574+
* type: string
575+
* responses:
576+
* 200:
577+
* description: Websocket stream ok
578+
* 404:
579+
* description: Websocket stream not available
580+
*/
581+
;[router.get.bind(router), router.post.bind(router)].forEach(m => m('/api/ttsstatus/:streamId', async (req, res, next) => {
582+
const stream = wssStreams[req.params.streamId]
583+
if (stream) {
584+
const streamDuration = ((new Date() - stream.dateTimeStart) / 1000).toFixed(3)
585+
res.status(200).json({ status: 'OK', streamId: req.params.streamId, streamDuration })
586+
} else {
587+
res.status(404).json({ status: 'NOTFOUND', streamId: req.params.streamId })
588+
}
589+
}))
590+
591+
/**
592+
* @swagger
593+
* /api/ttsend/{streamId}:
594+
* get:
595+
* description: Close a Websocket stream for converting text stream to audio
596+
* security:
597+
* - ApiKeyAuth: []
598+
* produces:
599+
* - application/json
600+
* parameters:
601+
* - name: streamId
602+
* description: Stream Id (as returned from ttsstream endpoint)
603+
* in: path
604+
* required: true
605+
* schema:
606+
* type: string
607+
* responses:
608+
* 200:
609+
* description: Websocket stream closed
610+
*/
611+
;[router.get.bind(router), router.post.bind(router)].forEach(m => m('/api/ttsend/:streamId', async (req, res, next) => {
612+
const stream = wssStreams[req.params.streamId]
613+
if (stream) {
614+
try {
615+
stream.end()
616+
} catch (err) {
617+
return next(err)
618+
}
619+
}
620+
res.end()
621+
}))
622+
487623
/**
488624
* @swagger
489625
* /api/audio/info:
@@ -783,7 +919,7 @@ const wssStreams = {}
783919
* required: false
784920
* schema:
785921
* type: string
786-
* enum: [google, ibm, azure, awstranscribe]
922+
* enum: [google, ibm, azure, awstranscribe, deepgram]
787923
* responses:
788924
* 200:
789925
* description: Websocket Url to stream the audio to, and the uri to check status and end the stream
@@ -804,6 +940,7 @@ const wssStreams = {}
804940
const stream = await stt.stt_OpenStream(req, { language: req.params.language })
805941
stream.events.on('close', () => delete wssStreams[streamId])
806942
stream.dateTimeStart = new Date()
943+
stream.type = 'stt'
807944
wssStreams[streamId] = stream
808945

809946
const baseUrls = readBaseUrls(req)
@@ -891,15 +1028,28 @@ const wssUpgrade = (req, socket, head) => {
8911028
stream.events.on('data', async (data) => {
8921029
if (data.err) debug(data)
8931030
data.streamDuration = ((new Date() - stream.dateTimeStart) / 1000).toFixed(3)
894-
ws.send(JSON.stringify(data))
1031+
1032+
// For TTS streams, send audio data as binary, for STT send JSON
1033+
if (stream.type === 'tts' && data.buffer) {
1034+
ws.send(data.buffer)
1035+
} else {
1036+
ws.send(JSON.stringify(data))
1037+
}
8951038
})
8961039
stream.events.on('close', () => {
8971040
ws.close()
8981041
wss1.close()
8991042
})
9001043
ws.on('message', (data) => {
901-
if (Buffer.isBuffer(data)) {
902-
stream.write(data)
1044+
if (stream.type === 'tts') {
1045+
// TTS streams expect text messages
1046+
const textData = Buffer.isBuffer(data) ? data.toString('utf8') : data.toString()
1047+
stream.write(textData)
1048+
} else {
1049+
// STT streams expect audio buffers
1050+
if (Buffer.isBuffer(data)) {
1051+
stream.write(data)
1052+
}
9031053
}
9041054
})
9051055
ws.on('close', () => {
@@ -915,7 +1065,7 @@ const wssUpgrade = (req, socket, head) => {
9151065
}
9161066

9171067
module.exports = {
918-
skipSecurityCheck: (req) => (req.url.startsWith('/api/sttstatus/') || req.url.startsWith('/api/sttend/')),
1068+
skipSecurityCheck: (req) => (req.url.startsWith('/api/sttstatus/') || req.url.startsWith('/api/sttend/') || req.url.startsWith('/api/ttsstatus/') || req.url.startsWith('/api/ttsend/')),
9191069
router,
9201070
wssUpgrade
9211071
}

0 commit comments

Comments
 (0)