@@ -39,7 +39,8 @@ const ttsEngines = {
3939 google : new ( require ( './tts/google' ) ) ( ) ,
4040 ibm : new ( require ( './tts/ibm' ) ) ( ) ,
4141 azure : new ( require ( './tts/azure' ) ) ( ) ,
42- polly : new ( require ( './tts/polly' ) ) ( )
42+ polly : new ( require ( './tts/polly' ) ) ( ) ,
43+ deepgram : new ( require ( './tts/deepgram' ) ) ( )
4344 // marytts: new (require('./tts/marytts'))(),
4445 // picotts: new (require('./tts/picotts'))()
4546}
@@ -48,7 +49,8 @@ const sttEngines = {
4849 // kaldi: new (require('./stt/kaldi'))(),
4950 ibm : new ( require ( './stt/ibm' ) ) ( ) ,
5051 azure : new ( require ( './stt/azure' ) ) ( ) ,
51- awstranscribe : new ( require ( './stt/awstranscribe' ) ) ( )
52+ awstranscribe : new ( require ( './stt/awstranscribe' ) ) ( ) ,
53+ deepgram : new ( require ( './stt/deepgram' ) ) ( )
5254}
5355
5456const multerMemoryStorage = multer . memoryStorage ( )
@@ -144,7 +146,7 @@ const router = express.Router()
144146 * required: false
145147 * schema:
146148 * type: string
147- * enum: [google, ibm, azure, awstranscribe]
149+ * enum: [google, ibm, azure, awstranscribe, deepgram ]
148150 * responses:
149151 * 200:
150152 * description: List of supported STT languages
@@ -196,7 +198,7 @@ const router = express.Router()
196198 * required: false
197199 * schema:
198200 * type: string
199- * enum: [google, ibm, azure, awstranscribe]
201+ * enum: [google, ibm, azure, awstranscribe, deepgram ]
200202 * - name: cache
201203 * description: Use result cache (default Y)
202204 * in: query
@@ -299,7 +301,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
299301 * required: false
300302 * schema:
301303 * type: string
302- * enum: [google, ibm, azure, polly]
304+ * enum: [google, ibm, azure, polly, deepgram ]
303305 * responses:
304306 * 200:
305307 * description: List of supported voices
@@ -340,7 +342,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
340342 * required: false
341343 * schema:
342344 * type: string
343- * enum: [google, ibm, azure, polly]
345+ * enum: [google, ibm, azure, polly, deepgram ]
344346 * responses:
345347 * 200:
346348 * description: List of supported TTS languages
@@ -395,7 +397,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
395397 * required: false
396398 * schema:
397399 * type: string
398- * enum: [google, ibm, azure, polly]
400+ * enum: [google, ibm, azure, polly, deepgram ]
399401 * - name: cache
400402 * description: Use result cache (default Y)
401403 * in: query
@@ -484,6 +486,140 @@ router.post('/api/stt/:language', async (req, res, next) => {
484486 }
485487} ) )
486488
489+ /**
490+ * @swagger
491+ * /api/ttsstream/{language}:
492+ * post:
493+ * description: Open a Websocket stream for converting text stream to audio
494+ * security:
495+ * - ApiKeyAuth: []
496+ * produces:
497+ * - application/json
498+ * parameters:
499+ * - name: language
500+ * description: Language code (as returned from ttslanguages endpoint)
501+ * in: path
502+ * required: true
503+ * schema:
504+ * type: string
505+ * - name: tts
506+ * description: Text-to-speech backend
507+ * in: query
508+ * required: false
509+ * schema:
510+ * type: string
511+ * enum: [google, azure, polly, ibm, deepgram]
512+ * - name: voice
513+ * description: Voice name (as returned from ttsvoices endpoint)
514+ * in: query
515+ * required: false
516+ * schema:
517+ * type: string
518+ * responses:
519+ * 200:
520+ * description: Websocket Url to stream the text to, and the uri to check status and end the stream
521+ * schema:
522+ * properties:
523+ * wsUri:
524+ * type: string
525+ * statusUri:
526+ * type: string
527+ * endUri:
528+ * type: string
529+ */
530+ ; [ router . get . bind ( router ) , router . post . bind ( router ) ] . forEach ( m => m ( '/api/ttsstream/:language' , async ( req , res , next ) => {
531+ try {
532+ const tts = ttsEngines [ ( req . query . tts && sanitize ( req . query . tts ) ) || process . env . BOTIUM_SPEECH_PROVIDER_TTS ]
533+
534+ if ( ! tts . tts_OpenStream ) {
535+ return next ( new Error ( `TTS provider ${ ( req . query . tts && sanitize ( req . query . tts ) ) || process . env . BOTIUM_SPEECH_PROVIDER_TTS } does not support streaming` ) )
536+ }
537+
538+ const streamId = uuidv1 ( )
539+ const stream = await tts . tts_OpenStream ( req , {
540+ language : req . params . language ,
541+ voice : req . query . voice
542+ } )
543+ stream . events . on ( 'close' , ( ) => delete wssStreams [ streamId ] )
544+ stream . dateTimeStart = new Date ( )
545+ stream . type = 'tts'
546+ wssStreams [ streamId ] = stream
547+
548+ const baseUrls = readBaseUrls ( req )
549+ res . json ( {
550+ wsUri : `${ baseUrls . wsUri } /${ streamId } ` ,
551+ statusUri : `${ baseUrls . baseUri } /api/ttsstatus/${ streamId } ` ,
552+ endUri : `${ baseUrls . baseUri } /api/ttsend/${ streamId } `
553+ } ) . end ( )
554+ } catch ( err ) {
555+ return next ( err )
556+ }
557+ } ) )
558+
559+ /**
560+ * @swagger
561+ * /api/ttsstatus/{streamId}:
562+ * get:
563+ * description: Check a Websocket stream for converting text stream to audio
564+ * security:
565+ * - ApiKeyAuth: []
566+ * produces:
567+ * - application/json
568+ * parameters:
569+ * - name: streamId
570+ * description: Stream Id (as returned from ttsstream endpoint)
571+ * in: path
572+ * required: true
573+ * schema:
574+ * type: string
575+ * responses:
576+ * 200:
577+ * description: Websocket stream ok
578+ * 404:
579+ * description: Websocket stream not available
580+ */
581+ ; [ router . get . bind ( router ) , router . post . bind ( router ) ] . forEach ( m => m ( '/api/ttsstatus/:streamId' , async ( req , res , next ) => {
582+ const stream = wssStreams [ req . params . streamId ]
583+ if ( stream ) {
584+ const streamDuration = ( ( new Date ( ) - stream . dateTimeStart ) / 1000 ) . toFixed ( 3 )
585+ res . status ( 200 ) . json ( { status : 'OK' , streamId : req . params . streamId , streamDuration } )
586+ } else {
587+ res . status ( 404 ) . json ( { status : 'NOTFOUND' , streamId : req . params . streamId } )
588+ }
589+ } ) )
590+
591+ /**
592+ * @swagger
593+ * /api/ttsend/{streamId}:
594+ * get:
595+ * description: Close a Websocket stream for converting text stream to audio
596+ * security:
597+ * - ApiKeyAuth: []
598+ * produces:
599+ * - application/json
600+ * parameters:
601+ * - name: streamId
602+ * description: Stream Id (as returned from ttsstream endpoint)
603+ * in: path
604+ * required: true
605+ * schema:
606+ * type: string
607+ * responses:
608+ * 200:
609+ * description: Websocket stream closed
610+ */
611+ ; [ router . get . bind ( router ) , router . post . bind ( router ) ] . forEach ( m => m ( '/api/ttsend/:streamId' , async ( req , res , next ) => {
612+ const stream = wssStreams [ req . params . streamId ]
613+ if ( stream ) {
614+ try {
615+ stream . end ( )
616+ } catch ( err ) {
617+ return next ( err )
618+ }
619+ }
620+ res . end ( )
621+ } ) )
622+
487623/**
488624 * @swagger
489625 * /api/audio/info:
@@ -783,7 +919,7 @@ const wssStreams = {}
783919 * required: false
784920 * schema:
785921 * type: string
786- * enum: [google, ibm, azure, awstranscribe]
922+ * enum: [google, ibm, azure, awstranscribe, deepgram ]
787923 * responses:
788924 * 200:
789925 * description: Websocket Url to stream the audio to, and the uri to check status and end the stream
@@ -804,6 +940,7 @@ const wssStreams = {}
804940 const stream = await stt . stt_OpenStream ( req , { language : req . params . language } )
805941 stream . events . on ( 'close' , ( ) => delete wssStreams [ streamId ] )
806942 stream . dateTimeStart = new Date ( )
943+ stream . type = 'stt'
807944 wssStreams [ streamId ] = stream
808945
809946 const baseUrls = readBaseUrls ( req )
@@ -891,15 +1028,28 @@ const wssUpgrade = (req, socket, head) => {
8911028 stream . events . on ( 'data' , async ( data ) => {
8921029 if ( data . err ) debug ( data )
8931030 data . streamDuration = ( ( new Date ( ) - stream . dateTimeStart ) / 1000 ) . toFixed ( 3 )
894- ws . send ( JSON . stringify ( data ) )
1031+
1032+ // For TTS streams, send audio data as binary, for STT send JSON
1033+ if ( stream . type === 'tts' && data . buffer ) {
1034+ ws . send ( data . buffer )
1035+ } else {
1036+ ws . send ( JSON . stringify ( data ) )
1037+ }
8951038 } )
8961039 stream . events . on ( 'close' , ( ) => {
8971040 ws . close ( )
8981041 wss1 . close ( )
8991042 } )
9001043 ws . on ( 'message' , ( data ) => {
901- if ( Buffer . isBuffer ( data ) ) {
902- stream . write ( data )
1044+ if ( stream . type === 'tts' ) {
1045+ // TTS streams expect text messages
1046+ const textData = Buffer . isBuffer ( data ) ? data . toString ( 'utf8' ) : data . toString ( )
1047+ stream . write ( textData )
1048+ } else {
1049+ // STT streams expect audio buffers
1050+ if ( Buffer . isBuffer ( data ) ) {
1051+ stream . write ( data )
1052+ }
9031053 }
9041054 } )
9051055 ws . on ( 'close' , ( ) => {
@@ -915,7 +1065,7 @@ const wssUpgrade = (req, socket, head) => {
9151065}
9161066
9171067module . exports = {
918- skipSecurityCheck : ( req ) => ( req . url . startsWith ( '/api/sttstatus/' ) || req . url . startsWith ( '/api/sttend/' ) ) ,
1068+ skipSecurityCheck : ( req ) => ( req . url . startsWith ( '/api/sttstatus/' ) || req . url . startsWith ( '/api/sttend/' ) || req . url . startsWith ( '/api/ttsstatus/' ) || req . url . startsWith ( '/api/ttsend/' ) ) ,
9191069 router,
9201070 wssUpgrade
9211071}
0 commit comments