@@ -6,6 +6,40 @@ const debug = require('debug')('botium-speech-processing-azure-tts')
66
77const { azureSpeechConfig, applyExtraAzureSpeechConfig, getAzureErrorDetails, ttsFilename } = require ( '../utils' )
88
9+ // Create WAV header for PCM data
10+ const createWavHeader = ( pcmLength , sampleRate = 16000 , channels = 1 , bitsPerSample = 16 ) => {
11+ const header = Buffer . alloc ( 44 )
12+ const bytesPerSample = bitsPerSample / 8
13+ const byteRate = sampleRate * channels * bytesPerSample
14+ const blockAlign = channels * bytesPerSample
15+ const dataSize = pcmLength
16+ const fileSize = 36 + dataSize
17+
18+ header . write ( 'RIFF' , 0 ) // ChunkID
19+ header . writeUInt32LE ( fileSize , 4 ) // ChunkSize
20+ header . write ( 'WAVE' , 8 ) // Format
21+ header . write ( 'fmt ' , 12 ) // Subchunk1ID
22+ header . writeUInt32LE ( 16 , 16 ) // Subchunk1Size (PCM)
23+ header . writeUInt16LE ( 1 , 20 ) // AudioFormat (PCM)
24+ header . writeUInt16LE ( channels , 22 ) // NumChannels
25+ header . writeUInt32LE ( sampleRate , 24 ) // SampleRate
26+ header . writeUInt32LE ( byteRate , 28 ) // ByteRate
27+ header . writeUInt16LE ( blockAlign , 32 ) // BlockAlign
28+ header . writeUInt16LE ( bitsPerSample , 34 ) // BitsPerSample
29+ header . write ( 'data' , 36 ) // Subchunk2ID
30+ header . writeUInt32LE ( dataSize , 40 ) // Subchunk2Size
31+
32+ return header
33+ }
34+
35+ // Extract PCM data from WAV buffer (skip 44-byte header)
36+ const extractPcmFromWav = ( wavBuffer ) => {
37+ if ( wavBuffer . length > 44 && wavBuffer . toString ( 'ascii' , 0 , 4 ) === 'RIFF' ) {
38+ return wavBuffer . slice ( 44 ) // Skip WAV header
39+ }
40+ return wavBuffer // Already PCM or unknown format
41+ }
42+
943const genderMap = {
1044 Male : 'male' ,
1145 Female : 'female'
@@ -84,6 +118,8 @@ class AzureTTS {
84118 let isStreamClosed = false
85119 let synthesizer = null
86120 let textBuffer = ''
121+ let totalPcmLength = 0 // Track total PCM length for WAV header
122+ let headerSent = false // Track if WAV header was sent
87123
88124 const triggerHistoryEmit = ( ) => {
89125 history . forEach ( data => events . emit ( 'data' , data ) )
@@ -130,17 +166,44 @@ class AzureTTS {
130166 debug ( `Azure TTS result for sentence: ${ text . substring ( 0 , 50 ) } ...` )
131167
132168 if ( result . reason === ResultReason . SynthesizingAudioCompleted ) {
133- const audioData = {
169+ // Extract PCM from WAV
170+ const wavBuffer = Buffer . from ( result . audioData )
171+ const pcmData = extractPcmFromWav ( wavBuffer )
172+
173+ debug ( `Received PCM chunk: ${ pcmData . length } bytes (from WAV: ${ wavBuffer . length } bytes)` )
174+
175+ // Send WAV header once at the beginning
176+ if ( ! headerSent ) {
177+ const placeholderSize = 0xFFFFFFFF - 44 // Max size minus header
178+ const wavHeader = createWavHeader ( placeholderSize )
179+
180+ const headerData = {
181+ status : 'ok' ,
182+ buffer : wavHeader ,
183+ final : false ,
184+ debug : { message : 'WAV header' , audioLength : wavHeader . length }
185+ }
186+ history . push ( headerData )
187+ events . emit ( 'data' , headerData )
188+ headerSent = true
189+ debug ( 'Sent WAV header (44 bytes)' )
190+ }
191+
192+ // Send raw PCM data
193+ totalPcmLength += pcmData . length
194+ const pcmChunkData = {
134195 status : 'ok' ,
135- buffer : Buffer . from ( result . audioData ) ,
196+ buffer : pcmData ,
136197 final : false ,
137198 debug : {
138- audioLength : result . audioData . byteLength ,
199+ message : 'PCM chunk' ,
200+ audioLength : pcmData . length ,
201+ totalPcmSoFar : totalPcmLength ,
139202 sentence : text . substring ( 0 , 100 ) + ( text . length > 100 ? '...' : '' )
140203 }
141204 }
142- history . push ( audioData )
143- events . emit ( 'data' , audioData )
205+ history . push ( pcmChunkData )
206+ events . emit ( 'data' , pcmChunkData )
144207 } else if ( result . reason === ResultReason . Canceled ) {
145208 const errorData = {
146209 status : 'error' ,
@@ -192,17 +255,18 @@ class AzureTTS {
192255
193256 // Signal end of stream
194257 setTimeout ( ( ) => {
195- if ( ! isStreamClosed ) {
196- const endData = {
197- status : 'ok' ,
198- text : '' ,
199- final : true ,
200- debug : { message : 'Stream ended' }
258+ const endData = {
259+ status : 'ok' ,
260+ text : '' ,
261+ final : true ,
262+ debug : {
263+ message : 'Stream ended' ,
264+ totalPcmLength : totalPcmLength
201265 }
202- history . push ( endData )
203- events . emit ( 'data' , endData )
204- close ( )
205266 }
267+ history . push ( endData )
268+ events . emit ( 'data' , endData )
269+ close ( )
206270 } , 500 ) // Give time for final synthesis to complete
207271 }
208272
@@ -232,26 +296,28 @@ class AzureTTS {
232296 // Setup synthesizer event handlers for streaming
233297 synthesizer . synthesisStarted = ( sender , event ) => {
234298 debug ( 'Azure TTS synthesis started' )
235- const startData = {
236- status : 'ok' ,
237- text : '' ,
238- final : false ,
239- debug : { message : 'Synthesis started' , sessionId : event . sessionId }
240- }
241- history . push ( startData )
242- events . emit ( 'data' , startData )
299+ // Only log, don't send metadata
243300 }
244301
245302 synthesizer . synthesizing = ( sender , event ) => {
246303 debug ( `Azure TTS synthesizing: ${ event . result . audioData . byteLength } bytes` )
247304
248305 if ( event . result . audioData . byteLength > 0 ) {
306+ // Extract PCM from WAV chunk and add proper header
307+ const wavBuffer = Buffer . from ( event . result . audioData )
308+ const pcmData = extractPcmFromWav ( wavBuffer )
309+ const newWavHeader = createWavHeader ( pcmData . length )
310+ const finalWavChunk = Buffer . concat ( [ newWavHeader , pcmData ] )
311+
312+ debug ( `Synthesizing chunk: WAV ${ wavBuffer . length } -> PCM ${ pcmData . length } -> WAV ${ finalWavChunk . length } ` )
313+
249314 const audioData = {
250315 status : 'ok' ,
251- buffer : Buffer . from ( event . result . audioData ) ,
316+ buffer : finalWavChunk ,
252317 final : false ,
253318 debug : {
254- audioLength : event . result . audioData . byteLength ,
319+ audioLength : finalWavChunk . length ,
320+ pcmLength : pcmData . length ,
255321 partial : true
256322 }
257323 }
@@ -262,31 +328,12 @@ class AzureTTS {
262328
263329 synthesizer . synthesisCompleted = ( sender , event ) => {
264330 debug ( 'Azure TTS synthesis completed for chunk' )
265-
266- const completeData = {
267- status : 'ok' ,
268- text : '' ,
269- final : false ,
270- debug : {
271- message : 'Chunk synthesis completed' ,
272- resultId : event . result . resultId
273- }
274- }
275- history . push ( completeData )
276- events . emit ( 'data' , completeData )
331+ // Only log, don't send metadata
277332 }
278333
279334 synthesizer . SynthesisCanceled = ( sender , event ) => {
280335 debug ( `Azure TTS synthesis canceled: ${ event . reason } ` )
281-
282- const cancelData = {
283- status : 'error' ,
284- text : '' ,
285- final : true ,
286- err : `Synthesis canceled: ${ event . reason } `
287- }
288- history . push ( cancelData )
289- events . emit ( 'data' , cancelData )
336+ // Only log errors, don't send metadata
290337 }
291338
292339 const openData = {
0 commit comments