Skip to content

Commit cded7eb

Browse files
committed
tts streaming
1 parent 78126b3 commit cded7eb

File tree

4 files changed

+373
-56
lines changed

4 files changed

+373
-56
lines changed

frontend/src/test-tts-streaming.js

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#!/usr/bin/env node
2+
3+
const WebSocket = require('ws')
4+
const axios = require('axios')
5+
const fs = require('fs')
6+
7+
// Configuration
8+
const BASE_URL = 'http://127.0.0.1:56000'
9+
const LANGUAGE = 'en'
10+
const TTS_PROVIDER = 'azure'
11+
const VOICE = 'en-US-JennyNeural'
12+
13+
const TEST_TEXT = "Hello, this is a test of TTS streaming. This text will be sent in chunks to demonstrate real-time synthesis."
14+
15+
async function testTTSStreaming() {
16+
console.log('🚀 Testing TTS Streaming')
17+
console.log(`Provider: ${TTS_PROVIDER}, Language: ${LANGUAGE}, Voice: ${VOICE}`)
18+
19+
try {
20+
// 1. Open streaming session
21+
console.log('📡 Opening stream...')
22+
const response = await axios.post(`${BASE_URL}/api/ttsstream/${LANGUAGE}?tts=${TTS_PROVIDER}&voice=${VOICE}`)
23+
const { wsUri, endUri } = response.data
24+
console.log('✅ Stream opened:', wsUri)
25+
26+
// 2. Connect WebSocket
27+
const ws = new WebSocket(wsUri)
28+
let wavHeader = null
29+
let pcmChunks = []
30+
31+
ws.on('open', () => {
32+
console.log('🔌 WebSocket connected')
33+
console.log('📋 Event: WebSocket opened')
34+
35+
// Send text in chunks
36+
const words = TEST_TEXT.split(' ')
37+
let i = 0
38+
const sendChunk = () => {
39+
if (i < words.length) {
40+
const chunk = words.slice(i, i + 3).join(' ') + ' '
41+
console.log(`📤 Sending: "${chunk.trim()}"`)
42+
ws.send(chunk)
43+
i += 3
44+
setTimeout(sendChunk, 1000)
45+
} else {
46+
console.log('📤 All text sent, ending stream...')
47+
// End the stream and close WebSocket
48+
setTimeout(async () => {
49+
try {
50+
await axios.get(endUri)
51+
console.log('📡 Stream ended via API')
52+
} catch (err) {
53+
console.log('⚠️ End API call failed:', err.message)
54+
}
55+
56+
// Force close WebSocket after a delay to receive final audio
57+
setTimeout(() => {
58+
if (ws.readyState === WebSocket.OPEN) {
59+
console.log('🔌 Closing WebSocket...')
60+
ws.close()
61+
}
62+
}, 2000)
63+
}, 1000)
64+
}
65+
}
66+
setTimeout(sendChunk, 500)
67+
})
68+
69+
ws.on('message', (data) => {
70+
if (Buffer.isBuffer(data)) {
71+
// Check if it's a WAV header or PCM data
72+
if (data.length === 44 && data.toString('ascii', 0, 4) === 'RIFF') {
73+
// It's a WAV header
74+
wavHeader = data
75+
console.log(`📄 WAV header received: ${data.length} bytes`)
76+
console.log('📋 Event: WAV header received')
77+
} else if (data.length > 44 && data.toString('ascii', 0, 4) === 'RIFF') {
78+
// It's a complete WAV file (fallback for providers that still send complete WAVs)
79+
const header = data.slice(0, 44)
80+
const pcm = data.slice(44)
81+
if (!wavHeader) wavHeader = header
82+
pcmChunks.push(pcm)
83+
console.log(`🔊 Complete WAV: ${data.length} bytes (header: 44, PCM: ${pcm.length})`)
84+
console.log('📋 Event: Complete WAV received')
85+
} else {
86+
// It's raw PCM data or JSON metadata
87+
try {
88+
const msg = JSON.parse(data.toString())
89+
console.log(`📋 Metadata: ${msg.status}, Final: ${msg.final}`)
90+
console.log('📋 Event: Metadata received')
91+
if (msg.debug) {
92+
console.log(` Debug:`, msg.debug)
93+
}
94+
} catch (e) {
95+
// It's raw PCM data
96+
pcmChunks.push(data)
97+
console.log(`🔊 PCM chunk: ${data.length} bytes`)
98+
console.log('📋 Event: PCM chunk received')
99+
}
100+
}
101+
} else {
102+
try {
103+
const msg = JSON.parse(data)
104+
console.log(`📋 Status: ${msg.status}, Final: ${msg.final}`)
105+
console.log('📋 Event: JSON message received')
106+
if (msg.debug) {
107+
console.log(` Debug:`, msg.debug)
108+
}
109+
} catch (e) {
110+
console.log(`📋 Raw message: ${data.toString()}`)
111+
console.log('📋 Event: Raw message received')
112+
}
113+
}
114+
})
115+
116+
ws.on('close', (code, reason) => {
117+
console.log(`🔌 WebSocket closed: code=${code}, reason=${reason}`)
118+
console.log('📋 Event: WebSocket closed')
119+
if (wavHeader && pcmChunks.length > 0) {
120+
console.log(`🔧 Combining WAV header + ${pcmChunks.length} PCM chunks...`)
121+
122+
// Combine all PCM data
123+
const combinedPcm = Buffer.concat(pcmChunks)
124+
125+
// Fix WAV header with actual PCM length
126+
const correctedHeader = Buffer.from(wavHeader)
127+
const actualDataSize = combinedPcm.length
128+
const actualFileSize = 36 + actualDataSize
129+
130+
correctedHeader.writeUInt32LE(actualFileSize, 4) // Fix ChunkSize
131+
correctedHeader.writeUInt32LE(actualDataSize, 40) // Fix Subchunk2Size
132+
133+
const finalWav = Buffer.concat([correctedHeader, combinedPcm])
134+
135+
const filename = `tts-test-${TTS_PROVIDER}-${Date.now()}.wav`
136+
fs.writeFileSync(filename, finalWav)
137+
138+
console.log(`💾 WAV file saved: ${filename}`)
139+
console.log(` Header: 44 bytes, PCM: ${combinedPcm.length} bytes, Total: ${finalWav.length} bytes`)
140+
console.log('✅ Audio should be playable now')
141+
} else if (wavHeader) {
142+
console.log('⚠️ WAV header received but no PCM chunks')
143+
} else if (pcmChunks.length > 0) {
144+
console.log('⚠️ PCM chunks received but no WAV header')
145+
} else {
146+
console.log('⚠️ No audio data received')
147+
}
148+
console.log('✅ Test completed')
149+
})
150+
151+
ws.on('error', (err) => {
152+
console.error('❌ WebSocket error:', err.message)
153+
console.log('📋 Event: WebSocket error')
154+
})
155+
156+
} catch (error) {
157+
console.error('❌ Test failed:', error.response?.data || error.message)
158+
}
159+
}
160+
161+
testTTSStreaming()

frontend/src/tts/azure.js

Lines changed: 92 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,40 @@ const debug = require('debug')('botium-speech-processing-azure-tts')
66

77
const { azureSpeechConfig, applyExtraAzureSpeechConfig, getAzureErrorDetails, ttsFilename } = require('../utils')
88

9+
// Create WAV header for PCM data
10+
const createWavHeader = (pcmLength, sampleRate = 16000, channels = 1, bitsPerSample = 16) => {
11+
const header = Buffer.alloc(44)
12+
const bytesPerSample = bitsPerSample / 8
13+
const byteRate = sampleRate * channels * bytesPerSample
14+
const blockAlign = channels * bytesPerSample
15+
const dataSize = pcmLength
16+
const fileSize = 36 + dataSize
17+
18+
header.write('RIFF', 0) // ChunkID
19+
header.writeUInt32LE(fileSize, 4) // ChunkSize
20+
header.write('WAVE', 8) // Format
21+
header.write('fmt ', 12) // Subchunk1ID
22+
header.writeUInt32LE(16, 16) // Subchunk1Size (PCM)
23+
header.writeUInt16LE(1, 20) // AudioFormat (PCM)
24+
header.writeUInt16LE(channels, 22) // NumChannels
25+
header.writeUInt32LE(sampleRate, 24) // SampleRate
26+
header.writeUInt32LE(byteRate, 28) // ByteRate
27+
header.writeUInt16LE(blockAlign, 32) // BlockAlign
28+
header.writeUInt16LE(bitsPerSample, 34) // BitsPerSample
29+
header.write('data', 36) // Subchunk2ID
30+
header.writeUInt32LE(dataSize, 40) // Subchunk2Size
31+
32+
return header
33+
}
34+
35+
// Extract PCM data from WAV buffer (skip 44-byte header)
36+
const extractPcmFromWav = (wavBuffer) => {
37+
if (wavBuffer.length > 44 && wavBuffer.toString('ascii', 0, 4) === 'RIFF') {
38+
return wavBuffer.slice(44) // Skip WAV header
39+
}
40+
return wavBuffer // Already PCM or unknown format
41+
}
42+
943
const genderMap = {
1044
Male: 'male',
1145
Female: 'female'
@@ -84,6 +118,8 @@ class AzureTTS {
84118
let isStreamClosed = false
85119
let synthesizer = null
86120
let textBuffer = ''
121+
let totalPcmLength = 0 // Track total PCM length for WAV header
122+
let headerSent = false // Track if WAV header was sent
87123

88124
const triggerHistoryEmit = () => {
89125
history.forEach(data => events.emit('data', data))
@@ -130,17 +166,44 @@ class AzureTTS {
130166
debug(`Azure TTS result for sentence: ${text.substring(0, 50)}...`)
131167

132168
if (result.reason === ResultReason.SynthesizingAudioCompleted) {
133-
const audioData = {
169+
// Extract PCM from WAV
170+
const wavBuffer = Buffer.from(result.audioData)
171+
const pcmData = extractPcmFromWav(wavBuffer)
172+
173+
debug(`Received PCM chunk: ${pcmData.length} bytes (from WAV: ${wavBuffer.length} bytes)`)
174+
175+
// Send WAV header once at the beginning
176+
if (!headerSent) {
177+
const placeholderSize = 0xFFFFFFFF - 44 // Max size minus header
178+
const wavHeader = createWavHeader(placeholderSize)
179+
180+
const headerData = {
181+
status: 'ok',
182+
buffer: wavHeader,
183+
final: false,
184+
debug: { message: 'WAV header', audioLength: wavHeader.length }
185+
}
186+
history.push(headerData)
187+
events.emit('data', headerData)
188+
headerSent = true
189+
debug('Sent WAV header (44 bytes)')
190+
}
191+
192+
// Send raw PCM data
193+
totalPcmLength += pcmData.length
194+
const pcmChunkData = {
134195
status: 'ok',
135-
buffer: Buffer.from(result.audioData),
196+
buffer: pcmData,
136197
final: false,
137198
debug: {
138-
audioLength: result.audioData.byteLength,
199+
message: 'PCM chunk',
200+
audioLength: pcmData.length,
201+
totalPcmSoFar: totalPcmLength,
139202
sentence: text.substring(0, 100) + (text.length > 100 ? '...' : '')
140203
}
141204
}
142-
history.push(audioData)
143-
events.emit('data', audioData)
205+
history.push(pcmChunkData)
206+
events.emit('data', pcmChunkData)
144207
} else if (result.reason === ResultReason.Canceled) {
145208
const errorData = {
146209
status: 'error',
@@ -192,17 +255,18 @@ class AzureTTS {
192255

193256
// Signal end of stream
194257
setTimeout(() => {
195-
if (!isStreamClosed) {
196-
const endData = {
197-
status: 'ok',
198-
text: '',
199-
final: true,
200-
debug: { message: 'Stream ended' }
258+
const endData = {
259+
status: 'ok',
260+
text: '',
261+
final: true,
262+
debug: {
263+
message: 'Stream ended',
264+
totalPcmLength: totalPcmLength
201265
}
202-
history.push(endData)
203-
events.emit('data', endData)
204-
close()
205266
}
267+
history.push(endData)
268+
events.emit('data', endData)
269+
close()
206270
}, 500) // Give time for final synthesis to complete
207271
}
208272

@@ -232,26 +296,28 @@ class AzureTTS {
232296
// Setup synthesizer event handlers for streaming
233297
synthesizer.synthesisStarted = (sender, event) => {
234298
debug('Azure TTS synthesis started')
235-
const startData = {
236-
status: 'ok',
237-
text: '',
238-
final: false,
239-
debug: { message: 'Synthesis started', sessionId: event.sessionId }
240-
}
241-
history.push(startData)
242-
events.emit('data', startData)
299+
// Only log, don't send metadata
243300
}
244301

245302
synthesizer.synthesizing = (sender, event) => {
246303
debug(`Azure TTS synthesizing: ${event.result.audioData.byteLength} bytes`)
247304

248305
if (event.result.audioData.byteLength > 0) {
306+
// Extract PCM from WAV chunk and add proper header
307+
const wavBuffer = Buffer.from(event.result.audioData)
308+
const pcmData = extractPcmFromWav(wavBuffer)
309+
const newWavHeader = createWavHeader(pcmData.length)
310+
const finalWavChunk = Buffer.concat([newWavHeader, pcmData])
311+
312+
debug(`Synthesizing chunk: WAV ${wavBuffer.length} -> PCM ${pcmData.length} -> WAV ${finalWavChunk.length}`)
313+
249314
const audioData = {
250315
status: 'ok',
251-
buffer: Buffer.from(event.result.audioData),
316+
buffer: finalWavChunk,
252317
final: false,
253318
debug: {
254-
audioLength: event.result.audioData.byteLength,
319+
audioLength: finalWavChunk.length,
320+
pcmLength: pcmData.length,
255321
partial: true
256322
}
257323
}
@@ -262,31 +328,12 @@ class AzureTTS {
262328

263329
synthesizer.synthesisCompleted = (sender, event) => {
264330
debug('Azure TTS synthesis completed for chunk')
265-
266-
const completeData = {
267-
status: 'ok',
268-
text: '',
269-
final: false,
270-
debug: {
271-
message: 'Chunk synthesis completed',
272-
resultId: event.result.resultId
273-
}
274-
}
275-
history.push(completeData)
276-
events.emit('data', completeData)
331+
// Only log, don't send metadata
277332
}
278333

279334
synthesizer.SynthesisCanceled = (sender, event) => {
280335
debug(`Azure TTS synthesis canceled: ${event.reason}`)
281-
282-
const cancelData = {
283-
status: 'error',
284-
text: '',
285-
final: true,
286-
err: `Synthesis canceled: ${event.reason}`
287-
}
288-
history.push(cancelData)
289-
events.emit('data', cancelData)
336+
// Only log errors, don't send metadata
290337
}
291338

292339
const openData = {

0 commit comments

Comments
 (0)