Skip to content

Commit eda5976

Browse files
feat(stt): add new sad_module param to recognize functions
1 parent 3660b3a commit eda5976

File tree

4 files changed

+52
-7
lines changed

4 files changed

+52
-7
lines changed

lib/recognize-stream.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ class RecognizeStream extends Duplex {
106106
* @param {number} [options.speechDetectorSensitivity] - The sensitivity of speech activity detection that the service is to perform
107107
* @param {number} [options.backgroundAudioSuppression] - The level to which the service is to suppress background audio based on its volume to prevent it from being transcribed as speech
108108
* @param {boolean} [params.lowLatency] - If `true` for next-generation `Multimedia` and `Telephony` models that support low latency, directs the service to produce results even more quickly than it usually does
109+
* @param {number} [params.sadModule] - Detects speech boundaries within the audio stream with better performance, improved noise suppression, faster responsiveness, and increased accuracy.
109110
* @constructor
110111
*/
111112
constructor(options: RecognizeStream.Options) {
@@ -182,7 +183,8 @@ class RecognizeStream extends Duplex {
182183
'split_transcript_at_phrase_end',
183184
'speech_detector_sensitivity',
184185
'background_audio_suppression',
185-
'low_latency'
186+
'low_latency',
187+
'sad_module'
186188
];
187189
const openingMessage = processUserParameters(options, openingMessageParamsAllowed);
188190
openingMessage.action = 'start';

speech-to-text/v1-generated.ts

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -339,8 +339,9 @@ class SpeechToTextV1 extends BaseService {
339339
* @param {boolean} [params.speechBeginEvent] - If `true`, the service returns a response object `SpeechActivity`
340340
* which contains the time when a speech activity is detected in the stream. This can be used both in standard and low
341341
* latency mode. This feature enables client applications to know that some words/speech has been detected and the
342-
* service is in the process of decoding. This can be used in lieu of interim results in standard mode. See [Using
343-
* speech recognition
342+
* service is in the process of decoding. This can be used in lieu of interim results in standard mode. Use
343+
* `sad_module: 2` to increase accuracy and performance in detecting speech boundaries within the audio stream. See
344+
* [Using speech recognition
344345
* parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
345346
* @param {string} [params.languageCustomizationId] - The customization ID (GUID) of a custom language model that is
346347
* to be used with the recognition request. The base model of the specified custom language model must match the model
@@ -508,6 +509,13 @@ class SpeechToTextV1 extends BaseService {
508509
* sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity)
509510
* and [Language model
510511
* support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support).
512+
* @param {number} [params.sadModule] - Detects speech boundaries within the audio stream with better performance,
513+
* improved noise suppression, faster responsiveness, and increased accuracy.
514+
*
515+
* Specify `sad_module: 2`
516+
*
517+
* See [Speech Activity Detection
518+
* (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
511519
* @param {number} [params.backgroundAudioSuppression] - The level to which the service is to suppress background
512520
* audio based on its volume to prevent it from being transcribed as speech. Use the parameter to suppress side
513521
* conversations or background noise.
@@ -561,7 +569,7 @@ class SpeechToTextV1 extends BaseService {
561569
): Promise<SpeechToTextV1.Response<SpeechToTextV1.SpeechRecognitionResults>> {
562570
const _params = { ...params };
563571
const _requiredParams = ['audio'];
564-
const _validParams = ['audio', 'contentType', 'model', 'speechBeginEvent', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers'];
572+
const _validParams = ['audio', 'contentType', 'model', 'speechBeginEvent', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'sadModule', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers'];
565573
const _validationErrors = validateParams(_params, _requiredParams, _validParams);
566574
if (_validationErrors) {
567575
return Promise.reject(_validationErrors);
@@ -592,6 +600,7 @@ class SpeechToTextV1 extends BaseService {
592600
'end_of_phrase_silence_time': _params.endOfPhraseSilenceTime,
593601
'split_transcript_at_phrase_end': _params.splitTranscriptAtPhraseEnd,
594602
'speech_detector_sensitivity': _params.speechDetectorSensitivity,
603+
'sad_module': _params.sadModule,
595604
'background_audio_suppression': _params.backgroundAudioSuppression,
596605
'low_latency': _params.lowLatency,
597606
'character_insertion_bias': _params.characterInsertionBias,
@@ -1116,6 +1125,13 @@ class SpeechToTextV1 extends BaseService {
11161125
* sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity)
11171126
* and [Language model
11181127
* support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support).
1128+
* @param {number} [params.sadModule] - Detects speech boundaries within the audio stream with better performance,
1129+
* improved noise suppression, faster responsiveness, and increased accuracy.
1130+
*
1131+
* Specify `sad_module: 2`
1132+
*
1133+
* See [Speech Activity Detection
1134+
* (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
11191135
* @param {number} [params.backgroundAudioSuppression] - The level to which the service is to suppress background
11201136
* audio based on its volume to prevent it from being transcribed as speech. Use the parameter to suppress side
11211137
* conversations or background noise.
@@ -1169,7 +1185,7 @@ class SpeechToTextV1 extends BaseService {
11691185
): Promise<SpeechToTextV1.Response<SpeechToTextV1.RecognitionJob>> {
11701186
const _params = { ...params };
11711187
const _requiredParams = ['audio'];
1172-
const _validParams = ['audio', 'contentType', 'model', 'callbackUrl', 'events', 'userToken', 'resultsTtl', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'processingMetrics', 'processingMetricsInterval', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers'];
1188+
const _validParams = ['audio', 'contentType', 'model', 'callbackUrl', 'events', 'userToken', 'resultsTtl', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'processingMetrics', 'processingMetricsInterval', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'sadModule', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers'];
11731189
const _validationErrors = validateParams(_params, _requiredParams, _validParams);
11741190
if (_validationErrors) {
11751191
return Promise.reject(_validationErrors);
@@ -1205,6 +1221,7 @@ class SpeechToTextV1 extends BaseService {
12051221
'end_of_phrase_silence_time': _params.endOfPhraseSilenceTime,
12061222
'split_transcript_at_phrase_end': _params.splitTranscriptAtPhraseEnd,
12071223
'speech_detector_sensitivity': _params.speechDetectorSensitivity,
1224+
'sad_module': _params.sadModule,
12081225
'background_audio_suppression': _params.backgroundAudioSuppression,
12091226
'low_latency': _params.lowLatency,
12101227
'character_insertion_bias': _params.characterInsertionBias,
@@ -4334,7 +4351,8 @@ namespace SpeechToTextV1 {
43344351
/** If `true`, the service returns a response object `SpeechActivity` which contains the time when a speech
43354352
* activity is detected in the stream. This can be used both in standard and low latency mode. This feature enables
43364353
* client applications to know that some words/speech has been detected and the service is in the process of
4337-
* decoding. This can be used in lieu of interim results in standard mode. See [Using speech recognition
4354+
* decoding. This can be used in lieu of interim results in standard mode. Use `sad_module: 2` to increase accuracy
4355+
* and performance in detecting speech boundaries within the audio stream. See [Using speech recognition
43384356
* parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
43394357
*/
43404358
speechBeginEvent?: boolean;
@@ -4541,6 +4559,15 @@ namespace SpeechToTextV1 {
45414559
* support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support).
45424560
*/
45434561
speechDetectorSensitivity?: number;
4562+
/** Detects speech boundaries within the audio stream with better performance, improved noise suppression,
4563+
* faster responsiveness, and increased accuracy.
4564+
*
4565+
* Specify `sad_module: 2`
4566+
*
4567+
* See [Speech Activity Detection
4568+
* (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
4569+
*/
4570+
sadModule?: number;
45444571
/** The level to which the service is to suppress background audio based on its volume to prevent it from being
45454572
* transcribed as speech. Use the parameter to suppress side conversations or background noise.
45464573
*
@@ -5009,6 +5036,15 @@ namespace SpeechToTextV1 {
50095036
* support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support).
50105037
*/
50115038
speechDetectorSensitivity?: number;
5039+
/** Detects speech boundaries within the audio stream with better performance, improved noise suppression,
5040+
* faster responsiveness, and increased accuracy.
5041+
*
5042+
* Specify `sad_module: 2`
5043+
*
5044+
* See [Speech Activity Detection
5045+
* (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
5046+
*/
5047+
sadModule?: number;
50125048
/** The level to which the service is to suppress background audio based on its volume to prevent it from being
50135049
* transcribed as speech. Use the parameter to suppress side conversations or background noise.
50145050
*

speech-to-text/v1.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,7 @@ namespace SpeechToTextV1 {
286286
speechDetectorSensitivity?: number;
287287
backgroundAudioSuppression?: number;
288288
characterInsertionBias?: number;
289+
sadModule?: number;
289290
}
290291
}
291292

test/unit/speech-to-text.v1.test.js

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* (C) Copyright IBM Corp. 2018, 2024.
2+
* (C) Copyright IBM Corp. 2025.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -295,6 +295,7 @@ describe('SpeechToTextV1', () => {
295295
const endOfPhraseSilenceTime = 0.8;
296296
const splitTranscriptAtPhraseEnd = false;
297297
const speechDetectorSensitivity = 0.5;
298+
const sadModule = 1;
298299
const backgroundAudioSuppression = 0.0;
299300
const lowLatency = false;
300301
const characterInsertionBias = 0.0;
@@ -324,6 +325,7 @@ describe('SpeechToTextV1', () => {
324325
endOfPhraseSilenceTime,
325326
splitTranscriptAtPhraseEnd,
326327
speechDetectorSensitivity,
328+
sadModule,
327329
backgroundAudioSuppression,
328330
lowLatency,
329331
characterInsertionBias,
@@ -368,6 +370,7 @@ describe('SpeechToTextV1', () => {
368370
expect(mockRequestOptions.qs.end_of_phrase_silence_time).toEqual(endOfPhraseSilenceTime);
369371
expect(mockRequestOptions.qs.split_transcript_at_phrase_end).toEqual(splitTranscriptAtPhraseEnd);
370372
expect(mockRequestOptions.qs.speech_detector_sensitivity).toEqual(speechDetectorSensitivity);
373+
expect(mockRequestOptions.qs.sad_module).toEqual(sadModule);
371374
expect(mockRequestOptions.qs.background_audio_suppression).toEqual(backgroundAudioSuppression);
372375
expect(mockRequestOptions.qs.low_latency).toEqual(lowLatency);
373376
expect(mockRequestOptions.qs.character_insertion_bias).toEqual(characterInsertionBias);
@@ -636,6 +639,7 @@ describe('SpeechToTextV1', () => {
636639
const endOfPhraseSilenceTime = 0.8;
637640
const splitTranscriptAtPhraseEnd = false;
638641
const speechDetectorSensitivity = 0.5;
642+
const sadModule = 1;
639643
const backgroundAudioSuppression = 0.0;
640644
const lowLatency = false;
641645
const characterInsertionBias = 0.0;
@@ -670,6 +674,7 @@ describe('SpeechToTextV1', () => {
670674
endOfPhraseSilenceTime,
671675
splitTranscriptAtPhraseEnd,
672676
speechDetectorSensitivity,
677+
sadModule,
673678
backgroundAudioSuppression,
674679
lowLatency,
675680
characterInsertionBias,
@@ -719,6 +724,7 @@ describe('SpeechToTextV1', () => {
719724
expect(mockRequestOptions.qs.end_of_phrase_silence_time).toEqual(endOfPhraseSilenceTime);
720725
expect(mockRequestOptions.qs.split_transcript_at_phrase_end).toEqual(splitTranscriptAtPhraseEnd);
721726
expect(mockRequestOptions.qs.speech_detector_sensitivity).toEqual(speechDetectorSensitivity);
727+
expect(mockRequestOptions.qs.sad_module).toEqual(sadModule);
722728
expect(mockRequestOptions.qs.background_audio_suppression).toEqual(backgroundAudioSuppression);
723729
expect(mockRequestOptions.qs.low_latency).toEqual(lowLatency);
724730
expect(mockRequestOptions.qs.character_insertion_bias).toEqual(characterInsertionBias);

0 commit comments

Comments
 (0)