feat(stt): add new sad_module param to recognize functions

apaparazzi0329 · apaparazzi0329 · commit eda597600ae3 · 2025-11-11T12:28:21.000-05:00
diff --git a/lib/recognize-stream.ts b/lib/recognize-stream.ts
@@ -106,6 +106,7 @@ class RecognizeStream extends Duplex {
    * @param {number} [options.speechDetectorSensitivity] - The sensitivity of speech activity detection that the service is to perform
    * @param {number} [options.backgroundAudioSuppression] - The level to which the service is to suppress background audio based on its volume to prevent it from being transcribed as speech
    * @param {boolean} [params.lowLatency] - If `true` for next-generation `Multimedia` and `Telephony` models that support low latency, directs the service to produce results even more quickly than it usually does
+   * @param {number} [params.sadModule] - Detects speech boundaries within the audio stream with better performance, improved noise suppression, faster responsiveness, and increased accuracy.
    * @constructor
    */
   constructor(options: RecognizeStream.Options) {
@@ -182,7 +183,8 @@ class RecognizeStream extends Duplex {
       'split_transcript_at_phrase_end',
       'speech_detector_sensitivity',
       'background_audio_suppression',
-      'low_latency'
+      'low_latency',
+      'sad_module'
     ];
     const openingMessage = processUserParameters(options, openingMessageParamsAllowed);
     openingMessage.action = 'start';
diff --git a/speech-to-text/v1-generated.ts b/speech-to-text/v1-generated.ts
@@ -339,8 +339,9 @@ class SpeechToTextV1 extends BaseService {
    * @param {boolean} [params.speechBeginEvent] - If `true`, the service returns a response object `SpeechActivity`
    * which contains the time when a speech activity is detected in the stream. This can be used both in standard and low
    * latency mode. This feature enables client applications to know that some words/speech has been detected and the
-   * service is in the process of decoding. This can be used in lieu of interim results in standard mode. See [Using
-   * speech recognition
+   * service is in the process of decoding. This can be used in lieu of interim results in standard mode. Use
+   * `sad_module: 2` to increase accuracy and performance in detecting speech boundaries within the audio stream. See
+   * [Using speech recognition
    * parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
    * @param {string} [params.languageCustomizationId] - The customization ID (GUID) of a custom language model that is
    * to be used with the recognition request. The base model of the specified custom language model must match the model
@@ -508,6 +509,13 @@ class SpeechToTextV1 extends BaseService {
    * sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity)
    * and [Language model
    * support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support).
+   * @param {number} [params.sadModule] - Detects speech boundaries within the audio stream with better performance,
+   * improved noise suppression, faster responsiveness, and increased accuracy.
+   *
+   * Specify `sad_module: 2`
+   *
+   *  See [Speech Activity Detection
+   * (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
    * @param {number} [params.backgroundAudioSuppression] - The level to which the service is to suppress background
    * audio based on its volume to prevent it from being transcribed as speech. Use the parameter to suppress side
    * conversations or background noise.
@@ -561,7 +569,7 @@ class SpeechToTextV1 extends BaseService {
   ): Promise<SpeechToTextV1.Response<SpeechToTextV1.SpeechRecognitionResults>> {
     const _params = { ...params };
     const _requiredParams = ['audio'];
-    const _validParams = ['audio', 'contentType', 'model', 'speechBeginEvent', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers'];
+    const _validParams = ['audio', 'contentType', 'model', 'speechBeginEvent', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'sadModule', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers'];
     const _validationErrors = validateParams(_params, _requiredParams, _validParams);
     if (_validationErrors) {
       return Promise.reject(_validationErrors);
@@ -592,6 +600,7 @@ class SpeechToTextV1 extends BaseService {
       'end_of_phrase_silence_time': _params.endOfPhraseSilenceTime,
       'split_transcript_at_phrase_end': _params.splitTranscriptAtPhraseEnd,
       'speech_detector_sensitivity': _params.speechDetectorSensitivity,
+      'sad_module': _params.sadModule,
       'background_audio_suppression': _params.backgroundAudioSuppression,
       'low_latency': _params.lowLatency,
       'character_insertion_bias': _params.characterInsertionBias,
@@ -1116,6 +1125,13 @@ class SpeechToTextV1 extends BaseService {
    * sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity)
    * and [Language model
    * support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support).
+   * @param {number} [params.sadModule] - Detects speech boundaries within the audio stream with better performance,
+   * improved noise suppression, faster responsiveness, and increased accuracy.
+   *
+   * Specify `sad_module: 2`
+   *
+   *  See [Speech Activity Detection
+   * (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
    * @param {number} [params.backgroundAudioSuppression] - The level to which the service is to suppress background
    * audio based on its volume to prevent it from being transcribed as speech. Use the parameter to suppress side
    * conversations or background noise.
@@ -1169,7 +1185,7 @@ class SpeechToTextV1 extends BaseService {
   ): Promise<SpeechToTextV1.Response<SpeechToTextV1.RecognitionJob>> {
     const _params = { ...params };
     const _requiredParams = ['audio'];
-    const _validParams = ['audio', 'contentType', 'model', 'callbackUrl', 'events', 'userToken', 'resultsTtl', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'processingMetrics', 'processingMetricsInterval', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers'];
+    const _validParams = ['audio', 'contentType', 'model', 'callbackUrl', 'events', 'userToken', 'resultsTtl', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'processingMetrics', 'processingMetricsInterval', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'sadModule', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers'];
     const _validationErrors = validateParams(_params, _requiredParams, _validParams);
     if (_validationErrors) {
       return Promise.reject(_validationErrors);
@@ -1205,6 +1221,7 @@ class SpeechToTextV1 extends BaseService {
       'end_of_phrase_silence_time': _params.endOfPhraseSilenceTime,
       'split_transcript_at_phrase_end': _params.splitTranscriptAtPhraseEnd,
       'speech_detector_sensitivity': _params.speechDetectorSensitivity,
+      'sad_module': _params.sadModule,
       'background_audio_suppression': _params.backgroundAudioSuppression,
       'low_latency': _params.lowLatency,
       'character_insertion_bias': _params.characterInsertionBias,
@@ -4334,7 +4351,8 @@ namespace SpeechToTextV1 {
     /** If `true`, the service returns a response object `SpeechActivity` which contains the time when a speech
      *  activity is detected in the stream. This can be used both in standard and low latency mode. This feature enables
      *  client applications to know that some words/speech has been detected and the service is in the process of
-     *  decoding. This can be used in lieu of interim results in standard mode. See [Using speech recognition
+     *  decoding. This can be used in lieu of interim results in standard mode. Use `sad_module: 2` to increase accuracy
+     *  and performance in detecting speech boundaries within the audio stream. See [Using speech recognition
      *  parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
      */
     speechBeginEvent?: boolean;
@@ -4541,6 +4559,15 @@ namespace SpeechToTextV1 {
      *  support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support).
      */
     speechDetectorSensitivity?: number;
+    /** Detects speech boundaries within the audio stream with better performance, improved noise suppression,
+     *  faster responsiveness, and increased accuracy.
+     *
+     *  Specify `sad_module: 2`
+     *
+     *   See [Speech Activity Detection
+     *  (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
+     */
+    sadModule?: number;
     /** The level to which the service is to suppress background audio based on its volume to prevent it from being
      *  transcribed as speech. Use the parameter to suppress side conversations or background noise.
      *
@@ -5009,6 +5036,15 @@ namespace SpeechToTextV1 {
      *  support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support).
      */
     speechDetectorSensitivity?: number;
+    /** Detects speech boundaries within the audio stream with better performance, improved noise suppression,
+     *  faster responsiveness, and increased accuracy.
+     *
+     *  Specify `sad_module: 2`
+     *
+     *   See [Speech Activity Detection
+     *  (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
+     */
+    sadModule?: number;
     /** The level to which the service is to suppress background audio based on its volume to prevent it from being
      *  transcribed as speech. Use the parameter to suppress side conversations or background noise.
      *
diff --git a/speech-to-text/v1.ts b/speech-to-text/v1.ts
@@ -286,6 +286,7 @@ namespace SpeechToTextV1 {
     speechDetectorSensitivity?: number;
     backgroundAudioSuppression?: number;
     characterInsertionBias?: number;
+    sadModule?: number;
   }
 }
 
diff --git a/test/unit/speech-to-text.v1.test.js b/test/unit/speech-to-text.v1.test.js
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright IBM Corp. 2018, 2024.
+ * (C) Copyright IBM Corp. 2025.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -295,6 +295,7 @@ describe('SpeechToTextV1', () => {
         const endOfPhraseSilenceTime = 0.8;
         const splitTranscriptAtPhraseEnd = false;
         const speechDetectorSensitivity = 0.5;
+        const sadModule = 1;
         const backgroundAudioSuppression = 0.0;
         const lowLatency = false;
         const characterInsertionBias = 0.0;
@@ -324,6 +325,7 @@ describe('SpeechToTextV1', () => {
           endOfPhraseSilenceTime,
           splitTranscriptAtPhraseEnd,
           speechDetectorSensitivity,
+          sadModule,
           backgroundAudioSuppression,
           lowLatency,
           characterInsertionBias,
@@ -368,6 +370,7 @@ describe('SpeechToTextV1', () => {
         expect(mockRequestOptions.qs.end_of_phrase_silence_time).toEqual(endOfPhraseSilenceTime);
         expect(mockRequestOptions.qs.split_transcript_at_phrase_end).toEqual(splitTranscriptAtPhraseEnd);
         expect(mockRequestOptions.qs.speech_detector_sensitivity).toEqual(speechDetectorSensitivity);
+        expect(mockRequestOptions.qs.sad_module).toEqual(sadModule);
         expect(mockRequestOptions.qs.background_audio_suppression).toEqual(backgroundAudioSuppression);
         expect(mockRequestOptions.qs.low_latency).toEqual(lowLatency);
         expect(mockRequestOptions.qs.character_insertion_bias).toEqual(characterInsertionBias);
@@ -636,6 +639,7 @@ describe('SpeechToTextV1', () => {
         const endOfPhraseSilenceTime = 0.8;
         const splitTranscriptAtPhraseEnd = false;
         const speechDetectorSensitivity = 0.5;
+        const sadModule = 1;
         const backgroundAudioSuppression = 0.0;
         const lowLatency = false;
         const characterInsertionBias = 0.0;
@@ -670,6 +674,7 @@ describe('SpeechToTextV1', () => {
           endOfPhraseSilenceTime,
           splitTranscriptAtPhraseEnd,
           speechDetectorSensitivity,
+          sadModule,
           backgroundAudioSuppression,
           lowLatency,
           characterInsertionBias,
@@ -719,6 +724,7 @@ describe('SpeechToTextV1', () => {
         expect(mockRequestOptions.qs.end_of_phrase_silence_time).toEqual(endOfPhraseSilenceTime);
         expect(mockRequestOptions.qs.split_transcript_at_phrase_end).toEqual(splitTranscriptAtPhraseEnd);
         expect(mockRequestOptions.qs.speech_detector_sensitivity).toEqual(speechDetectorSensitivity);
+        expect(mockRequestOptions.qs.sad_module).toEqual(sadModule);
         expect(mockRequestOptions.qs.background_audio_suppression).toEqual(backgroundAudioSuppression);
         expect(mockRequestOptions.qs.low_latency).toEqual(lowLatency);
         expect(mockRequestOptions.qs.character_insertion_bias).toEqual(characterInsertionBias);

Original file line number	Diff line number	Diff line change
`@@ -286,6 +286,7 @@ namespace SpeechToTextV1 {`
`286`	`286`	`speechDetectorSensitivity?: number;`
`287`	`287`	`backgroundAudioSuppression?: number;`
`288`	`288`	`characterInsertionBias?: number;`
	`289`	`+ sadModule?: number;`
`289`	`290`	`}`
`290`	`291`	`}`
`291`	`292`