Skip to content

Commit 34d305e

Browse files
[speech-to-text] WebSockets support for speech to text
1 parent cc5bc8d commit 34d305e

File tree

8 files changed

+526
-19
lines changed

8 files changed

+526
-19
lines changed

pom.xml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
<?xml version="1.0" encoding="UTF-8"?>
2-
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
2+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
34
<modelVersion>4.0.0</modelVersion>
45
<groupId>com.ibm.watson.developer_cloud</groupId>
56
<version>2.5.1-SNAPSHOT</version>
@@ -20,7 +21,7 @@
2021
<dependency>
2122
<groupId>com.squareup.okhttp</groupId>
2223
<artifactId>okhttp</artifactId>
23-
<version>2.7.0</version>
24+
<version>2.7.2</version>
2425
</dependency>
2526
<dependency>
2627
<groupId>com.google.code.gson</groupId>
@@ -35,9 +36,14 @@
3536
<dependency>
3637
<groupId>junit</groupId>
3738
<artifactId>junit</artifactId>
38-
<version>4.11</version>
39+
<version>4.12</version>
3940
<scope>test</scope>
4041
</dependency>
42+
<dependency>
43+
<groupId>com.neovisionaries</groupId>
44+
<artifactId>nv-websocket-client</artifactId>
45+
<version>1.19</version>
46+
</dependency>
4147
<!-- mockserver -->
4248
<dependency>
4349
<groupId>org.mock-server</groupId>

src/main/java/com/ibm/watson/developer_cloud/http/HttpHeaders.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,4 +179,6 @@ public interface HttpHeaders {
179179
*/
180180
public static final String WWW_AUTHENTICATE = "WWW-Authenticate";
181181

182+
/** The Authorization token header. */
183+
public static final String X_WATSON_AUTHORIZATION_TOKEN = "X-Watson-Authorization-Token";
182184
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/**
2+
* Copyright 2015 IBM Corp. All Rights Reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5+
* in compliance with the License. You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software distributed under the License
10+
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11+
* or implied. See the License for the specific language governing permissions and limitations under
12+
* the License.
13+
*/
14+
package com.ibm.watson.developer_cloud.speech_to_text.v1;
15+
16+
import com.ibm.watson.developer_cloud.speech_to_text.v1.model.SpeechResults;
17+
18+
19+
20+
/**
21+
* The Interface RecognizeDelegate.
22+
*/
23+
public interface RecognizeDelegate {
24+
25+
/**
26+
* On message.
27+
*
28+
* @param speechResults the speech results
29+
* @param fin if results are final
30+
*/
31+
public void onMessage(SpeechResults speechResults, boolean fin);
32+
33+
/**
34+
* On connected.
35+
*/
36+
public void onConnected();
37+
38+
/**
39+
* On error.
40+
*
41+
* @param e the e
42+
*/
43+
public void onError(Exception e);
44+
45+
/**
46+
* On disconnected.
47+
*/
48+
public void onDisconnected();
49+
}

src/main/java/com/ibm/watson/developer_cloud/speech_to_text/v1/RecognizeOptions.java

Lines changed: 143 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
*/
1414
package com.ibm.watson.developer_cloud.speech_to_text.v1;
1515

16+
import java.util.List;
17+
18+
import com.google.gson.annotations.SerializedName;
1619
import com.ibm.watson.developer_cloud.speech_to_text.v1.model.SpeechSession;
1720

1821

@@ -22,15 +25,27 @@
2225
*/
2326
public class RecognizeOptions {
2427

28+
@SerializedName("content-type")
29+
private String contentType;
2530
private Boolean continuous;
2631
private Integer inactivityTimeout;
32+
33+
@SerializedName("interim_results")
34+
private Boolean interimResults;
35+
private List<String> keywords;
36+
37+
@SerializedName("keywords_threshold")
38+
private Double keywordsThreshold;
2739
private Integer maxAlternatives;
2840
private String model;
2941
private String sessionId;
30-
3142
private Boolean timestamps;
32-
private Boolean wordConfidence;
3343

44+
@SerializedName("word_alternatives_threshold")
45+
private Double wordAlternativesThreshold;
46+
47+
@SerializedName("word_confidence")
48+
private Boolean wordConfidence;
3449

3550
/**
3651
* If true, multiple final results that represent multiple consecutive phrases separated by pauses
@@ -44,6 +59,15 @@ public RecognizeOptions continuous(Boolean continuous) {
4459
return this;
4560
}
4661

62+
/**
63+
* Gets the content type.
64+
*
65+
* @return the contentType
66+
*/
67+
public String getContentType() {
68+
return contentType;
69+
}
70+
4771
/**
4872
* Gets the continuous.
4973
*
@@ -62,6 +86,33 @@ public Integer getInactivityTimeout() {
6286
return inactivityTimeout;
6387
}
6488

89+
/**
90+
* Gets the interim results.
91+
*
92+
* @return the interimResults
93+
*/
94+
public Boolean getInterimResults() {
95+
return interimResults;
96+
}
97+
98+
/**
99+
* Gets the keywords.
100+
*
101+
* @return the keywords
102+
*/
103+
public List<String> getKeywords() {
104+
return keywords;
105+
}
106+
107+
/**
108+
* Gets the keywords threshold.
109+
*
110+
* @return the keywordsThreshold
111+
*/
112+
public Double getKeywordsThreshold() {
113+
return keywordsThreshold;
114+
}
115+
65116
/**
66117
* Gets the max alternatives.
67118
*
@@ -98,7 +149,14 @@ public Boolean getTimestamps() {
98149
return timestamps;
99150
}
100151

101-
152+
/**
153+
* Gets the word alternatives threshold.
154+
*
155+
* @return the wordAlternativesThreshold
156+
*/
157+
public Double getWordAlternativesThreshold() {
158+
return wordAlternativesThreshold;
159+
}
102160

103161
/**
104162
* Gets the word confidence.
@@ -121,7 +179,50 @@ public RecognizeOptions inactivityTimeout(Integer inactivityTimeout) {
121179
}
122180

123181
/**
124-
* Maximum number of alternative transcripts returned
182+
* If true, the service sends interim results for the transcription. Otherwise, the recognition
183+
* ends after first "end of speech" is detected. The default is false..
184+
*
185+
* @param interimResults the interim results
186+
* @return the recognize options
187+
*/
188+
public RecognizeOptions interimResults(Boolean interimResults) {
189+
this.interimResults = interimResults;
190+
return this;
191+
}
192+
193+
/**
194+
* Specifies an array of keyword strings to be matched in the input audio. By default, the service
195+
* does no keyword spotting.
196+
*
197+
*
198+
* @param keywords the keywords
199+
* @return the recognize options
200+
*/
201+
public RecognizeOptions keywords(List<String> keywords) {
202+
this.keywords = keywords;
203+
return this;
204+
}
205+
206+
207+
208+
/**
209+
* Specifies a minimum level of confidence that the service must have to report a matching keyword
210+
* in the input audio. Specify a probability value between 0 and 1 inclusive. A match must have at
211+
* least the specified confidence to be returned. Omit the parameter or specify a value of null
212+
* (the default) to spot no keywords. If you specify a valid threshold, you must also specify at
213+
* least one keyword.
214+
*
215+
*
216+
* @param keywordsThreshold the keywords threshold
217+
* @return the recognize options
218+
*/
219+
public RecognizeOptions keywordsThreshold(Double keywordsThreshold) {
220+
this.keywordsThreshold = keywordsThreshold;
221+
return this;
222+
}
223+
224+
/**
225+
* Maximum number of alternative transcripts returned.
125226
*
126227
* @param maxAlternatives the max alternatives
127228
* @return the recognize options
@@ -132,7 +233,7 @@ public RecognizeOptions maxAlternatives(Integer maxAlternatives) {
132233
}
133234

134235
/**
135-
* Sets the model name used for the recognition
236+
* Sets the model name used for the recognition.
136237
*
137238
* @param model the model
138239
* @return the recognize options
@@ -142,6 +243,17 @@ public RecognizeOptions model(String model) {
142243
return this;
143244
}
144245

246+
/**
247+
* Sets the session id.
248+
*
249+
* @param session the {@link SpeechSession}
250+
* @return the recognize options
251+
*/
252+
public RecognizeOptions session(SpeechSession session) {
253+
this.sessionId = session.getSessionId();
254+
return this;
255+
}
256+
145257
/**
146258
* Sets session id.
147259
*
@@ -154,29 +266,46 @@ public RecognizeOptions sessionId(String sessionId) {
154266
}
155267

156268
/**
157-
* Sets the session id.
269+
* If true, time alignment for each word is returned.
158270
*
159-
* @param session the {@link SpeechSession}
271+
* @param timestamps the timestamps
160272
* @return the recognize options
161273
*/
162-
public RecognizeOptions session(SpeechSession session) {
163-
this.sessionId = session.getSessionId();
274+
public RecognizeOptions timestamps(Boolean timestamps) {
275+
this.timestamps = timestamps;
164276
return this;
165277
}
166278

167279
/**
168-
* If true, time alignment for each word is returned
280+
* Specifies a minimum level of confidence that the service must have to report a hypothesis for a
281+
* word from the input audio. Specify a probability value between 0 and 1 inclusive. A hypothesis
282+
* must have at least the specified confidence to be returned as a word alternative. Omit the
283+
* parameter or specify a value of null (the default) to return no word alternatives.
169284
*
170-
* @param timestamps the timestamps
285+
*
286+
*
287+
* @param wordAlternativesThreshold the wordAalternatives threshold
171288
* @return the recognize options
172289
*/
173-
public RecognizeOptions timestamps(Boolean timestamps) {
174-
this.timestamps = timestamps;
290+
public RecognizeOptions wordAlternativesThreshold(Double wordAlternativesThreshold) {
291+
this.wordAlternativesThreshold = wordAlternativesThreshold;
175292
return this;
176293
}
177294

178295
/**
179-
* If true, confidence measure per word is returned if available
296+
* Content type.
297+
*
298+
* @param contentType the content type
299+
* @return the recognize options
300+
*/
301+
public RecognizeOptions contentType(String contentType) {
302+
this.contentType = contentType;
303+
return this;
304+
}
305+
306+
307+
/**
308+
* If true, confidence measure per word is returned if available.
180309
*
181310
* @param wordConfidence the word confidence
182311
* @return the recognize options

src/main/java/com/ibm/watson/developer_cloud/speech_to_text/v1/SpeechToText.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
package com.ibm.watson.developer_cloud.speech_to_text.v1;
1515

1616
import java.io.File;
17+
import java.io.InputStream;
1718
import java.util.List;
1819

1920
import com.google.gson.JsonObject;
@@ -262,4 +263,16 @@ public SpeechResults recognize(File audio, String contentType, RecognizeOptions
262263
requestBuilder.withBody(RequestBody.create(MediaType.parse(contentType), audio));
263264
return executeRequest(requestBuilder.build(), SpeechResults.class);
264265
}
266+
267+
/**
268+
* Recognizes using WebSockets.
269+
*
270+
* @param audio the audio
271+
* @param options the options
272+
* @param delegate the delegate
273+
*/
274+
public void recognizeWS(InputStream audio, RecognizeOptions options, RecognizeDelegate delegate) {
275+
WebSocketClient webSocket = new WebSocketClient();
276+
webSocket.recognize(audio, options, delegate);
277+
}
265278
}

0 commit comments

Comments
 (0)