23
23
import com .google .cloud .speech .v1beta1 .RecognitionConfig .AudioEncoding ;
24
24
import com .google .cloud .speech .v1beta1 .SpeechGrpc ;
25
25
import com .google .cloud .speech .v1beta1 .StreamingRecognitionConfig ;
26
+ import com .google .cloud .speech .v1beta1 .StreamingRecognitionResult ;
26
27
import com .google .cloud .speech .v1beta1 .StreamingRecognizeRequest ;
27
28
import com .google .cloud .speech .v1beta1 .StreamingRecognizeResponse ;
28
29
import com .google .protobuf .ByteString ;
29
- import com .google .protobuf .TextFormat ;
30
30
31
31
import io .grpc .ManagedChannel ;
32
32
import io .grpc .ManagedChannelBuilder ;
44
44
import org .apache .log4j .Logger ;
45
45
import org .apache .log4j .SimpleLayout ;
46
46
47
- import java .io .File ;
48
- import java .io .FileInputStream ;
49
47
import java .io .IOException ;
50
48
import java .util .Arrays ;
51
49
import java .util .List ;
52
50
import java .util .concurrent .CountDownLatch ;
53
51
import java .util .concurrent .Executors ;
54
52
import java .util .concurrent .TimeUnit ;
53
+ import javax .sound .sampled .AudioFormat ;
54
+ import javax .sound .sampled .AudioSystem ;
55
+ import javax .sound .sampled .DataLine ;
56
+ import javax .sound .sampled .LineUnavailableException ;
57
+ import javax .sound .sampled .TargetDataLine ;
55
58
56
59
57
60
/**
58
61
* Client that sends streaming audio to Speech.Recognize and returns streaming transcript.
59
62
*/
60
63
public class StreamingRecognizeClient {
61
64
62
- private final String file ;
63
- private final int samplingRate ;
64
-
65
65
private static final Logger logger = Logger .getLogger (StreamingRecognizeClient .class .getName ());
66
66
67
67
private final ManagedChannel channel ;
68
-
69
68
private final SpeechGrpc .SpeechStub speechClient ;
70
-
71
- private static final int BYTES_PER_BUFFER = 3200 ; //buffer size in bytes
72
- private static final int BYTES_PER_SAMPLE = 2 ; //bytes per sample for LINEAR16
73
-
74
69
private static final List <String > OAUTH2_SCOPES =
75
70
Arrays .asList ("https://www.googleapis.com/auth/cloud-platform" );
76
71
72
+ static final int BYTES_PER_SAMPLE = 2 ; // bytes per sample for LINEAR16
73
+
74
+ private final int samplingRate ;
75
+ final int bytesPerBuffer ; // buffer size in bytes
76
+
77
+ // Used for testing
78
+ protected TargetDataLine mockDataLine = null ;
79
+
77
80
/**
78
81
* Construct client connecting to Cloud Speech server at {@code host:port}.
79
82
*/
80
- public StreamingRecognizeClient (ManagedChannel channel , String file , int samplingRate )
83
+ public StreamingRecognizeClient (ManagedChannel channel , int samplingRate )
81
84
throws IOException {
82
- this .file = file ;
83
85
this .samplingRate = samplingRate ;
84
86
this .channel = channel ;
87
+ this .bytesPerBuffer = samplingRate * BYTES_PER_SAMPLE / 10 ; // 100 ms
85
88
86
89
speechClient = SpeechGrpc .newStub (channel );
87
90
88
91
// Send log4j logs to Console
89
92
// If you are going to run this on GCE, you might wish to integrate with
90
- // google-cloud-java logging. See:
93
+ // google-cloud-java logging. See:
91
94
// https://github.com/GoogleCloudPlatform/google-cloud-java/blob/master/README.md#stackdriver-logging-alpha
92
-
93
95
ConsoleAppender appender = new ConsoleAppender (new SimpleLayout (), SYSTEM_OUT );
94
96
logger .addAppender (appender );
95
97
}
@@ -109,19 +111,73 @@ static ManagedChannel createChannel(String host, int port) throws IOException {
109
111
return channel ;
110
112
}
111
113
114
+ /**
115
+ * Return a Line to the audio input device.
116
+ */
117
+ private TargetDataLine getAudioInputLine () {
118
+ // For testing
119
+ if (null != mockDataLine ) {
120
+ return mockDataLine ;
121
+ }
122
+
123
+ AudioFormat format = new AudioFormat (samplingRate , BYTES_PER_SAMPLE * 8 , 1 , true , false );
124
+ DataLine .Info info = new DataLine .Info (TargetDataLine .class , format );
125
+ if (!AudioSystem .isLineSupported (info )) {
126
+ throw new RuntimeException (String .format (
127
+ "Device doesn't support LINEAR16 mono raw audio format at {}Hz" , samplingRate ));
128
+ }
129
+ try {
130
+ TargetDataLine line = (TargetDataLine ) AudioSystem .getLine (info );
131
+ // Make sure the line buffer doesn't overflow while we're filling this thread's buffer.
132
+ line .open (format , bytesPerBuffer * 5 );
133
+ return line ;
134
+ } catch (LineUnavailableException e ) {
135
+ throw new RuntimeException (e );
136
+ }
137
+ }
138
+
112
139
/** Send streaming recognize requests to server. */
113
140
public void recognize () throws InterruptedException , IOException {
114
141
final CountDownLatch finishLatch = new CountDownLatch (1 );
115
142
StreamObserver <StreamingRecognizeResponse > responseObserver =
116
143
new StreamObserver <StreamingRecognizeResponse >() {
144
+ private int sentenceLength = 1 ;
145
+ /**
146
+ * Prints the transcription results. Interim results are overwritten by subsequent
147
+ * results, until a final one is returned, at which point we start a new line.
148
+ *
149
+ * Flags the program to exit when it hears "exit".
150
+ */
117
151
@ Override
118
152
public void onNext (StreamingRecognizeResponse response ) {
119
- logger .info ("Received response: " + TextFormat .printToString (response ));
153
+ List <StreamingRecognitionResult > results = response .getResultsList ();
154
+ if (results .size () < 1 ) {
155
+ return ;
156
+ }
157
+
158
+ StreamingRecognitionResult result = results .get (0 );
159
+ String transcript = result .getAlternatives (0 ).getTranscript ();
160
+
161
+ // Print interim results with a line feed, so subsequent transcriptions will overwrite
162
+ // it. Final result will print a newline.
163
+ String format = "%-" + this .sentenceLength + 's' ;
164
+ if (result .getIsFinal ()) {
165
+ format += '\n' ;
166
+ this .sentenceLength = 1 ;
167
+
168
+ if (transcript .toLowerCase ().indexOf ("exit" ) >= 0 ) {
169
+ finishLatch .countDown ();
170
+ }
171
+ } else {
172
+ format += '\r' ;
173
+ this .sentenceLength = transcript .length ();
174
+ }
175
+ System .out .print (String .format (format , transcript ));
120
176
}
121
177
122
178
@ Override
123
179
public void onError (Throwable error ) {
124
- logger .log (Level .WARN , "recognize failed: {0}" , error );
180
+ logger .log (Level .ERROR , "recognize failed: {0}" , error );
125
181
finishLatch .countDown ();
126
182
}
127
183
@@ -146,33 +202,28 @@ public void onCompleted() {
146
202
StreamingRecognitionConfig .newBuilder ()
147
203
.setConfig (config )
148
204
.setInterimResults (true )
149
- .setSingleUtterance (true )
205
+ .setSingleUtterance (false )
150
206
.build ();
151
207
152
208
StreamingRecognizeRequest initial =
153
209
StreamingRecognizeRequest .newBuilder ().setStreamingConfig (streamingConfig ).build ();
154
210
requestObserver .onNext (initial );
155
211
156
- // Open audio file. Read and send sequential buffers of audio as additional RecognizeRequests.
157
- FileInputStream in = new FileInputStream (new File (file ));
158
- // For LINEAR16 at 16000 Hz sample rate, 3200 bytes corresponds to 100 milliseconds of audio.
159
- byte [] buffer = new byte [BYTES_PER_BUFFER ];
212
+ // Get a Line to the audio input device.
213
+ TargetDataLine in = getAudioInputLine ();
214
+ byte [] buffer = new byte [bytesPerBuffer ];
160
215
int bytesRead ;
161
- int totalBytes = 0 ;
162
- int samplesPerBuffer = BYTES_PER_BUFFER / BYTES_PER_SAMPLE ;
163
- int samplesPerMillis = samplingRate / 1000 ;
164
216
165
- while ((bytesRead = in .read (buffer )) != -1 ) {
166
- totalBytes += bytesRead ;
217
+ in .start ();
218
+ // Read and send sequential buffers of audio as additional RecognizeRequests.
219
+ while (finishLatch .getCount () > 0
220
+ && (bytesRead = in .read (buffer , 0 , buffer .length )) != -1 ) {
167
221
StreamingRecognizeRequest request =
168
222
StreamingRecognizeRequest .newBuilder ()
169
223
.setAudioContent (ByteString .copyFrom (buffer , 0 , bytesRead ))
170
224
.build ();
171
225
requestObserver .onNext (request );
172
- // To simulate real-time audio, sleep after sending each audio buffer.
173
- Thread .sleep (samplesPerBuffer / samplesPerMillis );
174
226
}
175
- logger .info ("Sent " + totalBytes + " bytes from audio file: " + file );
176
227
} catch (RuntimeException e ) {
177
228
// Cancel RPC.
178
229
requestObserver .onError (e );
@@ -187,21 +238,13 @@ public void onCompleted() {
187
238
188
239
public static void main (String [] args ) throws Exception {
189
240
190
- String audioFile = "" ;
191
- String host = "speech.googleapis.com" ;
192
- Integer port = 443 ;
193
- Integer sampling = 16000 ;
241
+ String host = null ;
242
+ Integer port = null ;
243
+ Integer sampling = null ;
194
244
195
245
CommandLineParser parser = new DefaultParser ();
196
246
197
247
Options options = new Options ();
198
- options .addOption (
199
- Option .builder ()
200
- .longOpt ("file" )
201
- .desc ("path to audio file" )
202
- .hasArg ()
203
- .argName ("FILE_PATH" )
204
- .build ());
205
248
options .addOption (
206
249
Option .builder ()
207
250
.longOpt ("host" )
@@ -226,31 +269,14 @@ public static void main(String[] args) throws Exception {
226
269
227
270
try {
228
271
CommandLine line = parser .parse (options , args );
229
- if (line .hasOption ("file" )) {
230
- audioFile = line .getOptionValue ("file" );
231
- } else {
232
- System .err .println ("An Audio file must be specified (e.g. /foo/baz.raw)." );
233
- System .exit (1 );
234
- }
235
-
236
- if (line .hasOption ("host" )) {
237
- host = line .getOptionValue ("host" );
238
- } else {
239
- System .err .println ("An API enpoint must be specified (typically speech.googleapis.com)." );
240
- System .exit (1 );
241
- }
242
272
243
- if (line .hasOption ("port" )) {
244
- port = Integer .parseInt (line .getOptionValue ("port" ));
245
- } else {
246
- System .err .println ("An SSL port must be specified (typically 443)." );
247
- System .exit (1 );
248
- }
273
+ host = line .getOptionValue ("host" , "speech.googleapis.com" );
274
+ port = Integer .parseInt (line .getOptionValue ("port" , "443" ));
249
275
250
276
if (line .hasOption ("sampling" )) {
251
277
sampling = Integer .parseInt (line .getOptionValue ("sampling" ));
252
278
} else {
253
- System .err .println ("An Audio sampling rate must be specified." );
279
+ System .err .println ("An Audio sampling rate (--sampling) must be specified. (e.g. 16000) " );
254
280
System .exit (1 );
255
281
}
256
282
} catch (ParseException exp ) {
@@ -259,7 +285,7 @@ public static void main(String[] args) throws Exception {
259
285
}
260
286
261
287
ManagedChannel channel = createChannel (host , port );
262
- StreamingRecognizeClient client = new StreamingRecognizeClient (channel , audioFile , sampling );
288
+ StreamingRecognizeClient client = new StreamingRecognizeClient (channel , sampling );
263
289
try {
264
290
client .recognize ();
265
291
} finally {
0 commit comments