Skip to content

Commit c129c99

Browse files
author
Jerjou Cheng
committed
Working draft of continuous listening.
1 parent b7d2b50 commit c129c99

File tree

1 file changed

+263
-0
lines changed

1 file changed

+263
-0
lines changed
Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
#!/usr/bin/python
2+
# Copyright (C) 2016 Google Inc.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""Sample that streams audio to the Google Cloud Speech API via GRPC."""
16+
17+
from __future__ import division
18+
19+
import contextlib
20+
import functools
21+
import re
22+
import signal
23+
import sys
24+
import time
25+
26+
import google.auth
27+
import google.auth.transport.grpc
28+
import google.auth.transport.requests
29+
import grpc
30+
from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2
31+
from google.rpc import code_pb2
32+
from grpc.framework.interfaces.face import face
33+
import pyaudio
34+
from six.moves import queue
35+
36+
# Seconds to allow you to shut up
37+
WRAP_IT_UP_SECS = 55
38+
39+
# Audio recording parameters
40+
RATE = 16000
41+
CHUNK = int(RATE / 10) # 100ms
42+
43+
# The Speech API has a streaming limit of 60 seconds of audio*, so keep the
44+
# connection alive for that long, plus some more to give the API time to figure
45+
# out the transcription.
46+
# * https://g.co/cloud/speech/limits#content
47+
DEADLINE_SECS = 60 * 3 + 5
48+
SPEECH_SCOPE = 'https://www.googleapis.com/auth/cloud-platform'
49+
50+
51+
def make_channel(host, port):
52+
"""Creates a secure channel with auth credentials from the environment."""
53+
# Grab application default credentials from the environment
54+
credentials, _ = google.auth.default(scopes=[SPEECH_SCOPE])
55+
56+
# Create a secure channel using the credentials.
57+
http_request = google.auth.transport.requests.Request()
58+
target = '{}:{}'.format(host, port)
59+
60+
return google.auth.transport.grpc.secure_authorized_channel(
61+
credentials, http_request, target)
62+
63+
64+
def _audio_data_generator(buff):
65+
"""A generator that yields all available data in the given buffer.
66+
67+
Args:
68+
buff - a Queue object, where each element is a chunk of data.
69+
Yields:
70+
A chunk of data that is the aggregate of all chunks of data in `buff`.
71+
The function will block until at least one data chunk is available.
72+
"""
73+
stop = False
74+
while not stop:
75+
# Use a blocking get() to ensure there's at least one chunk of data.
76+
data = [buff.get()]
77+
78+
# Now consume whatever other data's still buffered.
79+
while True:
80+
try:
81+
data.append(buff.get(block=False))
82+
except queue.Empty:
83+
break
84+
85+
# `None` in the buffer signals that the audio stream is closed. Yield
86+
# the final bit of the buffer and exit the loop.
87+
if None in data:
88+
stop = True
89+
data.remove(None)
90+
91+
yield b''.join(data)
92+
93+
94+
def _fill_buffer(buff, in_data, frame_count, time_info, status_flags):
95+
"""Continuously collect data from the audio stream, into the buffer."""
96+
buff.put(in_data)
97+
return None, pyaudio.paContinue
98+
99+
100+
# [START audio_stream]
101+
@contextlib.contextmanager
102+
def record_audio(rate, chunk):
103+
"""Opens a recording stream in a context manager."""
104+
# Create a thread-safe buffer of audio data
105+
buff = queue.Queue()
106+
107+
audio_interface = pyaudio.PyAudio()
108+
audio_stream = audio_interface.open(
109+
format=pyaudio.paInt16,
110+
# The API currently only supports 1-channel (mono) audio
111+
# https://goo.gl/z757pE
112+
channels=1, rate=rate,
113+
input=True, frames_per_buffer=chunk,
114+
# Run the audio stream asynchronously to fill the buffer object.
115+
# This is necessary so that the input device's buffer doesn't overflow
116+
# while the calling thread makes network requests, etc.
117+
stream_callback=functools.partial(_fill_buffer, buff),
118+
)
119+
120+
yield buff
121+
122+
audio_stream.stop_stream()
123+
audio_stream.close()
124+
# Signal the _audio_data_generator to finish
125+
buff.put(None)
126+
audio_interface.terminate()
127+
# [END audio_stream]
128+
129+
130+
def request_stream(data_stream, rate, interim_results=True):
131+
"""Yields `StreamingRecognizeRequest`s constructed from a recording audio
132+
stream.
133+
134+
Args:
135+
data_stream: A generator that yields raw audio data to send.
136+
rate: The sampling rate in hertz.
137+
interim_results: Whether to return intermediate results, before the
138+
transcription is finalized.
139+
"""
140+
# The initial request must contain metadata about the stream, so the
141+
# server knows how to interpret it.
142+
recognition_config = cloud_speech_pb2.RecognitionConfig(
143+
# There are a bunch of config options you can specify. See
144+
# https://goo.gl/KPZn97 for the full list.
145+
encoding='LINEAR16', # raw 16-bit signed LE samples
146+
sample_rate=rate, # the rate in hertz
147+
# See http://g.co/cloud/speech/docs/languages
148+
# for a list of supported languages.
149+
language_code='en-US', # a BCP-47 language tag
150+
)
151+
streaming_config = cloud_speech_pb2.StreamingRecognitionConfig(
152+
interim_results=interim_results,
153+
config=recognition_config,
154+
)
155+
156+
yield cloud_speech_pb2.StreamingRecognizeRequest(
157+
streaming_config=streaming_config)
158+
159+
for data in data_stream:
160+
# Subsequent requests can all just have the content
161+
yield cloud_speech_pb2.StreamingRecognizeRequest(audio_content=data)
162+
163+
164+
def listen_print_loop(recognize_stream, wrap_it_up_secs, max_recog_secs=60):
165+
"""Iterates through server responses and prints them.
166+
167+
The recognize_stream passed is a generator that will block until a response
168+
is provided by the server. When the transcription response comes, print it.
169+
170+
In this case, responses are provided for interim results as well. If the
171+
response is an interim one, print a line feed at the end of it, to allow
172+
the next result to overwrite it, until the response is a final one. For the
173+
final one, print a newline to preserve the finalized transcription.
174+
"""
175+
start_time = time.time()
176+
time_to_switch = time.time() + max_recog_secs - wrap_it_up_secs
177+
wrap_it_up = False
178+
num_chars_printed = 0
179+
for resp in recognize_stream:
180+
if resp.error.code != code_pb2.OK:
181+
raise RuntimeError('Server error: ' + resp.error.message)
182+
183+
if not resp.results:
184+
if resp.endpointer_type is resp.END_OF_SPEECH and (
185+
time.time() > time_to_switch):
186+
wrap_it_up = True
187+
resp = next(recognize_stream)
188+
if not resp.results:
189+
return True
190+
else:
191+
continue
192+
193+
# Display the top transcription
194+
result = resp.results[0]
195+
transcript = result.alternatives[0].transcript
196+
197+
# If the previous result was longer than this one, we need to print
198+
# some extra spaces to overwrite the previous result
199+
overwrite_chars = ' ' * max(0, num_chars_printed - len(transcript))
200+
201+
# Display interim results, but with a carriage return at the end of the
202+
# line, so subsequent lines will overwrite them.
203+
if not result.is_final:
204+
sys.stdout.write(transcript + overwrite_chars + '\r')
205+
sys.stdout.flush()
206+
207+
num_chars_printed = len(transcript)
208+
209+
else:
210+
print(transcript + overwrite_chars)
211+
212+
# Exit recognition if any of the transcribed phrases could be
213+
# one of our keywords.
214+
if re.search(r'\b(exit|quit)\b', transcript, re.I):
215+
print('Exiting..')
216+
return False
217+
218+
num_chars_printed = 0
219+
220+
if wrap_it_up:
221+
return True
222+
223+
224+
def main():
225+
service = cloud_speech_pb2.SpeechStub(
226+
make_channel('speech.googleapis.com', 443))
227+
228+
keep_going = True
229+
230+
# For streaming audio from the microphone, there are three threads.
231+
# First, a thread that collects audio data as it comes in
232+
with record_audio(RATE, CHUNK) as buff:
233+
# Second, a thread that sends requests with that data
234+
requests = request_stream(_audio_data_generator(buff), RATE)
235+
# Third, a thread that listens for transcription responses
236+
recognize_stream = service.StreamingRecognize(
237+
requests, DEADLINE_SECS)
238+
239+
# Exit things cleanly on interrupt
240+
def handle_interrupt(*_):
241+
keep_going = False
242+
recognize_stream.cancel()
243+
signal.signal(signal.SIGINT, handle_interrupt)
244+
245+
# Now, put the transcription responses to use.
246+
while keep_going:
247+
print('==== Continuing... ====')
248+
keep_going = False
249+
try:
250+
keep_going = listen_print_loop(recognize_stream, WRAP_IT_UP_SECS)
251+
252+
recognize_stream.cancel()
253+
next(recognize_stream)
254+
# This happens because of the interrupt handler
255+
except grpc.RpcError, e:
256+
if keep_going:
257+
requests = request_stream(_audio_data_generator(buff), RATE)
258+
recognize_stream = service.StreamingRecognize(
259+
requests, DEADLINE_SECS)
260+
261+
262+
if __name__ == '__main__':
263+
main()

0 commit comments

Comments
 (0)