Working draft of continuous listening.

Jerjou Cheng · Jerjou Cheng · commit c129c9914875 · 2016-12-21T14:36:54.000-08:00
diff --git a/speech/grpc/transcribe_streaming_minute.py b/speech/grpc/transcribe_streaming_minute.py
@@ -0,0 +1,263 @@
+#!/usr/bin/python
+# Copyright (C) 2016 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Sample that streams audio to the Google Cloud Speech API via GRPC."""
+
+from __future__ import division
+
+import contextlib
+import functools
+import re
+import signal
+import sys
+import time
+
+import google.auth
+import google.auth.transport.grpc
+import google.auth.transport.requests
+import grpc
+from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2
+from google.rpc import code_pb2
+from grpc.framework.interfaces.face import face
+import pyaudio
+from six.moves import queue
+
+# Seconds to allow you to shut up
+WRAP_IT_UP_SECS = 55
+
+# Audio recording parameters
+RATE = 16000
+CHUNK = int(RATE / 10)  # 100ms
+
+# The Speech API has a streaming limit of 60 seconds of audio*, so keep the
+# connection alive for that long, plus some more to give the API time to figure
+# out the transcription.
+# * https://g.co/cloud/speech/limits#content
+DEADLINE_SECS = 60 * 3 + 5
+SPEECH_SCOPE = 'https://www.googleapis.com/auth/cloud-platform'
+
+
+def make_channel(host, port):
+    """Creates a secure channel with auth credentials from the environment."""
+    # Grab application default credentials from the environment
+    credentials, _ = google.auth.default(scopes=[SPEECH_SCOPE])
+
+    # Create a secure channel using the credentials.
+    http_request = google.auth.transport.requests.Request()
+    target = '{}:{}'.format(host, port)
+
+    return google.auth.transport.grpc.secure_authorized_channel(
+        credentials, http_request, target)
+
+
+def _audio_data_generator(buff):
+    """A generator that yields all available data in the given buffer.
+
+    Args:
+        buff - a Queue object, where each element is a chunk of data.
+    Yields:
+        A chunk of data that is the aggregate of all chunks of data in `buff`.
+        The function will block until at least one data chunk is available.
+    """
+    stop = False
+    while not stop:
+        # Use a blocking get() to ensure there's at least one chunk of data.
+        data = [buff.get()]
+
+        # Now consume whatever other data's still buffered.
+        while True:
+            try:
+                data.append(buff.get(block=False))
+            except queue.Empty:
+                break
+
+        # `None` in the buffer signals that the audio stream is closed. Yield
+        # the final bit of the buffer and exit the loop.
+        if None in data:
+            stop = True
+            data.remove(None)
+
+        yield b''.join(data)
+
+
+def _fill_buffer(buff, in_data, frame_count, time_info, status_flags):
+    """Continuously collect data from the audio stream, into the buffer."""
+    buff.put(in_data)
+    return None, pyaudio.paContinue
+
+
+# [START audio_stream]
+@contextlib.contextmanager
+def record_audio(rate, chunk):
+    """Opens a recording stream in a context manager."""
+    # Create a thread-safe buffer of audio data
+    buff = queue.Queue()
+
+    audio_interface = pyaudio.PyAudio()
+    audio_stream = audio_interface.open(
+        format=pyaudio.paInt16,
+        # The API currently only supports 1-channel (mono) audio
+        # https://goo.gl/z757pE
+        channels=1, rate=rate,
+        input=True, frames_per_buffer=chunk,
+        # Run the audio stream asynchronously to fill the buffer object.
+        # This is necessary so that the input device's buffer doesn't overflow
+        # while the calling thread makes network requests, etc.
+        stream_callback=functools.partial(_fill_buffer, buff),
+    )
+
+    yield buff
+
+    audio_stream.stop_stream()
+    audio_stream.close()
+    # Signal the _audio_data_generator to finish
+    buff.put(None)
+    audio_interface.terminate()
+# [END audio_stream]
+
+
+def request_stream(data_stream, rate, interim_results=True):
+    """Yields `StreamingRecognizeRequest`s constructed from a recording audio
+    stream.
+
+    Args:
+        data_stream: A generator that yields raw audio data to send.
+        rate: The sampling rate in hertz.
+        interim_results: Whether to return intermediate results, before the
+            transcription is finalized.
+    """
+    # The initial request must contain metadata about the stream, so the
+    # server knows how to interpret it.
+    recognition_config = cloud_speech_pb2.RecognitionConfig(
+        # There are a bunch of config options you can specify. See
+        # https://goo.gl/KPZn97 for the full list.
+        encoding='LINEAR16',  # raw 16-bit signed LE samples
+        sample_rate=rate,  # the rate in hertz
+        # See http://g.co/cloud/speech/docs/languages
+        # for a list of supported languages.
+        language_code='en-US',  # a BCP-47 language tag
+    )
+    streaming_config = cloud_speech_pb2.StreamingRecognitionConfig(
+        interim_results=interim_results,
+        config=recognition_config,
+    )
+
+    yield cloud_speech_pb2.StreamingRecognizeRequest(
+        streaming_config=streaming_config)
+
+    for data in data_stream:
+        # Subsequent requests can all just have the content
+        yield cloud_speech_pb2.StreamingRecognizeRequest(audio_content=data)
+
+
+def listen_print_loop(recognize_stream, wrap_it_up_secs, max_recog_secs=60):
+    """Iterates through server responses and prints them.
+
+    The recognize_stream passed is a generator that will block until a response
+    is provided by the server. When the transcription response comes, print it.
+
+    In this case, responses are provided for interim results as well. If the
+    response is an interim one, print a line feed at the end of it, to allow
+    the next result to overwrite it, until the response is a final one. For the
+    final one, print a newline to preserve the finalized transcription.
+    """
+    start_time = time.time()
+    time_to_switch = time.time() + max_recog_secs - wrap_it_up_secs
+    wrap_it_up = False
+    num_chars_printed = 0
+    for resp in recognize_stream:
+        if resp.error.code != code_pb2.OK:
+            raise RuntimeError('Server error: ' + resp.error.message)
+
+        if not resp.results:
+            if resp.endpointer_type is resp.END_OF_SPEECH and (
+                    time.time() > time_to_switch):
+                wrap_it_up = True
+                resp = next(recognize_stream)
+                if not resp.results:
+                    return True
+            else:
+                continue
+
+        # Display the top transcription
+        result = resp.results[0]
+        transcript = result.alternatives[0].transcript
+
+        # If the previous result was longer than this one, we need to print
+        # some extra spaces to overwrite the previous result
+        overwrite_chars = ' ' * max(0, num_chars_printed - len(transcript))
+
+        # Display interim results, but with a carriage return at the end of the
+        # line, so subsequent lines will overwrite them.
+        if not result.is_final:
+            sys.stdout.write(transcript + overwrite_chars + '\r')
+            sys.stdout.flush()
+
+            num_chars_printed = len(transcript)
+
+        else:
+            print(transcript + overwrite_chars)
+
+            # Exit recognition if any of the transcribed phrases could be
+            # one of our keywords.
+            if re.search(r'\b(exit|quit)\b', transcript, re.I):
+                print('Exiting..')
+                return False
+
+            num_chars_printed = 0
+
+        if wrap_it_up:
+            return True
+
+
+def main():
+    service = cloud_speech_pb2.SpeechStub(
+        make_channel('speech.googleapis.com', 443))
+
+    keep_going = True
+
+    # For streaming audio from the microphone, there are three threads.
+    # First, a thread that collects audio data as it comes in
+    with record_audio(RATE, CHUNK) as buff:
+        # Second, a thread that sends requests with that data
+        requests = request_stream(_audio_data_generator(buff), RATE)
+        # Third, a thread that listens for transcription responses
+        recognize_stream = service.StreamingRecognize(
+            requests, DEADLINE_SECS)
+
+        # Exit things cleanly on interrupt
+        def handle_interrupt(*_):
+            keep_going = False
+            recognize_stream.cancel()
+        signal.signal(signal.SIGINT, handle_interrupt)
+
+        # Now, put the transcription responses to use.
+        while keep_going:
+            print('==== Continuing... ====')
+            keep_going = False
+            try:
+                keep_going = listen_print_loop(recognize_stream, WRAP_IT_UP_SECS)
+
+                recognize_stream.cancel()
+                next(recognize_stream)
+            # This happens because of the interrupt handler
+            except grpc.RpcError, e:
+                if keep_going:
+                    requests = request_stream(_audio_data_generator(buff), RATE)
+                    recognize_stream = service.StreamingRecognize(
+                            requests, DEADLINE_SECS)
+
+
+if __name__ == '__main__':
+    main()