Merge pull request #343 from GoogleCloudPlatform/speech-streaming

jerjou · jerjou · commit d5cffb7be24d · 2016-05-10T14:00:47.000-07:00
Fix speech streaming sample &amp; test
diff --git a/speech/api/README.md b/speech/api/README.md
@@ -52,6 +52,16 @@ for more information.
     $ pip install -r requirements-speech_streaming.txt
     ```
 
+    The sample uses the [PyAudio][pyaudio] library to stream audio from your
+    computer's microphone.  PyAudio depends on [PortAudio][portaudio], which may
+    need to be compiled when you install PyAudio. If you run into compilation
+    issues that mention PortAudio, you may have to [install some
+    dependencies][pyaudio-install].
+
+[pyaudio]: https://people.csail.mit.edu/hubert/pyaudio/
+[portaudio]: http://www.portaudio.com/
+[pyaudio-install]: https://people.csail.mit.edu/hubert/pyaudio/#downloads
+
 ## Run the example
 
 * To run the `speech_rest.py` sample:
diff --git a/speech/api/speech_streaming.py b/speech/api/speech_streaming.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python
 
 import contextlib
+import re
 import threading
 
 from gcloud.credentials import get_credentials
@@ -70,16 +71,27 @@ def request_stream(stop_audio, channels=CHANNELS, rate=RATE, chunk=CHUNK):
         # The initial request must contain metadata about the stream, so the
         # server knows how to interpret it.
         metadata = InitialRecognizeRequest(
-            encoding='LINEAR16', sample_rate=rate)
-        audio_request = AudioRequest(content=audio_stream.read(chunk))
+            encoding='LINEAR16', sample_rate=rate,
+            # Note that setting interim_results to True means that you'll
+            # likely get multiple results for the same bit of audio, as the
+            # system re-interprets audio in the context of subsequent audio.
+            # However, this will give us quick results without having to tell
+            # the server when to finalize a piece of audio.
+            interim_results=True, continuous=False,
+        )
+        data = audio_stream.read(chunk)
+        audio_request = AudioRequest(content=data)
 
         yield RecognizeRequest(
             initial_request=metadata,
             audio_request=audio_request)
 
         while not stop_audio.is_set():
+            data = audio_stream.read(chunk)
+            if not data:
+                raise StopIteration()
             # Subsequent requests can all just have the content
-            audio_request = AudioRequest(content=audio_stream.read(chunk))
+            audio_request = AudioRequest(content=data)
 
             yield RecognizeRequest(audio_request=audio_request)
 
@@ -95,8 +107,7 @@ def listen_print_loop(recognize_stream):
 
         # Exit recognition if any of the transcribed phrases could be
         # one of our keywords.
-        if any(alt.confidence > .5 and
-               (alt.transcript.strip() in ('exit', 'quit'))
+        if any(re.search(r'\b(exit|quit)\b', alt.transcript)
                for result in resp.results
                for alt in result.alternatives):
             print('Exiting..')
diff --git a/speech/api/speech_streaming_test.py b/speech/api/speech_streaming_test.py
@@ -15,8 +15,8 @@
 import io
 import re
 import sys
+import time
 
-from gcp.testing.flaky import flaky
 import pytest
 
 import speech_streaming
@@ -39,6 +39,9 @@ def __call__(self, *args):
         return self
 
     def read(self, num_frames):
+        # Approximate realtime by sleeping for the appropriate time for the
+        # requested number of frames
+        time.sleep(num_frames / float(speech_streaming.RATE))
         # audio is 16-bit samples, whereas python byte is 8-bit
         num_bytes = 2 * num_frames
         chunk = self.audio_file.read(num_bytes) or self.silence.read(num_bytes)
@@ -54,7 +57,6 @@ def mock_audio_stream(channels, rate, chunk):
     return mock_audio_stream
 
 
-@flaky
 @pytest.mark.skipif(
         sys.version_info >= (3, 0),
         reason=("grpc doesn't yet support python3 "