|
| 1 | +syntax = "proto3"; |
| 2 | + |
| 3 | +package google.cloud.speech.v1beta1; |
| 4 | + |
| 5 | +option java_multiple_files = true; |
| 6 | +option java_outer_classname = "SpeechProto"; |
| 7 | +option java_package = "com.google.cloud.speech.v1beta1"; |
| 8 | + |
| 9 | +import "google/api/annotations.proto"; |
| 10 | +import "google/longrunning/operations.proto"; |
| 11 | +import "google/rpc/status.proto"; |
| 12 | + |
| 13 | + |
| 14 | +// Service that implements Google Cloud Speech API. |
| 15 | +service Speech { |
| 16 | + // Perform synchronous speech-recognition: receive results after all audio |
| 17 | + // has been sent and processed. |
| 18 | + rpc SyncRecognize(SyncRecognizeRequest) returns (SyncRecognizeResponse) { |
| 19 | + option (google.api.http) = |
| 20 | + { post: "/v1beta1/speech:syncrecognize" body: "*" }; |
| 21 | + } |
| 22 | + |
| 23 | + // Perform asynchronous speech-recognition: receive results via the |
| 24 | + // google.longrunning.Operations interface. `Operation.response` returns |
| 25 | + // `AsyncRecognizeResponse`. |
| 26 | + rpc AsyncRecognize(AsyncRecognizeRequest) |
| 27 | + returns (google.longrunning.Operation) { |
| 28 | + option (google.api.http) = |
| 29 | + { post: "/v1beta1/speech:asyncrecognize" body: "*" }; |
| 30 | + } |
| 31 | + |
| 32 | + // Perform bidirectional streaming speech-recognition: receive results while |
| 33 | + // sending audio. This method is only available via the gRPC API (not REST). |
| 34 | + rpc StreamingRecognize(stream StreamingRecognizeRequest) |
| 35 | + returns (stream StreamingRecognizeResponse); |
| 36 | +} |
| 37 | + |
| 38 | +// `SyncRecognizeRequest` is the top-level message sent by the client for |
| 39 | +// the `SyncRecognize` method. |
| 40 | +message SyncRecognizeRequest { |
| 41 | + // [Required] The `config` message provides information to the recognizer |
| 42 | + // that specifies how to process the request. |
| 43 | + RecognitionConfig config = 1; |
| 44 | + |
| 45 | + // [Required] The audio data to be recognized. |
| 46 | + RecognitionAudio audio = 2; |
| 47 | +} |
| 48 | + |
| 49 | +// `AsyncRecognizeRequest` is the top-level message sent by the client for |
| 50 | +// the `AsyncRecognize` method. |
| 51 | +message AsyncRecognizeRequest { |
| 52 | + // [Required] The `config` message provides information to the recognizer |
| 53 | + // that specifies how to process the request. |
| 54 | + RecognitionConfig config = 1; |
| 55 | + |
| 56 | + // [Required] The audio data to be recognized. |
| 57 | + RecognitionAudio audio = 2; |
| 58 | +} |
| 59 | + |
| 60 | +// `StreamingRecognizeRequest` is the top-level message sent by the client for |
| 61 | +// the `StreamingRecognize`. Multiple `StreamingRecognizeRequest` messages are |
| 62 | +// sent. The first message must contain a `streaming_config` message and must |
| 63 | +// not contain `audio` data. All subsequent messages must contain `audio` data |
| 64 | +// and must not contain a `streaming_config` message. |
| 65 | +message StreamingRecognizeRequest { |
| 66 | + oneof streaming_request { |
| 67 | + // The `streaming_config` message provides information to the recognizer |
| 68 | + // that specifies how to process the request. |
| 69 | + // |
| 70 | + // The first `StreamingRecognizeRequest` message must contain a |
| 71 | + // `streaming_config` message. |
| 72 | + StreamingRecognitionConfig streaming_config = 1; |
| 73 | + |
| 74 | + // The audio data to be recognized. Sequential chunks of audio data are sent |
| 75 | + // in sequential `StreamingRecognizeRequest` messages. The first |
| 76 | + // `StreamingRecognizeRequest` message must not contain `audio_content` data |
| 77 | + // and all subsequent `StreamingRecognizeRequest` messages must contain |
| 78 | + // `audio_content` data. The audio bytes must be encoded as specified in |
| 79 | + // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a |
| 80 | + // pure binary representation (not base64). |
| 81 | + bytes audio_content = 2 [ctype = CORD]; |
| 82 | + } |
| 83 | +} |
| 84 | + |
| 85 | +// The `StreamingRecognitionConfig` message provides information to the |
| 86 | +// recognizer that specifies how to process the request. |
| 87 | +message StreamingRecognitionConfig { |
| 88 | + // [Required] The `config` message provides information to the recognizer |
| 89 | + // that specifies how to process the request. |
| 90 | + RecognitionConfig config = 1; |
| 91 | + |
| 92 | + // [Optional] If `false` or omitted, the recognizer will perform continuous |
| 93 | + // recognition (continuing to process audio even if the user pauses speaking) |
| 94 | + // until the client closes the output stream (gRPC API) or when the maximum |
| 95 | + // time limit has been reached. Multiple `SpeechRecognitionResult`s with the |
| 96 | + // `is_final` flag set to `true` may be returned. |
| 97 | + // |
| 98 | + // If `true`, the recognizer will detect a single spoken utterance. When it |
| 99 | + // detects that the user has paused or stopped speaking, it will return an |
| 100 | + // `END_OF_UTTERANCE` event and cease recognition. It will return no more than |
| 101 | + // one `SpeechRecognitionResult` with the `is_final` flag set to `true`. |
| 102 | + bool single_utterance = 2; |
| 103 | + |
| 104 | + // [Optional] If `true`, interim results (tentative hypotheses) may be |
| 105 | + // returned as they become available (these interim results are indicated with |
| 106 | + // the `is_final=false` flag). |
| 107 | + // If `false` or omitted, only `is_final=true` result(s) are returned. |
| 108 | + bool interim_results = 3; |
| 109 | +} |
| 110 | + |
| 111 | +// The `RecognitionConfig` message provides information to the recognizer |
| 112 | +// that specifies how to process the request. |
| 113 | +message RecognitionConfig { |
| 114 | + // Audio encoding of the data sent in the audio message. All encodings support |
| 115 | + // only 1 channel (mono) audio. Only `FLAC` includes a header that describes |
| 116 | + // the bytes of audio that follow the header. The other encodings are raw |
| 117 | + // audio bytes with no header. |
| 118 | + // |
| 119 | + // For best results, the audio source should be captured and transmitted using |
| 120 | + // a lossless encoding (`FLAC` or `LINEAR16`). Recognition accuracy may be |
| 121 | + // reduced if lossy codecs (such as AMR, AMR_WB and MULAW) are used to capture |
| 122 | + // or transmit the audio, particularly if background noise is present. |
| 123 | + enum AudioEncoding { |
| 124 | + // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][]. |
| 125 | + ENCODING_UNSPECIFIED = 0; |
| 126 | + |
| 127 | + // Uncompressed 16-bit signed little-endian samples. |
| 128 | + // This is the only encoding that may be used by `AsyncRecognize`. |
| 129 | + LINEAR16 = 1; |
| 130 | + |
| 131 | + // This is the recommended encoding for `SyncRecognize` and |
| 132 | + // `StreamingRecognize` because it uses lossless compression; therefore |
| 133 | + // recognition accuracy is not compromised by a lossy codec. |
| 134 | + // |
| 135 | + // The stream FLAC (Free Lossless Audio Codec) encoding is specified at: |
| 136 | + // http://flac.sourceforge.net/documentation.html. |
| 137 | + // Only 16-bit samples are supported. |
| 138 | + // Not all fields in STREAMINFO are supported. |
| 139 | + FLAC = 2; |
| 140 | + |
| 141 | + // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. |
| 142 | + MULAW = 3; |
| 143 | + |
| 144 | + // Adaptive Multi-Rate Narrowband codec. `sample_rate` must be 8000 Hz. |
| 145 | + AMR = 4; |
| 146 | + |
| 147 | + // Adaptive Multi-Rate Wideband codec. `sample_rate` must be 16000 Hz. |
| 148 | + AMR_WB = 5; |
| 149 | + } |
| 150 | + |
| 151 | + // [Required] Encoding of audio data sent in all `RecognitionAudio` messages. |
| 152 | + AudioEncoding encoding = 1; |
| 153 | + |
| 154 | + // [Required] Sample rate in Hertz of the audio data sent in all |
| 155 | + // `RecognitionAudio` messages. Valid values are: 8000-48000. |
| 156 | + // 16000 is optimal. For best results, set the sampling rate of the audio |
| 157 | + // source to 16000 Hz. If that's not possible, use the native sample rate of |
| 158 | + // the audio source (instead of re-sampling). |
| 159 | + int32 sample_rate = 2; |
| 160 | + |
| 161 | + // [Optional] The language of the supplied audio as a BCP-47 language tag. |
| 162 | + // Example: "en-GB" https://www.rfc-editor.org/rfc/bcp/bcp47.txt |
| 163 | + // If omitted, defaults to "en-US". See |
| 164 | + // [Language Support](/speech/docs/best-practices#language_support) for |
| 165 | + // a list of the currently supported language codes. |
| 166 | + string language_code = 3; |
| 167 | + |
| 168 | + // [Optional] Maximum number of recognition hypotheses to be returned. |
| 169 | + // Specifically, the maximum number of `SpeechRecognitionAlternative` messages |
| 170 | + // within each `SpeechRecognitionResult`. |
| 171 | + // The server may return fewer than `max_alternatives`. |
| 172 | + // Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of |
| 173 | + // `1`. If omitted, defaults to `1`. |
| 174 | + int32 max_alternatives = 4; |
| 175 | + |
| 176 | + // [Optional] If set to `true`, the server will attempt to filter out |
| 177 | + // profanities, replacing all but the initial character in each filtered word |
| 178 | + // with asterisks, e.g. "f***". If set to `false` or omitted, profanities |
| 179 | + // won't be filtered out. |
| 180 | + bool profanity_filter = 5; |
| 181 | + |
| 182 | + // [Optional] A means to provide context to assist the speech recognition. |
| 183 | + SpeechContext speech_context = 6; |
| 184 | +} |
| 185 | + |
| 186 | +// Provides "hints" to the speech recognizer to favor specific words and phrases |
| 187 | +// in the results. |
| 188 | +message SpeechContext { |
| 189 | + // [Optional] A list of up to 50 phrases of up to 100 characters each to |
| 190 | + // provide words and phrases "hints" to the speech recognition so that it is |
| 191 | + // more likely to recognize them. |
| 192 | + repeated string phrases = 1; |
| 193 | +} |
| 194 | + |
| 195 | +// Contains audio data in the encoding specified in the `RecognitionConfig`. |
| 196 | +// Either `content` or `uri` must be supplied. Supplying both or neither |
| 197 | +// returns [google.rpc.Code.INVALID_ARGUMENT][]. |
| 198 | +message RecognitionAudio { |
| 199 | + oneof audio_source { |
| 200 | + // The audio data bytes encoded as specified in |
| 201 | + // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a |
| 202 | + // pure binary representation, whereas JSON representations use base64. |
| 203 | + bytes content = 1 [ctype = CORD]; |
| 204 | + |
| 205 | + // URI that points to a file that contains audio data bytes as specified in |
| 206 | + // `RecognitionConfig`. Currently, only Google Cloud Storage URIs are |
| 207 | + // supported, which must be specified in the following format: |
| 208 | + // `gs://bucket_name/object_name` (other URI formats return |
| 209 | + // [google.rpc.Code.INVALID_ARGUMENT][]). For more information, see |
| 210 | + // [Request URIs](/storage/docs/reference-uris). |
| 211 | + string uri = 2; |
| 212 | + } |
| 213 | +} |
| 214 | + |
| 215 | +// `SyncRecognizeResponse` is the only message returned to the client by |
| 216 | +// `SyncRecognize`. It contains the result as zero or more |
| 217 | +// sequential `RecognizeResponse` messages. |
| 218 | +message SyncRecognizeResponse { |
| 219 | + // [Output-only] Sequential list of transcription results corresponding to |
| 220 | + // sequential portions of audio. |
| 221 | + repeated SpeechRecognitionResult results = 2; |
| 222 | +} |
| 223 | + |
| 224 | +// `AsyncRecognizeResponse` is the only message returned to the client by |
| 225 | +// `AsyncRecognize`. It contains the result as zero or more |
| 226 | +// sequential `RecognizeResponse` messages. |
| 227 | +message AsyncRecognizeResponse { |
| 228 | + // [Output-only] Sequential list of transcription results corresponding to |
| 229 | + // sequential portions of audio. |
| 230 | + repeated SpeechRecognitionResult results = 2; |
| 231 | +} |
| 232 | + |
| 233 | +// `StreamingRecognizeResponse` is the only message returned to the client by |
| 234 | +// `StreamingRecognize`. It contains the result as zero or more |
| 235 | +// sequential `RecognizeResponse` messages. |
| 236 | +message StreamingRecognizeResponse { |
| 237 | + // Indicates the type of endpointer event. |
| 238 | + enum EndpointerType { |
| 239 | + // No endpointer event specified. |
| 240 | + ENDPOINTER_EVENT_UNSPECIFIED = 0; |
| 241 | + |
| 242 | + // Speech has been detected in the audio stream. |
| 243 | + START_OF_SPEECH = 1; |
| 244 | + |
| 245 | + // Speech has ceased to be detected in the audio stream. |
| 246 | + END_OF_SPEECH = 2; |
| 247 | + |
| 248 | + // The end of the audio stream has been reached. and it is being processed. |
| 249 | + END_OF_AUDIO = 3; |
| 250 | + |
| 251 | + // This event is only sent when `single_utterance` is `true`. It indicates |
| 252 | + // that the server has detected the end of the user's speech utterance and |
| 253 | + // expects no additional speech. Therefore, the server will not process |
| 254 | + // additional audio. The client should stop sending additional audio data. |
| 255 | + END_OF_UTTERANCE = 4; |
| 256 | + } |
| 257 | + |
| 258 | + // [Output-only] If set, returns a [google.rpc.Status][] message that |
| 259 | + // specifies the error for the operation. |
| 260 | + google.rpc.Status error = 1; |
| 261 | + |
| 262 | + // [Output-only] This repeated list contains zero or more results that |
| 263 | + // correspond to consecutive portions of the audio currently being processed. |
| 264 | + // It contains zero or one `is_final=true` result (the newly settled portion), |
| 265 | + // followed by zero or more `is_final=false` results. |
| 266 | + repeated StreamingRecognitionResult results = 2; |
| 267 | + |
| 268 | + // [Output-only] Indicates the lowest index in the `results` array that has |
| 269 | + // changed. The repeated `SpeechRecognitionResult` results overwrite past |
| 270 | + // results at this index and higher. |
| 271 | + int32 result_index = 3; |
| 272 | + |
| 273 | + // [Output-only] Indicates the type of endpointer event. |
| 274 | + EndpointerType endpointer_type = 4; |
| 275 | +} |
| 276 | + |
| 277 | +// A speech recognition result corresponding to a portion of the audio that is |
| 278 | +// currently being processed. |
| 279 | +// TODO(gshires): add a comment describing the various repeated interim and |
| 280 | +// alternative results fields. |
| 281 | +message StreamingRecognitionResult { |
| 282 | + // [Output-only] May contain one or more recognition hypotheses (up to the |
| 283 | + // maximum specified in `max_alternatives`). |
| 284 | + repeated SpeechRecognitionAlternative alternatives = 1; |
| 285 | + |
| 286 | + // [Output-only] If `false`, this `SpeechRecognitionResult` represents an |
| 287 | + // interim result that may change. If `true`, this is the final time the |
| 288 | + // speech service will return this particular `SpeechRecognitionResult`, |
| 289 | + // the recognizer will not return any further hypotheses for this portion of |
| 290 | + // the transcript and corresponding audio. |
| 291 | + bool is_final = 2; |
| 292 | + |
| 293 | + // [Output-only] An estimate of the probability that the recognizer will not |
| 294 | + // change its guess about this interim result. Values range from 0.0 |
| 295 | + // (completely unstable) to 1.0 (completely stable). Note that this is not the |
| 296 | + // same as `confidence`, which estimates the probability that a recognition |
| 297 | + // result is correct. |
| 298 | + // This field is only provided for interim results (`is_final=false`). |
| 299 | + // The default of 0.0 is a sentinel value indicating stability was not set. |
| 300 | + float stability = 3; |
| 301 | +} |
| 302 | + |
| 303 | +// A speech recognition result corresponding to a portion of the audio. |
| 304 | +message SpeechRecognitionResult { |
| 305 | + // [Output-only] May contain one or more recognition hypotheses (up to the |
| 306 | + // maximum specified in `max_alternatives`). |
| 307 | + repeated SpeechRecognitionAlternative alternatives = 1; |
| 308 | +} |
| 309 | + |
| 310 | +// Alternative hypotheses (a.k.a. n-best list). |
| 311 | +message SpeechRecognitionAlternative { |
| 312 | + // [Output-only] Transcript text representing the words that the user spoke. |
| 313 | + string transcript = 1; |
| 314 | + |
| 315 | + // [Output-only] The confidence estimate between 0.0 and 1.0. A higher number |
| 316 | + // means the system is more confident that the recognition is correct. |
| 317 | + // This field is typically provided only for the top hypothesis, and only for |
| 318 | + // `is_final=true` results. |
| 319 | + // The default of 0.0 is a sentinel value indicating confidence was not set. |
| 320 | + float confidence = 2; |
| 321 | +} |
0 commit comments