// DO NOT CHANGE THIS FILE!
// This proto is copied from
// http://google3/speech/soda/chrome/extended_soda_api.proto That is the source
// of truth, and any changes should be submitted and approved there before being
// copied into here.
syntax = "proto2";
package speech.soda.chrome;
// Optimize generated output for Lite, since it's going to be running on
// end-user devices.
option optimize_for = LITE_RUNTIME;
option java_multiple_files = true;
// Next ID to use: 17
message ExtendedSodaConfigMsg {
// Number of channels in RAW audio that will be provided to SODA.
optional int32 channel_count = 1;
// Sample rate, in Hz.
optional int32 sample_rate = 2;
// Maximum size of buffer to use in PipeStream. By default, is 0, which means
// unlimited.
optional int32 max_buffer_bytes = 4 [default = 0];
// If set to true, forces the audio provider to simulate realtime audio
// provision. This only makes sense during testing, to simulate realtime audio
// providing from a big chunk of audio.
// This slows down audio provided to SODA to a maximum of real-time, which
// means more accurate endpointer behavior, but is unsuitable for execution in
// real production environments. Set with caution!
optional bool simulate_realtime_testonly = 5 [default = false];
// config file location for languagepack.
optional string config_file_location = 3 [deprecated = true];
// API key used for call verification.
optional string api_key = 6;
// Directory of the language pack to use.
optional string language_pack_directory = 7;
enum RecognitionMode {
UNKNOWN = 0;
// Intended for voice input for keyboard usage.
IME = 1;
// Intended to caption a stream of audio.
CAPTION = 2;
}
// What kind of recognition to execute here. Impacts model usage.
optional RecognitionMode recognition_mode = 8 [default = IME];
// No impact in current code.
optional bool reset_on_final_result = 9 [default = true, deprecated = true];
// Whether to populate the timing_metrics field on Recognition and Endpoint
// events.
optional bool include_timing_metrics = 10 [default = true];
// Whether or not to request lang id events.
optional bool enable_lang_id = 11 [default = false];
// Whether to enable formatting and punctuation in the recognition results.
optional bool enable_formatting = 12 [default = true];
// Whether to enable speaker change detection in the recognition results.
optional bool enable_speaker_change_detection = 13 [default = false];
// Whether to return, in an asynchronous manner, speech/google3 logs from here
// and back through the callbacks.
optional bool include_logging = 14 [default = false];
// Multilang configuration.
// In order to enable multilang, ExtendedSodaConfigMsg.language_pack_directory
// must correctly specify the LP for the primary locale.
optional MultilangConfig multilang_config = 15;
// Whether to mask / leave offensive words in recognition.
optional bool mask_offensive_words = 16 [default = false];
}
// Next ID: 3
message MultilangConfig {
// The locale-to-LP mapping for multilang code switching. Locales are in BCP47
// format, e.g. `en-US`. The set of keys are the locales that will support
// multilang code switching. There should only be one locale per language
// (e.g. only es-ES not both es-US and es-ES). The primary LP specified in
// OnDeviceAsrConfig.language_pack_directory does not need to be included.
map<string, string> multilang_language_pack_directory = 1;
// Rewind the audio buffer if SODA decides to switch the recognizer when a
// new language is detected. The language detection has a delay between the
// spoken language changed and the new language detected. The audio buffer
// rewind will try to cover this gap. See b/218705498 for additional details.
optional bool rewind_when_switching_language = 2;
}
// Next id: 5
message TimingMetrics {
// Epoch time of first audio buffer of main query that is fed into ASR.
// This is the wall time read from the system clock when the first audio
// buffer is received by the terse processor.
optional int64 audio_start_epoch_usec = 1;
// Start time in audio time from start of SODA session.
// This time measures the amount of audio input into SODA.
optional int64 audio_start_time_usec = 2;
// Elapsed wall time usec since first frame.
optional int64 elapsed_wall_time_usec = 3;
// Elapsed processed audio usec from first frame after preamble.
optional int64 event_end_time_usec = 4;
}
// Next item: 3.
message HypothesisPart {
// Typically 1 item for a word/piece of text. If formatting is enabled, the
// raw text will be the second item.
repeated string text = 1;
// Offset in ms of the beginning of this part of the hypothesis from this
// events audio_start_time_usec.
optional int64 alignment_ms = 2;
}
// Next id: 6
message SodaRecognitionResult {
// Hypothesis from recognition, in order of probability. We don't get the
// probability from SODA, so the only given is that the first is the "best".
repeated string hypothesis = 1;
enum ResultType {
UNKNOWN = 0;
// Partial result of a speech segment so far.
PARTIAL = 1;
// Final result for this segment.
FINAL = 2;
// Prefetch is only sent for likely query strings. This won't happen for
// non-query mode SODA, but we add here for completeness.
PREFETCH = 3;
}
// What kind of result set this is.
optional ResultType result_type = 2;
enum FinalResultEndpointReason {
ENDPOINT_UNKNOWN = 0;
// End of speech from endpointer.
ENDPOINT_END_OF_SPEECH = 1;
// End of utterance from endpointer.
ENDPOINT_END_OF_UTTERANCE = 2;
// No more audio.
ENDPOINT_END_OF_AUDIO = 3;
// Final was generated because a hotword was detected.
ENDPOINT_ASR_RESET_BY_HOTWORD = 4;
// ASR was reset via the external API.
ENDPOINT_ASR_RESET_EXTERNAL = 5;
// Final recognition result was generated due to an error in ASR.
ENDPOINT_ASR_ERROR = 6;
}
// If this is a final result, why was the recognition marked final.
optional FinalResultEndpointReason endpoint_reason = 3;
// Timing information for the event.
optional TimingMetrics timing_metrics = 4;
// For final results. The first hypothesis split into word level timing.
repeated HypothesisPart hypothesis_part = 5;
}
// Next id: 3
message SodaEndpointEvent {
// What endpoint type we're referring to here.
enum EndpointType {
// A start-of-speech moment has been detected at this time. Audio currently
// contains speech.
START_OF_SPEECH = 0;
// End of speech has been detected by the endpointer, audio does not contain
// speech right now.
END_OF_SPEECH = 1;
// End of Audio due to an end-of-mic data event.
END_OF_AUDIO = 2;
// End of Utterance detected from the endpointer. Not used in
// Caption/Transcription.
END_OF_UTTERANCE = 3;
UNKNOWN = 4;
}
optional EndpointType endpoint_type = 1 [default = UNKNOWN];
// Timing information for the event.
optional TimingMetrics timing_metrics = 2;
}
message SodaAudioLevelInfo {
// Low-pass filtered RMS in range 0..1.
optional float rms = 1;
// Speech likelihood score from in range 0..1.
optional float audio_level = 2;
// Amount of audio seen from start of SODA session until an audio level event.
// This value is only set when audio_level is set.
optional int64 audio_time_usec = 3;
}
message SodaLangIdEvent {
enum AsrSwitchResult {
// No switch is attempted when:
// 1. Multilang is disabled.
// 2. top_language_confidence did not meet the sensitivity threshold.
// 3. the top detected language is the same as the one currently being
// transcribed.
// 4. The LangId results were jittery and this event was ignored for being
// too short.
DEFAULT_NO_SWITCH = 0;
// ASR successfully switched to this locale.
SWITCH_SUCCEEDED = 1;
// ASR attempted to switch to this locale, but could not load the provided
// LP.
SWITCH_FAILED = 2;
// ASR did not attempt to switch because no LP was provided for the locale,
// but top_language_confidence met the sensitivity threshold.
SWITCH_SKIPPED_NO_LP = 3;
}
// Locale, e.g. "en-us" or "af-za"
optional string language = 1;
// Equal to the internal enum from langid confidence.
optional int32 confidence_level = 2;
// Flag indicating whether ASR successfully switched to this locale.
optional AsrSwitchResult asr_switch_result = 3;
}
message SodaResponse {
enum SodaMessageType {
UNKNOWN = 0;
RECOGNITION = 1;
STOP = 2;
SHUTDOWN = 3;
START = 4;
ENDPOINT = 5;
AUDIO_LEVEL = 6;
LANGID = 7;
LOGS_ONLY_ARTIFICIAL_MESSAGE = 8;
}
optional SodaMessageType soda_type = 1 [default = UNKNOWN];
// Set when type is RECOGNITION
optional SodaRecognitionResult recognition_result = 2;
// Set when type is ENDPOINT
optional SodaEndpointEvent endpoint_event = 3;
// Set when type is AUDIO_LEVEL
optional SodaAudioLevelInfo audio_level_info = 4;
// Set when type is LANGID
optional SodaLangIdEvent langid_event = 5;
// aynchronously built up list of strings of logs since last SodaResponse.
repeated string log_lines = 6;
}