chromium/chrome/services/speech/soda/proto/soda_api.proto

// DO NOT CHANGE THIS FILE!
// This proto is copied from
// http://google3/speech/soda/chrome/extended_soda_api.proto That is the source
// of truth, and any changes should be submitted and approved there before being
// copied into here.

syntax = "proto2";

package speech.soda.chrome;

// Optimize generated output for Lite, since it's going to be running on
// end-user devices.
option optimize_for = LITE_RUNTIME;
option java_multiple_files = true;

// Next ID to use: 17
message ExtendedSodaConfigMsg {
  // Number of channels in RAW audio that will be provided to SODA.
  optional int32 channel_count = 1;
  // Sample rate, in Hz.
  optional int32 sample_rate = 2;

  // Maximum size of buffer to use in PipeStream. By default, is 0, which means
  // unlimited.
  optional int32 max_buffer_bytes = 4 [default = 0];

  // If set to true, forces the audio provider to simulate realtime audio
  // provision. This only makes sense during testing, to simulate realtime audio
  // providing from a big chunk of audio.
  // This slows down audio provided to SODA to a maximum of real-time, which
  // means more accurate endpointer behavior, but is unsuitable for execution in
  // real production environments. Set with caution!
  optional bool simulate_realtime_testonly = 5 [default = false];

  // config file location for languagepack.
  optional string config_file_location = 3 [deprecated = true];

  // API key used for call verification.
  optional string api_key = 6;

  // Directory of the language pack to use.
  optional string language_pack_directory = 7;

  enum RecognitionMode {
    UNKNOWN = 0;

    // Intended for voice input for keyboard usage.
    IME = 1;

    // Intended to caption a stream of audio.
    CAPTION = 2;
  }
  // What kind of recognition to execute here. Impacts model usage.
  optional RecognitionMode recognition_mode = 8 [default = IME];

  // No impact in current code.
  optional bool reset_on_final_result = 9 [default = true, deprecated = true];

  // Whether to populate the timing_metrics field on Recognition and Endpoint
  // events.
  optional bool include_timing_metrics = 10 [default = true];

  // Whether or not to request lang id events.
  optional bool enable_lang_id = 11 [default = false];

  // Whether to enable formatting and punctuation in the recognition results.
  optional bool enable_formatting = 12 [default = true];

  // Whether to enable speaker change detection in the recognition results.
  optional bool enable_speaker_change_detection = 13 [default = false];

  // Whether to return, in an asynchronous manner, speech/google3 logs from here
  // and back through the callbacks.
  optional bool include_logging = 14 [default = false];

  // Multilang configuration.
  // In order to enable multilang, ExtendedSodaConfigMsg.language_pack_directory
  // must correctly specify the LP for the primary locale.
  optional MultilangConfig multilang_config = 15;

  // Whether to mask / leave offensive words in recognition.
  optional bool mask_offensive_words = 16 [default = false];
}

// Next ID: 3
message MultilangConfig {
  // The locale-to-LP mapping for multilang code switching. Locales are in BCP47
  // format, e.g. `en-US`. The set of keys are the locales that will support
  // multilang code switching. There should only be one locale per language
  // (e.g. only es-ES not both es-US and es-ES). The primary LP specified in
  // OnDeviceAsrConfig.language_pack_directory does not need to be included.
  map<string, string> multilang_language_pack_directory = 1;

  // Rewind the audio buffer if SODA decides to switch the recognizer when a
  // new language is detected. The language detection has a delay between the
  // spoken language changed and the new language detected. The audio buffer
  // rewind will try to cover this gap. See b/218705498 for additional details.
  optional bool rewind_when_switching_language = 2;
}

// Next id: 5
message TimingMetrics {
  // Epoch time of first audio buffer of main query that is fed into ASR.
  // This is the wall time read from the system clock when the first audio
  // buffer is received by the terse processor.
  optional int64 audio_start_epoch_usec = 1;

  // Start time in audio time from start of SODA session.
  // This time measures the amount of audio input into SODA.
  optional int64 audio_start_time_usec = 2;

  // Elapsed wall time usec since first frame.
  optional int64 elapsed_wall_time_usec = 3;

  // Elapsed processed audio usec from first frame after preamble.
  optional int64 event_end_time_usec = 4;
}

// Next item: 3.
message HypothesisPart {
  // Typically 1 item for a word/piece of text. If formatting is enabled, the
  // raw text will be the second item.
  repeated string text = 1;
  // Offset in ms of the beginning of this part of the hypothesis from this
  // events audio_start_time_usec.
  optional int64 alignment_ms = 2;
}

// Next id: 6
message SodaRecognitionResult {
  // Hypothesis from recognition, in order of probability. We don't get the
  // probability from SODA, so the only given is that the first is the "best".
  repeated string hypothesis = 1;
  enum ResultType {
    UNKNOWN = 0;
    // Partial result of a speech segment so far.
    PARTIAL = 1;
    // Final result for this segment.
    FINAL = 2;
    // Prefetch is only sent for likely query strings. This won't happen for
    // non-query mode SODA, but we add here for completeness.
    PREFETCH = 3;
  }

  // What kind of result set this is.
  optional ResultType result_type = 2;

  enum FinalResultEndpointReason {
    ENDPOINT_UNKNOWN = 0;
    // End of speech from endpointer.
    ENDPOINT_END_OF_SPEECH = 1;
    // End of utterance from endpointer.
    ENDPOINT_END_OF_UTTERANCE = 2;
    // No more audio.
    ENDPOINT_END_OF_AUDIO = 3;
    // Final was generated because a hotword was detected.
    ENDPOINT_ASR_RESET_BY_HOTWORD = 4;
    // ASR was reset via the external API.
    ENDPOINT_ASR_RESET_EXTERNAL = 5;
    // Final recognition result was generated due to an error in ASR.
    ENDPOINT_ASR_ERROR = 6;
  }
  // If this is a final result, why was the recognition marked final.
  optional FinalResultEndpointReason endpoint_reason = 3;

  // Timing information for the event.
  optional TimingMetrics timing_metrics = 4;

  // For final results. The first hypothesis split into word level timing.
  repeated HypothesisPart hypothesis_part = 5;
}

// Next id: 3
message SodaEndpointEvent {
  // What endpoint type we're referring to here.
  enum EndpointType {
    // A start-of-speech moment has been detected at this time. Audio currently
    // contains speech.
    START_OF_SPEECH = 0;

    // End of speech has been detected by the endpointer, audio does not contain
    // speech right now.
    END_OF_SPEECH = 1;

    // End of Audio due to an end-of-mic data event.
    END_OF_AUDIO = 2;

    // End of Utterance detected from the endpointer. Not used in
    // Caption/Transcription.
    END_OF_UTTERANCE = 3;

    UNKNOWN = 4;
  }

  optional EndpointType endpoint_type = 1 [default = UNKNOWN];

  // Timing information for the event.
  optional TimingMetrics timing_metrics = 2;
}

message SodaAudioLevelInfo {
  // Low-pass filtered RMS in range 0..1.
  optional float rms = 1;

  // Speech likelihood score from in range 0..1.
  optional float audio_level = 2;

  // Amount of audio seen from start of SODA session until an audio level event.
  // This value is only set when audio_level is set.
  optional int64 audio_time_usec = 3;
}

message SodaLangIdEvent {
  enum AsrSwitchResult {
    // No switch is attempted when:
    // 1. Multilang is disabled.
    // 2. top_language_confidence did not meet the sensitivity threshold.
    // 3. the top detected language is the same as the one currently being
    // transcribed.
    // 4. The LangId results were jittery and this event was ignored for being
    // too short.
    DEFAULT_NO_SWITCH = 0;
    // ASR successfully switched to this locale.
    SWITCH_SUCCEEDED = 1;
    // ASR attempted to switch to this locale, but could not load the provided
    // LP.
    SWITCH_FAILED = 2;
    // ASR did not attempt to switch because no LP was provided for the locale,
    // but top_language_confidence met the sensitivity threshold.
    SWITCH_SKIPPED_NO_LP = 3;
  }

  // Locale, e.g. "en-us" or "af-za"
  optional string language = 1;

  // Equal to the internal enum from langid confidence.
  optional int32 confidence_level = 2;

  // Flag indicating whether ASR successfully switched to this locale.
  optional AsrSwitchResult asr_switch_result = 3;
}

message SodaResponse {
  enum SodaMessageType {
    UNKNOWN = 0;
    RECOGNITION = 1;
    STOP = 2;
    SHUTDOWN = 3;
    START = 4;
    ENDPOINT = 5;
    AUDIO_LEVEL = 6;
    LANGID = 7;
    LOGS_ONLY_ARTIFICIAL_MESSAGE = 8;
  }

  optional SodaMessageType soda_type = 1 [default = UNKNOWN];

  // Set when type is RECOGNITION
  optional SodaRecognitionResult recognition_result = 2;

  // Set when type is ENDPOINT
  optional SodaEndpointEvent endpoint_event = 3;

  // Set when type is AUDIO_LEVEL
  optional SodaAudioLevelInfo audio_level_info = 4;

  // Set when type is LANGID
  optional SodaLangIdEvent langid_event = 5;

  // aynchronously built up list of strings of logs since last SodaResponse.
  repeated string log_lines = 6;
}