chromium/chromeos/services/machine_learning/public/mojom/soda.mojom

// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Datatypes and interfaces of speech recognition API.

// NOTE: This mojom exists in two places and must be kept in sync:
//       Chromium:  //chromeos/services/machine_learning/public/mojom/
//       Chrome OS: src/platform2/ml/mojom/
//       Note: Other repos downstream of Chromium might also use this mojom.
// Example: A backwards-compatible mojom change (and corresponding
// implementation change) can be made in Chrome OS first, then replicated to the
// clients (Chromium, other downstream repos) later.
// Use //chromeos/services/machine_learning/public/mojom/roll_mojoms.sh to help
// replicate Chrome OS-side changes over to Chromium.
// Versions list:
// Version 0: Initial
// Version 1: Include HypothesisPart Info in Final result.
// Version 2: Include enable formatting in request config.
// Version 3: Include recognition mode in request config.
// Version 4: Include mask offensive words mode in request config.
// Version 5: Include speaker change detection in request config.
// Version 6: Include logging inclusion in request config.
// Version 7: Include multilang fields in request config, and langid responses.
// Version 8: Include leading_space field in HypothesisPart.
// Version 9: Include speaker diarization in request config and HypothesisPart,
//            and add LabelCorrectionEvent.
module chromeos.machine_learning.mojom;

import "mojo/public/mojom/base/time.mojom";

// Augments a bool to include an 'unknown' value.
[Stable, Extensible]
enum OptionalBool {
  [Default] kUnknown = 0,
  kFalse,
  kTrue,
};

[Stable, Extensible]
enum SodaRecognitionMode {
  [Default] kCaption,
  kIme,
};

[Stable]
struct SodaMultilangConfig {
  // Rewind the audio buffer (to the end of the previous final) if SODA decides
  // to switch the recognizer when a new language is detected. The language
  // detection has a delay between the spoken language changed and the new
  // language detected. The audio buffer rewind will try to cover this gap.
  bool rewind_when_switching_language = true;
  // locale to languagepack map for additional languages.
  // Key is lowercased locale e.g. "en-us"
  // Value is directory for the languagepack dlc path to use.
  map<string, string> locale_to_language_pack_map;
};

[Stable, Extensible]
enum SpeakerDiarizationMode {
  // Not specified.
  kDiarizationUnspecified,

  // Do not use speaker diarization. The default.
  [Default] kSpeakerDiarizationModeOffDefault,

  // Use speaker turn detection.
  kSpeakerChangeDetection,

  // Use full speaker diarization.
  kSpeakerLabelDetection,
};

// The configuration used to load Soda recognizer.
[Stable]
struct SodaConfig {
  // Number of channels of the audio that will be sent to Soda recognizer.
  uint32 channel_count;
  // Sample rate of the audio that will be sent to Soda recognizer.
  uint32 sample_rate;
  // The api key for Soda library.
  string api_key;
  // Path to already-installed SODA library.
  string library_dlc_path;
  // Path to already-installed SODA language pack to use.
  string language_dlc_path;
  // Whether to enable automated punctuation. Defaults to true as this
  // is the default in the underlying protocol buffer.
  [MinVersion=2]
  OptionalBool enable_formatting = kTrue;
  // Which mode to execute in: IME or CAPTION. Default CAPTION.
  [MinVersion=3]
  SodaRecognitionMode recognition_mode = kCaption;
  // Whether to mask offensive words in this captioning.
  [MinVersion=4]
  bool mask_offensive_words = false;
  // Whether to detect speaker changes in the transcript from model.
  [MinVersion=5]
  bool speaker_change_detection = false;
  // Print more logging output in ML Service.
  [MinVersion=6]
  bool include_logging_output = false;

  [MinVersion=7]
  SodaMultilangConfig? multi_lang_config;

  [MinVersion=9]
  SpeakerDiarizationMode speaker_diarization_mode =
      kSpeakerDiarizationModeOffDefault;

  // When diarization set to kSpeakerLabelDetection, maximum number of labels
  // to set.
  [MinVersion=9]
  uint32 max_speaker_count = 2;
};

// From the endpointer, What kind of endpointer event to record.
[Stable, Extensible]
enum EndpointerType {
  // Speech detected.
  START_OF_SPEECH,
  // End of speech detected, but audio continues.
  END_OF_SPEECH,
  // Audio is terminated.
  END_OF_AUDIO,
  // Query is terminated.
  END_OF_UTTERANCE,
};

// Common information about the timing of reported SODA events.
[Stable]
struct TimingInfo {
  // Epoch time of the first audio buffer of the main query that is fed into
  // ASR. This is the wall time read from the system clock when the first audio
  // buffer is received by the terse processor.
  mojo_base.mojom.Time audio_start_epoch;

  // Start time in audio time from the start of the SODA session.
  // This time measures the amount of audio input into SODA.
  mojo_base.mojom.TimeDelta audio_start_time;

  // Elapsed wall time usec since the first frame.
  mojo_base.mojom.TimeDelta elapsed_wall_time;

  // Elapsed processed audio usec from first frame after preamble.
  mojo_base.mojom.TimeDelta event_end_time;

  // On device benchmark latency as defined in go/asr-latency-metrics.
  mojo_base.mojom.TimeDelta latency;

  // On device counter part of E2E normalized latency as defined in
  // go/asr-latency-metrics. This metric is mainly for non-continuous
  // conversation.
  float normalized_latency;

  // Timing for each word as an offset from audio_start_time_usec.
  array<mojo_base.mojom.TimeDelta> word_alignments;
};

// Start/end events.
[Stable]
struct EndpointerEvent {
  EndpointerType endpointer_type;
  TimingInfo? timing_event;
};

// Detail about a part of a hypothesis in a result. Only makes sense
// in context of an array of them for a hypothesis.
[Stable]
struct HypothesisPartInResult {
  // Typically 1 item for a a word/piece of text. If formatting is
  // enabled, the raw text is the second item.
  array<string> text;

  // Offset from the beginning of this part of the hypothesis from
  // audio_start_time in TimingInfo.
  mojo_base.mojom.TimeDelta alignment;

  // If there is a space before this part in the hypothesis. Only populated
  // when formatting is enabled.
  [MinVersion=8]
  bool? leading_space;

  // If this word was uttered by a different speaker than the previous word.
  // Populated for kSpeakerLabelDetection and kSpeakerChangeDetection modes.
  [MinVersion=9]
  bool speaker_change = false;

  // The label of the speaker who uttered this word. This will only be populated
  // if in kSpeakerLabelDetection mode.
  [MinVersion=9]
  string? speaker_label;
};

// A result _during_ a recognition. Could change at any time with the
// next partial or the final recognition for this chunk.
[Stable]
struct PartialResult {
  // Most likely hypothesis so far. First is the most likely, followed by
  // others.
  // Note: the relationship from first to other hypothess is not guaranteed in
  // any way.
  array<string> partial_text;
  TimingInfo? timing_event;

  // If populated, this array contains the hypothesis parts for the
  // first partial text in the array of partial_text.
  [MinVersion=9]
  array<HypothesisPartInResult>? hypothesis_part;
};

[Stable, Extensible]
enum EndpointReason {
  // Default value, unknown reason.
  ENDPOINT_UNKNOWN,
  // Due to end_of_speech detection by endpointer.
  ENDPOINT_END_OF_SPEECH,
  // Due to end_of_utterance detection by endpointer.
  ENDPOINT_END_OF_UTTERANCE,
  // Due to the end of mics audio. This could be due to a mic event or SODA
  // being stopped.
  ENDPOINT_END_OF_AUDIO,
};

[Stable]
struct FinalResult {
  // Sorted in decreasing order of probability.
  array<string> final_hypotheses;
  EndpointReason endpoint_reason;
  TimingInfo? timing_event;

  // If populated, this array contains the hypothesis parts for the
  // first final hypothesis in the array of final_hypotheses.
  [MinVersion=1]
  array<HypothesisPartInResult>? hypothesis_part;
};

// Frequent event from recognizer, almost from every frame. Gives an indication
// of speechiness and audio level.
[Stable]
struct AudioLevelEvent {
  // RMS audio level, from PowerEvaluator . Score is [0, 1)
  float rms;
  // Speech likelihood score, from TerseProcessor. Score is [0, 1)
  float audio_level;
};

[Stable, Extensible]
enum AsrSwitchResult {
  // No switch is attempted when:
  // 1. Multilang is disabled.
  // 2. top_language_confidence did not meet the sensitivity threshold.
  // 3. the top detected language is the same as the one currently being
  // transcribed.
  // 4. The LangId results were jittery and this event was ignored for being
  // too short.
  [Default] DEFAULT_NO_SWITCH = 0,
  // ASR successfully switched to this locale.
  SWITCH_SUCCEEDED = 1,
  // ASR attempted to switch to this locale, but could not load the provided
  // LP.
  SWITCH_FAILED = 2,
  // ASR did not attempt to switch because no LP was provided for the locale,
  // but top_language_confidence met the sensitivity threshold.
  SWITCH_SKIPPED_NO_LP = 3,
};

[Stable]
struct LangIdEvent {
  string language;
  // Confidence level, exactly as per input protocol buffer value.
  int32 confidence_level;
  // converted enum value, exactly as per input protocol buffer value.
  AsrSwitchResult asr_switch_result;
};

// Result with corrected speaker labels.
[Stable]
struct LabelCorrectionEvent {
  // The hypothesis parts with corrected speaker labels. The hypothesis part is
  // only included if its speaker label is different from the original streaming
  // label, or any of the previously corrected labels. This is designed to
  // minimize the amount of data that needs to be emitted during correction by
  // skipping parts with labels have never changed once.
  array<HypothesisPartInResult> hypothesis_parts;
};

// This essentially mirrors the subset of SODA's SodaEvent proto we will
// support.
[Stable]
union SpeechRecognizerEvent {
  AudioLevelEvent audio_event;
  PartialResult partial_result;
  EndpointerEvent endpointer_event;
  FinalResult final_result;

  [MinVersion=7]
  LangIdEvent langid_event;

  [MinVersion=9]
  LabelCorrectionEvent label_correction_event;
};

// This interface is called upon by the SodaRecognizer. Implemented by
// the client, SODA then calls these as 'events' with appropriate details
// when recognition occurs.
// Next ordinal: 3
[Stable]
interface SodaClient {
  // After SODA successfully starts / warms up, in case the client cares.
  OnStart@0();

  // After SODA successfully stops, in case the client cares.
  OnStop@1();

  // This is how the client receives actual recognized text as well as other
  // conclusions from the SODA model like "speech ended".
  OnSpeechRecognizerEvent@2(SpeechRecognizerEvent event);
};

// The mojom interface for performing the recognition of spoken text.
// Next ordinal: 4
[Stable]
interface SodaRecognizer {
  // Add Audio for speech recognition.
  AddAudio@0(array<uint8> audio);

  // Instruct SODA to stop processing immediately. Stopping is
  // confirmed when SodaClient::OnStop is called back.
  Stop@1();

  // Instruct SODA to start processing. Noop if already
  // processing. When Stopped, causes a SodaClient::OnStart callback.
  Start@2();

  // Instruct SODA to stop processing after all queued audio is
  // processed.  Will eventually result in a SodaClient::OnStop, but only
  // after all audio currently in queue is decoded.
  MarkDone@3();
};