chromium/media/mojo/mojom/speech_recognition.mojom

// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

module media.mojom;

import "media/mojo/mojom/audio_data.mojom";
import "media/mojo/mojom/speech_recognition_audio_forwarder.mojom";
import "media/mojo/mojom/speech_recognizer.mojom";
import "mojo/public/mojom/base/time.mojom";
import "mojo/public/mojom/base/unguessable_token.mojom";
import "ui/gfx/geometry/mojom/geometry.mojom";

// Next MinVersion: 7

// Corresponds to the LangIdEvent.ConfidenceInterval defined in
// http://google3/speech/soda/public/soda_event.proto.
[Stable, Extensible]
enum ConfidenceLevel {
  [Default] kUnknown,
  kNotConfident,
  kConfident,
  kHighlyConfident,
};

// Corresponds to the LangIdEvent.AsrSwitchResult defined in
// http://google3/speech/soda/public/soda_event.proto.
[Stable, Extensible]
enum AsrSwitchResult {
  [Default] kDefaultNoSwitch,
  kSwitchSucceeded,
  kSwitchFailed,
  kSwitchSkipedNoLp,
};

// The main interface a client uses to interact with a speech
// recognition service process. For Live Caption, every renderer can own one
// or more Remote<SpeechRecognitionContext>, with the receiver bound through the
// BrowserInterfaceBroker. For the Web Speech API, the browser is the client and
// can own one or more Remote<SpeechRecognitionContext>. This is a stable
// interface that is used across the LaCrOS/Ash boundary.
[Stable]
interface SpeechRecognitionContext {
  // Bind the recognizers to the speech recognition service. Returns a flag
  // indicating whether multichannel audio is supported by the speech
  // recognition service.
  BindRecognizer@0(pending_receiver<SpeechRecognitionRecognizer> receiver,
                   pending_remote<SpeechRecognitionRecognizerClient> client,
                   SpeechRecognitionOptions options)
      => (bool is_multichannel_supported);

  // Bind the recognizer used by the Web Speech API.
  [MinVersion=6]
  BindWebSpeechRecognizer@1(
      pending_receiver<media.mojom.SpeechRecognitionSession> session_receiver,
      pending_remote<media.mojom.SpeechRecognitionSessionClient> session_client,
      pending_receiver<SpeechRecognitionAudioForwarder> audio_forwarder,
      int32 channel_count,
      int32 sample_rate,
      SpeechRecognitionOptions options,
      bool continuous);
};

// The interface used to pass raw audio from the renderer to the speech
// recognition service. The remote lives either in the renderer process (for web
// Live Caption) or the browser process (for CrOS features like system Live
// Caption and dictation) and the receiver lives in the speech recognition
// process.
[Stable]
interface SpeechRecognitionRecognizer {
  // Initialize the speech recognition instance. The speech recognition client
  // will return the recognition events containing the transcribed audio back
  // to the originating media.
  SendAudioToSpeechRecognitionService@0(AudioDataS16 buffer);

  // Mark audio stream done. This informs the speech recognition client to stop
  // speech recognition after it finishes processing the audio it has received
  // already. This will eventually trigger the
  // SpeechRecognitionRecognizerClient::OnSpeechRecognitionStopped callback.
  MarkDone@1();

  // Notify the speech recognition recognizer that the language changed. Takes
  // in the locale string (e.g. "en-US").
  OnLanguageChanged@2(string language);

  // Notify the speech recognition recognizer that the mask offensive words
  // setting has changed.
  [MinVersion=2]
  OnMaskOffensiveWordsChanged@3(bool mask_offensive_words);
};

// The interface used to return speech recognition events from the speech
// recognition service to the client that will display the results to the user.
// The remote lives in the speech recognition process and the receiver lives in
// the browser process.
[Stable]
interface SpeechRecognitionRecognizerClient {
  // Triggered by speech recognition process on a speech recognition event.
  //
  // Returns false if the client wants to halt speech recognition e.g. in
  // response to user input or in the case of an error.
  OnSpeechRecognitionRecognitionEvent@0(SpeechRecognitionResult result)
      => (bool continue_recognition);

  // Called when speech recognition stops.
  OnSpeechRecognitionStopped@1();

  // Triggered by an error within the speech recognition service.
  OnSpeechRecognitionError@2();

  // Triggered by speech recognition process on a language identification event.
  OnLanguageIdentificationEvent@3(LanguageIdentificationEvent event);
};

// The hypothesis parts that provides timing information for each word in
// recognized speech.
[Stable]
struct HypothesisParts {
  // A section of the final transcription text. Either an entire word or single
  // character (depending on the language) with adjacent punctuation. There will
  // usually only be one value here. If formatting is enabled in the speech
  // recognition, then the raw text will be included as the second element.
  array<string> text;

  // Time offset from this event's |audio_start_time| defined below. We enforce
  // the following invariant: 0 <= hypothesis_part_offset < |audio_end_time -
  // audio_start_time|.
  mojo_base.mojom.TimeDelta hypothesis_part_offset;
};

// The timing information for the transcript.
[Stable]
struct TimingInformation {
  // Start time in audio time from the start of the SODA session.
  // This time measures the amount of audio input into SODA.
  mojo_base.mojom.TimeDelta audio_start_time;

  // Elapsed processed audio from first frame after preamble.
  mojo_base.mojom.TimeDelta audio_end_time;

  // The timing information for each word/letter in the transription.
  // HypothesisPartsInResult was introduced in min version 1 in
  // chromeos/services/machine_learning/public/mojom/soda.mojom. Therefore, it
  // must be optional. Hypothesis parts maybe non-empty optional containing a
  // zero length vector if no words were spoken during the event's time span.
  array<HypothesisParts>? hypothesis_parts;
};

// A speech recognition result created by the speech service and passed to the
// browser.
[Stable]
struct SpeechRecognitionResult {
  string transcription;

  // A flag indicating whether the result is final. If true, the result is
  // locked in and the next result returned will not overlap with the previous
  // final result.
  bool is_final;

  // Timing information for the current transcription. |timing_information| is
  // expected to be valid if:
  //   1. speech recognition is provided by |CrosSodaClient| and
  //   2. |is_final| is true.
  TimingInformation? timing_information;
};

// A language identification event created by the speech recognition service
// and passed to the browser and renderer.
[Stable]
struct LanguageIdentificationEvent {
  // The locale of the language with the highest confidence.
  string language;

  // The confidence interval.
  ConfidenceLevel confidence_level;

  // If multilang is enabled, describes the actions Automatic Speech Recognition
  // took as a result of this event.
  [MinVersion=1]
  AsrSwitchResult? asr_switch_result;
};

// The interface used to notify the speech recognition client of events
// triggered by the browser. The remote lives in the browser process and the
// receiver lives either in the renderer process (for web Live Caption) or
// the browser process (for CrOS system Live Caption).
[Stable]
interface SpeechRecognitionBrowserObserver {
  // Notify the speech recognition client when speech recognition availability
  // changes.
  SpeechRecognitionAvailabilityChanged@0(
      bool is_speech_recognition_available);

  // Notify the speech recognition client when the speech recognition language
  // changes.
  SpeechRecognitionLanguageChanged@1(string language);

  // Notify the speech recognition client when the mask offensive words pref
  // changes.
  [MinVersion=2]
  SpeechRecognitionMaskOffensiveWordsChanged@2(bool mask_offensive_words);
};

// The user-facing source of recognized speech; typically a tab. The remote
// lives in the Ash browser process and is used to trigger behavior in lacros
// (like focusing the tab). The receiver lives in the lacros browser process.
[Stable]
interface SpeechRecognitionSurface {
  // "Activate" the surface - i.e. bring it to the front and focus it.
  Activate@0();

  // Fetch the bounds of the surface in screen coordinates. A nullopt is
  // returned if no bounds could be fetched.
  GetBounds@1() => (gfx.mojom.Rect? bounds);
};

// The OS-side observer of a lacros-side speech surface. Used to close or
// re-render a live caption bubble based on user interaction with the
// lacros-side surface. The remote lives in the lacros browser process, and the
// receiver lives in the Ash browser process.
[Stable]
interface SpeechRecognitionSurfaceClient {
  // Called when the user navigates away or refreshes the current tab. This
  // comprises the end of a live caption "session", after which the caption
  // bubble can be shown even if it was explicitly dismissed by the user.
  OnSessionEnded@0();

  // Called when the user fullscreens or un-fullscreens the speech surface.
  OnFullscreenToggled@1();
};

// Static metadata about a remote speech surface. Used by the speech service
// client in Ash.
[Stable]
struct SpeechRecognitionSurfaceMetadata {
  // A unique identifier for the "session" (i.e. tab) of the surface. Is used to
  // hide the caption bubble for all streams in a tab if the bubble is closed
  // once.
  mojo_base.mojom.UnguessableToken session_id;
};

// This interface between the speech recognition client and the browser.
// The remote lives in the renderer process and the receiver lives in the
// browser process. Not necessary for browser-side features (e.g. CrOS system
// Live Caption), which can access browser functionality directly.
[Stable]
interface SpeechRecognitionClientBrowserInterface {
  // Bind the speech recognition availability observer.
  BindSpeechRecognitionBrowserObserver@0(
      pending_remote<SpeechRecognitionBrowserObserver> observer);

  // Requests that a remote speech recognition client be instantiated and bound
  // in the Ash browser process. The instantiated client should use the surface
  // and surface client bindings to perform tasks (such as refocusing) that
  // require coordination with the current lacros tab.
  [MinVersion=1]
  BindRecognizerToRemoteClient@1(
      pending_receiver<SpeechRecognitionRecognizerClient> client,
      pending_receiver<SpeechRecognitionSurfaceClient> surface_client,
      pending_remote<SpeechRecognitionSurface> surface,
      SpeechRecognitionSurfaceMetadata metadata);
};

// Corresponds to ExtendedSodaConfigMsg.RecognitionMode in
// chrome/services/speech/soda/proto/soda_api.proto and
// SodaRecognitionMode in
// chromeos/services/machine_learning/public/mojom/soda.mojom.
[Stable, Extensible]
enum SpeechRecognitionMode {
  [Default] kUnknown,
  // Intended for voice input for keyboard usage.
  kIme,
  // Intended to caption a stream of audio.
  kCaption,
};

// Which Chrome/ChromeOS application that is triggering the
// speech recognition session to start.
[Stable, Extensible]
enum RecognizerClientType {
  [Default] kUnknown,
  // Dictation on ChromeOS.
  kDictation,
  // LiveCaption on Chrome/ChromeOS.
  kLiveCaption,
  // Projector on ChromeOS.
  kProjector,
  // CastModerator on ChromeOS.
  kCastModerator,
};

// Options for speech recognition.
[Stable]
struct SpeechRecognitionOptions {
  // What kind of recognition to use.
  // In the case of web fallback (not for launch, used for development only),
  // this option will be ignored.
  SpeechRecognitionMode recognition_mode;

  // Whether to enable formatting and punctuation in the recognition results.
  bool enable_formatting;

  // The BCP-47 localized language code to use (e.g. "en-US").
  // TODO(crbug.com/40162502): Language needs to be required when multiple
  // languages are supported by SODA, so that each SpeechRecognitionRecognizer
  // can use its own language. Right now Language is only used by Projector
  // and Dictation via OnDeviceSpeechRecognizer in Chrome OS.
  string? language;

  // Whether the recognition is happening on-device or remotely on a server.
  [MinVersion=1]
  bool is_server_based;

  // Which client is requesting the speech recognition session.
  [MinVersion=1]
  RecognizerClientType recognizer_client_type;

  // When true, if the incoming audio buffer is zero for an extended period
  // (e.g. 10 seconds), audio won't be fed to the captioning model until nonzero
  // audio is received.
  // When false, even empty audio is captioned indefinitely.
  // Set to false if accurate TimingInfo relative to the start of captioning is
  // needed.
  [MinVersion=4]
  bool skip_continuously_empty_audio = false;

  // The optional experiment recognizer routing key for current request.
  [MinVersion=5]
  string? experiment_recognizer_routing_key;

  // The channel count of the forwarded audio.
  [MinVersion=6]
  int32 channel_count;

  // The sample rate of the forwarded audio.
  [MinVersion=6]
  int32 sample_rate;
};