// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
module media.mojom;
import "media/mojo/mojom/audio_data.mojom";
import "media/mojo/mojom/speech_recognition_audio_forwarder.mojom";
import "media/mojo/mojom/speech_recognizer.mojom";
import "mojo/public/mojom/base/time.mojom";
import "mojo/public/mojom/base/unguessable_token.mojom";
import "ui/gfx/geometry/mojom/geometry.mojom";
// Next MinVersion: 7
// Corresponds to the LangIdEvent.ConfidenceInterval defined in
// http://google3/speech/soda/public/soda_event.proto.
[Stable, Extensible]
enum ConfidenceLevel {
[Default] kUnknown,
// Corresponds to the LangIdEvent.AsrSwitchResult defined in
// http://google3/speech/soda/public/soda_event.proto.
[Stable, Extensible]
enum AsrSwitchResult {
[Default] kDefaultNoSwitch,
// The main interface a client uses to interact with a speech
// recognition service process. For Live Caption, every renderer can own one
// or more Remote<SpeechRecognitionContext>, with the receiver bound through the
// BrowserInterfaceBroker. For the Web Speech API, the browser is the client and
// can own one or more Remote<SpeechRecognitionContext>. This is a stable
// interface that is used across the LaCrOS/Ash boundary.
interface SpeechRecognitionContext {
// Bind the recognizers to the speech recognition service. Returns a flag
// indicating whether multichannel audio is supported by the speech
// recognition service.
BindRecognizer@0(pending_receiver<SpeechRecognitionRecognizer> receiver,
pending_remote<SpeechRecognitionRecognizerClient> client,
SpeechRecognitionOptions options)
=> (bool is_multichannel_supported);
// Bind the recognizer used by the Web Speech API.
pending_receiver<media.mojom.SpeechRecognitionSession> session_receiver,
pending_remote<media.mojom.SpeechRecognitionSessionClient> session_client,
pending_receiver<SpeechRecognitionAudioForwarder> audio_forwarder,
int32 channel_count,
int32 sample_rate,
SpeechRecognitionOptions options,
bool continuous);
// The interface used to pass raw audio from the renderer to the speech
// recognition service. The remote lives either in the renderer process (for web
// Live Caption) or the browser process (for CrOS features like system Live
// Caption and dictation) and the receiver lives in the speech recognition
// process.
interface SpeechRecognitionRecognizer {
// Initialize the speech recognition instance. The speech recognition client
// will return the recognition events containing the transcribed audio back
// to the originating media.
SendAudioToSpeechRecognitionService@0(AudioDataS16 buffer);
// Mark audio stream done. This informs the speech recognition client to stop
// speech recognition after it finishes processing the audio it has received
// already. This will eventually trigger the
// SpeechRecognitionRecognizerClient::OnSpeechRecognitionStopped callback.
// Notify the speech recognition recognizer that the language changed. Takes
// in the locale string (e.g. "en-US").
OnLanguageChanged@2(string language);
// Notify the speech recognition recognizer that the mask offensive words
// setting has changed.
OnMaskOffensiveWordsChanged@3(bool mask_offensive_words);
// The interface used to return speech recognition events from the speech
// recognition service to the client that will display the results to the user.
// The remote lives in the speech recognition process and the receiver lives in
// the browser process.
interface SpeechRecognitionRecognizerClient {
// Triggered by speech recognition process on a speech recognition event.
// Returns false if the client wants to halt speech recognition e.g. in
// response to user input or in the case of an error.
OnSpeechRecognitionRecognitionEvent@0(SpeechRecognitionResult result)
=> (bool continue_recognition);
// Called when speech recognition stops.
// Triggered by an error within the speech recognition service.
// Triggered by speech recognition process on a language identification event.
OnLanguageIdentificationEvent@3(LanguageIdentificationEvent event);
// The hypothesis parts that provides timing information for each word in
// recognized speech.
struct HypothesisParts {
// A section of the final transcription text. Either an entire word or single
// character (depending on the language) with adjacent punctuation. There will
// usually only be one value here. If formatting is enabled in the speech
// recognition, then the raw text will be included as the second element.
array<string> text;
// Time offset from this event's |audio_start_time| defined below. We enforce
// the following invariant: 0 <= hypothesis_part_offset < |audio_end_time -
// audio_start_time|.
mojo_base.mojom.TimeDelta hypothesis_part_offset;
// The timing information for the transcript.
struct TimingInformation {
// Start time in audio time from the start of the SODA session.
// This time measures the amount of audio input into SODA.
mojo_base.mojom.TimeDelta audio_start_time;
// Elapsed processed audio from first frame after preamble.
mojo_base.mojom.TimeDelta audio_end_time;
// The timing information for each word/letter in the transription.
// HypothesisPartsInResult was introduced in min version 1 in
// chromeos/services/machine_learning/public/mojom/soda.mojom. Therefore, it
// must be optional. Hypothesis parts maybe non-empty optional containing a
// zero length vector if no words were spoken during the event's time span.
array<HypothesisParts>? hypothesis_parts;
// A speech recognition result created by the speech service and passed to the
// browser.
struct SpeechRecognitionResult {
string transcription;
// A flag indicating whether the result is final. If true, the result is
// locked in and the next result returned will not overlap with the previous
// final result.
bool is_final;
// Timing information for the current transcription. |timing_information| is
// expected to be valid if:
// 1. speech recognition is provided by |CrosSodaClient| and
// 2. |is_final| is true.
TimingInformation? timing_information;
// A language identification event created by the speech recognition service
// and passed to the browser and renderer.
struct LanguageIdentificationEvent {
// The locale of the language with the highest confidence.
string language;
// The confidence interval.
ConfidenceLevel confidence_level;
// If multilang is enabled, describes the actions Automatic Speech Recognition
// took as a result of this event.
AsrSwitchResult? asr_switch_result;
// The interface used to notify the speech recognition client of events
// triggered by the browser. The remote lives in the browser process and the
// receiver lives either in the renderer process (for web Live Caption) or
// the browser process (for CrOS system Live Caption).
interface SpeechRecognitionBrowserObserver {
// Notify the speech recognition client when speech recognition availability
// changes.
bool is_speech_recognition_available);
// Notify the speech recognition client when the speech recognition language
// changes.
SpeechRecognitionLanguageChanged@1(string language);
// Notify the speech recognition client when the mask offensive words pref
// changes.
SpeechRecognitionMaskOffensiveWordsChanged@2(bool mask_offensive_words);
// The user-facing source of recognized speech; typically a tab. The remote
// lives in the Ash browser process and is used to trigger behavior in lacros
// (like focusing the tab). The receiver lives in the lacros browser process.
interface SpeechRecognitionSurface {
// "Activate" the surface - i.e. bring it to the front and focus it.
// Fetch the bounds of the surface in screen coordinates. A nullopt is
// returned if no bounds could be fetched.
GetBounds@1() => (gfx.mojom.Rect? bounds);
// The OS-side observer of a lacros-side speech surface. Used to close or
// re-render a live caption bubble based on user interaction with the
// lacros-side surface. The remote lives in the lacros browser process, and the
// receiver lives in the Ash browser process.
interface SpeechRecognitionSurfaceClient {
// Called when the user navigates away or refreshes the current tab. This
// comprises the end of a live caption "session", after which the caption
// bubble can be shown even if it was explicitly dismissed by the user.
// Called when the user fullscreens or un-fullscreens the speech surface.
// Static metadata about a remote speech surface. Used by the speech service
// client in Ash.
struct SpeechRecognitionSurfaceMetadata {
// A unique identifier for the "session" (i.e. tab) of the surface. Is used to
// hide the caption bubble for all streams in a tab if the bubble is closed
// once.
mojo_base.mojom.UnguessableToken session_id;
// This interface between the speech recognition client and the browser.
// The remote lives in the renderer process and the receiver lives in the
// browser process. Not necessary for browser-side features (e.g. CrOS system
// Live Caption), which can access browser functionality directly.
interface SpeechRecognitionClientBrowserInterface {
// Bind the speech recognition availability observer.
pending_remote<SpeechRecognitionBrowserObserver> observer);
// Requests that a remote speech recognition client be instantiated and bound
// in the Ash browser process. The instantiated client should use the surface
// and surface client bindings to perform tasks (such as refocusing) that
// require coordination with the current lacros tab.
pending_receiver<SpeechRecognitionRecognizerClient> client,
pending_receiver<SpeechRecognitionSurfaceClient> surface_client,
pending_remote<SpeechRecognitionSurface> surface,
SpeechRecognitionSurfaceMetadata metadata);
// Corresponds to ExtendedSodaConfigMsg.RecognitionMode in
// chrome/services/speech/soda/proto/soda_api.proto and
// SodaRecognitionMode in
// chromeos/services/machine_learning/public/mojom/soda.mojom.
[Stable, Extensible]
enum SpeechRecognitionMode {
[Default] kUnknown,
// Intended for voice input for keyboard usage.
// Intended to caption a stream of audio.
// Which Chrome/ChromeOS application that is triggering the
// speech recognition session to start.
[Stable, Extensible]
enum RecognizerClientType {
[Default] kUnknown,
// Dictation on ChromeOS.
// LiveCaption on Chrome/ChromeOS.
// Projector on ChromeOS.
// CastModerator on ChromeOS.
// Options for speech recognition.
struct SpeechRecognitionOptions {
// What kind of recognition to use.
// In the case of web fallback (not for launch, used for development only),
// this option will be ignored.
SpeechRecognitionMode recognition_mode;
// Whether to enable formatting and punctuation in the recognition results.
bool enable_formatting;
// The BCP-47 localized language code to use (e.g. "en-US").
// TODO(crbug.com/40162502): Language needs to be required when multiple
// languages are supported by SODA, so that each SpeechRecognitionRecognizer
// can use its own language. Right now Language is only used by Projector
// and Dictation via OnDeviceSpeechRecognizer in Chrome OS.
string? language;
// Whether the recognition is happening on-device or remotely on a server.
bool is_server_based;
// Which client is requesting the speech recognition session.
RecognizerClientType recognizer_client_type;
// When true, if the incoming audio buffer is zero for an extended period
// (e.g. 10 seconds), audio won't be fed to the captioning model until nonzero
// audio is received.
// When false, even empty audio is captioned indefinitely.
// Set to false if accurate TimingInfo relative to the start of captioning is
// needed.
bool skip_continuously_empty_audio = false;
// The optional experiment recognizer routing key for current request.
string? experiment_recognizer_routing_key;
// The channel count of the forwarded audio.
int32 channel_count;
// The sample rate of the forwarded audio.
int32 sample_rate;