// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Datatypes and interfaces of speech recognition API.
// NOTE: This mojom exists in two places and must be kept in sync:
// Chromium: //chromeos/services/machine_learning/public/mojom/
// Chrome OS: src/platform2/ml/mojom/
// Note: Other repos downstream of Chromium might also use this mojom.
// Example: A backwards-compatible mojom change (and corresponding
// implementation change) can be made in Chrome OS first, then replicated to the
// clients (Chromium, other downstream repos) later.
// Use //chromeos/services/machine_learning/public/mojom/roll_mojoms.sh to help
// replicate Chrome OS-side changes over to Chromium.
// Versions list:
// Version 0: Initial
// Version 1: Include HypothesisPart Info in Final result.
// Version 2: Include enable formatting in request config.
// Version 3: Include recognition mode in request config.
// Version 4: Include mask offensive words mode in request config.
// Version 5: Include speaker change detection in request config.
// Version 6: Include logging inclusion in request config.
// Version 7: Include multilang fields in request config, and langid responses.
// Version 8: Include leading_space field in HypothesisPart.
// Version 9: Include speaker diarization in request config and HypothesisPart,
// and add LabelCorrectionEvent.
module chromeos.machine_learning.mojom;
import "mojo/public/mojom/base/time.mojom";
// Augments a bool to include an 'unknown' value.
[Stable, Extensible]
enum OptionalBool {
[Default] kUnknown = 0,
kFalse,
kTrue,
};
[Stable, Extensible]
enum SodaRecognitionMode {
[Default] kCaption,
kIme,
};
[Stable]
struct SodaMultilangConfig {
// Rewind the audio buffer (to the end of the previous final) if SODA decides
// to switch the recognizer when a new language is detected. The language
// detection has a delay between the spoken language changed and the new
// language detected. The audio buffer rewind will try to cover this gap.
bool rewind_when_switching_language = true;
// locale to languagepack map for additional languages.
// Key is lowercased locale e.g. "en-us"
// Value is directory for the languagepack dlc path to use.
map<string, string> locale_to_language_pack_map;
};
[Stable, Extensible]
enum SpeakerDiarizationMode {
// Not specified.
kDiarizationUnspecified,
// Do not use speaker diarization. The default.
[Default] kSpeakerDiarizationModeOffDefault,
// Use speaker turn detection.
kSpeakerChangeDetection,
// Use full speaker diarization.
kSpeakerLabelDetection,
};
// The configuration used to load Soda recognizer.
[Stable]
struct SodaConfig {
// Number of channels of the audio that will be sent to Soda recognizer.
uint32 channel_count;
// Sample rate of the audio that will be sent to Soda recognizer.
uint32 sample_rate;
// The api key for Soda library.
string api_key;
// Path to already-installed SODA library.
string library_dlc_path;
// Path to already-installed SODA language pack to use.
string language_dlc_path;
// Whether to enable automated punctuation. Defaults to true as this
// is the default in the underlying protocol buffer.
[MinVersion=2]
OptionalBool enable_formatting = kTrue;
// Which mode to execute in: IME or CAPTION. Default CAPTION.
[MinVersion=3]
SodaRecognitionMode recognition_mode = kCaption;
// Whether to mask offensive words in this captioning.
[MinVersion=4]
bool mask_offensive_words = false;
// Whether to detect speaker changes in the transcript from model.
[MinVersion=5]
bool speaker_change_detection = false;
// Print more logging output in ML Service.
[MinVersion=6]
bool include_logging_output = false;
[MinVersion=7]
SodaMultilangConfig? multi_lang_config;
[MinVersion=9]
SpeakerDiarizationMode speaker_diarization_mode =
kSpeakerDiarizationModeOffDefault;
// When diarization set to kSpeakerLabelDetection, maximum number of labels
// to set.
[MinVersion=9]
uint32 max_speaker_count = 2;
};
// From the endpointer, What kind of endpointer event to record.
[Stable, Extensible]
enum EndpointerType {
// Speech detected.
START_OF_SPEECH,
// End of speech detected, but audio continues.
END_OF_SPEECH,
// Audio is terminated.
END_OF_AUDIO,
// Query is terminated.
END_OF_UTTERANCE,
};
// Common information about the timing of reported SODA events.
[Stable]
struct TimingInfo {
// Epoch time of the first audio buffer of the main query that is fed into
// ASR. This is the wall time read from the system clock when the first audio
// buffer is received by the terse processor.
mojo_base.mojom.Time audio_start_epoch;
// Start time in audio time from the start of the SODA session.
// This time measures the amount of audio input into SODA.
mojo_base.mojom.TimeDelta audio_start_time;
// Elapsed wall time usec since the first frame.
mojo_base.mojom.TimeDelta elapsed_wall_time;
// Elapsed processed audio usec from first frame after preamble.
mojo_base.mojom.TimeDelta event_end_time;
// On device benchmark latency as defined in go/asr-latency-metrics.
mojo_base.mojom.TimeDelta latency;
// On device counter part of E2E normalized latency as defined in
// go/asr-latency-metrics. This metric is mainly for non-continuous
// conversation.
float normalized_latency;
// Timing for each word as an offset from audio_start_time_usec.
array<mojo_base.mojom.TimeDelta> word_alignments;
};
// Start/end events.
[Stable]
struct EndpointerEvent {
EndpointerType endpointer_type;
TimingInfo? timing_event;
};
// Detail about a part of a hypothesis in a result. Only makes sense
// in context of an array of them for a hypothesis.
[Stable]
struct HypothesisPartInResult {
// Typically 1 item for a a word/piece of text. If formatting is
// enabled, the raw text is the second item.
array<string> text;
// Offset from the beginning of this part of the hypothesis from
// audio_start_time in TimingInfo.
mojo_base.mojom.TimeDelta alignment;
// If there is a space before this part in the hypothesis. Only populated
// when formatting is enabled.
[MinVersion=8]
bool? leading_space;
// If this word was uttered by a different speaker than the previous word.
// Populated for kSpeakerLabelDetection and kSpeakerChangeDetection modes.
[MinVersion=9]
bool speaker_change = false;
// The label of the speaker who uttered this word. This will only be populated
// if in kSpeakerLabelDetection mode.
[MinVersion=9]
string? speaker_label;
};
// A result _during_ a recognition. Could change at any time with the
// next partial or the final recognition for this chunk.
[Stable]
struct PartialResult {
// Most likely hypothesis so far. First is the most likely, followed by
// others.
// Note: the relationship from first to other hypothess is not guaranteed in
// any way.
array<string> partial_text;
TimingInfo? timing_event;
// If populated, this array contains the hypothesis parts for the
// first partial text in the array of partial_text.
[MinVersion=9]
array<HypothesisPartInResult>? hypothesis_part;
};
[Stable, Extensible]
enum EndpointReason {
// Default value, unknown reason.
ENDPOINT_UNKNOWN,
// Due to end_of_speech detection by endpointer.
ENDPOINT_END_OF_SPEECH,
// Due to end_of_utterance detection by endpointer.
ENDPOINT_END_OF_UTTERANCE,
// Due to the end of mics audio. This could be due to a mic event or SODA
// being stopped.
ENDPOINT_END_OF_AUDIO,
};
[Stable]
struct FinalResult {
// Sorted in decreasing order of probability.
array<string> final_hypotheses;
EndpointReason endpoint_reason;
TimingInfo? timing_event;
// If populated, this array contains the hypothesis parts for the
// first final hypothesis in the array of final_hypotheses.
[MinVersion=1]
array<HypothesisPartInResult>? hypothesis_part;
};
// Frequent event from recognizer, almost from every frame. Gives an indication
// of speechiness and audio level.
[Stable]
struct AudioLevelEvent {
// RMS audio level, from PowerEvaluator . Score is [0, 1)
float rms;
// Speech likelihood score, from TerseProcessor. Score is [0, 1)
float audio_level;
};
[Stable, Extensible]
enum AsrSwitchResult {
// No switch is attempted when:
// 1. Multilang is disabled.
// 2. top_language_confidence did not meet the sensitivity threshold.
// 3. the top detected language is the same as the one currently being
// transcribed.
// 4. The LangId results were jittery and this event was ignored for being
// too short.
[Default] DEFAULT_NO_SWITCH = 0,
// ASR successfully switched to this locale.
SWITCH_SUCCEEDED = 1,
// ASR attempted to switch to this locale, but could not load the provided
// LP.
SWITCH_FAILED = 2,
// ASR did not attempt to switch because no LP was provided for the locale,
// but top_language_confidence met the sensitivity threshold.
SWITCH_SKIPPED_NO_LP = 3,
};
[Stable]
struct LangIdEvent {
string language;
// Confidence level, exactly as per input protocol buffer value.
int32 confidence_level;
// converted enum value, exactly as per input protocol buffer value.
AsrSwitchResult asr_switch_result;
};
// Result with corrected speaker labels.
[Stable]
struct LabelCorrectionEvent {
// The hypothesis parts with corrected speaker labels. The hypothesis part is
// only included if its speaker label is different from the original streaming
// label, or any of the previously corrected labels. This is designed to
// minimize the amount of data that needs to be emitted during correction by
// skipping parts with labels have never changed once.
array<HypothesisPartInResult> hypothesis_parts;
};
// This essentially mirrors the subset of SODA's SodaEvent proto we will
// support.
[Stable]
union SpeechRecognizerEvent {
AudioLevelEvent audio_event;
PartialResult partial_result;
EndpointerEvent endpointer_event;
FinalResult final_result;
[MinVersion=7]
LangIdEvent langid_event;
[MinVersion=9]
LabelCorrectionEvent label_correction_event;
};
// This interface is called upon by the SodaRecognizer. Implemented by
// the client, SODA then calls these as 'events' with appropriate details
// when recognition occurs.
// Next ordinal: 3
[Stable]
interface SodaClient {
// After SODA successfully starts / warms up, in case the client cares.
OnStart@0();
// After SODA successfully stops, in case the client cares.
OnStop@1();
// This is how the client receives actual recognized text as well as other
// conclusions from the SODA model like "speech ended".
OnSpeechRecognizerEvent@2(SpeechRecognizerEvent event);
};
// The mojom interface for performing the recognition of spoken text.
// Next ordinal: 4
[Stable]
interface SodaRecognizer {
// Add Audio for speech recognition.
AddAudio@0(array<uint8> audio);
// Instruct SODA to stop processing immediately. Stopping is
// confirmed when SodaClient::OnStop is called back.
Stop@1();
// Instruct SODA to start processing. Noop if already
// processing. When Stopped, causes a SodaClient::OnStart callback.
Start@2();
// Instruct SODA to stop processing after all queued audio is
// processed. Will eventually result in a SodaClient::OnStop, but only
// after all audio currently in queue is decoded.
MarkDone@3();
};