// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
module ash.enhanced_network_tts.mojom;
const uint16 kEnhancedNetworkTtsMaxCharacterSize = 1000;
// The ways in which a Text-to-Speech request can fail.
enum TtsRequestError {
// The utterance is empty.
kEmptyUtterance,
// The utterance is over the length limit.
kOverLength,
// The server is down and we did not receive a valid server response.
kServerError,
// The received data is not in the expected format.
kReceivedUnexpectedData,
// The request is overridden by a later request.
kRequestOverride,
};
// A request for fetching Text-to-Speech data.
struct TtsRequest {
// The utterance to speak. The length of the utterance should be less than
// |kEnhancedNetworkTtsMaxCharacterSize|, and the utterance should not be
// empty.
string utterance;
// The speech rate scale factor with 1.0 being unmodified rate. Supported
// values are between 0.3 and 4.0, in steps of 0.1.
float rate;
// The specific voice to use. Optional.
string? voice;
// Language to use for speech synthesis. The voice and lang should be
// specified together to be valid (e.g., voice "aua-wavenet" with lang "en"
// for Australian English). Optional.
string? lang;
};
// The timing information for each word in the fetched Text-to-Speech data.
struct TimingInfo {
// The text of the word.
string text;
// The character offset to the start of the utterance in the |TtsRequest|.
uint32 text_offset;
// The time when the playback of the utterance reaches the word.
string time_offset;
// The audio duration of the word.
string duration;
};
// The Text-to-Speech data fetched from Google's ReadAloud Text-to-Speech
// server. An |AudioDataObserver| may receive several TtsData for a single
// |TtsRequest| that contains a long utterance.
struct TtsData {
// The audio data representing the synthesized speech. The audio data is
// encoded as mp3 with a sample rate of 44.1kHz.
array<uint8> audio;
// The timing information for each word in the audio data.
array<TimingInfo> time_info;
// Whether this is the last |TtsData| for the input |TtsRequest|.
bool last_data;
};
// The response to a |TtsRequest|.
union TtsResponse {
TtsRequestError error_code;
TtsData data;
};
// The interface to fetch enhanced voices generated from Google text-to-speak
// servers. The remote is a JS extension (i.e., Enhanced Network Tts
// Extension), and the receiver is implemented in c/b/a/enhanced_network_tts.
// The JS extension acts as a Text-to-speech engine, and is currently used by
// Select-to-speak only. When a Select-to-speak user selects text to speak
// with Enhanced Network Tts voices, the text will be sent to chrome.tts.speak.
// The JS extension receives the text by listening to
// chrome.ttsEngine.onSpeakWithAudioStream. Then, the extension uses this mojom
// interface to fetch synthesized audio data. The text from Select-to-speak is
// limited to 32768 chars, and the JS extension wraps the text along with other
// TTS options into a |TtsRequest|.
interface EnhancedNetworkTts {
// The method to fetch the audio data generated by a Google network-based
// API. When a |request| contains a long utterance, the receiver chops the
// utterance into several smaller text pieces, which will be sent to Google
// servers one by one. For each received response from Google servers, the
// receiver sends a |TtsResponse| to |observer|. The next text piece will
// only be sent out when we have finished downloading and processing the
// response from prior one. Closing the message pipe for the |observer| will
// cancel the |TtsRequest|.
// TODO(crbug.com/1240278): Use mojo data pipe for data communication.
GetAudioData(TtsRequest request)
=> (pending_receiver<AudioDataObserver> observer);
};
// The interface to process the |TtsResponse| generated by |EnhancedNetworkTts|.
interface AudioDataObserver {
// The method to process the |TtsResponse|.
OnAudioDataReceived(TtsResponse response);
};