chromium/chromeos/ash/components/enhanced_network_tts/mojom/enhanced_network_tts.mojom

// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

module ash.enhanced_network_tts.mojom;

const uint16 kEnhancedNetworkTtsMaxCharacterSize = 1000;

// The ways in which a Text-to-Speech request can fail.
enum TtsRequestError {
  // The utterance is empty.
  kEmptyUtterance,
  // The utterance is over the length limit.
  kOverLength,
  // The server is down and we did not receive a valid server response.
  kServerError,
  // The received data is not in the expected format.
  kReceivedUnexpectedData,
  // The request is overridden by a later request.
  kRequestOverride,
};

// A request for fetching Text-to-Speech data.
struct TtsRequest {
  // The utterance to speak. The length of the utterance should be less than
  // |kEnhancedNetworkTtsMaxCharacterSize|, and the utterance should not be
  // empty.
  string utterance;

  // The speech rate scale factor with 1.0 being unmodified rate. Supported
  // values are between 0.3 and 4.0, in steps of 0.1.
  float rate;

  // The specific voice to use. Optional.
  string? voice;

  // Language to use for speech synthesis. The voice and lang should be
  // specified together to be valid (e.g., voice "aua-wavenet" with lang "en"
  // for Australian English). Optional.
  string? lang;
};

// The timing information for each word in the fetched Text-to-Speech data.
struct TimingInfo {
  // The text of the word.
  string text;

  // The character offset to the start of the utterance in the |TtsRequest|.
  uint32 text_offset;

  // The time when the playback of the utterance reaches the word.
  string time_offset;

  // The audio duration of the word.
  string duration;
};

// The Text-to-Speech data fetched from Google's ReadAloud Text-to-Speech
// server. An |AudioDataObserver| may receive several TtsData for a single
// |TtsRequest| that contains a long utterance.
struct TtsData {
  // The audio data representing the synthesized speech. The audio data is
  // encoded as mp3 with a sample rate of 44.1kHz.
  array<uint8> audio;

  // The timing information for each word in the audio data.
  array<TimingInfo> time_info;

  // Whether this is the last |TtsData| for the input |TtsRequest|.
  bool last_data;
};

// The response to a |TtsRequest|.
union TtsResponse {
  TtsRequestError error_code;

  TtsData data;
};

// The interface to fetch enhanced voices generated from Google text-to-speak
// servers. The remote is a JS extension (i.e., Enhanced Network Tts
// Extension), and the receiver is implemented in c/b/a/enhanced_network_tts.
// The JS extension acts as a Text-to-speech engine, and is currently used by
// Select-to-speak only. When a Select-to-speak user selects text to speak
// with Enhanced Network Tts voices, the text will be sent to chrome.tts.speak.
// The JS extension receives the text by listening to
// chrome.ttsEngine.onSpeakWithAudioStream. Then, the extension uses this mojom
// interface to fetch synthesized audio data. The text from Select-to-speak is
// limited to 32768 chars, and the JS extension wraps the text along with other
// TTS options into a |TtsRequest|.
interface EnhancedNetworkTts {
  // The method to fetch the audio data generated by a Google network-based
  // API. When a |request| contains a long utterance, the receiver chops the
  // utterance into several smaller text pieces, which will be sent to Google
  // servers one by one. For each received response from Google servers, the
  // receiver sends a |TtsResponse| to |observer|. The next text piece will
  // only be sent out when we have finished downloading and processing the
  // response from prior one. Closing the message pipe for the |observer| will
  // cancel the |TtsRequest|.
  // TODO(crbug.com/1240278): Use mojo data pipe for data communication.
  GetAudioData(TtsRequest request)
      => (pending_receiver<AudioDataObserver> observer);
};

// The interface to process the |TtsResponse| generated by |EnhancedNetworkTts|.
interface AudioDataObserver {
  // The method to process the |TtsResponse|.
  OnAudioDataReceived(TtsResponse response);
};