chromium/chromeos/services/tts/public/mojom/tts_service.mojom

// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

module chromeos.tts.mojom;

import "media/mojo/mojom/audio_stream_factory.mojom";
import "sandbox/policy/mojom/context.mojom";
import "sandbox/policy/mojom/sandbox.mojom";

// Audio parameters used for PlaybackTtsStream.
struct AudioParameters {
  int32 sample_rate;
  int32 buffer_size;
};

// The main interface to TTS engines on Chrome OS. TtsService lives in a
// tts-sandboxed process.  TtsEngineExtensionObserver, the other end of this
// interface, in the browser process, brokers a connection between TtsService
// and two possible engine types, [Google|Playback]TtsStream.
[ServiceSandbox=sandbox.mojom.Sandbox.kTts,
 RequireContext=sandbox.mojom.Context.kBrowswer]
interface TtsService {
  // Binds a GoogleTtsStream received by this service.
  // The remote lives in the Google tts component extension.
  [AllowedContext=sandbox.mojom.Context.kBrowser]
  BindGoogleTtsStream(
      pending_receiver<GoogleTtsStream> receiver,
      pending_remote<media.mojom.AudioStreamFactory> stream_factory);

  // Binds a PlaybackTtsStream received by this service.
  // The remote lives in the Ash Chrome browser process.
  // The caller can request specific |sample_rate| and |buffer_size|. The actual
  // audio parameters for the output device are returned.
  [AllowedContext=sandbox.mojom.Context.kBrowser]
  BindPlaybackTtsStream(
      pending_receiver<PlaybackTtsStream> receiver,
      pending_remote<media.mojom.AudioStreamFactory> stream_factory,
      AudioParameters? desired_audio_parameters) => (
          AudioParameters audio_parameters);
};

// Interface for the Google component TTS engine to control
// the TtsService's production of audio. There is only ever one TtsStream
// owned by the TtsService.
//
// The component extension sets up the stream's voice by doing:
// InstallVoice(data, "voice")
// InstallVoice(other_data, "other_voice")
// SelectVoice("other_voice")
//
// After speaking using the stream (see below), the component extension can do:
// SelectVoice("voice")
// to change voices.
//
// The component extension calls the following two methods repeatedly and
// optionally observes events.
//
// Speak(<a proto containing text "Hello there.">)
// Speak(<proto containing text "Testing 1, 2, 3.")
// Stop()
//
// Note that the component extension may call Stop() early, if the TTS api
// wants to, for example, stop speech.
interface GoogleTtsStream {
  // Forward and install the |voice_name| encoded by |voice_bytes|.
  InstallVoice(string voice_name, array<uint8> voice_bytes)
      => (bool success);

  // Selects a voice for streaming given a |voice_name|.
  SelectVoice(string voice_name) => (bool success);

  // Speak text described by a serialized proto.speech.tts.Text proto with the
  // speaker params described by a serialized proto.speech.tts.SpeakerParams
  // proto. The call will fail if no speaker name is given and the voice model
  // is a multi-speaker model.
  Speak(array<uint8> text_jspb, array<uint8> speaker_params_jspb)
      => (pending_receiver<TtsEventObserver> event_observer);

  // Stop speaking the currently speaking text, if any.
  Stop();

  // Sets the volume of the tts playback (0.0 to 1.0).
  SetVolume(float volume);

  // Pauses tts playback. Safe to call repeatedly (no-op for subsequent calls).
  Pause();

  // Resumes tts playback. Safe to call repeatedly (no-op for subsequent calls).
  Resume();
};

// Interface for a tts engine to control the TtsService's production of audio
// for engines like Espeak, which send raw audio data.
//
// The remote is in the Ash Chrome browser process; the receiver is the tts
// service utility process. The chrome.ttsEngine api uses this interface
// internally to produce audio output, but it is never directly exposed to the
// extension's public js bindings.
//
// Example usage:
// Play() - starts playback of an utterance like 'hello world'
// SendAudioBuffer(<first 1024 frames of audio>, 0, false)
// SendAudioBuffer(<1024 more frames of audio>, -1, false)
// ...
// SendAudioBuffer(<1024 more frames of audio>, 4, false)
// ...
// SendAudioBuffer(<1024 more frames of audio>, 11, true)
// Stop()
//
// Note: Espeak-ng is currently the only remote utilizing this interface. It is
// a private component extension to CHrome OS. Its sources can be found here:
// https://chromium.googlesource.com/chromiumos/third_party/espeak-ng
interface PlaybackTtsStream {
  // Start playback of audio.
  Play()
      => (pending_receiver<TtsEventObserver> event_observer);

  // Send audio data to the tts service; expected to be called after Play,
  // Resume and before Stop, Pause.
  //
  // |char_index| annotates the |frames_buffer| as the character index within
  // the text being spoken. This is pass along in
  // |TtsEventObserver.onTimepoint| at the moment when the buffer is being
  // played.
  //
  // |last_buffer| indicates whether or not this is the last buffer of a
  // particular tts utterance string.
  SendAudioBuffer(
      array<float> frames_buffer, int32 char_index, bool last_buffer);

  // Stops on-going audio playback.
  Stop();

  // Sets volume of audio playback (0.0 to 1.0).
  SetVolume(float volume);

  // Pauses audio playback.
  Pause();

  // Resumes audio playback.
  Resume();
};

// Returned to callers of GoogleTtsStream.speak() and
// PlaybackTtsStream.Play(). It receives notable events pertaining to the text
// spoken.
interface TtsEventObserver {
  // TtsStream.Speak started speech playback.
  OnStart();

  // TtsStream.Speak is playing text at |char_index| approximately at the
  // current time.
  OnTimepoint(int32 char_index);

  // TtsStream.Speak ended speech playback.
  OnEnd();

  // TtsStream.Speak encountered an error.
  OnError();
};