// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
module chromeos.tts.mojom;
import "media/mojo/mojom/audio_stream_factory.mojom";
import "sandbox/policy/mojom/context.mojom";
import "sandbox/policy/mojom/sandbox.mojom";
// Audio parameters used for PlaybackTtsStream.
struct AudioParameters {
int32 sample_rate;
int32 buffer_size;
};
// The main interface to TTS engines on Chrome OS. TtsService lives in a
// tts-sandboxed process. TtsEngineExtensionObserver, the other end of this
// interface, in the browser process, brokers a connection between TtsService
// and two possible engine types, [Google|Playback]TtsStream.
[ServiceSandbox=sandbox.mojom.Sandbox.kTts,
RequireContext=sandbox.mojom.Context.kBrowswer]
interface TtsService {
// Binds a GoogleTtsStream received by this service.
// The remote lives in the Google tts component extension.
[AllowedContext=sandbox.mojom.Context.kBrowser]
BindGoogleTtsStream(
pending_receiver<GoogleTtsStream> receiver,
pending_remote<media.mojom.AudioStreamFactory> stream_factory);
// Binds a PlaybackTtsStream received by this service.
// The remote lives in the Ash Chrome browser process.
// The caller can request specific |sample_rate| and |buffer_size|. The actual
// audio parameters for the output device are returned.
[AllowedContext=sandbox.mojom.Context.kBrowser]
BindPlaybackTtsStream(
pending_receiver<PlaybackTtsStream> receiver,
pending_remote<media.mojom.AudioStreamFactory> stream_factory,
AudioParameters? desired_audio_parameters) => (
AudioParameters audio_parameters);
};
// Interface for the Google component TTS engine to control
// the TtsService's production of audio. There is only ever one TtsStream
// owned by the TtsService.
//
// The component extension sets up the stream's voice by doing:
// InstallVoice(data, "voice")
// InstallVoice(other_data, "other_voice")
// SelectVoice("other_voice")
//
// After speaking using the stream (see below), the component extension can do:
// SelectVoice("voice")
// to change voices.
//
// The component extension calls the following two methods repeatedly and
// optionally observes events.
//
// Speak(<a proto containing text "Hello there.">)
// Speak(<proto containing text "Testing 1, 2, 3.")
// Stop()
//
// Note that the component extension may call Stop() early, if the TTS api
// wants to, for example, stop speech.
interface GoogleTtsStream {
// Forward and install the |voice_name| encoded by |voice_bytes|.
InstallVoice(string voice_name, array<uint8> voice_bytes)
=> (bool success);
// Selects a voice for streaming given a |voice_name|.
SelectVoice(string voice_name) => (bool success);
// Speak text described by a serialized proto.speech.tts.Text proto with the
// speaker params described by a serialized proto.speech.tts.SpeakerParams
// proto. The call will fail if no speaker name is given and the voice model
// is a multi-speaker model.
Speak(array<uint8> text_jspb, array<uint8> speaker_params_jspb)
=> (pending_receiver<TtsEventObserver> event_observer);
// Stop speaking the currently speaking text, if any.
Stop();
// Sets the volume of the tts playback (0.0 to 1.0).
SetVolume(float volume);
// Pauses tts playback. Safe to call repeatedly (no-op for subsequent calls).
Pause();
// Resumes tts playback. Safe to call repeatedly (no-op for subsequent calls).
Resume();
};
// Interface for a tts engine to control the TtsService's production of audio
// for engines like Espeak, which send raw audio data.
//
// The remote is in the Ash Chrome browser process; the receiver is the tts
// service utility process. The chrome.ttsEngine api uses this interface
// internally to produce audio output, but it is never directly exposed to the
// extension's public js bindings.
//
// Example usage:
// Play() - starts playback of an utterance like 'hello world'
// SendAudioBuffer(<first 1024 frames of audio>, 0, false)
// SendAudioBuffer(<1024 more frames of audio>, -1, false)
// ...
// SendAudioBuffer(<1024 more frames of audio>, 4, false)
// ...
// SendAudioBuffer(<1024 more frames of audio>, 11, true)
// Stop()
//
// Note: Espeak-ng is currently the only remote utilizing this interface. It is
// a private component extension to CHrome OS. Its sources can be found here:
// https://chromium.googlesource.com/chromiumos/third_party/espeak-ng
interface PlaybackTtsStream {
// Start playback of audio.
Play()
=> (pending_receiver<TtsEventObserver> event_observer);
// Send audio data to the tts service; expected to be called after Play,
// Resume and before Stop, Pause.
//
// |char_index| annotates the |frames_buffer| as the character index within
// the text being spoken. This is pass along in
// |TtsEventObserver.onTimepoint| at the moment when the buffer is being
// played.
//
// |last_buffer| indicates whether or not this is the last buffer of a
// particular tts utterance string.
SendAudioBuffer(
array<float> frames_buffer, int32 char_index, bool last_buffer);
// Stops on-going audio playback.
Stop();
// Sets volume of audio playback (0.0 to 1.0).
SetVolume(float volume);
// Pauses audio playback.
Pause();
// Resumes audio playback.
Resume();
};
// Returned to callers of GoogleTtsStream.speak() and
// PlaybackTtsStream.Play(). It receives notable events pertaining to the text
// spoken.
interface TtsEventObserver {
// TtsStream.Speak started speech playback.
OnStart();
// TtsStream.Speak is playing text at |char_index| approximately at the
// current time.
OnTimepoint(int32 char_index);
// TtsStream.Speak ended speech playback.
OnEnd();
// TtsStream.Speak encountered an error.
OnError();
};