// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#import "content/browser/speech/tts_mac.h"
#import <AVFAudio/AVFAudio.h>
#import <AppKit/AppKit.h>
#include <objc/runtime.h>
#include <algorithm>
#include <string>
#include "base/apple/foundation_util.h"
#include "base/functional/bind.h"
#include "base/memory/raw_ptr.h"
#include "base/no_destructor.h"
#include "base/strings/sys_string_conversions.h"
#include "base/values.h"
#include "content/public/browser/tts_controller.h"
namespace {
constexpr int kNoLength = -1;
constexpr char kNoError[] = "";
std::vector<content::VoiceData>& VoicesRef() {
static base::NoDestructor<std::vector<content::VoiceData>> voices([]() {
usingBlock:^(NSNotification* notification) {
// The user might have switched to Settings or some other app
// to change voices or locale settings. Avoid a stale cache by
// forcing a rebuild of the voices vector after the app
// becomes active.
return std::vector<content::VoiceData>();
return *voices;
AVSpeechSynthesisVoice* GetSystemDefaultVoice() {
// This should be
// [AVSpeechSynthesisVoice voiceWithLanguage:nil]
// but that has a bug (https://crbug.com/1484940#c9, FB13197951). In short,
// while passing nil to -[AVSpeechSynthesisVoice voiceWithLanguage:] does
// indeed return "the default voice for the system’s language and region",
// that's not necessarily the voice that the user selected in System Settings
// > Accessibility > Spoken Content, and that user voice selection is the only
// one that matters. There does not appear to be an AVSpeechSynthesis API that
// returns that user choice, so use the deprecated NSSpeechSynthesizer API,
// which behaves correctly.
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
NSString* default_voice_identifier = NSSpeechSynthesizer.defaultVoice;
#pragma clang diagnostic pop
return [AVSpeechSynthesisVoice voiceWithIdentifier:default_voice_identifier];
std::vector<content::VoiceData>& Voices() {
std::vector<content::VoiceData>& voices = VoicesRef();
if (!voices.empty()) {
return voices;
NSMutableArray* av_speech_voices =
[[AVSpeechSynthesisVoice.speechVoices sortedArrayUsingDescriptors:@[
[NSSortDescriptor sortDescriptorWithKey:@"name" ascending:YES]
]] mutableCopy];
AVSpeechSynthesisVoice* default_voice = GetSystemDefaultVoice();
if (default_voice) {
[av_speech_voices removeObject:default_voice];
[av_speech_voices insertObject:default_voice atIndex:0];
// For the case of multiple voices with the same name but of a different
// language, the old API (NSSpeechSynthesizer) would append locale information
// to the names, while this current API does not. Because returning a bunch of
// voices with the same name isn't helpful, count how often each name is used,
// so that later on, locale information can be appended if necessary for
// disambiguation.
NSMutableDictionary<NSString*, NSNumber*>* name_counts =
[NSMutableDictionary dictionary];
for (AVSpeechSynthesisVoice* av_speech_voice in av_speech_voices) {
NSString* voice_name = av_speech_voice.name;
if (!voice_name) {
// AVSpeechSynthesisVoice.name is not a nullable property, but there are
// crashes (https://crbug.com/1459235) where -setObject:forKeyedSubscript:
// is being passed a nil key, and the only place that happens in this
// function is below.
if (NSNumber* count = name_counts[voice_name]) {
name_counts[voice_name] = @(count.intValue + 1);
} else {
name_counts[voice_name] = @1;
for (AVSpeechSynthesisVoice* av_speech_voice in av_speech_voices) {
NSString* voice_name = av_speech_voice.name;
if (!voice_name) {
// AVSpeechSynthesisVoice.name is not a nullable property, but there are
// crashes (https://crbug.com/1459235) where it seems like it's returning
// nil. Without a name, a voice is useless, so skip it.
content::VoiceData& data = voices.back();
if (name_counts[voice_name].intValue > 1) {
// The language property on a voice is a BCP 47 code (i.e. "en-US") while
// an NSLocale locale identifier isn't (i.e. "en_US"). However, using the
// BCP 47 code as if it were a locale identifier works just fine (tested
// back to 10.15).
NSString* localized_language = [NSLocale.autoupdatingCurrentLocale
voice_name = [NSString
stringWithFormat:@"%@ (%@)", voice_name, localized_language];
data.native = true;
data.native_voice_identifier =
data.name = base::SysNSStringToUTF8(voice_name);
data.lang = base::SysNSStringToUTF8(av_speech_voice.language);
return voices;
AVSpeechUtterance* MakeUtterance(int utterance_id,
const std::string& utterance_string) {
AVSpeechUtterance* utterance = [AVSpeechUtterance
objc_setAssociatedObject(utterance, @selector(identifier), @(utterance_id),
return utterance;
int GetUtteranceId(AVSpeechUtterance* utterance) {
NSNumber* identifier = base::apple::ObjCCast<NSNumber>(
objc_getAssociatedObject(utterance, @selector(identifier)));
if (identifier) {
return identifier.intValue;
return TtsPlatformImplMac::kInvalidUtteranceId;
} // namespace
// static
content::TtsPlatformImpl* content::TtsPlatformImpl::GetInstance() {
return TtsPlatformImplMac::GetInstance();
TtsPlatformImplMac::~TtsPlatformImplMac() = default;
bool TtsPlatformImplMac::PlatformImplSupported() {
return true;
bool TtsPlatformImplMac::PlatformImplInitialized() {
return true;
void TtsPlatformImplMac::Speak(
int utterance_id,
const std::string& utterance,
const std::string& lang,
const content::VoiceData& voice,
const content::UtteranceContinuousParameters& params,
base::OnceCallback<void(bool)> on_speak_finished) {
// Parse SSML and process speech. TODO(crbug.com/40273591):
// AVSpeechUtterance has an initializer -initWithSSMLRepresentation:. Should
// that be used instead?
utterance, base::BindOnce(&TtsPlatformImplMac::ProcessSpeech,
base::Unretained(this), utterance_id, lang,
voice, params, std::move(on_speak_finished)));
void TtsPlatformImplMac::ProcessSpeech(
int utterance_id,
const std::string& lang,
const content::VoiceData& voice,
const content::UtteranceContinuousParameters& params,
base::OnceCallback<void(bool)> on_speak_finished,
const std::string& parsed_utterance) {
utterance_ = parsed_utterance;
paused_ = false;
utterance_id_ = utterance_id;
AVSpeechUtterance* speech_utterance =
MakeUtterance(utterance_id, parsed_utterance);
if (!speech_utterance) {
speech_utterance.voice = [AVSpeechSynthesisVoice
if (params.rate >= 0.0) {
// The two relevant APIs have different ranges:
// - Web Speech API is [.1, 10] with default 1
// - AVSpeechSynthesizer is [0, 1] with default .5
// Speeds in the Web Speech API other than 1 (the default rate) are meant to
// be multiples of the default speaking rate.
// The mapping of AVSpeechSynthesizer speeds was done experimentally, using
// the fourth paragraph of _A Tale of Two Cities_. With the "Samantha"
// voice, AVSpeechUtteranceDefaultSpeechRate takes about 80s to read the
// paragraph, while AVSpeechUtteranceMaximumSpeechRate takes about 20s.
// Therefore, map
// 1 → AVSpeechUtteranceDefaultSpeechRate
// 4 → AVSpeechUtteranceMaximumSpeechRate
// and cap anything higher.
// References:
// https://developer.mozilla.org/en-US/docs/Web/API/SpeechSynthesisUtterance/rate
// https://github.com/WebKit/WebKit/blob/main/Source/WebCore/platform/cocoa/PlatformSpeechSynthesizerCocoa.mm
// ^ This is the WebKit implementation. It appears to have a bug in
// scaling, where a Web Speech API rate of 2 is scaled to
// AVSpeechUtteranceMaximumSpeechRate and the value passed to the
// AVSpeechSynthesizer goes up from there. A bug was filed about this:
// https://bugs.webkit.org/show_bug.cgi?id=258587
float rate = params.rate;
if (rate < 1) {
// If a slower than normal rate is requested, scale the default speech
// rate down proportionally.
rate *= AVSpeechUtteranceDefaultSpeechRate;
} else {
// Scale the AVSpeech rate headroom proportionally to match the excess
// above 1 in the Speech API, capping at a Web Speech API value of 4.
const float kWebSpeechDefault = 1;
const float kWebSpeechMaxSupported = 4;
const float kAVSpeechRateHeadroom = AVSpeechUtteranceMaximumSpeechRate -
const float excess = rate - kWebSpeechDefault;
const float capped_excess =
std::min(excess, (kWebSpeechMaxSupported - kWebSpeechDefault));
const float headroom_proportion =
capped_excess / (kWebSpeechMaxSupported - kWebSpeechDefault);
rate = AVSpeechUtteranceDefaultSpeechRate +
headroom_proportion * kAVSpeechRateHeadroom;
speech_utterance.rate = rate;
if (params.pitch >= 0.0) {
speech_utterance.pitchMultiplier = params.pitch;
if (params.volume >= 0.0) {
speech_utterance.volume = params.volume;
[speech_synthesizer_ speakUtterance:speech_utterance];
bool TtsPlatformImplMac::StopSpeaking() {
[speech_synthesizer_ stopSpeakingAtBoundary:AVSpeechBoundaryImmediate];
paused_ = false;
return true;
void TtsPlatformImplMac::Pause() {
if (!paused_) {
[speech_synthesizer_ pauseSpeakingAtBoundary:AVSpeechBoundaryImmediate];
paused_ = true;
utterance_id_, content::TTS_EVENT_PAUSE, last_char_index_, kNoLength,
void TtsPlatformImplMac::Resume() {
if (paused_) {
[speech_synthesizer_ continueSpeaking];
paused_ = false;
utterance_id_, content::TTS_EVENT_RESUME, last_char_index_, kNoLength,
bool TtsPlatformImplMac::IsSpeaking() {
return speech_synthesizer_.speaking;
void TtsPlatformImplMac::GetVoices(std::vector<content::VoiceData>* outVoices) {
*outVoices = Voices();
void TtsPlatformImplMac::OnSpeechEvent(int utterance_id,
content::TtsEventType event_type,
int char_index,
int char_length,
const std::string& error_message) {
// Don't send events from an utterance that's already completed.
if (utterance_id != utterance_id_) {
if (event_type == content::TTS_EVENT_END) {
char_index = utterance_.size();
utterance_id_, event_type, char_index, char_length, error_message);
last_char_index_ = char_index;
: speech_synthesizer_([[AVSpeechSynthesizer alloc] init]),
delegate_([[ChromeTtsDelegate alloc] initWithPlatformImplMac:this]) {
speech_synthesizer_.delegate = delegate_;
// static
TtsPlatformImplMac* TtsPlatformImplMac::GetInstance() {
static base::NoDestructor<TtsPlatformImplMac> tts_platform;
return tts_platform.get();
// static
std::vector<content::VoiceData>& TtsPlatformImplMac::VoicesRefForTesting() {
return VoicesRef();
@implementation ChromeTtsDelegate {
raw_ptr<TtsPlatformImplMac> _ttsImplMac; // weak.
- (id)initWithPlatformImplMac:(TtsPlatformImplMac*)ttsImplMac {
if ((self = [super init])) {
_ttsImplMac = ttsImplMac;
return self;
- (void)speechSynthesizer:(AVSpeechSynthesizer*)synthesizer
didStartSpeechUtterance:(AVSpeechUtterance*)utterance {
content::TTS_EVENT_START, /*char_index=*/0,
kNoLength, kNoError);
- (void)speechSynthesizer:(AVSpeechSynthesizer*)synthesizer
didFinishSpeechUtterance:(AVSpeechUtterance*)utterance {
_ttsImplMac->OnSpeechEvent(GetUtteranceId(utterance), content::TTS_EVENT_END,
/*char_index=*/0, kNoLength, kNoError);
- (void)speechSynthesizer:(AVSpeechSynthesizer*)synthesizer
utterance:(AVSpeechUtterance*)utterance {
// Ignore bogus ranges. The Mac speech synthesizer is a bit buggy and
// occasionally returns a number way out of range.
if (characterRange.location > utterance.speechString.length ||
characterRange.length == 0) {
_ttsImplMac->OnSpeechEvent(GetUtteranceId(utterance), content::TTS_EVENT_WORD,
characterRange.location, characterRange.length,