chromium/content/browser/speech/tts_mac.mm

// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#import "content/browser/speech/tts_mac.h"

#import <AVFAudio/AVFAudio.h>
#import <AppKit/AppKit.h>
#include <objc/runtime.h>

#include <algorithm>
#include <string>

#include "base/apple/foundation_util.h"
#include "base/functional/bind.h"
#include "base/memory/raw_ptr.h"
#include "base/no_destructor.h"
#include "base/strings/sys_string_conversions.h"
#include "base/values.h"
#include "content/public/browser/tts_controller.h"

namespace {

constexpr int kNoLength = -1;
constexpr char kNoError[] = "";

std::vector<content::VoiceData>& VoicesRef() {
  static base::NoDestructor<std::vector<content::VoiceData>> voices([]() {
    [NSNotificationCenter.defaultCenter
        addObserverForName:NSApplicationWillBecomeActiveNotification
                    object:nil
                     queue:nil
                usingBlock:^(NSNotification* notification) {
                  // The user might have switched to Settings or some other app
                  // to change voices or locale settings. Avoid a stale cache by
                  // forcing a rebuild of the voices vector after the app
                  // becomes active.
                  VoicesRef().clear();
                }];
    return std::vector<content::VoiceData>();
  }());

  return *voices;
}

AVSpeechSynthesisVoice* GetSystemDefaultVoice() {
  // This should be
  //
  //   [AVSpeechSynthesisVoice voiceWithLanguage:nil]
  //
  // but that has a bug (https://crbug.com/1484940#c9, FB13197951). In short,
  // while passing nil to -[AVSpeechSynthesisVoice voiceWithLanguage:] does
  // indeed return "the default voice for the system’s language and region",
  // that's not necessarily the voice that the user selected in System Settings
  // > Accessibility > Spoken Content, and that user voice selection is the only
  // one that matters. There does not appear to be an AVSpeechSynthesis API that
  // returns that user choice, so use the deprecated NSSpeechSynthesizer API,
  // which behaves correctly.
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
  NSString* default_voice_identifier = NSSpeechSynthesizer.defaultVoice;
#pragma clang diagnostic pop
  return [AVSpeechSynthesisVoice voiceWithIdentifier:default_voice_identifier];
}

std::vector<content::VoiceData>& Voices() {
  std::vector<content::VoiceData>& voices = VoicesRef();
  if (!voices.empty()) {
    return voices;
  }

  NSMutableArray* av_speech_voices =
      [[AVSpeechSynthesisVoice.speechVoices sortedArrayUsingDescriptors:@[
        [NSSortDescriptor sortDescriptorWithKey:@"name" ascending:YES]
      ]] mutableCopy];
  AVSpeechSynthesisVoice* default_voice = GetSystemDefaultVoice();
  if (default_voice) {
    [av_speech_voices removeObject:default_voice];
    [av_speech_voices insertObject:default_voice atIndex:0];
  }

  // For the case of multiple voices with the same name but of a different
  // language, the old API (NSSpeechSynthesizer) would append locale information
  // to the names, while this current API does not. Because returning a bunch of
  // voices with the same name isn't helpful, count how often each name is used,
  // so that later on, locale information can be appended if necessary for
  // disambiguation.
  NSMutableDictionary<NSString*, NSNumber*>* name_counts =
      [NSMutableDictionary dictionary];
  for (AVSpeechSynthesisVoice* av_speech_voice in av_speech_voices) {
    NSString* voice_name = av_speech_voice.name;
    if (!voice_name) {
      // AVSpeechSynthesisVoice.name is not a nullable property, but there are
      // crashes (https://crbug.com/1459235) where -setObject:forKeyedSubscript:
      // is being passed a nil key, and the only place that happens in this
      // function is below.
      continue;
    }
    if (NSNumber* count = name_counts[voice_name]) {
      name_counts[voice_name] = @(count.intValue + 1);
    } else {
      name_counts[voice_name] = @1;
    }
  }

  voices.reserve(av_speech_voices.count);
  for (AVSpeechSynthesisVoice* av_speech_voice in av_speech_voices) {
    NSString* voice_name = av_speech_voice.name;
    if (!voice_name) {
      // AVSpeechSynthesisVoice.name is not a nullable property, but there are
      // crashes (https://crbug.com/1459235) where it seems like it's returning
      // nil. Without a name, a voice is useless, so skip it.
      continue;
    }

    voices.emplace_back();
    content::VoiceData& data = voices.back();

    if (name_counts[voice_name].intValue > 1) {
      // The language property on a voice is a BCP 47 code (i.e. "en-US") while
      // an NSLocale locale identifier isn't (i.e. "en_US"). However, using the
      // BCP 47 code as if it were a locale identifier works just fine (tested
      // back to 10.15).
      NSString* localized_language = [NSLocale.autoupdatingCurrentLocale
          localizedStringForLocaleIdentifier:av_speech_voice.language];
      voice_name = [NSString
          stringWithFormat:@"%@ (%@)", voice_name, localized_language];
    }

    data.native = true;
    data.native_voice_identifier =
        base::SysNSStringToUTF8(av_speech_voice.identifier);
    data.name = base::SysNSStringToUTF8(voice_name);
    data.lang = base::SysNSStringToUTF8(av_speech_voice.language);

    data.events.insert(content::TTS_EVENT_START);
    data.events.insert(content::TTS_EVENT_END);
    data.events.insert(content::TTS_EVENT_WORD);
    data.events.insert(content::TTS_EVENT_PAUSE);
    data.events.insert(content::TTS_EVENT_RESUME);
  }

  return voices;
}

AVSpeechUtterance* MakeUtterance(int utterance_id,
                                 const std::string& utterance_string) {
  AVSpeechUtterance* utterance = [AVSpeechUtterance
      speechUtteranceWithString:base::SysUTF8ToNSString(utterance_string)];
  objc_setAssociatedObject(utterance, @selector(identifier), @(utterance_id),
                           OBJC_ASSOCIATION_RETAIN);
  return utterance;
}

int GetUtteranceId(AVSpeechUtterance* utterance) {
  NSNumber* identifier = base::apple::ObjCCast<NSNumber>(
      objc_getAssociatedObject(utterance, @selector(identifier)));
  if (identifier) {
    return identifier.intValue;
  }
  return TtsPlatformImplMac::kInvalidUtteranceId;
}

}  // namespace

// static
content::TtsPlatformImpl* content::TtsPlatformImpl::GetInstance() {
  return TtsPlatformImplMac::GetInstance();
}

TtsPlatformImplMac::~TtsPlatformImplMac() = default;

bool TtsPlatformImplMac::PlatformImplSupported() {
  return true;
}

bool TtsPlatformImplMac::PlatformImplInitialized() {
  return true;
}

void TtsPlatformImplMac::Speak(
    int utterance_id,
    const std::string& utterance,
    const std::string& lang,
    const content::VoiceData& voice,
    const content::UtteranceContinuousParameters& params,
    base::OnceCallback<void(bool)> on_speak_finished) {
  // Parse SSML and process speech. TODO(crbug.com/40273591):
  // AVSpeechUtterance has an initializer -initWithSSMLRepresentation:. Should
  // that be used instead?
  content::TtsController::GetInstance()->StripSSML(
      utterance, base::BindOnce(&TtsPlatformImplMac::ProcessSpeech,
                                base::Unretained(this), utterance_id, lang,
                                voice, params, std::move(on_speak_finished)));
}

void TtsPlatformImplMac::ProcessSpeech(
    int utterance_id,
    const std::string& lang,
    const content::VoiceData& voice,
    const content::UtteranceContinuousParameters& params,
    base::OnceCallback<void(bool)> on_speak_finished,
    const std::string& parsed_utterance) {
  utterance_ = parsed_utterance;
  paused_ = false;
  utterance_id_ = utterance_id;

  AVSpeechUtterance* speech_utterance =
      MakeUtterance(utterance_id, parsed_utterance);
  if (!speech_utterance) {
    std::move(on_speak_finished).Run(false);
    return;
  }

  speech_utterance.voice = [AVSpeechSynthesisVoice
      voiceWithIdentifier:base::SysUTF8ToNSString(
                              voice.native_voice_identifier)];

  if (params.rate >= 0.0) {
    // The two relevant APIs have different ranges:
    // - Web Speech API is [.1, 10] with default 1
    // - AVSpeechSynthesizer is [0, 1] with default .5
    //
    // Speeds in the Web Speech API other than 1 (the default rate) are meant to
    // be multiples of the default speaking rate.
    //
    // The mapping of AVSpeechSynthesizer speeds was done experimentally, using
    // the fourth paragraph of _A Tale of Two Cities_. With the "Samantha"
    // voice, AVSpeechUtteranceDefaultSpeechRate takes about 80s to read the
    // paragraph, while AVSpeechUtteranceMaximumSpeechRate takes about 20s.
    // Therefore, map
    //
    // 1 → AVSpeechUtteranceDefaultSpeechRate
    // 4 → AVSpeechUtteranceMaximumSpeechRate
    //
    // and cap anything higher.
    //
    // References:
    //
    // https://developer.mozilla.org/en-US/docs/Web/API/SpeechSynthesisUtterance/rate
    // https://github.com/WebKit/WebKit/blob/main/Source/WebCore/platform/cocoa/PlatformSpeechSynthesizerCocoa.mm
    //  ^ This is the WebKit implementation. It appears to have a bug in
    //    scaling, where a Web Speech API rate of 2 is scaled to
    //    AVSpeechUtteranceMaximumSpeechRate and the value passed to the
    //    AVSpeechSynthesizer goes up from there. A bug was filed about this:
    //    https://bugs.webkit.org/show_bug.cgi?id=258587

    float rate = params.rate;
    if (rate < 1) {
      // If a slower than normal rate is requested, scale the default speech
      // rate down proportionally.
      rate *= AVSpeechUtteranceDefaultSpeechRate;
    } else {
      // Scale the AVSpeech rate headroom proportionally to match the excess
      // above 1 in the Speech API, capping at a Web Speech API value of 4.
      const float kWebSpeechDefault = 1;
      const float kWebSpeechMaxSupported = 4;
      const float kAVSpeechRateHeadroom = AVSpeechUtteranceMaximumSpeechRate -
                                          AVSpeechUtteranceDefaultSpeechRate;
      const float excess = rate - kWebSpeechDefault;
      const float capped_excess =
          std::min(excess, (kWebSpeechMaxSupported - kWebSpeechDefault));
      const float headroom_proportion =
          capped_excess / (kWebSpeechMaxSupported - kWebSpeechDefault);
      rate = AVSpeechUtteranceDefaultSpeechRate +
             headroom_proportion * kAVSpeechRateHeadroom;
    }

    speech_utterance.rate = rate;
  }

  if (params.pitch >= 0.0) {
    speech_utterance.pitchMultiplier = params.pitch;
  }

  if (params.volume >= 0.0) {
    speech_utterance.volume = params.volume;
  }

  [speech_synthesizer_ speakUtterance:speech_utterance];
  std::move(on_speak_finished).Run(true);
}

bool TtsPlatformImplMac::StopSpeaking() {
  [speech_synthesizer_ stopSpeakingAtBoundary:AVSpeechBoundaryImmediate];
  paused_ = false;
  return true;
}

void TtsPlatformImplMac::Pause() {
  if (!paused_) {
    [speech_synthesizer_ pauseSpeakingAtBoundary:AVSpeechBoundaryImmediate];
    paused_ = true;
    content::TtsController::GetInstance()->OnTtsEvent(
        utterance_id_, content::TTS_EVENT_PAUSE, last_char_index_, kNoLength,
        kNoError);
  }
}

void TtsPlatformImplMac::Resume() {
  if (paused_) {
    [speech_synthesizer_ continueSpeaking];
    paused_ = false;
    content::TtsController::GetInstance()->OnTtsEvent(
        utterance_id_, content::TTS_EVENT_RESUME, last_char_index_, kNoLength,
        kNoError);
  }
}

bool TtsPlatformImplMac::IsSpeaking() {
  return speech_synthesizer_.speaking;
}

void TtsPlatformImplMac::GetVoices(std::vector<content::VoiceData>* outVoices) {
  *outVoices = Voices();
}

void TtsPlatformImplMac::OnSpeechEvent(int utterance_id,
                                       content::TtsEventType event_type,
                                       int char_index,
                                       int char_length,
                                       const std::string& error_message) {
  // Don't send events from an utterance that's already completed.
  if (utterance_id != utterance_id_) {
    return;
  }

  if (event_type == content::TTS_EVENT_END) {
    char_index = utterance_.size();
  }

  content::TtsController::GetInstance()->OnTtsEvent(
      utterance_id_, event_type, char_index, char_length, error_message);
  last_char_index_ = char_index;
}

TtsPlatformImplMac::TtsPlatformImplMac()
    : speech_synthesizer_([[AVSpeechSynthesizer alloc] init]),
      delegate_([[ChromeTtsDelegate alloc] initWithPlatformImplMac:this]) {
  speech_synthesizer_.delegate = delegate_;
}

// static
TtsPlatformImplMac* TtsPlatformImplMac::GetInstance() {
  static base::NoDestructor<TtsPlatformImplMac> tts_platform;
  return tts_platform.get();
}

// static
std::vector<content::VoiceData>& TtsPlatformImplMac::VoicesRefForTesting() {
  return VoicesRef();
}

@implementation ChromeTtsDelegate {
  raw_ptr<TtsPlatformImplMac> _ttsImplMac;  // weak.
}

- (id)initWithPlatformImplMac:(TtsPlatformImplMac*)ttsImplMac {
  if ((self = [super init])) {
    _ttsImplMac = ttsImplMac;
  }
  return self;
}

- (void)speechSynthesizer:(AVSpeechSynthesizer*)synthesizer
    didStartSpeechUtterance:(AVSpeechUtterance*)utterance {
  _ttsImplMac->OnSpeechEvent(GetUtteranceId(utterance),
                             content::TTS_EVENT_START, /*char_index=*/0,
                             kNoLength, kNoError);
}

- (void)speechSynthesizer:(AVSpeechSynthesizer*)synthesizer
    didFinishSpeechUtterance:(AVSpeechUtterance*)utterance {
  _ttsImplMac->OnSpeechEvent(GetUtteranceId(utterance), content::TTS_EVENT_END,
                             /*char_index=*/0, kNoLength, kNoError);
}

- (void)speechSynthesizer:(AVSpeechSynthesizer*)synthesizer
    willSpeakRangeOfSpeechString:(NSRange)characterRange
                       utterance:(AVSpeechUtterance*)utterance {
  // Ignore bogus ranges. The Mac speech synthesizer is a bit buggy and
  // occasionally returns a number way out of range.
  if (characterRange.location > utterance.speechString.length ||
      characterRange.length == 0) {
    return;
  }
  _ttsImplMac->OnSpeechEvent(GetUtteranceId(utterance), content::TTS_EVENT_WORD,
                             characterRange.location, characterRange.length,
                             kNoError);
}

@end