chromium/chrome/services/speech/cros_speech_recognition_recognizer_impl.cc

// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/services/speech/cros_speech_recognition_recognizer_impl.h"

#include <string>

#include "base/containers/flat_map.h"
#include "base/files/file_path.h"
#include "base/logging.h"
#include "chrome/services/speech/soda/cros_soda_client.h"
#include "components/soda/constants.h"
#include "components/soda/soda_installer.h"
#include "google_apis/google_api_keys.h"
#include "media/base/audio_buffer.h"
#include "media/base/audio_sample_types.h"
#include "media/base/audio_timestamp_helper.h"
#include "media/base/limits.h"
#include "media/base/media_switches.h"
#include "media/mojo/mojom/audio_data.mojom.h"
#include "media/mojo/mojom/media_types.mojom.h"
#include "mojo/public/cpp/bindings/self_owned_receiver.h"

namespace speech {

namespace {
constexpr char kNoClientError[] = "No cros soda client.";

chromeos::machine_learning::mojom::SodaRecognitionMode
GetSodaSpeechRecognitionMode(
    media::mojom::SpeechRecognitionMode recognition_mode) {
  switch (recognition_mode) {
    case media::mojom::SpeechRecognitionMode::kIme:
      return chromeos::machine_learning::mojom::SodaRecognitionMode::kIme;
    case media::mojom::SpeechRecognitionMode::kCaption:
      return chromeos::machine_learning::mojom::SodaRecognitionMode::kCaption;
    case media::mojom::SpeechRecognitionMode::kUnknown:
      // Chrome OS SODA doesn't support unknown recognition type. Default to
      // caption.
      NOTREACHED_IN_MIGRATION();
      return chromeos::machine_learning::mojom::SodaRecognitionMode::kCaption;
  }
}
}  // namespace

void CrosSpeechRecognitionRecognizerImpl::Create(
    mojo::PendingReceiver<media::mojom::SpeechRecognitionRecognizer> receiver,
    mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> remote,
    media::mojom::SpeechRecognitionOptionsPtr options,
    const base::FilePath& binary_path,
    const base::flat_map<std::string, base::FilePath>& config_paths,
    const std::string& primary_language_name,
    const bool mask_offensive_words) {
  mojo::MakeSelfOwnedReceiver(
      std::make_unique<CrosSpeechRecognitionRecognizerImpl>(
          std::move(remote), std::move(options), binary_path, config_paths,
          primary_language_name, mask_offensive_words),
      std::move(receiver));
}
CrosSpeechRecognitionRecognizerImpl::~CrosSpeechRecognitionRecognizerImpl() =
    default;

CrosSpeechRecognitionRecognizerImpl::CrosSpeechRecognitionRecognizerImpl(
    mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> remote,
    media::mojom::SpeechRecognitionOptionsPtr options,
    const base::FilePath& binary_path,
    const base::flat_map<std::string, base::FilePath>& config_paths,
    const std::string& primary_language_name,
    const bool mask_offensive_words)
    : SpeechRecognitionRecognizerImpl(std::move(remote),
                                      std::move(options),
                                      binary_path,
                                      config_paths,
                                      primary_language_name,
                                      mask_offensive_words),
      binary_path_(binary_path) {
  cros_soda_client_ = std::make_unique<soda::CrosSodaClient>();
}

chromeos::machine_learning::mojom::SodaMultilangConfigPtr
CrosSpeechRecognitionRecognizerImpl::AddLiveCaptionLanguagesToConfig(
    const std::string& primary_language_name,
    const base::flat_map<std::string, base::FilePath>& config_paths,
    const std::vector<std::string>& live_caption_languages) {
  auto multi_lang_config =
      chromeos::machine_learning::mojom::SodaMultilangConfig::New();

  for (const auto& config_path : config_paths) {
    if (config_path.first == primary_language_name) {
      continue;
    } else if (!base::Contains(live_caption_languages, config_path.first)) {
      VLOG(1) << "Skipping multilang on captions of " << config_path.first
              << " as it is not listed as a live caption language.";
      continue;
    }
    multi_lang_config->locale_to_language_pack_map[config_path.first] =
        config_path.second.value();
  }
  return multi_lang_config;
}

void CrosSpeechRecognitionRecognizerImpl::
    SendAudioToSpeechRecognitionServiceInternal(
        media::mojom::AudioDataS16Ptr buffer) {
  // Soda is on, let's send the audio to it.
  int channel_count = buffer->channel_count;
  int sample_rate = buffer->sample_rate;
  size_t buffer_size = 0;
  // Verify and calculate the buffer size.
  if (!base::CheckMul(buffer->data.size(), sizeof(buffer->data[0]))
           .AssignIfValid(&buffer_size)) {
    LOG(DFATAL) << "Size check invalid.";
    return;
  }
  if (cros_soda_client_ == nullptr) {
    LOG(DFATAL) << "No soda client, stopping.";
    mojo::ReportBadMessage(kNoClientError);
    return;
  }

  if (!cros_soda_client_->IsInitialized() ||
      cros_soda_client_->DidAudioPropertyChange(sample_rate, channel_count)) {
    auto config = chromeos::machine_learning::mojom::SodaConfig::New();
    config->channel_count = channel_count;
    config->sample_rate = sample_rate;
    config->api_key = google_apis::GetSodaAPIKey();
    config->language_dlc_path = config_paths()[primary_language_name()].value();
    config->library_dlc_path = binary_path_.value();
    config->recognition_mode =
        GetSodaSpeechRecognitionMode(options_->recognition_mode);
    if (options_->recognition_mode ==
            media::mojom::SpeechRecognitionMode::kCaption &&
        base::FeatureList::IsEnabled(media::kLiveCaptionMultiLanguage)) {
      config->multi_lang_config = AddLiveCaptionLanguagesToConfig(
          primary_language_name(), config_paths(),
          speech::SodaInstaller::GetInstance()
              ->GetLiveCaptionEnabledLanguages());
    }

    config->enable_formatting =
        options_->enable_formatting
            ? chromeos::machine_learning::mojom::OptionalBool::kTrue
            : chromeos::machine_learning::mojom::OptionalBool::kFalse;
    cros_soda_client_->Reset(std::move(config), recognition_event_callback(),
                             speech_recognition_stopped_callback(),
                             language_identification_event_callback());
  }
  cros_soda_client_->AddAudio(reinterpret_cast<char*>(buffer->data.data()),
                              buffer_size);
}

void CrosSpeechRecognitionRecognizerImpl::MarkDone() {
  if (cros_soda_client_ == nullptr) {
    LOG(DFATAL) << "No soda client, stopping.";
    mojo::ReportBadMessage(kNoClientError);
    return;
  }

  if (!cros_soda_client_->IsInitialized()) {
    // Speech recognition was stopped before it could initialize. Return early
    // in this case.
    return;
  }

  cros_soda_client_->MarkDone();
}

}  // namespace speech