chromium/chrome/browser/speech/cros_speech_recognition_service.cc

// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/browser/speech/cros_speech_recognition_service.h"

#include <memory>
#include <string>

#include "ash/constants/ash_features.h"
#include "base/containers/flat_map.h"
#include "base/files/file_path.h"
#include "base/notimplemented.h"
#include "base/types/optional_util.h"
#include "chrome/services/speech/audio_source_fetcher_impl.h"
#include "chrome/services/speech/buildflags/buildflags.h"
#include "chrome/services/speech/cros_speech_recognition_recognizer_impl.h"
#include "components/live_caption/pref_names.h"
#include "components/soda/constants.h"
#include "components/soda/soda_installer.h"
#include "components/user_prefs/user_prefs.h"
#include "content/public/browser/browser_context.h"
#include "content/public/browser/browser_task_traits.h"
#include "content/public/browser/browser_thread.h"
#include "content/public/browser/storage_partition.h"
#include "media/base/media_switches.h"
#include "media/mojo/mojom/speech_recognition.mojom.h"
#include "media/mojo/mojom/speech_recognition_service.mojom.h"
#include "services/network/public/cpp/shared_url_loader_factory.h"

#if BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
#include "chrome/services/speech/internal/server_based_recognition_recognizer.h"
#endif  // BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)

namespace speech {

namespace {

constexpr char kInvalidSpeechRecogntionOptions[] =
    "Invalid SpeechRecognitionOptions provided";

void PopulateFilePaths(
    base::FilePath& binary_path,
    base::flat_map<std::string, base::FilePath>& config_paths) {
  speech::SodaInstaller* soda_installer = speech::SodaInstaller::GetInstance();

  binary_path = soda_installer->GetSodaBinaryPath();
  for (const auto& language_code : soda_installer->InstalledLanguages()) {
    config_paths[GetLanguageName(language_code)] =
        soda_installer->GetLanguagePath(GetLanguageName(language_code));
  }
}

}  // namespace

CrosSpeechRecognitionService::CrosSpeechRecognitionService(
    content::BrowserContext* context)
    : ChromeSpeechRecognitionService(context) {}

CrosSpeechRecognitionService::~CrosSpeechRecognitionService() {}

void CrosSpeechRecognitionService::BindSpeechRecognitionContext(
    mojo::PendingReceiver<media::mojom::SpeechRecognitionContext> receiver) {
  speech_recognition_contexts_.Add(this, std::move(receiver));
}

void CrosSpeechRecognitionService::BindAudioSourceSpeechRecognitionContext(
    mojo::PendingReceiver<media::mojom::AudioSourceSpeechRecognitionContext>
        receiver) {
  audio_source_speech_recognition_contexts_.Add(this, std::move(receiver));
}

void CrosSpeechRecognitionService::BindRecognizer(
    mojo::PendingReceiver<media::mojom::SpeechRecognitionRecognizer> receiver,
    mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> client,
    media::mojom::SpeechRecognitionOptionsPtr options,
    BindRecognizerCallback callback) {
  // This binding is used by LiveCaption and it can't be server based
  // recognition.
  if (options->is_server_based ||
      options->recognizer_client_type !=
          media::mojom::RecognizerClientType::kLiveCaption) {
    mojo::ReportBadMessage(kInvalidSpeechRecogntionOptions);
    return;
  }

  base::FilePath binary_path;
  base::flat_map<std::string, base::FilePath> config_paths;
  // The options should have locale set, but if they don't, pull from the prefs.
  std::string language_name;
  if (options->language) {
    language_name = options->language.value();
  } else {
    PrefService* profile_prefs = user_prefs::UserPrefs::Get(context());
    language_name = prefs::GetLiveCaptionLanguageCode(profile_prefs);
  }

  PopulateFilePaths(binary_path, config_paths);

  // TODO(crbug.com/40924425): Implement offensive word mask on ChromeOS so that
  // mask_offensive_words is not hard-coded.
  CrosSpeechRecognitionRecognizerImpl::Create(
      std::move(receiver), std::move(client), std::move(options), binary_path,
      config_paths, language_name, /* mask_offensive_words= */ false);
  std::move(callback).Run(
      CrosSpeechRecognitionRecognizerImpl::IsMultichannelSupported());
}

void CrosSpeechRecognitionService::BindWebSpeechRecognizer(
    mojo::PendingReceiver<media::mojom::SpeechRecognitionSession>
        session_receiver,
    mojo::PendingRemote<media::mojom::SpeechRecognitionSessionClient>
        session_client,
    mojo::PendingReceiver<media::mojom::SpeechRecognitionAudioForwarder>
        audio_forwarder,
    int channel_count,
    int sample_rate,
    media::mojom::SpeechRecognitionOptionsPtr options,
    bool continuous) {
  NOTIMPLEMENTED();
}

void CrosSpeechRecognitionService::BindAudioSourceFetcher(
    mojo::PendingReceiver<media::mojom::AudioSourceFetcher> fetcher_receiver,
    mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> client,
    media::mojom::SpeechRecognitionOptionsPtr options,
    BindRecognizerCallback callback) {
  if (!options->is_server_based) {
    base::FilePath binary_path;
    base::flat_map<std::string, base::FilePath> config_paths;
    PopulateFilePaths(binary_path, config_paths);

    // The options should have locale set, but if they don't, pull from the
    // prefs.
    std::string language_name;
    if (options->language) {
      language_name = options->language.value();
    } else {
      PrefService* profile_prefs = user_prefs::UserPrefs::Get(context());
      language_name = prefs::GetLiveCaptionLanguageCode(profile_prefs);
    }
    // CrosSpeechRecognitionService runs on browser UI thread.
    // Create AudioSourceFetcher on browser IO thread to avoid UI jank.
    // Note that its CrosSpeechRecognitionRecognizer must also run
    // on the IO thread. If CrosSpeechRecognitionService is moved away from
    // browser UI thread, we can call AudioSourceFetcherImpl::Create directly.
    // TODO: Implement offensive word mask on ChromeOS so that
    // mask_offensive_words is not hard-coded.
    content::GetIOThreadTaskRunner({})->PostTask(
        FROM_HERE,
        base::BindOnce(
            &CrosSpeechRecognitionService::
                CreateAudioSourceFetcherForOnDeviceRecognitionOnIOThread,
            weak_factory_.GetWeakPtr(), std::move(fetcher_receiver),
            std::move(client), std::move(options), binary_path, config_paths,
            language_name, /* mask_offensive_words= */ false));
    std::move(callback).Run(
        CrosSpeechRecognitionRecognizerImpl::IsMultichannelSupported());
    return;
  }
#if BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
  if (!ash::features::IsInternalServerSideSpeechRecognitionEnabled()) {
    // A request is made for a service that has not been enabled.
    mojo::ReportBadMessage(kInvalidSpeechRecogntionOptions);
    return;
  }
  content::GetIOThreadTaskRunner({})->PostTask(
      FROM_HERE,
      base::BindOnce(
          &CrosSpeechRecognitionService::
              CreateAudioSourceFetcherForServerBasedRecognitionOnIOThread,
          weak_factory_.GetWeakPtr(), std::move(fetcher_receiver),
          std::move(client), std::move(options),
          context()
              ->GetDefaultStoragePartition()
              ->GetURLLoaderFactoryForBrowserProcessIOThread()));
  std::move(callback).Run(/*is_multichannel_supported=*/false);
  return;
#else
  mojo::ReportBadMessage(kInvalidSpeechRecogntionOptions);
#endif  // BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
}

void CrosSpeechRecognitionService::
    CreateAudioSourceFetcherForOnDeviceRecognitionOnIOThread(
        mojo::PendingReceiver<media::mojom::AudioSourceFetcher>
            fetcher_receiver,
        mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient>
            client,
        media::mojom::SpeechRecognitionOptionsPtr options,
        const base::FilePath& binary_path,
        const base::flat_map<std::string, base::FilePath>& config_paths,
        const std::string& primary_language_name,
        const bool mask_offensive_words) {
  DCHECK_CURRENTLY_ON(content::BrowserThread::IO);
  DCHECK(!options->is_server_based);
  AudioSourceFetcherImpl::Create(
      std::move(fetcher_receiver),
      std::make_unique<CrosSpeechRecognitionRecognizerImpl>(
          std::move(client), std::move(options), binary_path, config_paths,
          primary_language_name, mask_offensive_words),
      CrosSpeechRecognitionRecognizerImpl::IsMultichannelSupported(),
      /*is_server_based=*/false);
}

void CrosSpeechRecognitionService::
    CreateAudioSourceFetcherForServerBasedRecognitionOnIOThread(
        mojo::PendingReceiver<media::mojom::AudioSourceFetcher>
            fetcher_receiver,
        mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient>
            client,
        media::mojom::SpeechRecognitionOptionsPtr options,
        std::unique_ptr<network::PendingSharedURLLoaderFactory>
            pending_loader_factory) {
#if BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
  DCHECK_CURRENTLY_ON(content::BrowserThread::IO);
  DCHECK(options->is_server_based);
  AudioSourceFetcherImpl::Create(
      std::move(fetcher_receiver),
      std::make_unique<ServerBasedRecognitionRecognizer>(
          std::move(client), std::move(options),
          network::SharedURLLoaderFactory::Create(
              std::move(pending_loader_factory))),
      /*is_multichannel_supported=*/false, /*is_server_based=*/true);
#endif  // BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
}

}  // namespace speech