chromium/chrome/browser/speech/speech_recognition_service_browsertest.cc

// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include <algorithm>
#include <string_view>

#include "base/files/file_path.h"
#include "base/files/file_util.h"
#include "base/files/scoped_temp_dir.h"
#include "base/notreached.h"
#include "base/path_service.h"
#include "base/sync_socket.h"
#include "base/test/metrics/histogram_tester.h"
#include "base/threading/thread_restrictions.h"
#include "base/timer/timer.h"
#include "build/build_config.h"
#include "chrome/browser/browser_process.h"
#include "chrome/browser/profiles/profile.h"
#include "chrome/browser/speech/chrome_speech_recognition_service.h"
#include "chrome/browser/ui/browser.h"
#include "chrome/common/pref_names.h"
#include "chrome/services/speech/soda/soda_test_paths.h"
#include "chrome/services/speech/speech_recognition_recognizer_impl.h"
#include "chrome/test/base/in_process_browser_test.h"
#include "components/metrics/content/subprocess_metrics_provider.h"
#include "components/prefs/pref_service.h"
#include "components/soda/constants.h"
#include "components/soda/pref_names.h"
#include "content/public/browser/audio_service.h"
#include "content/public/common/content_switches.h"
#include "content/public/test/browser_test.h"
#include "media/audio/audio_device_description.h"
#include "media/audio/wav_audio_handler.h"
#include "media/base/audio_bus.h"
#include "media/mojo/mojom/audio_data.mojom.h"
#include "media/mojo/mojom/audio_data_pipe.mojom.h"
#include "media/mojo/mojom/audio_input_stream.mojom.h"
#include "media/mojo/mojom/audio_stream_factory.mojom.h"
#include "media/mojo/mojom/media_types.mojom.h"
#include "media/mojo/mojom/speech_recognition_service.mojom.h"
#include "sandbox/policy/switches.h"
#include "services/audio/public/cpp/fake_stream_factory.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "third_party/abseil-cpp/absl/utility/utility.h"

#if BUILDFLAG(IS_WIN)
#include <windows.h>
#else
#include <unistd.h>
#endif

using testing::StrictMock;

namespace speech {

constexpr int kExpectedChannelCount = 1;

// TODO: Should be a way to generate this, this seems way too brittle.
const size_t kShMemSize = 82240;

class MockStream : public media::mojom::AudioInputStream {
 public:
  MOCK_METHOD0(Record, void());
  MOCK_METHOD1(SetVolume, void(double));
};

class TestStreamFactory : public audio::FakeStreamFactory {
 public:
  TestStreamFactory() : stream_(), stream_receiver_(&stream_) {}
  ~TestStreamFactory() override = default;
  void CreateInputStream(
      mojo::PendingReceiver<media::mojom::AudioInputStream> stream_receiver,
      mojo::PendingRemote<media::mojom::AudioInputStreamClient> client,
      mojo::PendingRemote<media::mojom::AudioInputStreamObserver> observer,
      mojo::PendingRemote<media::mojom::AudioLog> log,
      const std::string& device_id,
      const media::AudioParameters& params,
      uint32_t shared_memory_count,
      bool enable_agc,
      base::ReadOnlySharedMemoryRegion key_press_count_buffer,
      media::mojom::AudioProcessingConfigPtr processing_config,
      CreateInputStreamCallback created_callback) override {
    device_id_ = device_id;
    params_ = params;
    if (stream_receiver_.is_bound())
      stream_receiver_.reset();
    stream_receiver_.Bind(std::move(stream_receiver));
    if (client_)
      client_.reset();
    // Keep the passed client alive to avoid binding errors.
    client_.Bind(std::move(client));
    base::SyncSocket socket1, socket2;
    base::SyncSocket::CreatePair(&socket1, &socket2);
    std::move(created_callback)
        .Run({std::in_place,
              base::ReadOnlySharedMemoryRegion::Create(kShMemSize).region,
              mojo::PlatformHandle(socket1.Take())},
             false /*initially muted*/, base::UnguessableToken::Create());
  }

  mojo::PendingRemote<media::mojom::AudioStreamFactory> MakeRemote() {
    return receiver_.BindNewPipeAndPassRemote();
  }

  void WaitToCreateInputStream() {
    if (stream_receiver_.is_bound())
      return;
    base::RepeatingTimer check_timer;
    check_timer.Start(FROM_HERE, base::Milliseconds(10), this,
                      &TestStreamFactory::OnTimer);
    runner_.Run();
  }

  StrictMock<MockStream> stream_;
  mojo::Remote<media::mojom::AudioInputStreamClient> client_;
  mojo::Receiver<media::mojom::AudioInputStream> stream_receiver_;
  std::string device_id_;
  std::optional<media::AudioParameters> params_;

 private:
  void OnTimer() {
    if (stream_receiver_.is_bound())
      runner_.Quit();
  }

  base::RunLoop runner_;
};

class SpeechRecognitionServiceTest
    : public InProcessBrowserTest,
      public media::mojom::SpeechRecognitionRecognizerClient {
 public:
  SpeechRecognitionServiceTest() = default;
  SpeechRecognitionServiceTest(const SpeechRecognitionServiceTest&) = delete;
  SpeechRecognitionServiceTest& operator=(const SpeechRecognitionServiceTest&) =
      delete;

  ~SpeechRecognitionServiceTest() override = default;

  // InProcessBrowserTest
  void SetUp() override;
  void TearDownOnMainThread() override;

  // media::mojom::SpeechRecognitionRecognizerClient
  void OnSpeechRecognitionRecognitionEvent(
      const media::SpeechRecognitionResult& result,
      OnSpeechRecognitionRecognitionEventCallback reply) override;
  void OnSpeechRecognitionStopped() override;
  void OnSpeechRecognitionError() override;
  void OnLanguageIdentificationEvent(
      media::mojom::LanguageIdentificationEventPtr event) override;

  // Disable the sandbox on Windows and MacOS as the sandboxes on those
  // platforms have not been configured yet.
#if BUILDFLAG(IS_WIN) || BUILDFLAG(IS_MAC)
  void SetUpCommandLine(base::CommandLine* command_line) override {
    // Required for the utility process to access the directory containing the
    // test files.
    command_line->AppendSwitch(sandbox::policy::switches::kNoSandbox);
  }
#endif

 protected:
  void CloseCaptionBubble() {
    is_client_requesting_speech_recognition_ = false;
  }
  void SetUpPrefs();
  void LaunchService();
  void LaunchServiceWithAudioSourceFetcher();
  void SendAudioChunk(const std::vector<int16_t>& audio_data,
                      media::WavAudioHandler* handler,
                      size_t kMaxChunkSize);

  // The root directory for test files.
  base::FilePath test_data_dir_;

  mojo::Remote<media::mojom::AudioSourceSpeechRecognitionContext>
      audio_source_speech_recognition_context_;
  mojo::Remote<media::mojom::SpeechRecognitionContext>
      speech_recognition_context_;

  mojo::Remote<media::mojom::SpeechRecognitionRecognizer>
      speech_recognition_recognizer_;

  mojo::Remote<media::mojom::AudioSourceFetcher> audio_source_fetcher_;

  mojo::Receiver<media::mojom::SpeechRecognitionRecognizerClient>
      speech_recognition_client_receiver_{this};

  std::vector<std::string> recognition_results_;

  std::unique_ptr<ChromeSpeechRecognitionService> service_;

  bool is_client_requesting_speech_recognition_ = true;
};

void SpeechRecognitionServiceTest::SetUp() {
  ASSERT_TRUE(
      base::PathService::Get(base::DIR_SRC_TEST_DATA_ROOT, &test_data_dir_));
  InProcessBrowserTest::SetUp();
}

void SpeechRecognitionServiceTest::TearDownOnMainThread() {
  // The ChromeSpeechRecognitionService must be destroyed on the main thread.
  service_.reset();
}

void SpeechRecognitionServiceTest::OnSpeechRecognitionRecognitionEvent(
    const media::SpeechRecognitionResult& result,
    OnSpeechRecognitionRecognitionEventCallback reply) {
  std::string transcription = result.transcription;
  // The language pack used by the MacOS builder is newer and has punctuation
  // enabled whereas the one used by the Linux builder does not.
  transcription.erase(
      std::remove(transcription.begin(), transcription.end(), ','),
      transcription.end());
  recognition_results_.push_back(std::move(transcription));
  std::move(reply).Run(is_client_requesting_speech_recognition_);
}

void SpeechRecognitionServiceTest::OnSpeechRecognitionStopped() {
  NOTREACHED_IN_MIGRATION();
}

void SpeechRecognitionServiceTest::OnSpeechRecognitionError() {
  NOTREACHED_IN_MIGRATION();
}

void SpeechRecognitionServiceTest::OnLanguageIdentificationEvent(
    media::mojom::LanguageIdentificationEventPtr event) {
  NOTREACHED_IN_MIGRATION();
}

void SpeechRecognitionServiceTest::SetUpPrefs() {
  base::FilePath soda_binary_path;
#if BUILDFLAG(IS_WIN) || BUILDFLAG(IS_MAC)
  soda_binary_path =
      test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
          .Append(soda::kSodaTestBinaryRelativePath);
#else
  base::FilePath soda_test_binary_path =
      test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
          .Append(soda::kSodaTestBinaryRelativePath);
  DVLOG(0) << "SODA test path: " << soda_test_binary_path.value().c_str();
  base::ScopedAllowBlockingForTesting allow_blocking;
  ASSERT_TRUE(base::PathExists(soda_test_binary_path));

  soda_binary_path = GetSodaTestBinaryPath();
  DVLOG(0) << "SODA binary path: " << soda_binary_path.value().c_str();
  ASSERT_TRUE(base::PathExists(soda_binary_path));
#endif
  g_browser_process->local_state()->SetFilePath(prefs::kSodaBinaryPath,
                                                soda_binary_path);
  g_browser_process->local_state()->SetFilePath(
      prefs::kSodaEnUsConfigPath,
      test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
          .Append(soda::kSodaLanguagePackRelativePath));
}

void SpeechRecognitionServiceTest::LaunchService() {
  // Launch the Speech Recognition service.
  auto* browser_context =
      static_cast<content::BrowserContext*>(browser()->profile());
  service_ = std::make_unique<ChromeSpeechRecognitionService>(browser_context);

  service_->BindSpeechRecognitionContext(
      speech_recognition_context_.BindNewPipeAndPassReceiver());

  bool is_multichannel_supported = true;
  auto run_loop = std::make_unique<base::RunLoop>();
  // Bind the recognizer pipes used to send audio and receive results.
  speech_recognition_context_->BindRecognizer(
      speech_recognition_recognizer_.BindNewPipeAndPassReceiver(),
      speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
      media::mojom::SpeechRecognitionOptions::New(
          media::mojom::SpeechRecognitionMode::kCaption,
          /*enable_formatting=*/true, kUsEnglishLocale,
          /*is_server_based=*/false,
          media::mojom::RecognizerClientType::kLiveCaption),
      base::BindOnce(
          [](bool* p_is_multichannel_supported, base::RunLoop* run_loop,
             bool is_multichannel_supported) {
            *p_is_multichannel_supported = is_multichannel_supported;
            run_loop->Quit();
          },
          &is_multichannel_supported, run_loop.get()));

  run_loop->Run();
  ASSERT_FALSE(is_multichannel_supported);
}

void SpeechRecognitionServiceTest::LaunchServiceWithAudioSourceFetcher() {
  // Launch the Speech Recognition service.
  auto* browser_context =
      static_cast<content::BrowserContext*>(browser()->profile());
  service_ = std::make_unique<ChromeSpeechRecognitionService>(browser_context);

  service_->BindAudioSourceSpeechRecognitionContext(
      audio_source_speech_recognition_context_.BindNewPipeAndPassReceiver());

  bool is_multichannel_supported = true;
  auto run_loop = std::make_unique<base::RunLoop>();
  // Bind the recognizer pipes used to send audio and receive results.
  audio_source_speech_recognition_context_->BindAudioSourceFetcher(
      audio_source_fetcher_.BindNewPipeAndPassReceiver(),
      speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
      media::mojom::SpeechRecognitionOptions::New(
          media::mojom::SpeechRecognitionMode::kIme,
          /*enable_formatting=*/false, kUsEnglishLocale),
      base::BindOnce(
          [](bool* p_is_multichannel_supported, base::RunLoop* run_loop,
             bool is_multichannel_supported) {
            *p_is_multichannel_supported = is_multichannel_supported;
            run_loop->Quit();
          },
          &is_multichannel_supported, run_loop.get()));
  run_loop->Run();
  ASSERT_FALSE(is_multichannel_supported);
}

void SpeechRecognitionServiceTest::SendAudioChunk(
    const std::vector<int16_t>& audio_data,
    media::WavAudioHandler* handler,
    size_t kMaxChunkSize) {
  int chunk_start = 0;
  // Upload chunks of 1024 frames at a time.
  while (chunk_start < static_cast<int>(audio_data.size())) {
    int chunk_size = kMaxChunkSize < audio_data.size() - chunk_start
                         ? kMaxChunkSize
                         : audio_data.size() - chunk_start;

    auto signed_buffer = media::mojom::AudioDataS16::New();
    signed_buffer->channel_count = kExpectedChannelCount;
    signed_buffer->frame_count = chunk_size;
    signed_buffer->sample_rate = handler->GetSampleRate();
    for (int i = 0; i < chunk_size; i++) {
      signed_buffer->data.push_back(audio_data[chunk_start + i]);
    }

    speech_recognition_recognizer_->SendAudioToSpeechRecognitionService(
        std::move(signed_buffer));
    chunk_start += chunk_size;

    // Sleep for 20ms to simulate real-time audio. SODA requires audio
    // streaming in order to return events.
#if BUILDFLAG(IS_WIN)
    ::Sleep(20);
#else
    usleep(20000);
#endif
  }
}

IN_PROC_BROWSER_TEST_F(SpeechRecognitionServiceTest, RecognizePhrase) {
  base::HistogramTester histograms;
  SetUpPrefs();
  LaunchService();

  std::string buffer;
  auto audio_file =
      test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
          .Append(base::FilePath(soda::kSodaTestAudioRelativePath));
  {
    base::ScopedAllowBlockingForTesting allow_blocking;
    ASSERT_TRUE(base::PathExists(audio_file));
    ASSERT_TRUE(base::ReadFileToString(audio_file, &buffer));
  }

  auto handler = media::WavAudioHandler::Create(buffer);
  ASSERT_TRUE(handler.get());
  ASSERT_EQ(handler->GetNumChannels(), kExpectedChannelCount);

  auto bus = media::AudioBus::Create(kExpectedChannelCount,
                                     handler->total_frames_for_testing());

  size_t bytes_written = 0u;
  ASSERT_TRUE(handler->CopyTo(bus.get(), &bytes_written));

  std::vector<int16_t> audio_data(bus->frames());
  bus->ToInterleaved<media::SignedInt16SampleTypeTraits>(bus->frames(),
                                                         audio_data.data());

  constexpr size_t kMaxChunkSize = 1024;
  constexpr int kReplayAudioCount = 2;
  for (int i = 0; i < kReplayAudioCount; i++) {
    SendAudioChunk(audio_data, handler.get(), kMaxChunkSize);
  }

  speech_recognition_recognizer_.reset();
  base::RunLoop().RunUntilIdle();

  // Sleep for 100ms to ensure SODA has returned real-time results.
#if BUILDFLAG(IS_WIN)
  ::Sleep(100);
#else
  usleep(100000);
#endif
  ASSERT_GT(static_cast<int>(recognition_results_.size()), kReplayAudioCount);
  ASSERT_EQ(recognition_results_.back(), "Hey Google Hey Google");

  metrics::SubprocessMetricsProvider::MergeHistogramDeltasForTesting();
  histograms.ExpectUniqueTimeSample(
      SpeechRecognitionRecognizerImpl::kCaptionBubbleVisibleHistogramName,
      base::Milliseconds(2520), 1);
  histograms.ExpectTotalCount(
      SpeechRecognitionRecognizerImpl::kCaptionBubbleHiddenHistogramName, 0);
}

IN_PROC_BROWSER_TEST_F(SpeechRecognitionServiceTest,
                       ClosingCaptionBubbleStopsRecognition) {
  base::HistogramTester histograms;
  SetUpPrefs();
  LaunchService();

  std::string buffer;
  auto audio_file =
      test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
          .Append(base::FilePath(soda::kSodaTestAudioRelativePath));
  {
    base::ScopedAllowBlockingForTesting allow_blocking;
    ASSERT_TRUE(base::PathExists(audio_file));
    ASSERT_TRUE(base::ReadFileToString(audio_file, &buffer));
  }

  auto handler = media::WavAudioHandler::Create(buffer);
  ASSERT_TRUE(handler.get());
  ASSERT_EQ(handler->GetNumChannels(), kExpectedChannelCount);

  auto bus = media::AudioBus::Create(kExpectedChannelCount,
                                     handler->total_frames_for_testing());

  size_t bytes_written = 0u;
  ASSERT_TRUE(handler->CopyTo(bus.get(), &bytes_written));

  std::vector<int16_t> audio_data(bus->frames());
  bus->ToInterleaved<media::SignedInt16SampleTypeTraits>(bus->frames(),
                                                         audio_data.data());
  constexpr size_t kMaxChunkSize = 1024;

  // Send an audio chunk to the service. It will output "Hey Google". When the
  // client receives the result, it responds to the service with `success =
  // true`, informing the speech recognition service that it still wants
  // transcriptions.
  SendAudioChunk(audio_data, handler.get(), kMaxChunkSize);
  base::RunLoop().RunUntilIdle();

  // Close caption bubble. This means that the next time the client receives a
  // transcription, it will respond to the speech service with `success =
  // false`, informing the speech recognition service that it is no longer
  // requesting speech recognition.
  CloseCaptionBubble();

  // Send an audio chunk to the service. It will output "Hey Google". When the
  // client receives the result, it responds to the service with `success =
  // false`, informing the speech recognition service that it is no longer
  // requesting speech recognition.
  SendAudioChunk(audio_data, handler.get(), kMaxChunkSize);
  base::RunLoop().RunUntilIdle();

  // Send an audio chunk to the service. It does not get transcribed.
  SendAudioChunk(audio_data, handler.get(), kMaxChunkSize);

  speech_recognition_recognizer_.reset();
  base::RunLoop().RunUntilIdle();

  // Sleep for 100ms to ensure SODA has returned real-time results.
#if BUILDFLAG(IS_WIN)
  ::Sleep(100);
#else
  usleep(100000);
#endif
  ASSERT_GT(static_cast<int>(recognition_results_.size()), 3);
  ASSERT_EQ(recognition_results_.back(), "Hey Google Hey Google");

  metrics::SubprocessMetricsProvider::MergeHistogramDeltasForTesting();
  histograms.ExpectUniqueTimeSample(
      SpeechRecognitionRecognizerImpl::kCaptionBubbleVisibleHistogramName,
      base::Milliseconds(2520), 1);
  histograms.ExpectUniqueTimeSample(
      SpeechRecognitionRecognizerImpl::kCaptionBubbleHiddenHistogramName,
      base::Milliseconds(1260), 1);
}

IN_PROC_BROWSER_TEST_F(SpeechRecognitionServiceTest, CreateAudioSourceFetcher) {
  base::HistogramTester histograms;
  SetUpPrefs();
  LaunchServiceWithAudioSourceFetcher();

  // TODO(crbug.com/40753481): Check implementation / sandbox policy on Mac and
  // Windows.
#if BUILDFLAG(IS_CHROMEOS) || BUILDFLAG(IS_LINUX)
  // Check that Start begins audio recording.
  // TODO(crbug.com/40166991): Try to mock audio input, maybe with
  // TestStreamFactory::stream_, to test end-to-end.
  std::string device_id = media::AudioDeviceDescription::kDefaultDeviceId;
  media::AudioParameters params(media::AudioParameters::AUDIO_PCM_LOW_LATENCY,
                                media::ChannelLayoutConfig::Stereo(), 10000,
                                1000);

  // Create a fake stream factory.
  std::unique_ptr<StrictMock<TestStreamFactory>> stream_factory =
      std::make_unique<StrictMock<TestStreamFactory>>();
  EXPECT_CALL(stream_factory->stream_, Record());
  audio_source_fetcher_->Start(stream_factory->MakeRemote(), device_id, params);
  stream_factory->WaitToCreateInputStream();

  EXPECT_EQ(device_id, stream_factory->device_id_);
  ASSERT_TRUE(stream_factory->params_);
  EXPECT_TRUE(params.Equals(stream_factory->params_.value()));
#endif

  audio_source_fetcher_->Stop();
  base::RunLoop().RunUntilIdle();
}

IN_PROC_BROWSER_TEST_F(SpeechRecognitionServiceTest, CompromisedRenderer) {
  // Create temporary SODA files.
  SetUpPrefs();
  base::ScopedAllowBlockingForTesting allow_blocking;
  base::FilePath config_dir =
      GetSodaLanguagePacksDirectory()
          .AppendASCII(kUsEnglishLocale)
          .Append(FILE_PATH_LITERAL("1.1.1"))
          .Append(kSodaLanguagePackDirectoryRelativePath);
  base::CreateDirectory(config_dir);
  ASSERT_TRUE(base::PathExists(config_dir));
  base::FilePath config_file_path =
      config_dir.Append(FILE_PATH_LITERAL("config_file"));
  ASSERT_TRUE(base::WriteFile(config_file_path, std::string_view()));
  ASSERT_TRUE(base::PathExists(config_file_path));
  g_browser_process->local_state()->SetFilePath(prefs::kSodaEnUsConfigPath,
                                                config_file_path);

  // Launch the Speech Recognition service.
  auto* browser_context =
      static_cast<content::BrowserContext*>(browser()->profile());
  service_ = std::make_unique<ChromeSpeechRecognitionService>(browser_context);
  service_->BindSpeechRecognitionContext(
      speech_recognition_context_.BindNewPipeAndPassReceiver());

  // Bind the recognizer pipes used to send audio and receive results.
  auto run_loop = std::make_unique<base::RunLoop>();
  speech_recognition_context_->BindRecognizer(
      speech_recognition_recognizer_.BindNewPipeAndPassReceiver(),
      speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
      media::mojom::SpeechRecognitionOptions::New(
          media::mojom::SpeechRecognitionMode::kCaption,
          /*enable_formatting=*/true, kUsEnglishLocale,
          /*is_server_based=*/false,
          media::mojom::RecognizerClientType::kLiveCaption),
      base::BindOnce([](base::RunLoop* run_loop,
                        bool is_multichannel_supported) { run_loop->Quit(); },
                     run_loop.get()));
  run_loop->Run();

  // Simulate a compromised renderer by changing the language and immediately
  // resetting the recognizer and verify that the subsequent callbacks do not
  // cause any crashes.
  speech_recognition_recognizer_->OnLanguageChanged(kUsEnglishLocale);
  speech_recognition_recognizer_.reset();
  base::RunLoop().RunUntilIdle();
}

}  // namespace speech