network_speech_recognition_engine_impl.cc

// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "content/browser/speech/network_speech_recognition_engine_impl.h"

#include <algorithm>
#include <memory>
#include <string_view>
#include <vector>

#include "base/functional/bind.h"
#include "base/metrics/histogram_functions.h"
#include "base/numerics/byte_conversions.h"
#include "base/numerics/safe_conversions.h"
#include "base/rand_util.h"
#include "base/strings/escape.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/time/time.h"
#include "components/speech/audio_buffer.h"
#include "content/public/browser/google_streaming_api.pb.h"
#include "google_apis/google_api_keys.h"
#include "media/base/audio_timestamp_helper.h"
#include "media/mojo/mojom/speech_recognition_error.mojom.h"
#include "media/mojo/mojom/speech_recognition_result.mojom.h"
#include "mojo/public/c/system/types.h"
#include "mojo/public/cpp/bindings/receiver_set.h"
#include "net/base/load_flags.h"
#include "net/traffic_annotation/network_traffic_annotation.h"
#include "services/network/public/cpp/shared_url_loader_factory.h"

namespace content {
namespace {

const char kWebServiceBaseUrl[] = …;
const char kDownstreamUrl[] = …;
const char kUpstreamUrl[] = …;

constexpr char kWebSpeechAudioDuration[] = …;

// Used to override |kWebServiceBaseUrl| when non-null, only set in tests.
const char* web_service_base_url_for_tests = …;

// This matches the maximum maxAlternatives value supported by the server.
const uint32_t kMaxMaxAlternatives = …;

// TODO(hans): Remove this and other logging when we don't need it anymore.
void DumpResponse(const std::string& response) { … }

const int kDefaultConfigSampleRate = …;
const int kDefaultConfigBitsPerSample = …;
const uint32_t kDefaultMaxHypotheses = …;

}  // namespace

NetworkSpeechRecognitionEngineImpl::Config::Config()
    : … { … }

NetworkSpeechRecognitionEngineImpl::Config::~Config() = default;

const int NetworkSpeechRecognitionEngineImpl::kAudioPacketIntervalMs = …;
const int NetworkSpeechRecognitionEngineImpl::kWebserviceStatusNoError = …;
const int NetworkSpeechRecognitionEngineImpl::kWebserviceStatusErrorNoMatch = …;

NetworkSpeechRecognitionEngineImpl::NetworkSpeechRecognitionEngineImpl(
    scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory,
    const std::string& accept_language)
    : … { … }

NetworkSpeechRecognitionEngineImpl::~NetworkSpeechRecognitionEngineImpl() { … }

void NetworkSpeechRecognitionEngineImpl::set_web_service_base_url_for_tests(
    const char* base_url_for_tests) { … }

void NetworkSpeechRecognitionEngineImpl::SetConfig(const Config& config) { … }

bool NetworkSpeechRecognitionEngineImpl::IsRecognitionPending() const { … }

void NetworkSpeechRecognitionEngineImpl::StartRecognition() { … }

void NetworkSpeechRecognitionEngineImpl::EndRecognition() { … }

void NetworkSpeechRecognitionEngineImpl::TakeAudioChunk(
    const AudioChunk& data) { … }

void NetworkSpeechRecognitionEngineImpl::AudioChunksEnded() { … }

void NetworkSpeechRecognitionEngineImpl::OnUpstreamDataComplete(
    bool success,
    int response_code) { … }

void NetworkSpeechRecognitionEngineImpl::OnDownstreamDataReceived(
    std::string_view new_response_data) { … }

void NetworkSpeechRecognitionEngineImpl::OnDownstreamDataComplete(
    bool success,
    int response_code) { … }

int NetworkSpeechRecognitionEngineImpl::GetDesiredAudioChunkDurationMs() const { … }

// -----------------------  Core FSM implementation ---------------------------

void NetworkSpeechRecognitionEngineImpl::DispatchEvent(
    const FSMEventArgs& event_args) { … }

NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::ExecuteTransitionAndGetNextState(
    const FSMEventArgs& event_args) { … }

// ----------- Contract for all the FSM evolution functions below -------------
//  - Are guaranteed to be executed in the same thread (IO, except for tests);
//  - Are guaranteed to be not reentrant (themselves and each other);
//  - event_args members are guaranteed to be stable during the call;

NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::ConnectBothStreams(const FSMEventArgs&) { … }

NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::TransmitAudioUpstream(
    const FSMEventArgs& event_args) { … }

NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::ProcessDownstreamResponse(
    const FSMEventArgs& event_args) { … }

NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::RaiseNoMatchErrorIfGotNoResults(
    const FSMEventArgs& event_args) { … }

NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::CloseUpstreamAndWaitForResults(
    const FSMEventArgs&) { … }

NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::CloseDownstream(const FSMEventArgs&) { … }

NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::AbortSilently(const FSMEventArgs&) { … }

NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::AbortWithError(const FSMEventArgs&) { … }

NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::Abort(
    media::mojom::SpeechRecognitionErrorCode error_code) { … }

NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::DoNothing(const FSMEventArgs&) { … }

NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::NotFeasible(
    const FSMEventArgs& event_args) { … }

std::string NetworkSpeechRecognitionEngineImpl::GetAcceptedLanguages() const { … }

// TODO(primiano): Is there any utility in the codebase that already does this?
std::string NetworkSpeechRecognitionEngineImpl::GenerateRequestKey() const { … }

void NetworkSpeechRecognitionEngineImpl::UploadAudioChunk(
    const std::string& data,
    FrameType type,
    bool is_final) { … }

NetworkSpeechRecognitionEngineImpl::FSMEventArgs::FSMEventArgs(
    FSMEvent event_value)
    : … { … }

NetworkSpeechRecognitionEngineImpl::FSMEventArgs::~FSMEventArgs() = default;

}  // namespace content
chromium/content/browser/speech/network_speech_recognition_engine_impl.cc