speech_recognizer_impl.cc | Explore in Territory

// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/342213636): Remove this and spanify to fix the errors.
#pragma allow_unsafe_buffers
#endif

#include "content/browser/speech/speech_recognizer_impl.h"

#include <stdint.h>

#include <algorithm>
#include <memory>

#include "base/functional/bind.h"
#include "base/time/time.h"
#include "build/build_config.h"
#include "components/speech/audio_buffer.h"
#include "content/browser/browser_main_loop.h"
#include "content/browser/media/media_internals.h"
#include "content/public/browser/audio_service.h"
#include "content/public/browser/browser_task_traits.h"
#include "content/public/browser/browser_thread.h"
#include "content/public/browser/speech_recognition_audio_forwarder_config.h"
#include "content/public/browser/speech_recognition_event_listener.h"
#include "media/audio/audio_system.h"
#include "media/base/audio_bus.h"
#include "media/base/audio_converter.h"
#include "media/base/audio_parameters.h"
#include "media/mojo/mojom/audio_logging.mojom.h"
#include "services/audio/public/cpp/device_factory.h"

#if BUILDFLAG(IS_WIN)
#include "media/audio/win/core_audio_util_win.h"
#endif

AudioBus;
AudioConverter;
AudioGlitchInfo;
AudioParameters;
ChannelLayout;

namespace content {

// Private class which encapsulates the audio converter and the
// AudioConverter::InputCallback. It handles resampling, buffering and
// channel mixing between input and output parameters.
class SpeechRecognizerImpl::OnDataConverter
    : public media::AudioConverter::InputCallback { … };

namespace {

// The following constants are related to the volume level indicator shown in
// the UI for recorded audio.
// Multiplier used when new volume is greater than previous level.
const float kUpSmoothingFactor = …;
// Multiplier used when new volume is lesser than previous level.
const float kDownSmoothingFactor = …;
// RMS dB value of a maximum (unclipped) sine wave for int16_t samples.
const float kAudioMeterMaxDb = …;
// This value corresponds to RMS dB for int16_t with 6 most-significant-bits =
// 0.
// Values lower than this will display as empty level-meter.
const float kAudioMeterMinDb = …;
const float kAudioMeterDbRange = …;

// Maximum level to draw to display unclipped meter. (1.0f displays clipping.)
const float kAudioMeterRangeMaxUnclipped = …;

// Returns true if more than 5% of the samples are at min or max value.
bool DetectClipping(const AudioChunk& chunk) { … }

}  // namespace

media::AudioSystem* SpeechRecognizerImpl::audio_system_for_tests_ = …;
media::AudioCapturerSource*
    SpeechRecognizerImpl::audio_capturer_source_for_tests_ = …;

// SpeechRecognizerImpl::OnDataConverter implementation

SpeechRecognizerImpl::OnDataConverter::OnDataConverter(
    const AudioParameters& input_params,
    const AudioParameters& output_params)
    : … { … }

SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { … }

scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert(
    const AudioBus* data) { … }

double SpeechRecognizerImpl::OnDataConverter::ProvideInput(
    AudioBus* dest,
    uint32_t frames_delayed,
    const AudioGlitchInfo& glitch_info) { … }

// SpeechRecognizerImpl implementation

SpeechRecognizerImpl::SpeechRecognizerImpl(
    SpeechRecognitionEventListener* listener,
    media::AudioSystem* audio_system,
    int session_id,
    bool continuous,
    bool provisional_results,
    std::unique_ptr<SpeechRecognitionEngine> engine,
    std::optional<SpeechRecognitionAudioForwarderConfig> audio_forwarder_config)
    : … { … }

// -------  Methods that trigger Finite State Machine (FSM) events ------------

// NOTE:all the external events and requests should be enqueued (PostTask), even
// if they come from the same (IO) thread, in order to preserve the relationship
// of causality between events and avoid interleaved event processing due to
// synchronous callbacks.

void SpeechRecognizerImpl::StartRecognition(const std::string& device_id) { … }

void SpeechRecognizerImpl::AbortRecognition() { … }

void SpeechRecognizerImpl::StopAudioCapture() { … }

bool SpeechRecognizerImpl::IsActive() const { … }

bool SpeechRecognizerImpl::IsCapturingAudio() const { … }

const SpeechRecognitionEngine&
SpeechRecognizerImpl::recognition_engine() const { … }

SpeechRecognizerImpl::~SpeechRecognizerImpl() { … }

void SpeechRecognizerImpl::Capture(const AudioBus* data,
                                   base::TimeTicks audio_capture_time,
                                   const AudioGlitchInfo& glitch_info,
                                   double volume,
                                   bool key_pressed) { … }

void SpeechRecognizerImpl::OnCaptureError(
    media::AudioCapturerSource::ErrorCode code,
    const std::string& message) { … }

void SpeechRecognizerImpl::AddAudioFromRenderer(
    media::mojom::AudioDataS16Ptr buffer) { … }

void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults(
    const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& results) { … }

void SpeechRecognizerImpl::OnSpeechRecognitionEngineEndOfUtterance() { … }

void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(
    const media::mojom::SpeechRecognitionError& error) { … }

// -----------------------  Core FSM implementation ---------------------------
// TODO(primiano): After the changes in the media package (r129173), this class
// slightly violates the SpeechRecognitionEventListener interface contract. In
// particular, it is not true anymore that this class can be freed after the
// OnRecognitionEnd event, since the audio_capturer_source_->Stop() asynchronous
// call can be still in progress after the end event. Currently, it does not
// represent a problem for the browser itself, since refcounting protects us
// against such race conditions. However, we should fix this in the next CLs.

// ----------- Contract for all the FSM evolution functions below -------------
//  - Are guaranteed to be executed in the IO thread;
//  - Are guaranteed to be not reentrant (themselves and each other);
//  - event_args members are guaranteed to be stable during the call;
//  - The class won't be freed in the meanwhile due to callbacks;

// TODO(primiano): the audio pipeline is currently serial. However, the
// clipper->endpointer->vumeter chain and the sr_engine could be parallelized.
// We should profile the execution to see if it would be worth or not.
void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) { … }

void SpeechRecognizerImpl::ProcessAudioPipeline(
    const FSMEventArgs& event_args) { … }

void SpeechRecognizerImpl::OnAudioParametersReceived(
    const std::optional<media::AudioParameters>& params) { … }

SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::PrepareRecognition(
    const FSMEventArgs&) { … }

SpeechRecognizerImpl::FSMState
SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { … }

SpeechRecognizerImpl::FSMState
SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { … }

SpeechRecognizerImpl::FSMState
SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { … }

SpeechRecognizerImpl::FSMState
SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { … }

SpeechRecognizerImpl::FSMState
SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { … }

SpeechRecognizerImpl::FSMState
SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) { … }

SpeechRecognizerImpl::FSMState
SpeechRecognizerImpl::AbortSilently(const FSMEventArgs& event_args) { … }

SpeechRecognizerImpl::FSMState
SpeechRecognizerImpl::AbortWithError(const FSMEventArgs& event_args) { … }

SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort(
    const media::mojom::SpeechRecognitionError& error) { … }

SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult(
    const FSMEventArgs& event_args) { … }

SpeechRecognizerImpl::FSMState
SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { … }

SpeechRecognizerImpl::FSMState
SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { … }

SpeechRecognizerImpl::FSMState
SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) { … }

void SpeechRecognizerImpl::CloseAudioCapturerSource() { … }

int SpeechRecognizerImpl::GetElapsedTimeMs() const { … }

void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms,
                                                  bool clip_detected) { … }

void SpeechRecognizerImpl::SetAudioEnvironmentForTesting(
    media::AudioSystem* audio_system,
    media::AudioCapturerSource* audio_capturer_source) { … }

media::AudioSystem* SpeechRecognizerImpl::GetAudioSystem() { … }

void SpeechRecognizerImpl::CreateAudioCapturerSource() { … }

media::AudioCapturerSource* SpeechRecognizerImpl::GetAudioCapturerSource() { … }

}  // namespace content
chromium/content/browser/speech/speech_recognizer_impl.cc