vad_core.c | Explore in Territory

/*
 *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "common_audio/vad/vad_core.h"

#include "rtc_base/sanitizer.h"
#include "common_audio/signal_processing/include/signal_processing_library.h"
#include "common_audio/vad/vad_filterbank.h"
#include "common_audio/vad/vad_gmm.h"
#include "common_audio/vad/vad_sp.h"

// Spectrum Weighting
static const int16_t kSpectrumWeight[kNumChannels] = …;
static const int16_t kNoiseUpdateConst = …; // Q15
static const int16_t kSpeechUpdateConst = …; // Q15
static const int16_t kBackEta = …; // Q8
// Minimum difference between the two models, Q5
static const int16_t kMinimumDifference[kNumChannels] = …;
// Upper limit of mean value for speech model, Q7
static const int16_t kMaximumSpeech[kNumChannels] = …;
// Minimum value for mean value
static const int16_t kMinimumMean[kNumGaussians] = …;
// Upper limit of mean value for noise model, Q7
static const int16_t kMaximumNoise[kNumChannels] = …;
// Start values for the Gaussian models, Q7
// Weights for the two Gaussians for the six channels (noise)
static const int16_t kNoiseDataWeights[kTableSize] = …;
// Weights for the two Gaussians for the six channels (speech)
static const int16_t kSpeechDataWeights[kTableSize] = …;
// Means for the two Gaussians for the six channels (noise)
static const int16_t kNoiseDataMeans[kTableSize] = …;
// Means for the two Gaussians for the six channels (speech)
static const int16_t kSpeechDataMeans[kTableSize] = …;
// Stds for the two Gaussians for the six channels (noise)
static const int16_t kNoiseDataStds[kTableSize] = …;
// Stds for the two Gaussians for the six channels (speech)
static const int16_t kSpeechDataStds[kTableSize] = …;

// Constants used in GmmProbability().
//
// Maximum number of counted speech (VAD = 1) frames in a row.
static const int16_t kMaxSpeechFrames = …;
// Minimum standard deviation for both speech and noise.
static const int16_t kMinStd = …;

// Constants in WebRtcVad_InitCore().
// Default aggressiveness mode.
static const short kDefaultMode = …;
static const int kInitCheck = …;

// Constants used in WebRtcVad_set_mode_core().
//
// Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
//
// Mode 0, Quality.
static const int16_t kOverHangMax1Q[3] = …;
static const int16_t kOverHangMax2Q[3] = …;
static const int16_t kLocalThresholdQ[3] = …;
static const int16_t kGlobalThresholdQ[3] = …;
// Mode 1, Low bitrate.
static const int16_t kOverHangMax1LBR[3] = …;
static const int16_t kOverHangMax2LBR[3] = …;
static const int16_t kLocalThresholdLBR[3] = …;
static const int16_t kGlobalThresholdLBR[3] = …;
// Mode 2, Aggressive.
static const int16_t kOverHangMax1AGG[3] = …;
static const int16_t kOverHangMax2AGG[3] = …;
static const int16_t kLocalThresholdAGG[3] = …;
static const int16_t kGlobalThresholdAGG[3] = …;
// Mode 3, Very aggressive.
static const int16_t kOverHangMax1VAG[3] = …;
static const int16_t kOverHangMax2VAG[3] = …;
static const int16_t kLocalThresholdVAG[3] = …;
static const int16_t kGlobalThresholdVAG[3] = …;

// Calculates the weighted average w.r.t. number of Gaussians. The `data` are
// updated with an `offset` before averaging.
//
// - data     [i/o] : Data to average.
// - offset   [i]   : An offset added to `data`.
// - weights  [i]   : Weights used for averaging.
//
// returns          : The weighted average.
static int32_t WeightedAverage(int16_t* data, int16_t offset,
                               const int16_t* weights) { … }

// An s16 x s32 -> s32 multiplication that's allowed to overflow. (It's still
// undefined behavior, so not a good idea; this just makes UBSan ignore the
// violation, so that our old code can continue to do what it's always been
// doing.)
static inline int32_t RTC_NO_SANITIZE("signed-integer-overflow")
    OverflowingMulS16ByS32ToS32(int16_t a, int32_t b) { … }

// Calculates the probabilities for both speech and background noise using
// Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
// type of signal is most probable.
//
// - self           [i/o] : Pointer to VAD instance
// - features       [i]   : Feature vector of length `kNumChannels`
//                          = log10(energy in frequency band)
// - total_power    [i]   : Total power in audio frame.
// - frame_length   [i]   : Number of input samples
//
// - returns              : the VAD decision (0 - noise, 1 - speech).
static int16_t GmmProbability(VadInstT* self, int16_t* features,
                              int16_t total_power, size_t frame_length) { … }

// Initialize the VAD. Set aggressiveness mode to default value.
int WebRtcVad_InitCore(VadInstT* self) { … }

// Set aggressiveness mode
int WebRtcVad_set_mode_core(VadInstT* self, int mode) { … }

// Calculate VAD decision by first extracting feature values and then calculate
// probability for both speech and background noise.

int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame,
                           size_t frame_length) { … }

int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame,
                           size_t frame_length)
{ … }

int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame,
                           size_t frame_length)
{ … }

int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame,
                          size_t frame_length)
{ … }
chromium/third_party/webrtc/common_audio/vad/vad_core.c