/* * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "common_audio/vad/vad_core.h" #include "rtc_base/sanitizer.h" #include "common_audio/signal_processing/include/signal_processing_library.h" #include "common_audio/vad/vad_filterbank.h" #include "common_audio/vad/vad_gmm.h" #include "common_audio/vad/vad_sp.h" // Spectrum Weighting static const int16_t kSpectrumWeight[kNumChannels] = …; static const int16_t kNoiseUpdateConst = …; // Q15 static const int16_t kSpeechUpdateConst = …; // Q15 static const int16_t kBackEta = …; // Q8 // Minimum difference between the two models, Q5 static const int16_t kMinimumDifference[kNumChannels] = …; // Upper limit of mean value for speech model, Q7 static const int16_t kMaximumSpeech[kNumChannels] = …; // Minimum value for mean value static const int16_t kMinimumMean[kNumGaussians] = …; // Upper limit of mean value for noise model, Q7 static const int16_t kMaximumNoise[kNumChannels] = …; // Start values for the Gaussian models, Q7 // Weights for the two Gaussians for the six channels (noise) static const int16_t kNoiseDataWeights[kTableSize] = …; // Weights for the two Gaussians for the six channels (speech) static const int16_t kSpeechDataWeights[kTableSize] = …; // Means for the two Gaussians for the six channels (noise) static const int16_t kNoiseDataMeans[kTableSize] = …; // Means for the two Gaussians for the six channels (speech) static const int16_t kSpeechDataMeans[kTableSize] = …; // Stds for the two Gaussians for the six channels (noise) static const int16_t kNoiseDataStds[kTableSize] = …; // Stds for the two Gaussians for the six channels (speech) static const int16_t kSpeechDataStds[kTableSize] = …; // Constants used in GmmProbability(). // // Maximum number of counted speech (VAD = 1) frames in a row. static const int16_t kMaxSpeechFrames = …; // Minimum standard deviation for both speech and noise. static const int16_t kMinStd = …; // Constants in WebRtcVad_InitCore(). // Default aggressiveness mode. static const short kDefaultMode = …; static const int kInitCheck = …; // Constants used in WebRtcVad_set_mode_core(). // // Thresholds for different frame lengths (10 ms, 20 ms and 30 ms). // // Mode 0, Quality. static const int16_t kOverHangMax1Q[3] = …; static const int16_t kOverHangMax2Q[3] = …; static const int16_t kLocalThresholdQ[3] = …; static const int16_t kGlobalThresholdQ[3] = …; // Mode 1, Low bitrate. static const int16_t kOverHangMax1LBR[3] = …; static const int16_t kOverHangMax2LBR[3] = …; static const int16_t kLocalThresholdLBR[3] = …; static const int16_t kGlobalThresholdLBR[3] = …; // Mode 2, Aggressive. static const int16_t kOverHangMax1AGG[3] = …; static const int16_t kOverHangMax2AGG[3] = …; static const int16_t kLocalThresholdAGG[3] = …; static const int16_t kGlobalThresholdAGG[3] = …; // Mode 3, Very aggressive. static const int16_t kOverHangMax1VAG[3] = …; static const int16_t kOverHangMax2VAG[3] = …; static const int16_t kLocalThresholdVAG[3] = …; static const int16_t kGlobalThresholdVAG[3] = …; // Calculates the weighted average w.r.t. number of Gaussians. The `data` are // updated with an `offset` before averaging. // // - data [i/o] : Data to average. // - offset [i] : An offset added to `data`. // - weights [i] : Weights used for averaging. // // returns : The weighted average. static int32_t WeightedAverage(int16_t* data, int16_t offset, const int16_t* weights) { … } // An s16 x s32 -> s32 multiplication that's allowed to overflow. (It's still // undefined behavior, so not a good idea; this just makes UBSan ignore the // violation, so that our old code can continue to do what it's always been // doing.) static inline int32_t RTC_NO_SANITIZE("signed-integer-overflow") OverflowingMulS16ByS32ToS32(int16_t a, int32_t b) { … } // Calculates the probabilities for both speech and background noise using // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which // type of signal is most probable. // // - self [i/o] : Pointer to VAD instance // - features [i] : Feature vector of length `kNumChannels` // = log10(energy in frequency band) // - total_power [i] : Total power in audio frame. // - frame_length [i] : Number of input samples // // - returns : the VAD decision (0 - noise, 1 - speech). static int16_t GmmProbability(VadInstT* self, int16_t* features, int16_t total_power, size_t frame_length) { … } // Initialize the VAD. Set aggressiveness mode to default value. int WebRtcVad_InitCore(VadInstT* self) { … } // Set aggressiveness mode int WebRtcVad_set_mode_core(VadInstT* self, int mode) { … } // Calculate VAD decision by first extracting feature values and then calculate // probability for both speech and background noise. int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame, size_t frame_length) { … } int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame, size_t frame_length) { … } int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame, size_t frame_length) { … } int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame, size_t frame_length) { … }