// Copyright 2012 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ // and r4_ will move after the first load): // // |----------------|-----------------------------------------|----------------| // // request_frames_ // <---------------------------------------------------------> // r0_ (during first load) // // kernel_size_ / 2 kernel_size_ / 2 kernel_size_ / 2 kernel_size_ / 2 // <---------------> <---------------> <---------------> <---------------> // r1_ r2_ r3_ r4_ // // block_size_ == r4_ - r2_ // <---------------------------------------> // // request_frames_ // <------------------ ... -----------------> // r0_ (during second load) // // On the second request r0_ slides to the right by kernel_size_ / 2 and r3_, // r4_ and block_size_ are reinitialized via step (3) in the algorithm below. // // These new regions remain constant until a Flush() occurs. While complicated, // this allows us to reduce jitter by always requesting the same amount from the // provided callback. // // The algorithm: // // 1) Allocate input_buffer of size: request_frames_ + kernel_size_; this // ensures // there's enough room to read request_frames_ from the callback into region // r0_ (which will move between the first and subsequent passes). // // 2) Let r1_, r2_ each represent half the kernel centered around r0_: // // r0_ = input_buffer_ + kernel_size_ / 2 // r1_ = input_buffer_ // r2_ = r0_ // // r0_ is always request_frames_ in size. r1_, r2_ are kernel_size_ / 2 in // size. r1_ must be zero initialized to avoid convolution with garbage (see // step (5) for why). // // 3) Let r3_, r4_ each represent half the kernel right aligned with the end of // r0_ and choose block_size_ as the distance in frames between r4_ and r2_: // // r3_ = r0_ + request_frames_ - kernel_size_ // r4_ = r0_ + request_frames_ - kernel_size_ / 2 // block_size_ = r4_ - r2_ = request_frames_ - kernel_size_ / 2 // // 4) Consume request_frames_ frames into r0_. // // 5) Position kernel centered at start of r2_ and generate output frames until // the kernel is centered at the start of r4_ or we've finished generating // all the output frames. // // 6) Wrap left over data from the r3_ to r1_ and r4_ to r2_. // // 7) If we're on the second load, in order to avoid overwriting the frames we // just wrapped from r4_ we need to slide r0_ to the right by the size of // r4_, which is kernel_size_ / 2: // // r0_ = r0_ + kernel_size_ / 2 = input_buffer_ + kernel_size_ // // r3_, r4_, and block_size_ then need to be reinitialized, so goto (3). // // 8) Else, if we're not on the second load, goto (4). // // Note: we're glossing over how the sub-sample handling works with // |virtual_source_idx_|, etc. #ifdef UNSAFE_BUFFERS_BUILD // TODO(crbug.com/40285824): Remove this and convert code to safer constructs. #pragma allow_unsafe_buffers #endif #include "media/base/sinc_resampler.h" #include <limits> #include <numbers> #include "base/check_op.h" #include "base/cpu.h" #include "base/trace_event/trace_event.h" #include "build/build_config.h" #include "cc/base/math_util.h" #if defined(ARCH_CPU_X86_FAMILY) #include <immintrin.h> // Including these headers directly should generally be avoided. Since // Chrome is compiled with -msse3 (the minimal requirement), we include the // headers directly to make the intrinsics available. #include <avx2intrin.h> #include <avxintrin.h> #include <fmaintrin.h> #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) #include <arm_neon.h> #endif namespace media { static double SincScaleFactor(double io_ratio, int kernel_size) { … } // If we know the minimum architecture at compile time, avoid CPU detection. void SincResampler::InitializeCPUSpecificFeatures() { … } static int CalculateChunkSize(int block_size_, double io_ratio) { … } // Static int SincResampler::KernelSizeFromRequestFrames(int request_frames) { … } SincResampler::SincResampler(double io_sample_rate_ratio, int request_frames, const ReadCB read_cb) : … { … } SincResampler::~SincResampler() = default; void SincResampler::UpdateRegions(bool second_load) { … } void SincResampler::InitializeKernel() { … } void SincResampler::SetRatio(double io_sample_rate_ratio) { … } void SincResampler::Resample(int frames, float* destination) { … } void SincResampler::PrimeWithSilence() { … } void SincResampler::Flush() { … } int SincResampler::GetMaxInputFramesRequested( int output_frames_requested) const { … } double SincResampler::BufferedFrames() const { … } int SincResampler::KernelSize() const { … } float SincResampler::Convolve_C(const int kernel_size, const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor) { … } #if defined(ARCH_CPU_X86_FAMILY) float SincResampler::Convolve_SSE(const int kernel_size, const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor) { … } __attribute__((target("avx2,fma"))) float SincResampler::Convolve_AVX2( const int kernel_size, const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor) { … } #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) float SincResampler::Convolve_NEON(const int kernel_size, const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor) { float32x4_t m_input; float32x4_t m_sums1 = vmovq_n_f32(0); float32x4_t m_sums2 = vmovq_n_f32(0); const float* upper = input_ptr + kernel_size; for (; input_ptr < upper;) { m_input = vld1q_f32(input_ptr); input_ptr += 4; m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1)); k1 += 4; m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2)); k2 += 4; } // Linearly interpolate the two "convolutions". m_sums1 = vmlaq_f32( vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), m_sums2, vmovq_n_f32(kernel_interpolation_factor)); // Sum components together. float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); return vget_lane_f32(vpadd_f32(m_half, m_half), 0); } #endif } // namespace media