sinc_resampler.cc | Explore in Territory

// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_
// and r4_ will move after the first load):
//
// |----------------|-----------------------------------------|----------------|
//
//                                        request_frames_
//                   <--------------------------------------------------------->
//                                    r0_ (during first load)
//
//  kernel_size_ / 2  kernel_size_ / 2        kernel_size_ / 2  kernel_size_ / 2
// <---------------> <--------------->       <---------------> <--------------->
//        r1_               r2_                     r3_               r4_
//
//                             block_size_ == r4_ - r2_
//                   <--------------------------------------->
//
//                                                  request_frames_
//                                    <------------------ ... ----------------->
//                                               r0_ (during second load)
//
// On the second request r0_ slides to the right by kernel_size_ / 2 and r3_,
// r4_ and block_size_ are reinitialized via step (3) in the algorithm below.
//
// These new regions remain constant until a Flush() occurs.  While complicated,
// this allows us to reduce jitter by always requesting the same amount from the
// provided callback.
//
// The algorithm:
//
// 1) Allocate input_buffer of size: request_frames_ + kernel_size_; this
// ensures
//    there's enough room to read request_frames_ from the callback into region
//    r0_ (which will move between the first and subsequent passes).
//
// 2) Let r1_, r2_ each represent half the kernel centered around r0_:
//
//        r0_ = input_buffer_ + kernel_size_ / 2
//        r1_ = input_buffer_
//        r2_ = r0_
//
//    r0_ is always request_frames_ in size.  r1_, r2_ are kernel_size_ / 2 in
//    size.  r1_ must be zero initialized to avoid convolution with garbage (see
//    step (5) for why).
//
// 3) Let r3_, r4_ each represent half the kernel right aligned with the end of
//    r0_ and choose block_size_ as the distance in frames between r4_ and r2_:
//
//        r3_ = r0_ + request_frames_ - kernel_size_
//        r4_ = r0_ + request_frames_ - kernel_size_ / 2
//        block_size_ = r4_ - r2_ = request_frames_ - kernel_size_ / 2
//
// 4) Consume request_frames_ frames into r0_.
//
// 5) Position kernel centered at start of r2_ and generate output frames until
//    the kernel is centered at the start of r4_ or we've finished generating
//    all the output frames.
//
// 6) Wrap left over data from the r3_ to r1_ and r4_ to r2_.
//
// 7) If we're on the second load, in order to avoid overwriting the frames we
//    just wrapped from r4_ we need to slide r0_ to the right by the size of
//    r4_, which is kernel_size_ / 2:
//
//        r0_ = r0_ + kernel_size_ / 2 = input_buffer_ + kernel_size_
//
//    r3_, r4_, and block_size_ then need to be reinitialized, so goto (3).
//
// 8) Else, if we're not on the second load, goto (4).
//
// Note: we're glossing over how the sub-sample handling works with
// |virtual_source_idx_|, etc.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/40285824): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif

#include "media/base/sinc_resampler.h"

#include <limits>
#include <numbers>

#include "base/check_op.h"
#include "base/cpu.h"
#include "base/trace_event/trace_event.h"
#include "build/build_config.h"
#include "cc/base/math_util.h"

#if defined(ARCH_CPU_X86_FAMILY)
#include <immintrin.h>
// Including these headers directly should generally be avoided. Since
// Chrome is compiled with -msse3 (the minimal requirement), we include the
// headers directly to make the intrinsics available.
#include <avx2intrin.h>
#include <avxintrin.h>
#include <fmaintrin.h>
#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
#include <arm_neon.h>
#endif

namespace media {

static double SincScaleFactor(double io_ratio, int kernel_size) { … }

// If we know the minimum architecture at compile time, avoid CPU detection.
void SincResampler::InitializeCPUSpecificFeatures() { … }

static int CalculateChunkSize(int block_size_, double io_ratio) { … }

// Static
int SincResampler::KernelSizeFromRequestFrames(int request_frames) { … }

SincResampler::SincResampler(double io_sample_rate_ratio,
                             int request_frames,
                             const ReadCB read_cb)
    : … { … }

SincResampler::~SincResampler() = default;

void SincResampler::UpdateRegions(bool second_load) { … }

void SincResampler::InitializeKernel() { … }

void SincResampler::SetRatio(double io_sample_rate_ratio) { … }

void SincResampler::Resample(int frames, float* destination) { … }

void SincResampler::PrimeWithSilence() { … }

void SincResampler::Flush() { … }

int SincResampler::GetMaxInputFramesRequested(
    int output_frames_requested) const { … }

double SincResampler::BufferedFrames() const { … }

int SincResampler::KernelSize() const { … }

float SincResampler::Convolve_C(const int kernel_size,
                                const float* input_ptr,
                                const float* k1,
                                const float* k2,
                                double kernel_interpolation_factor) { … }

#if defined(ARCH_CPU_X86_FAMILY)
float SincResampler::Convolve_SSE(const int kernel_size,
                                  const float* input_ptr,
                                  const float* k1,
                                  const float* k2,
                                  double kernel_interpolation_factor) { … }

__attribute__((target("avx2,fma"))) float SincResampler::Convolve_AVX2(
    const int kernel_size,
    const float* input_ptr,
    const float* k1,
    const float* k2,
    double kernel_interpolation_factor) { … }
#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
float SincResampler::Convolve_NEON(const int kernel_size,
                                   const float* input_ptr,
                                   const float* k1,
                                   const float* k2,
                                   double kernel_interpolation_factor) {
  float32x4_t m_input;
  float32x4_t m_sums1 = vmovq_n_f32(0);
  float32x4_t m_sums2 = vmovq_n_f32(0);

  const float* upper = input_ptr + kernel_size;
  for (; input_ptr < upper;) {
    m_input = vld1q_f32(input_ptr);
    input_ptr += 4;
    m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));
    k1 += 4;
    m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));
    k2 += 4;
  }

  // Linearly interpolate the two "convolutions".
  m_sums1 = vmlaq_f32(
      vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),
      m_sums2, vmovq_n_f32(kernel_interpolation_factor));

  // Sum components together.
  float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
  return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
}
#endif

}  // namespace media
chromium/media/base/sinc_resampler.cc