chromium/third_party/blink/renderer/modules/webaudio/cpu/arm/oscillator_kernel_neon.cc

// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/351564777): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif

#include "build/build_config.h"
#include "third_party/blink/renderer/modules/webaudio/oscillator_handler.h"
#include "third_party/blink/renderer/modules/webaudio/periodic_wave.h"

#if defined(CPU_ARM_NEON)
#include <arm_neon.h>
#endif

namespace blink {

#if defined(CPU_ARM_NEON)
namespace {

float32x4_t WrapVirtualIndexVector(float32x4_t x,
                                   float32x4_t wave_size,
                                   float32x4_t inv_wave_size) {
  // r = x/wave_size, f = truncate(r), truncating towards 0
  const float32x4_t r = vmulq_f32(x, inv_wave_size);
  int32x4_t f = vcvtq_s32_f32(r);

  // vcltq_f32 returns returns all 0xfffffff (-1) if a < b and if if not.
  const uint32x4_t cmp = vcltq_f32(r, vcvtq_f32_s32(f));
  f = vaddq_s32(f, vreinterpretq_s32_u32(cmp));

  return vsubq_f32(x, vmulq_f32(vcvtq_f32_s32(f), wave_size));
}

ALWAYS_INLINE double WrapVirtualIndex(double virtual_index,
                                      unsigned periodic_wave_size,
                                      double inv_periodic_wave_size) {
  return virtual_index -
         floor(virtual_index * inv_periodic_wave_size) * periodic_wave_size;
}

}  // namespace

std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
    int n,
    float* dest_p,
    double virtual_read_index,
    float frequency,
    float rate_scale) const {
  const unsigned periodic_wave_size = periodic_wave_->PeriodicWaveSize();
  const double inv_periodic_wave_size = 1.0 / periodic_wave_size;

  float* higher_wave_data = nullptr;
  float* lower_wave_data = nullptr;
  float table_interpolation_factor = 0;
  const float incr = frequency * rate_scale;
  DCHECK_GE(incr, kInterpolate2Point);

  periodic_wave_->WaveDataForFundamentalFrequency(
      frequency, lower_wave_data, higher_wave_data, table_interpolation_factor);

  const float32x4_t v_wave_size = vdupq_n_f32(periodic_wave_size);
  const float32x4_t v_inv_wave_size = vdupq_n_f32(1.0f / periodic_wave_size);

  const uint32x4_t v_read_mask = vdupq_n_u32(periodic_wave_size - 1);
  const uint32x4_t v_one = vdupq_n_u32(1);

  const float32x4_t v_table_factor = vdupq_n_f32(table_interpolation_factor);

  const float32x4_t v_incr = vdupq_n_f32(4 * incr);

  float virtual_read_index_flt = virtual_read_index;
  float32x4_t v_virt_index = {
      virtual_read_index_flt + 0 * incr, virtual_read_index_flt + 1 * incr,
      virtual_read_index_flt + 2 * incr, virtual_read_index_flt + 3 * incr};

  // Temporary arrsys to hold the read indices so we can access them
  // individually to get the samples needed for interpolation.
  uint32_t r0[4] __attribute__((aligned(16)));
  uint32_t r1[4] __attribute__((aligned(16)));

  // Temporary arrays where we can gather up the wave data we need for
  // interpolation.  Align these for best efficiency on older CPUs where aligned
  // access is much faster than unaliged.  TODO(rtoy): Is there a faster way to
  // do this?
  float sample1_lower[4] __attribute__((aligned(16)));
  float sample2_lower[4] __attribute__((aligned(16)));
  float sample1_higher[4] __attribute__((aligned(16)));
  float sample2_higher[4] __attribute__((aligned(16)));

  // It's possible that adding the incr above exceeded the bounds, so wrap them
  // if needed.
  v_virt_index =
      WrapVirtualIndexVector(v_virt_index, v_wave_size, v_inv_wave_size);

  int k = 0;
  int n_loops = n / 4;

  for (int loop = 0; loop < n_loops; ++loop, k += 4) {
    // Compute indices for the samples and contain within the valid range.
    const uint32x4_t read_index_0 =
        vandq_u32(vcvtq_u32_f32(v_virt_index), v_read_mask);
    const uint32x4_t read_index_1 =
        vandq_u32(vaddq_u32(read_index_0, v_one), v_read_mask);

    // Extract the components of the indices so we can get the samples
    // associated with the lower and higher wave data.
    vst1q_u32(r0, read_index_0);
    vst1q_u32(r1, read_index_1);

    for (int m = 0; m < 4; ++m) {
      sample1_lower[m] = lower_wave_data[r0[m]];
      sample2_lower[m] = lower_wave_data[r1[m]];
      sample1_higher[m] = higher_wave_data[r0[m]];
      sample2_higher[m] = higher_wave_data[r1[m]];
    }

    const float32x4_t s1_low = vld1q_f32(sample1_lower);
    const float32x4_t s2_low = vld1q_f32(sample2_lower);
    const float32x4_t s1_high = vld1q_f32(sample1_higher);
    const float32x4_t s2_high = vld1q_f32(sample2_higher);

    const float32x4_t interpolation_factor =
        vsubq_f32(v_virt_index, vcvtq_f32_u32(read_index_0));
    const float32x4_t sample_higher = vaddq_f32(
        s1_high, vmulq_f32(interpolation_factor, vsubq_f32(s2_high, s1_high)));
    const float32x4_t sample_lower = vaddq_f32(
        s1_low, vmulq_f32(interpolation_factor, vsubq_f32(s2_low, s1_low)));
    const float32x4_t sample = vaddq_f32(
        sample_higher,
        vmulq_f32(v_table_factor, vsubq_f32(sample_lower, sample_higher)));

    vst1q_f32(dest_p + k, sample);

    // Increment virtual read index and wrap virtualReadIndex into the range
    // 0 -> periodicWaveSize.
    v_virt_index = vaddq_f32(v_virt_index, v_incr);
    v_virt_index =
        WrapVirtualIndexVector(v_virt_index, v_wave_size, v_inv_wave_size);
  }

  // There's a bit of round-off above, so update the index more accurately so at
  // least the next render starts over with a more accurate value.
  virtual_read_index += k * incr;
  virtual_read_index -=
      std::floor(virtual_read_index * inv_periodic_wave_size) *
      periodic_wave_size;

  return std::make_tuple(k, virtual_read_index);
}

double OscillatorHandler::ProcessARateVectorKernel(
    float* destination,
    double virtual_read_index,
    const float* phase_increments,
    unsigned periodic_wave_size,
    const float* const lower_wave_data[4],
    const float* const higher_wave_data[4],
    const float table_interpolation_factor[4]) const {
  // See the scalar version in oscillator_node.cc for the basic algorithm.
  double inv_periodic_wave_size = 1.0 / periodic_wave_size;
  unsigned read_index_mask = periodic_wave_size - 1;

  // Accumulate the phase increments so we can set up the virtual read index
  // vector appropriately.  This must be a double to preserve accuracy and
  // to match the scalar version.
  double incr_sum[4];
  incr_sum[0] = phase_increments[0];
  for (int m = 1; m < 4; ++m) {
    incr_sum[m] = incr_sum[m - 1] + phase_increments[m];
  }

  // It's really important for accuracy that we use doubles instead of
  // floats for the virtual_read_index.  Without this, we can only get some
  // 30-50 dB in the sweep tests instead of 100+ dB.
  //
  // Arm NEON doesn't have float64x2_t so we have to do this.  (Aarch64 has
  // float64x2_t.)
  double virt_index[4];
  virt_index[0] = virtual_read_index;
  virt_index[1] = WrapVirtualIndex(virtual_read_index + incr_sum[0],
                                   periodic_wave_size, inv_periodic_wave_size);
  virt_index[2] = WrapVirtualIndex(virtual_read_index + incr_sum[1],
                                   periodic_wave_size, inv_periodic_wave_size);
  virt_index[3] = WrapVirtualIndex(virtual_read_index + incr_sum[2],
                                   periodic_wave_size, inv_periodic_wave_size);

  // The virtual indices we're working with now.
  const float32x4_t v_virt_index = {
      static_cast<float>(virt_index[0]), static_cast<float>(virt_index[1]),
      static_cast<float>(virt_index[2]), static_cast<float>(virt_index[3])};

  // Convert virtual index to actual index into wave data, wrap the index
  // around if needed.
  const uint32x4_t v_read0 =
      vandq_u32(vcvtq_u32_f32(v_virt_index), vdupq_n_u32(read_index_mask));

  // v_read1 = v_read0 + 1, but wrap the index around, if needed.
  const uint32x4_t v_read1 = vandq_u32(vaddq_u32(v_read0, vdupq_n_u32(1)),
                                       vdupq_n_u32(read_index_mask));

  float sample1_lower[4] __attribute__((aligned(16)));
  float sample2_lower[4] __attribute__((aligned(16)));
  float sample1_higher[4] __attribute__((aligned(16)));
  float sample2_higher[4] __attribute__((aligned(16)));

  uint32_t read0[4] __attribute__((aligned(16)));
  uint32_t read1[4] __attribute__((aligned(16)));

  vst1q_u32(read0, v_read0);
  vst1q_u32(read1, v_read1);

  // Read the samples from the wave tables
  for (int m = 0; m < 4; ++m) {
    DCHECK_LT(read0[m], periodic_wave_size);
    DCHECK_LT(read1[m], periodic_wave_size);

    sample1_lower[m] = lower_wave_data[m][read0[m]];
    sample2_lower[m] = lower_wave_data[m][read1[m]];
    sample1_higher[m] = higher_wave_data[m][read0[m]];
    sample2_higher[m] = higher_wave_data[m][read1[m]];
  }

  // Compute factor for linear interpolation within a wave table.
  const float32x4_t v_factor = vsubq_f32(v_virt_index, vcvtq_f32_u32(v_read0));

  // Linearly interpolate between samples from the higher wave table.
  const float32x4_t sample_higher = vmlaq_f32(
      vld1q_f32(sample1_higher), v_factor,
      vsubq_f32(vld1q_f32(sample2_higher), vld1q_f32(sample1_higher)));

  // Linearly interpolate between samples from the lower wave table.
  const float32x4_t sample_lower =
      vmlaq_f32(vld1q_f32(sample1_lower), v_factor,
                vsubq_f32(vld1q_f32(sample2_lower), vld1q_f32(sample1_lower)));

  // Linearly interpolate between wave tables to get the desired
  // output samples.
  const float32x4_t sample =
      vmlaq_f32(sample_higher, vld1q_f32(table_interpolation_factor),
                vsubq_f32(sample_lower, sample_higher));

  vst1q_f32(destination, sample);

  // Update the virtual_read_index appropriately and return it for the
  // next call.
  virtual_read_index =
      WrapVirtualIndex(virtual_read_index + incr_sum[3], periodic_wave_size,
                       inv_periodic_wave_size);

  return virtual_read_index;
}
#endif

}  // namespace blink