chromium/third_party/tflite/src/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h

/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_

#include <algorithm>
#include <type_traits>

#include "ruy/profiler/instrumentation.h"  // from @ruy
#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
#include "tensorflow/lite/kernels/internal/types.h"

#ifdef __AVX2__
#include <immintrin.h>
#endif

namespace tflite {
namespace optimized_ops {
namespace depthwise_conv {

// Implementation of quantized DepthwiseConv

template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
struct QuantizedDepthwiseConvKernel {};

#ifdef USE_NEON
template <>
struct QuantizedDepthwiseConvKernel<true, 8, 2> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    uint8x8x2_t filter_u8;
    filter_u8.val[0] = vld1_u8(filter_ptr);
    filter_u8.val[1] = vld1_u8(filter_ptr + 8);
    int16x8_t filter[2];
    for (int i = 0; i < 2; i++) {
      filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])),
                            vdupq_n_s16(filter_offset));
    }
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer
      int32x4x2_t acc[2];
      for (int i = 0; i < 2; i++) {
        acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
        acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
      }
      // Load the inputs, add input_offset.
      const uint8x8_t input_u8 = vld1_u8(input_ptr);
      input_ptr += input_ptr_increment;
      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
      // Duplicate the input values, 2-fold
      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
      // Multiply-accumulate
      for (int i = 0; i < 2; i++) {
        acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
                                  vget_low_s16(input_dup2.val[i]));
        acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
                                  vget_high_s16(input_dup2.val[i]));
      }
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 2; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
        vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
      }
      acc_buffer_ptr += 16;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 8, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));

    int outp = 0;
    // Handle 2 output pixels at a time.
    for (; outp <= num_output_pixels - 2; outp += 2) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      uint8x8_t input_u8[2];
      for (int i = 0; i < 2; i++) {
        input_u8[i] = vld1_u8(input_ptr + 8 * i);
      }
      input_ptr += 16;
      int16x8_t input[2];
      for (int i = 0; i < 2; i++) {
        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
      }
      for (int i = 0; i < 2; i++) {
        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
      }
      // Multiply-accumulate.
      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
      acc[1] =
          vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
      acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
      acc[3] =
          vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
    // Handle 1 output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc[2];
      acc[0] = vld1q_s32(acc_buffer_ptr);
      acc[1] = vld1q_s32(acc_buffer_ptr + 4);

      // Load the inputs, add input_offset.
      const uint8x8_t input_u8 = vld1_u8(input_ptr);
      input_ptr += 8;
      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
      // Multiply-accumulate.
      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr, acc[0]);
      vst1q_s32(acc_buffer_ptr + 4, acc[1]);
      acc_buffer_ptr += 8;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 4, 2> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));

    int outp = 0;
    // Handle 2 output pixels at a time.
    for (; outp <= num_output_pixels - 2; outp += 2) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      const uint8x8_t input_u8 = vld1_u8(input_ptr);
      input_ptr += 8;
      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
      // Duplicate the input values, 2-fold
      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
      // Multiply-accumulate
      for (int i = 0; i < 2; i++) {
        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
                                   vget_low_s16(input_dup2.val[i]));
        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
                                   vget_high_s16(input_dup2.val[i]));
      }
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
    // Handle one output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[2];
      for (int i = 0; i < 2; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      uint8x8_t input_u8 = vdup_n_u8(0);
      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
      input_ptr += 4;
      const int16x4_t input_s16 =
          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
      // Duplicate the input values, 2-fold
      const int16x4x2_t input_dup2 = vzip_s16(input, input);
      // Multiply-accumulate
      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 2; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 8;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 2, 8> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    int16x8_t filter[2];
    for (int i = 0; i < 2; i++) {
      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
    }
    int outp = 0;
    // Handle two output pixels at a time.
    for (; outp <= num_output_pixels - 2; outp += 2) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc[8];
      for (int i = 0; i < 8; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      uint8x8_t input_u8 = vdup_n_u8(0);
      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
      input_ptr += 4;
      const int16x4_t input_s16 =
          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
      // Multiply-accumulate.
      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
      // Store the accumulators back to acc_buffer.
      for (int i = 0; i < 8; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 32;
    }
    // Handle one output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      uint8x8_t input_u8 = vdup_n_u8(0);
      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
      input_ptr += 2;
      const int16x4_t input_s16 =
          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate.
      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);

      // Store the accumulators back to acc_buffer.
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 2, 2> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    uint8x8_t filter_u8 = vdup_n_u8(0);
    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
    const int16x4_t filter_s16 =
        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));

    int outp = 0;
    // Handle 4 output pixels at a time.
    for (; outp <= num_output_pixels - 4; outp += 4) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }

      // Load the inputs, add input_offset.
      const uint8x8_t input_u8 = vld1_u8(input_ptr);
      input_ptr += 8;
      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
      // Duplicate the input values, 2-fold
      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
      // Multiply-accumulate
      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
    // Handle one output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer
      int32x4_t acc = vld1q_s32(acc_buffer_ptr);

      uint8x8_t input_u8 = vdup_n_u8(0);
      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
      input_ptr += 2;
      const int16x4_t input_s16 =
          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
      // Duplicate the input values, 2-fold
      const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
      // Multiply-accumulate
      acc = vmlal_s16(acc, filter, input_dup2);
      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 4;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 2, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    uint8x8_t filter_u8 = vdup_n_u8(0);
    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
    const int16x4_t filter_s16 =
        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));

    int outp = 0;
    // Handle 8 output pixels at a time.
    for (; outp <= num_output_pixels - 8; outp += 8) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      uint8x8_t input_u8[2];
      for (int i = 0; i < 2; i++) {
        input_u8[i] = vld1_u8(input_ptr + 8 * i);
      }
      input_ptr += 16;
      int16x8_t input[2];
      for (int i = 0; i < 2; i++) {
        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
      }
      for (int i = 0; i < 2; i++) {
        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
      }

      // Multiply-accumulate.
      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
      // Store the accumulators back to acc_buffer.
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
    // Handle 4 output pixels at a time.
    for (; outp <= num_output_pixels - 4; outp += 4) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc[2];
      for (int i = 0; i < 2; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      const uint8x8_t input_u8 = vld1_u8(input_ptr);
      input_ptr += 8;
      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

      // Multiply-accumulate.
      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
      // Store the accumulators back to acc_buffer.
      for (int i = 0; i < 2; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 8;
    }
    // Handle 2 output pixels at a time.
    for (; outp <= num_output_pixels - 2; outp += 2) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
      // Load the inputs, add input_offset.
      uint8x8_t input_u8 = vdup_n_u8(0);
      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
      input_ptr += 4;
      const int16x4_t input_s16 =
          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate.
      acc = vmlal_s16(acc, filter, input);
      // Store the accumulators back to acc_buffer.
      vst1q_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 4;
    }
    // Handle 1 output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer.
      int32x2_t acc = vld1_s32(acc_buffer_ptr);
      // Load the inputs, add input_offset.
      uint8x8_t input_u8 = vdup_n_u8(0);
      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
      input_ptr += 2;
      const int16x4_t input_s16 =
          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate.
      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
      // Store the accumulators back to acc_buffer.
      vst1_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 2;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 1, 2> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    uint8x8_t filter_u8 = vdup_n_u8(0);
    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
    const int16x4_t filter_s16 =
        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));

    int outp = 0;
    // Handle 8 output pixels at a time.
    for (; outp <= num_output_pixels - 8; outp += 8) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }

      // Load the inputs, add input_offset.
      const uint8x8_t input_u8 = vld1_u8(input_ptr);
      input_ptr += 8;
      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
      // Duplicate the input values, 2-fold
      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
      // Multiply-accumulate
      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
    // Handle one output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer
      int32x2_t acc = vld1_s32(acc_buffer_ptr);

      // Load the inputs, add input_offset.
      const uint32_t input = *input_ptr++ + input_offset;

      // Multiply-accumulate
      acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
      // Store the accumulators back to acc_buffer
      vst1_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 2;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 1, 4> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    uint8x8_t filter_u8 = vdup_n_u8(0);
    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
    const int16x4_t filter_s16 =
        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));

    int outp = 0;
    // Handle 8 output pixels at a time.
    for (; outp <= num_output_pixels - 8; outp += 8) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[8];
      for (int i = 0; i < 8; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }

      // Load the inputs, add input_offset.
      uint8x8_t input_u8 = vld1_u8(input_ptr);
      input_ptr += 8;
      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

      // Multiply-accumulate
      acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
      acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
      acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
      acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
      acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
      acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
      acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
      acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);

      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 8; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 32;
    }
    // Handle 4 output pixels at a time.
    for (; outp <= num_output_pixels - 4; outp += 4) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }

      // Load the inputs, add input_offset.
      uint8x8_t input_u8 = vdup_n_u8(0);
      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
      input_ptr += 4;
      const int16x4_t input_s16 =
          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate
      acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
      acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
      acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
      acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);

      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
    // Handle one output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer
      int32x4_t acc = vld1q_s32(acc_buffer_ptr);

      // Load the inputs, add input_offset.
      const uint32_t input = *input_ptr++ + input_offset;

      // Multiply-accumulate
      acc = vmlal_n_s16(acc, filter, input);
      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 4;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 4, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    uint8x8_t filter_u8 = vdup_n_u8(0);
    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
    const int16x4_t filter_s16 =
        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));

    int outp = 0;
    // Handle 4 output pixels at a time.
    for (; outp <= num_output_pixels - 4; outp += 4) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      int16x8_t input[2];
      for (int i = 0; i < 2; i++) {
        const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
        input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
      }
      input_ptr += 16;
      // Multiply-accumulate
      for (int i = 0; i < 2; i++) {
        acc[2 * i + 0] =
            vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
        acc[2 * i + 1] =
            vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
      }
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
    // Handle one output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer
      int32x4_t acc;
      acc = vld1q_s32(acc_buffer_ptr);

      // Load the inputs, add input_offset.
      uint8x8_t input_u8 = vdup_n_u8(0);
      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
      input_ptr += 4;
      const int16x4_t input_s16 =
          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
      // Multiply-accumulate
      acc = vmlal_s16(acc, filter, input);
      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 4;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 4, 4> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    int16x8_t filter[2];
    for (int i = 0; i < 2; i++) {
      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
    }

    int outp = 0;
    // Handle 2 output pixels at a time.
    for (; outp <= num_output_pixels - 2; outp += 2) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[8];
      for (int i = 0; i < 8; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }

      // Load the inputs, add input_offset.
      uint8x8_t input_u8 = vld1_u8(input_ptr);
      input_ptr += 8;
      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

      // Multiply-accumulate
      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
                              vget_low_s16(input), 0);
      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
                              vget_low_s16(input), 1);
      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
                              vget_low_s16(input), 2);
      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
                              vget_low_s16(input), 3);
      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
                              vget_high_s16(input), 0);
      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
                              vget_high_s16(input), 1);
      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
                              vget_high_s16(input), 2);
      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
                              vget_high_s16(input), 3);
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 8; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 32;
    }
    // Handle one output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }

      // Load the inputs, add input_offset.
      uint8x8_t input_u8 = vdup_n_u8(0);
      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
      input_ptr += 4;
      const int16x4_t input_s16 =
          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate
      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 0, 3> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // We will have to duplicate bytes in a NEON register, 3-fold.
    // We will do that by register-level table-look-up using VTBL instructions.
    // Here we prepare the registers containing the table-lookup indices.
    static const uint8_t dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
                                                     {2, 3, 3, 3, 4, 4, 4, 5},
                                                     {5, 5, 6, 6, 6, 7, 7, 7}};
    uint8x8_t dup3_indices[3];
    for (int i = 0; i < 3; i++) {
      dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
    }

    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      const uint8_t* local_filter_ptr = filter_ptr;
      const uint8_t* local_input_ptr = input_ptr;
      int ic = 0;
      // Handle 8 input channels at a time.
      for (; ic <= input_depth - 8; ic += 8) {
        // Load the filters, add filter_offset.
        int16x8_t filter[3];
        uint8x8x3_t filter_u8;
        filter_u8.val[0] = vld1_u8(local_filter_ptr);
        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
        filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
        local_filter_ptr += 24;
        for (int i = 0; i < 3; i++) {
          const int16x8_t filter_s16 =
              vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
        }
        // Load the inputs, duplicate 3-fold, add input_offset.
        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
        local_input_ptr += 8;

        uint8x8_t input_u8_dup3[3];
        for (int i = 0; i < 3; i++) {
          input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
        }
        int16x8_t input_dup3[3];
        for (int i = 0; i < 3; i++) {
          const int16x8_t input_s16_dup3 =
              vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
          input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
        }
        // Load the accumulators from acc_buffer
        int32x4x3_t acc[2];
        for (int i = 0; i < 2; i++) {
          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
          acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
        }
        // Multiply-accumulate
        for (int j = 0; j < 3; j++) {
          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
                                    vget_low_s16(filter[j]));
          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
                                    vget_high_s16(filter[j]));
        }
        // Store the accumulators back to acc_buffer
        for (int i = 0; i < 2; i++) {
          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
          vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
        }
        acc_buffer_ptr += 24;
      }
      // Handle one input channel at a time.
      for (; ic < input_depth; ic++) {
        const int16_t input_val = *local_input_ptr++ + input_offset;
        for (int i = 0; i < 3; i++) {
          const int16_t filter_val = local_filter_ptr[i] + filter_offset;
          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
        }
        local_filter_ptr += 3;
      }
      input_ptr += input_ptr_increment;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 0, 2> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      const uint8_t* local_filter_ptr = filter_ptr;
      const uint8_t* local_input_ptr = input_ptr;
      int ic = 0;
      // Handle 8 input channels at a time.
      for (; ic <= input_depth - 8; ic += 8) {
        // Load the filters, add filter_offset.
        int16x8_t filter[2];
        uint8x8x2_t filter_u8;
        filter_u8.val[0] = vld1_u8(local_filter_ptr);
        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
        local_filter_ptr += 16;
        for (int i = 0; i < 2; i++) {
          const int16x8_t filter_s16 =
              vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
        }
        // Load the inputs, add input_offset, duplicate 2-fold.
        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
        local_input_ptr += 8;
        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
        const int16x8x2_t input_dup2 = vzipq_s16(input, input);
        // Load the accumulators from acc_buffer.
        int32x4x2_t acc[2];
        for (int i = 0; i < 2; i++) {
          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
        }
        // Multiply-accumulate.
        for (int j = 0; j < 2; j++) {
          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
                                    vget_low_s16(input_dup2.val[j]));
          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
                                    vget_high_s16(input_dup2.val[j]));
        }
        // Store the accumulators back to acc_buffer.
        for (int i = 0; i < 2; i++) {
          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
        }
        acc_buffer_ptr += 16;
      }
      // Handle one input channel at a time.
      for (; ic < input_depth; ic++) {
        // Load the inputs.
        const int16_t input_val = *local_input_ptr++ + input_offset;
        for (int i = 0; i < 2; i++) {
          const int16_t filter_val = local_filter_ptr[i] + filter_offset;
          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
        }
        local_filter_ptr += 2;
      }
      input_ptr += input_ptr_increment;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 0, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      const uint8_t* local_filter_ptr = filter_ptr;
      const uint8_t* local_input_ptr = input_ptr;
      int ic = 0;
      // Handle 16 input channels at a time.
      for (; ic <= input_depth - 16; ic += 16) {
#ifdef __AVX2__
        // Load the filters, add filter_offset.
        __m128i filter_u8_0 = _mm_loadl_epi64(
            reinterpret_cast<const __m128i*>(local_filter_ptr + 8 * 0));
        __m128i filter_u8_1 = _mm_loadl_epi64(
            reinterpret_cast<const __m128i*>(local_filter_ptr + 8 * 1));
        local_filter_ptr += 16;
        __m256i filter_0 = _mm256_cvtepu8_epi32(filter_u8_0);
        __m256i filter_1 = _mm256_cvtepu8_epi32(filter_u8_1);
        __m256i filter_offset_vec = _mm256_set1_epi32(filter_offset);
        filter_0 = _mm256_add_epi32(filter_0, filter_offset_vec);
        filter_1 = _mm256_add_epi32(filter_1, filter_offset_vec);
        // Load the inputs, add input_offset.
        __m128i input_u8_0 = _mm_loadl_epi64(
            reinterpret_cast<const __m128i*>(local_input_ptr + 8 * 0));
        __m128i input_u8_1 = _mm_loadl_epi64(
            reinterpret_cast<const __m128i*>(local_input_ptr + 8 * 1));
        local_input_ptr += 16;
        __m256i input_0 = _mm256_cvtepu8_epi32(input_u8_0);
        __m256i input_1 = _mm256_cvtepu8_epi32(input_u8_1);
        __m256i input_offset_vec = _mm256_set1_epi32(input_offset);
        input_0 = _mm256_add_epi32(input_0, input_offset_vec);
        input_1 = _mm256_add_epi32(input_1, input_offset_vec);
        // Load the accumulators from acc_buffer
        __m256i acc_0 = _mm256_loadu_si256(
            reinterpret_cast<const __m256i*>(acc_buffer_ptr + 8 * 0));
        __m256i acc_1 = _mm256_loadu_si256(
            reinterpret_cast<const __m256i*>(acc_buffer_ptr + 8 * 1));
        acc_0 = _mm256_add_epi32(acc_0, _mm256_mullo_epi32(input_0, filter_0));
        acc_1 = _mm256_add_epi32(acc_1, _mm256_mullo_epi32(input_1, filter_1));
        // Store the accumulators back to acc_buffer
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(acc_buffer_ptr + 8 * 0),
                            acc_0);
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(acc_buffer_ptr + 8 * 1),
                            acc_1);
        acc_buffer_ptr += 16;
#else
        // Load the filters, add filter_offset.
        uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);
        uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);
        local_filter_ptr += 16;
        int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
        int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
        filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
        filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
        // Load the inputs, add input_offset.
        uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);
        uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);
        local_input_ptr += 16;
        int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
        int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
        input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
        input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
        // Load the accumulators from acc_buffer
        int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
        int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
        int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
        int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
        acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
        acc_1 =
            vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
        acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
        acc_3 =
            vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
        // Store the accumulators back to acc_buffer
        vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
        vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
        vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
        vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
        acc_buffer_ptr += 16;
#endif
      }
      // Handle 8 input channels at a time.
      for (; ic <= input_depth - 8; ic += 8) {
        // Load the filters, add filter_offset.
        const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
        local_filter_ptr += 8;
        const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
        const int16x8_t filter =
            vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
        // Load the inputs, add input_offset.
        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
        local_input_ptr += 8;
        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
        // Load the accumulators from acc_buffer
        int32x4_t acc[2];
        for (int i = 0; i < 2; i++) {
          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
        }
        // Multiply-accumulate
        acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
        acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
        // Store the accumulators back to acc_buffer
        for (int i = 0; i < 2; i++) {
          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
        }
        acc_buffer_ptr += 8;
      }
      // Handle one input channel at a time.
      for (; ic < input_depth; ic++) {
        const int16_t input_val = *local_input_ptr++ + input_offset;
        const int16_t filter_val = *local_filter_ptr++ + filter_offset;
        *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
      }
      input_ptr += input_ptr_increment;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 16, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    uint8x8_t filter_u8[2];
    for (int i = 0; i < 2; i++) {
      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
    }
    int16x8_t filter[2];
    for (int i = 0; i < 2; i++) {
      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
    }
    for (int i = 0; i < 2; i++) {
      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
    }
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      // Load the inputs, add input_offset.
      uint8x8_t input_u8[2];
      for (int i = 0; i < 2; i++) {
        input_u8[i] = vld1_u8(input_ptr + 8 * i);
      }
      input_ptr += input_ptr_increment;
      int16x8_t input[2];
      for (int i = 0; i < 2; i++) {
        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
      }
      for (int i = 0; i < 2; i++) {
        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
      }
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Multiply-accumulate
      for (int i = 0; i < 2; i++) {
        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
                                   vget_low_s16(filter[i]));
        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
                                   vget_high_s16(filter[i]));
      }
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 8, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      // Load the inputs, add input_offset.
      const uint8x8_t input_u8 = vld1_u8(input_ptr);
      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
      // Load the accumulators from acc_buffer
      int32x4_t acc[2];
      for (int i = 0; i < 2; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Multiply-accumulate
      acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
      acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 2; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 8;
      input_ptr += input_ptr_increment;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 1, 16> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    uint8x8_t filter_u8[2];
    for (int i = 0; i < 2; i++) {
      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
    }
    int16x8_t filter[2];
    for (int i = 0; i < 2; i++) {
      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
    }
    for (int i = 0; i < 2; i++) {
      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
    }
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      uint8_t input_u8 = *input_ptr;
      input_ptr += input_ptr_increment;
      int16_t input = static_cast<int16_t>(input_u8 + input_offset);
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Multiply-accumulate
      for (int i = 0; i < 2; i++) {
        acc[2 * i + 0] =
            vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
        acc[2 * i + 1] =
            vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
      }
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 1, 32> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
    uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);
    uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);
    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
    int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));
    int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));
    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
    filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));
    filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      uint8_t input_u8 = *input_ptr;
      input_ptr += input_ptr_increment;
      int16_t input = static_cast<int16_t>(input_u8 + input_offset);
      // Load the accumulators from acc_buffer
      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
      int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
      int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
      int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
      // Multiply-accumulate
      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
      acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
      acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
      acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
      acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
      vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
      vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
      vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
      acc_buffer_ptr += 32;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 1, 20> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
    // We load the first 16 bytes into filter_u8_{0,1} as usual.
    // Then we load the 8 last bytes into filter_u8_x  (x for 'extra').
    // This is redundant: the first 4 bytes of filter_u8_x are the same
    // as the last 4 bytes of filter_u8_x.
    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
    uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);
    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
    int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));
    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
    filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      uint8_t input_u8 = *input_ptr;
      input_ptr += input_ptr_increment;
      int16_t input = static_cast<int16_t>(input_u8 + input_offset);
      // Load the accumulators from acc_buffer
      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
      // Multiply-accumulate
      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
      acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
      acc_buffer_ptr += 20;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 1, 8> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
    const int16x8_t filter = vaddq_s16(
        vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      uint8_t input_u8 = *input_ptr;
      input_ptr += input_ptr_increment;
      int16_t input = static_cast<int16_t>(input_u8 + input_offset);
      // Load the accumulators from acc_buffer
      int32x4_t acc[2];
      for (int i = 0; i < 2; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Multiply-accumulate
      acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
      acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 2; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 8;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 2, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    uint8x8_t filter_u8 = vdup_n_u8(0);
    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
    const int16x4_t filter_s16 =
        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));

    int outp = 0;

    // Handle 2 output pixels at a time.
    for (; outp <= num_output_pixels - 2; outp += 2) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
      // Load the inputs, add input_offset.
      uint16x4_t input_u16 = vdup_n_u16(0);
      input_u16 = vset_lane_u16(
          (reinterpret_cast<const uint16_t*>(input_ptr))[0], input_u16, 0);
      input_ptr += input_ptr_increment;
      input_u16 = vset_lane_u16(
          (reinterpret_cast<const uint16_t*>(input_ptr))[0], input_u16, 1);
      input_ptr += input_ptr_increment;
      const int16x4_t input_s16 = vreinterpret_s16_u16(
          vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate.
      acc = vmlal_s16(acc, filter, input);
      // Store the accumulators back to acc_buffer.
      vst1q_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 4;
    }

    // Handle 1 output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer.
      int32x2_t acc = vld1_s32(acc_buffer_ptr);
      // Load the inputs, add input_offset.
      uint8x8_t input_u8 = vdup_n_u8(0);
      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
      input_ptr += input_ptr_increment;
      const int16x4_t input_s16 =
          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate.
      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
      // Store the accumulators back to acc_buffer.
      vst1_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 2;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 4, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    if (num_output_pixels <= 0) {
      return;
    }

    // Load the filters, add filter_offset.
    uint8x8_t filter_u8 = vdup_n_u8(0);
    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
    const int16x4_t filter_s16 =
        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));

    int outp = 0;

    // Handle one output pixel at a time until second to the last pixel. Second
    // to the last because we read eight input pixels while only processing
    // four.
    for (; outp < num_output_pixels - 1; outp++) {
      // Load the accumulators from acc_buffer
      int32x4_t acc;
      acc = vld1q_s32(acc_buffer_ptr);

      // Load the inputs, add input_offset.
      uint8x8_t input_u8 = vld1_u8(input_ptr);
      input_ptr += input_ptr_increment;
      const int16x4_t input_s16 =
          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
      // Multiply-accumulate
      acc = vmlal_s16(acc, filter, input);
      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 4;
    }

    // Handle the last output pixel.
    // Load the accumulators from acc_buffer
    int32x4_t acc;
    acc = vld1q_s32(acc_buffer_ptr);

    // Load the inputs, add input_offset.
    uint8x8_t input_u8 = vdup_n_u8(0);
    input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
    input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
    const int16x4_t input_s16 =
        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    // Multiply-accumulate
    acc = vmlal_s16(acc, filter, input);
    // Store the accumulators back to acc_buffer
    vst1q_s32(acc_buffer_ptr, acc);
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 12, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const uint8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const uint8_t* filter_ptr,
                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
    // Load the filters, add filter_offset.
    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);
    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);
    int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
    int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
    filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));
    filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));
    int16x4_t filter_0 = vget_low_s16(filter_s16_0);
    int16x4_t filter_1 = vget_high_s16(filter_s16_0);
    int16x4_t filter_2 = vget_high_s16(filter_s16_1);

    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      // Load the inputs, add input_offset.
      uint8x8_t input_u8_0 = vld1_u8(input_ptr);
      uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);
      input_ptr += input_ptr_increment;
      int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
      int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
      input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
      input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));

      // Load the accumulators from acc_buffer
      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);

      // Multiply-accumulate
      acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
      acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
      acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);

      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);

      acc_buffer_ptr += 12;
    }
  }
};
#endif

// Accumulates the effect of one row of the filter, on a segment of one row
// of the output, accessing the corresponding one row of the input.
template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
void QuantizedDepthwiseConvAccumRow(
    int stride, int dilation_factor, int input_depth, int input_width,
    const uint8_t* input_data, int16_t input_offset, int pad_width,
    int depth_multiplier, int filter_width, const uint8_t* filter_data,
    int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end,
    int output_depth, int32_t* acc_buffer) {}

// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
inline void QuantizedDepthwiseConvAccumRowGeneric(
    int stride, int dilation_factor, int input_depth, int input_width,
    const uint8_t* input_data, int16_t input_offset, int pad_width,
    int depth_multiplier, int filter_width, const uint8_t* filter_data,
    int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end,
    int output_depth, int32_t* acc_buffer) {}

// Initializes the accumulator buffer with bias values.
inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
                                       const int32_t* bias_data,
                                       int32_t* acc_buffer) {}

inline void DepthwiseConvGeneral(
    const DepthwiseParams& params, const RuntimeShape& input_shape,
    const uint8_t* input_data, const RuntimeShape& filter_shape,
    const uint8_t* filter_data, const RuntimeShape& bias_shape,
    const int32_t* bias_data, const RuntimeShape& output_shape,
    uint8_t* output_data, int thread_start, int thread_end, int thread_dim) {}

}  // namespace depthwise_conv

template <DepthwiseConvOutputRounding kOutputRounding>
inline void DepthwiseConvWithRounding(
    const DepthwiseParams& params, const RuntimeShape& input_shape,
    const uint8_t* input_data, const RuntimeShape& filter_shape,
    const uint8_t* filter_data, const RuntimeShape& bias_shape,
    const int32_t* bias_data, const RuntimeShape& output_shape,
    uint8_t* output_data, const CpuFlags& cpu_flags, int thread_start,
    int thread_end, int thread_dim) {}

inline void DepthwiseConvImpl(
    const DepthwiseParams& params, const RuntimeShape& input_shape,
    const uint8_t* input_data, const RuntimeShape& filter_shape,
    const uint8_t* filter_data, const RuntimeShape& bias_shape,
    const int32_t* bias_data, const RuntimeShape& output_shape,
    uint8_t* output_data, const CpuFlags& cpu_flags, int thread_start,
    int thread_end, int thread_dim) {}

}  // namespace optimized_ops
}  // namespace tflite

#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_