depthwise_conv.h | Explore in Territory

/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_

#include <string.h>

#include <algorithm>
#include <vector>

#include "ruy/profiler/instrumentation.h"  // from @ruy
#include "tensorflow/lite/kernels/cpu_backend_context.h"
#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h"
#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
#include "tensorflow/lite/kernels/internal/types.h"

namespace tflite {
namespace optimized_integer_ops {
namespace depthwise_conv {

// Implementation of quantized DepthwiseConv

template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
struct QuantizedDepthwiseConvKernel { … };

#ifdef USE_NEON
template <>
struct QuantizedDepthwiseConvKernel<true, 8, 2> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    int8x8x2_t filter_s8;
    filter_s8.val[0] = vld1_s8(filter_ptr);
    filter_s8.val[1] = vld1_s8(filter_ptr + 8);
    int16x8_t filter[2];
    for (int i = 0; i < 2; i++) {
      filter[i] = vmovl_s8(filter_s8.val[i]);
    }
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer
      int32x4x2_t acc[2];
      for (int i = 0; i < 2; i++) {
        acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
        acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
      }
      // Load the inputs, add input_offset.
      const int8x8_t input_s8 = vld1_s8(input_ptr);
      input_ptr += input_ptr_increment;
      const int16x8_t input_s16 = vmovl_s8(input_s8);
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
      // Duplicate the input values, 2-fold
      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
      // Multiply-accumulate
      for (int i = 0; i < 2; i++) {
        acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
                                  vget_low_s16(input_dup2.val[i]));
        acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
                                  vget_high_s16(input_dup2.val[i]));
      }
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 2; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
        vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
      }
      acc_buffer_ptr += 16;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 8, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
    const int16x8_t filter = vmovl_s8(filter_s8);

    int outp = 0;
    // Handle 2 output pixels at a time.
    for (; outp <= num_output_pixels - 2; outp += 2) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      int8x8_t input_s8[2];
      for (int i = 0; i < 2; i++) {
        input_s8[i] = vld1_s8(input_ptr + 8 * i);
      }
      input_ptr += 16;
      int16x8_t input[2];
      for (int i = 0; i < 2; i++) {
        input[i] = vmovl_s8(input_s8[i]);
      }
      for (int i = 0; i < 2; i++) {
        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
      }
      // Multiply-accumulate.
      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
      acc[1] =
          vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
      acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
      acc[3] =
          vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
    // Handle 1 output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc[2];
      acc[0] = vld1q_s32(acc_buffer_ptr);
      acc[1] = vld1q_s32(acc_buffer_ptr + 4);

      // Load the inputs, add input_offset.
      const int8x8_t input_s8 = vld1_s8(input_ptr);
      input_ptr += 8;
      const int16x8_t input_s16 = vmovl_s8(input_s8);
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
      // Multiply-accumulate.
      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr, acc[0]);
      vst1q_s32(acc_buffer_ptr + 4, acc[1]);
      acc_buffer_ptr += 8;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 4, 2> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
    const int16x8_t filter = vmovl_s8(filter_s8);

    int outp = 0;
    // Handle 2 output pixels at a time.
    for (; outp <= num_output_pixels - 2; outp += 2) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      const int8x8_t input_s8 = vld1_s8(input_ptr);
      input_ptr += 8;
      const int16x8_t input_s16 = vmovl_s8(input_s8);
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
      // Duplicate the input values, 2-fold
      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
      // Multiply-accumulate
      for (int i = 0; i < 2; i++) {
        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
                                   vget_low_s16(input_dup2.val[i]));
        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
                                   vget_high_s16(input_dup2.val[i]));
      }
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
    // Handle one output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[2];
      for (int i = 0; i < 2; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      int8x8_t input_s8 = vdup_n_s8(0);
      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
      input_ptr += 4;
      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
      // Duplicate the input values, 2-fold
      const int16x4x2_t input_dup2 = vzip_s16(input, input);
      // Multiply-accumulate
      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 2; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 8;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 2, 8> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    int16x8_t filter[2];
    for (int i = 0; i < 2; i++) {
      const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i);
      filter[i] = vmovl_s8(filter_s8);
    }
    int outp = 0;
    // Handle two output pixels at a time.
    for (; outp <= num_output_pixels - 2; outp += 2) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc[8];
      for (int i = 0; i < 8; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      int8x8_t input_s8 = vdup_n_s8(0);
      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
      input_ptr += 4;
      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
      // Multiply-accumulate.
      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
      // Store the accumulators back to acc_buffer.
      for (int i = 0; i < 8; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 32;
    }
    // Handle one output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      int8x8_t input_s8 = vdup_n_s8(0);
      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
      input_ptr += 2;
      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate.
      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);

      // Store the accumulators back to acc_buffer.
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 2, 2> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    int8x8_t filter_s8 = vdup_n_s8(0);
    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));

    int outp = 0;
    // Handle 4 output pixels at a time.
    for (; outp <= num_output_pixels - 4; outp += 4) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }

      // Load the inputs, add input_offset.
      const int8x8_t input_s8 = vld1_s8(input_ptr);
      input_ptr += 8;
      const int16x8_t input_s16 = vmovl_s8(input_s8);
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
      // Duplicate the input values, 2-fold
      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
      // Multiply-accumulate
      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
    // Handle one output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer
      int32x4_t acc = vld1q_s32(acc_buffer_ptr);

      int8x8_t input_s8 = vdup_n_s8(0);
      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
      input_ptr += 2;
      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
      // Duplicate the input values, 2-fold
      const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
      // Multiply-accumulate
      acc = vmlal_s16(acc, filter, input_dup2);
      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 4;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 2, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    int8x8_t filter_s8 = vdup_n_s8(0);
    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));

    int outp = 0;
    // Handle 8 output pixels at a time.
    for (; outp <= num_output_pixels - 8; outp += 8) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      int8x8_t input_s8[2];
      for (int i = 0; i < 2; i++) {
        input_s8[i] = vld1_s8(input_ptr + 8 * i);
      }
      input_ptr += 16;
      int16x8_t input[2];
      for (int i = 0; i < 2; i++) {
        input[i] = vmovl_s8(input_s8[i]);
      }
      for (int i = 0; i < 2; i++) {
        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
      }

      // Multiply-accumulate.
      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
      // Store the accumulators back to acc_buffer.
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
    // Handle 4 output pixels at a time.
    for (; outp <= num_output_pixels - 4; outp += 4) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc[2];
      for (int i = 0; i < 2; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      const int8x8_t input_s8 = vld1_s8(input_ptr);
      input_ptr += 8;
      const int16x8_t input_s16 = vmovl_s8(input_s8);
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

      // Multiply-accumulate.
      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
      // Store the accumulators back to acc_buffer.
      for (int i = 0; i < 2; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 8;
    }
    // Handle 2 output pixels at a time.
    for (; outp <= num_output_pixels - 2; outp += 2) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
      // Load the inputs, add input_offset.
      int8x8_t input_s8 = vdup_n_s8(0);
      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
      input_ptr += 4;
      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate.
      acc = vmlal_s16(acc, filter, input);
      // Store the accumulators back to acc_buffer.
      vst1q_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 4;
    }
    // Handle 1 output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer.
      int32x2_t acc = vld1_s32(acc_buffer_ptr);
      // Load the inputs, add input_offset.
      int8x8_t input_s8 = vdup_n_s8(0);
      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
      input_ptr += 2;
      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate.
      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
      // Store the accumulators back to acc_buffer.
      vst1_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 2;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 1, 2> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    int8x8_t filter_s8 = vdup_n_s8(0);
    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));

    int outp = 0;
    // Handle 8 output pixels at a time.
    for (; outp <= num_output_pixels - 8; outp += 8) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }

      // Load the inputs, add input_offset.
      const int8x8_t input_s8 = vld1_s8(input_ptr);
      input_ptr += 8;
      const int16x8_t input_s16 = vmovl_s8(input_s8);
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
      // Duplicate the input values, 2-fold
      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
      // Multiply-accumulate
      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
    // Handle one output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer
      int32x2_t acc = vld1_s32(acc_buffer_ptr);

      // Load the inputs, add input_offset.
      const uint32_t input = *input_ptr++ + input_offset;

      // Multiply-accumulate
      acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
      // Store the accumulators back to acc_buffer
      vst1_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 2;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 1, 4> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    int8x8_t filter_s8 = vdup_n_s8(0);
    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));

    int outp = 0;
    // Handle 8 output pixels at a time.
    for (; outp <= num_output_pixels - 8; outp += 8) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[8];
      for (int i = 0; i < 8; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }

      // Load the inputs, add input_offset.
      int8x8_t input_s8 = vld1_s8(input_ptr);
      input_ptr += 8;
      const int16x8_t input_s16 = vmovl_s8(input_s8);
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

      // Multiply-accumulate
      acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
      acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
      acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
      acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
      acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
      acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
      acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
      acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);

      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 8; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 32;
    }
    // Handle 4 output pixels at a time.
    for (; outp <= num_output_pixels - 4; outp += 4) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }

      // Load the inputs, add input_offset.
      int8x8_t input_s8 = vdup_n_s8(0);
      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
      input_ptr += 4;
      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate
      acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
      acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
      acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
      acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);

      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
    // Handle one output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer
      int32x4_t acc = vld1q_s32(acc_buffer_ptr);

      // Load the inputs, add input_offset.
      const uint32_t input = *input_ptr++ + input_offset;

      // Multiply-accumulate
      acc = vmlal_n_s16(acc, filter, input);
      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 4;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 4, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    int8x8_t filter_s8 = vdup_n_s8(0);
    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));

    int outp = 0;
    // Handle 4 output pixels at a time.
    for (; outp <= num_output_pixels - 4; outp += 4) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Load the inputs, add input_offset.
      int16x8_t input[2];
      for (int i = 0; i < 2; i++) {
        const int8x8_t input_s8 = vld1_s8(input_ptr + 8 * i);
        const int16x8_t input_s16 = vmovl_s8(input_s8);
        input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
      }
      input_ptr += 16;
      // Multiply-accumulate
      for (int i = 0; i < 2; i++) {
        acc[2 * i + 0] =
            vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
        acc[2 * i + 1] =
            vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
      }
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
    // Handle one output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer
      int32x4_t acc;
      acc = vld1q_s32(acc_buffer_ptr);

      // Load the inputs, add input_offset.
      int8x8_t input_s8 = vdup_n_s8(0);
      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
      input_ptr += 4;
      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
      // Multiply-accumulate
      acc = vmlal_s16(acc, filter, input);
      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 4;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 4, 4> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    int16x8_t filter[2];
    for (int i = 0; i < 2; i++) {
      const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i);
      filter[i] = vmovl_s8(filter_s8);
    }

    int outp = 0;
    // Handle 2 output pixels at a time.
    for (; outp <= num_output_pixels - 2; outp += 2) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[8];
      for (int i = 0; i < 8; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }

      // Load the inputs, add input_offset.
      int8x8_t input_s8 = vld1_s8(input_ptr);
      input_ptr += 8;
      const int16x8_t input_s16 = vmovl_s8(input_s8);
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

      // Multiply-accumulate
      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
                              vget_low_s16(input), 0);
      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
                              vget_low_s16(input), 1);
      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
                              vget_low_s16(input), 2);
      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
                              vget_low_s16(input), 3);
      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
                              vget_high_s16(input), 0);
      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
                              vget_high_s16(input), 1);
      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
                              vget_high_s16(input), 2);
      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
                              vget_high_s16(input), 3);
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 8; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 32;
    }
    // Handle one output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }

      // Load the inputs, add input_offset.
      int8x8_t input_s8 = vdup_n_s8(0);
      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
      input_ptr += 4;
      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate
      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 0, 3> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // We will have to duplicate bytes in a NEON register, 3-fold.
    // We will do that by register-level table-look-up using VTBL instructions.
    // Here we prepare the registers containing the table-lookup indices.
    static const int8_t dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
                                                    {2, 3, 3, 3, 4, 4, 4, 5},
                                                    {5, 5, 6, 6, 6, 7, 7, 7}};
    int8x8_t dup3_indices[3];
    for (int i = 0; i < 3; i++) {
      dup3_indices[i] = vld1_s8(dup3_indices_array[i]);
    }

    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      const int8_t* local_filter_ptr = filter_ptr;
      const int8_t* local_input_ptr = input_ptr;
      int ic = 0;
      // Handle 8 input channels at a time.
      for (; ic <= input_depth - 8; ic += 8) {
        // Load the filters.
        int16x8_t filter[3];
        int8x8x3_t filter_s8;
        filter_s8.val[0] = vld1_s8(local_filter_ptr);
        filter_s8.val[1] = vld1_s8(local_filter_ptr + 8);
        filter_s8.val[2] = vld1_s8(local_filter_ptr + 16);
        local_filter_ptr += 24;
        for (int i = 0; i < 3; i++) {
          filter[i] = vmovl_s8(filter_s8.val[i]);
        }
        // Load the inputs, duplicate 3-fold, add input_offset.
        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
        local_input_ptr += 8;

        int8x8_t input_s8_dup3[3];
        for (int i = 0; i < 3; i++) {
          input_s8_dup3[i] = vtbl1_s8(input_s8, dup3_indices[i]);
        }
        int16x8_t input_dup3[3];
        for (int i = 0; i < 3; i++) {
          const int16x8_t input_s16_dup3 = vmovl_s8(input_s8_dup3[i]);
          input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
        }
        // Load the accumulators from acc_buffer
        int32x4x3_t acc[2];
        for (int i = 0; i < 2; i++) {
          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
          acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
        }
        // Multiply-accumulate
        for (int j = 0; j < 3; j++) {
          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
                                    vget_low_s16(filter[j]));
          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
                                    vget_high_s16(filter[j]));
        }
        // Store the accumulators back to acc_buffer
        for (int i = 0; i < 2; i++) {
          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
          vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
        }
        acc_buffer_ptr += 24;
      }
      // Handle one input channel at a time.
      for (; ic < input_depth; ic++) {
        const int16_t input_val = *local_input_ptr++ + input_offset;
        for (int i = 0; i < 3; i++) {
          *acc_buffer_ptr++ +=
              static_cast<int32_t>(local_filter_ptr[i]) * input_val;
        }
        local_filter_ptr += 3;
      }
      input_ptr += input_ptr_increment;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 0, 2> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      const int8_t* local_filter_ptr = filter_ptr;
      const int8_t* local_input_ptr = input_ptr;
      int ic = 0;
      // Handle 8 input channels at a time.
      for (; ic <= input_depth - 8; ic += 8) {
        // Load the filters.
        int16x8_t filter[2];
        int8x8x2_t filter_s8;
        filter_s8.val[0] = vld1_s8(local_filter_ptr);
        filter_s8.val[1] = vld1_s8(local_filter_ptr + 8);
        local_filter_ptr += 16;
        for (int i = 0; i < 2; i++) {
          filter[i] = vmovl_s8(filter_s8.val[i]);
        }
        // Load the inputs, add input_offset, duplicate 2-fold.
        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
        local_input_ptr += 8;
        const int16x8_t input_s16 = vmovl_s8(input_s8);
        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
        const int16x8x2_t input_dup2 = vzipq_s16(input, input);
        // Load the accumulators from acc_buffer.
        int32x4x2_t acc[2];
        for (int i = 0; i < 2; i++) {
          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
        }
        // Multiply-accumulate.
        for (int j = 0; j < 2; j++) {
          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
                                    vget_low_s16(input_dup2.val[j]));
          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
                                    vget_high_s16(input_dup2.val[j]));
        }
        // Store the accumulators back to acc_buffer.
        for (int i = 0; i < 2; i++) {
          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
        }
        acc_buffer_ptr += 16;
      }
      // Handle one input channel at a time.
      for (; ic < input_depth; ic++) {
        // Load the inputs.
        const int16_t input_val = *local_input_ptr++ + input_offset;
        for (int i = 0; i < 2; i++) {
          *acc_buffer_ptr++ +=
              static_cast<int32_t>(local_filter_ptr[i]) * input_val;
        }
        local_filter_ptr += 2;
      }
      input_ptr += input_ptr_increment;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 0, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      const int8_t* local_filter_ptr = filter_ptr;
      const int8_t* local_input_ptr = input_ptr;
      int ic = 0;
      // Handle 16 input channels at a time.
      for (; ic <= input_depth - 16; ic += 16) {
        // Load the filters.
        int8x8_t filter_s8_0 = vld1_s8(local_filter_ptr + 8 * 0);
        int8x8_t filter_s8_1 = vld1_s8(local_filter_ptr + 8 * 1);
        local_filter_ptr += 16;
        int16x8_t filter_0 = vmovl_s8(filter_s8_0);
        int16x8_t filter_1 = vmovl_s8(filter_s8_1);
        // Load the inputs, add input_offset.
        int8x8_t input_s8_0 = vld1_s8(local_input_ptr + 8 * 0);
        int8x8_t input_s8_1 = vld1_s8(local_input_ptr + 8 * 1);
        local_input_ptr += 16;
        int16x8_t input_0 = vmovl_s8(input_s8_0);
        int16x8_t input_1 = vmovl_s8(input_s8_1);
        input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
        input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
        // Load the accumulators from acc_buffer
        int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
        int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
        int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
        int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
        acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
        acc_1 =
            vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
        acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
        acc_3 =
            vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
        // Store the accumulators back to acc_buffer
        vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
        vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
        vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
        vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
        acc_buffer_ptr += 16;
      }
      // Handle 8 input channels at a time.
      for (; ic <= input_depth - 8; ic += 8) {
        // Load the filters.
        const int8x8_t filter_s8 = vld1_s8(local_filter_ptr);
        local_filter_ptr += 8;
        const int16x8_t filter = vmovl_s8(filter_s8);
        // Load the inputs, add input_offset.
        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
        local_input_ptr += 8;
        const int16x8_t input_s16 = vmovl_s8(input_s8);
        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
        // Load the accumulators from acc_buffer
        int32x4_t acc[2];
        for (int i = 0; i < 2; i++) {
          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
        }
        // Multiply-accumulate
        acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
        acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
        // Store the accumulators back to acc_buffer
        for (int i = 0; i < 2; i++) {
          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
        }
        acc_buffer_ptr += 8;
      }
      // Handle one input channel at a time.
      for (; ic < input_depth; ic++) {
        const int16_t input_val = *local_input_ptr++ + input_offset;
        const int16_t filter_val = *local_filter_ptr++;
        *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
      }
      input_ptr += input_ptr_increment;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 16, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    int8x8_t filter_s8[2];
    for (int i = 0; i < 2; i++) {
      filter_s8[i] = vld1_s8(filter_ptr + 8 * i);
    }
    int16x8_t filter[2];
    for (int i = 0; i < 2; i++) {
      filter[i] = vmovl_s8(filter_s8[i]);
    }
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      // Load the inputs, add input_offset.
      int8x8_t input_s8[2];
      for (int i = 0; i < 2; i++) {
        input_s8[i] = vld1_s8(input_ptr + 8 * i);
      }
      input_ptr += input_ptr_increment;
      int16x8_t input[2];
      for (int i = 0; i < 2; i++) {
        input[i] = vmovl_s8(input_s8[i]);
      }
      for (int i = 0; i < 2; i++) {
        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
      }
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Multiply-accumulate
      for (int i = 0; i < 2; i++) {
        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
                                   vget_low_s16(filter[i]));
        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
                                   vget_high_s16(filter[i]));
      }
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 8, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
    const int16x8_t filter = vmovl_s8(filter_s8);
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      // Load the inputs, add input_offset.
      const int8x8_t input_s8 = vld1_s8(input_ptr);
      const int16x8_t input_s16 = vmovl_s8(input_s8);
      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
      // Load the accumulators from acc_buffer
      int32x4_t acc[2];
      for (int i = 0; i < 2; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Multiply-accumulate
      acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
      acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 2; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 8;
      input_ptr += input_ptr_increment;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 1, 16> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    int8x8_t filter_s8[2];
    for (int i = 0; i < 2; i++) {
      filter_s8[i] = vld1_s8(filter_ptr + 8 * i);
    }
    int16x8_t filter[2];
    for (int i = 0; i < 2; i++) {
      filter[i] = vmovl_s8(filter_s8[i]);
    }
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      int8_t input_s8 = *input_ptr;
      input_ptr += input_ptr_increment;
      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
      // Load the accumulators from acc_buffer
      int32x4_t acc[4];
      for (int i = 0; i < 4; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Multiply-accumulate
      for (int i = 0; i < 2; i++) {
        acc[2 * i + 0] =
            vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
        acc[2 * i + 1] =
            vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
      }
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 4; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 16;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 1, 32> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0);
    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1);
    int8x8_t filter_s8_2 = vld1_s8(filter_ptr + 8 * 2);
    int8x8_t filter_s8_3 = vld1_s8(filter_ptr + 8 * 3);
    int16x8_t filter_0 = vmovl_s8(filter_s8_0);
    int16x8_t filter_1 = vmovl_s8(filter_s8_1);
    int16x8_t filter_2 = vmovl_s8(filter_s8_2);
    int16x8_t filter_3 = vmovl_s8(filter_s8_3);
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      int8_t input_s8 = *input_ptr;
      input_ptr += input_ptr_increment;
      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
      // Load the accumulators from acc_buffer
      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
      int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
      int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
      int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
      // Multiply-accumulate
      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
      acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
      acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
      acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
      acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
      vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
      vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
      vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
      acc_buffer_ptr += 32;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 1, 20> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
    // We load the first 16 bytes into filter_s8_{0,1} as usual.
    // Then we load the 8 last bytes into filter_s8_x  (x for 'extra').
    // This is redundant: the first 4 bytes of filter_s8_x are the same
    // as the last 4 bytes of filter_s8_x.
    int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0);
    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1);
    int8x8_t filter_s8_x = vld1_s8(filter_ptr + 8 * 1 + 4);
    int16x8_t filter_0 = vmovl_s8(filter_s8_0);
    int16x8_t filter_1 = vmovl_s8(filter_s8_1);
    int16x8_t filter_x = vmovl_s8(filter_s8_x);
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      int8_t input_s8 = *input_ptr;
      input_ptr += input_ptr_increment;
      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
      // Load the accumulators from acc_buffer
      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
      // Multiply-accumulate
      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
      acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
      acc_buffer_ptr += 20;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 1, 8> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
    const int16x8_t filter = vmovl_s8(filter_s8);
    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      int8_t input_s8 = *input_ptr;
      input_ptr += input_ptr_increment;
      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
      // Load the accumulators from acc_buffer
      int32x4_t acc[2];
      for (int i = 0; i < 2; i++) {
        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
      }
      // Multiply-accumulate
      acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
      acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
      // Store the accumulators back to acc_buffer
      for (int i = 0; i < 2; i++) {
        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
      }
      acc_buffer_ptr += 8;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 2, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    int8x8_t filter_s8 = vdup_n_s8(0);
    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));

    int outp = 0;

    // Handle 2 output pixels at a time.
    for (; outp <= num_output_pixels - 2; outp += 2) {
      // Load the accumulators from acc_buffer.
      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
      // Load the inputs, add input_offset.
      int16x4_t input_s16 = vdup_n_s16(0);
      input_s16 = vset_lane_s16(
          (reinterpret_cast<const int16_t*>(input_ptr))[0], input_s16, 0);
      input_ptr += input_ptr_increment;
      input_s16 = vset_lane_s16(
          (reinterpret_cast<const int16_t*>(input_ptr))[0], input_s16, 1);
      input_ptr += input_ptr_increment;
      input_s16 = vget_low_s16(vmovl_s8(vreinterpret_s8_s16(input_s16)));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate.
      acc = vmlal_s16(acc, filter, input);
      // Store the accumulators back to acc_buffer.
      vst1q_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 4;
    }

    // Handle 1 output pixel at a time.
    for (; outp < num_output_pixels; outp++) {
      // Load the accumulators from acc_buffer.
      int32x2_t acc = vld1_s32(acc_buffer_ptr);
      // Load the inputs, add input_offset.
      int8x8_t input_s8 = vdup_n_s8(0);
      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
      input_ptr += input_ptr_increment;
      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate.
      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
      // Store the accumulators back to acc_buffer.
      vst1_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 2;
    }
  }
};

template <>
struct QuantizedDepthwiseConvKernel<true, 4, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    if (num_output_pixels <= 0) {
      return;
    }

    // Load the filters.
    int8x8_t filter_s8 = vdup_n_s8(0);
    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));

    int outp = 0;

    // Handle one output pixel at a time until second to the last pixel. Second
    // to the last because we read eight input pixels while only processing
    // four.
    for (; outp < num_output_pixels - 1; outp++) {
      // Load the accumulators from acc_buffer
      int32x4_t acc;
      acc = vld1q_s32(acc_buffer_ptr);

      // Load the inputs, add input_offset.
      int8x8_t input_s8 = vld1_s8(input_ptr);
      input_ptr += input_ptr_increment;
      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
      // Multiply-accumulate
      acc = vmlal_s16(acc, filter, input);
      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr, acc);
      acc_buffer_ptr += 4;
    }

    // Handle the last output pixel.
    // Load the accumulators from acc_buffer
    int32x4_t acc;
    acc = vld1q_s32(acc_buffer_ptr);

    // Load the inputs, add input_offset.
    int8x8_t input_s8 = vdup_n_s8(0);
    input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
    input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
    input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
    input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
    const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
    const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    // Multiply-accumulate
    acc = vmlal_s16(acc, filter, input);
    // Store the accumulators back to acc_buffer
    vst1q_s32(acc_buffer_ptr, acc);
  }
};

template <>
struct QuantizedDepthwiseConvKernel<false, 12, 1> {
  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
                  const int8_t* input_ptr, int16_t input_offset,
                  int input_ptr_increment, const int8_t* filter_ptr,
                  int32_t* acc_buffer_ptr) {
    // Load the filters.
    int8x8_t filter_s8_0 = vld1_s8(filter_ptr);
    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 4);
    int16x8_t filter_s16_0 = vmovl_s8(filter_s8_0);
    int16x8_t filter_s16_1 = vmovl_s8(filter_s8_1);
    int16x4_t filter_0 = vget_low_s16(filter_s16_0);
    int16x4_t filter_1 = vget_high_s16(filter_s16_0);
    int16x4_t filter_2 = vget_high_s16(filter_s16_1);

    // Handle one output pixel at a time.
    for (int outp = 0; outp < num_output_pixels; outp++) {
      // Load the inputs, add input_offset.
      int8x8_t input_s8_0 = vld1_s8(input_ptr);
      int8x8_t input_s8_1 = vld1_s8(input_ptr + 4);
      input_ptr += input_ptr_increment;
      int16x8_t input_0 = vmovl_s8(input_s8_0);
      int16x8_t input_1 = vmovl_s8(input_s8_1);
      input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
      input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));

      // Load the accumulators from acc_buffer
      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);

      // Multiply-accumulate
      acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
      acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
      acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);

      // Store the accumulators back to acc_buffer
      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);

      acc_buffer_ptr += 12;
    }
  }
};
#endif

// Accumulates the effect of one row of the filter, on a segment of one row
// of the output, accessing the corresponding one row of the input.
template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
void QuantizedDepthwiseConvAccumRow(
    int stride, int dilation_factor, int input_depth, int input_width,
    const int8_t* input_data, int16_t input_offset, int pad_width,
    int depth_multiplier, int filter_width, const int8_t* filter_data,
    int out_x_buffer_start, int out_x_buffer_end, int output_depth,
    int32_t* acc_buffer) { … }

// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
inline void QuantizedDepthwiseConvAccumRowGeneric(
    int stride, int dilation_factor, int input_depth, int input_width,
    const int8_t* input_data, int16_t input_offset, int pad_width,
    int depth_multiplier, int filter_width, const int8_t* filter_data,
    int out_x_buffer_start, int out_x_buffer_end, int output_depth,
    int32_t* acc_buffer) { … }

// Initializes the accumulator buffer with bias values.
inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
                                       const int32_t* bias_data,
                                       int32_t* acc_buffer) { … }

inline void DepthwiseConvGeneral(
    const DepthwiseParams& params, const int32_t* output_multiplier,
    const int32_t* output_shift, const RuntimeShape& input_shape,
    const int8_t* input_data, const RuntimeShape& filter_shape,
    const int8_t* filter_data, const RuntimeShape& bias_shape,
    const int32_t* bias_data, const RuntimeShape& output_shape,
    int8_t* output_data, int thread_start, int thread_end, int thread_dim) { … }

}  // namespace depthwise_conv

template <DepthwiseConvOutputRounding kOutputRounding>
inline void DepthwiseConvWithRounding(
    const DepthwiseParams& params, const int32_t* output_multiplier,
    const int32_t* output_shift, const RuntimeShape& input_shape,
    const int8_t* input_data, const RuntimeShape& filter_shape,
    const int8_t* filter_data, const RuntimeShape& bias_shape,
    const int32_t* bias_data, const RuntimeShape& output_shape,
    int8_t* output_data, int thread_start, int thread_end, int thread_dim,
    const CpuBackendContext& cpu_backend_context) { … }

inline void DepthwiseConvImpl(
    const DepthwiseParams& params, const int32_t* output_multiplier,
    const int32_t* output_shift, const RuntimeShape& input_shape,
    const int8_t* input_data, const RuntimeShape& filter_shape,
    const int8_t* filter_data, const RuntimeShape& bias_shape,
    const int32_t* bias_data, const RuntimeShape& output_shape,
    int8_t* output_data, int thread_start, int thread_end, int thread_dim,
    const CpuBackendContext& cpu_backend_context) { … }

template <typename T, typename TS>
struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task { … };

inline int HowManyConvThreads(const RuntimeShape& output_shape,
                              const RuntimeShape& filter_shape,
                              int thread_dim) { … }

inline void DepthwiseConvPerChannel(
    const DepthwiseParams& params, const int32_t* output_multiplier,
    const int32_t* output_shift, const RuntimeShape& input_shape,
    const int8_t* input_data, const RuntimeShape& filter_shape,
    const int8_t* filter_data, const RuntimeShape& bias_shape,
    const int32_t* bias_data, const RuntimeShape& output_shape,
    int8_t* output_data, CpuBackendContext* cpu_backend_context) { … }

}  // namespace optimized_integer_ops
}  // namespace tflite

#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
chromium/third_party/tflite/src/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h