#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
#include <string.h>
#include <algorithm>
#include <vector>
#include "ruy/profiler/instrumentation.h"
#include "tensorflow/lite/kernels/cpu_backend_context.h"
#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h"
#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace optimized_integer_ops {
namespace depthwise_conv {
template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
struct QuantizedDepthwiseConvKernel { … };
#ifdef USE_NEON
template <>
struct QuantizedDepthwiseConvKernel<true, 8, 2> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
int8x8x2_t filter_s8;
filter_s8.val[0] = vld1_s8(filter_ptr);
filter_s8.val[1] = vld1_s8(filter_ptr + 8);
int16x8_t filter[2];
for (int i = 0; i < 2; i++) {
filter[i] = vmovl_s8(filter_s8.val[i]);
}
for (int outp = 0; outp < num_output_pixels; outp++) {
int32x4x2_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
}
const int8x8_t input_s8 = vld1_s8(input_ptr);
input_ptr += input_ptr_increment;
const int16x8_t input_s16 = vmovl_s8(input_s8);
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
const int16x8x2_t input_dup2 = vzipq_s16(input, input);
for (int i = 0; i < 2; i++) {
acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
vget_low_s16(input_dup2.val[i]));
acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
vget_high_s16(input_dup2.val[i]));
}
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
}
acc_buffer_ptr += 16;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 8, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
const int8x8_t filter_s8 = vld1_s8(filter_ptr);
const int16x8_t filter = vmovl_s8(filter_s8);
int outp = 0;
for (; outp <= num_output_pixels - 2; outp += 2) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
int8x8_t input_s8[2];
for (int i = 0; i < 2; i++) {
input_s8[i] = vld1_s8(input_ptr + 8 * i);
}
input_ptr += 16;
int16x8_t input[2];
for (int i = 0; i < 2; i++) {
input[i] = vmovl_s8(input_s8[i]);
}
for (int i = 0; i < 2; i++) {
input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
}
acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
acc[1] =
vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
acc[3] =
vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
for (; outp < num_output_pixels; outp++) {
int32x4_t acc[2];
acc[0] = vld1q_s32(acc_buffer_ptr);
acc[1] = vld1q_s32(acc_buffer_ptr + 4);
const int8x8_t input_s8 = vld1_s8(input_ptr);
input_ptr += 8;
const int16x8_t input_s16 = vmovl_s8(input_s8);
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
vst1q_s32(acc_buffer_ptr, acc[0]);
vst1q_s32(acc_buffer_ptr + 4, acc[1]);
acc_buffer_ptr += 8;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 4, 2> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
const int8x8_t filter_s8 = vld1_s8(filter_ptr);
const int16x8_t filter = vmovl_s8(filter_s8);
int outp = 0;
for (; outp <= num_output_pixels - 2; outp += 2) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
const int8x8_t input_s8 = vld1_s8(input_ptr);
input_ptr += 8;
const int16x8_t input_s16 = vmovl_s8(input_s8);
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
const int16x8x2_t input_dup2 = vzipq_s16(input, input);
for (int i = 0; i < 2; i++) {
acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
vget_low_s16(input_dup2.val[i]));
acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
vget_high_s16(input_dup2.val[i]));
}
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
for (; outp < num_output_pixels; outp++) {
int32x4_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
int8x8_t input_s8 = vdup_n_s8(0);
input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
input_ptr += 4;
const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
const int16x4x2_t input_dup2 = vzip_s16(input, input);
acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 8;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 2, 8> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
int16x8_t filter[2];
for (int i = 0; i < 2; i++) {
const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i);
filter[i] = vmovl_s8(filter_s8);
}
int outp = 0;
for (; outp <= num_output_pixels - 2; outp += 2) {
int32x4_t acc[8];
for (int i = 0; i < 8; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
int8x8_t input_s8 = vdup_n_s8(0);
input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
input_ptr += 4;
const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
for (int i = 0; i < 8; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 32;
}
for (; outp < num_output_pixels; outp++) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
int8x8_t input_s8 = vdup_n_s8(0);
input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
input_ptr += 2;
const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 2, 2> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
int8x8_t filter_s8 = vdup_n_s8(0);
filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
int outp = 0;
for (; outp <= num_output_pixels - 4; outp += 4) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
const int8x8_t input_s8 = vld1_s8(input_ptr);
input_ptr += 8;
const int16x8_t input_s16 = vmovl_s8(input_s8);
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
const int16x8x2_t input_dup2 = vzipq_s16(input, input);
acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
for (; outp < num_output_pixels; outp++) {
int32x4_t acc = vld1q_s32(acc_buffer_ptr);
int8x8_t input_s8 = vdup_n_s8(0);
input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
input_ptr += 2;
const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
acc = vmlal_s16(acc, filter, input_dup2);
vst1q_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 4;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 2, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
int8x8_t filter_s8 = vdup_n_s8(0);
filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
int outp = 0;
for (; outp <= num_output_pixels - 8; outp += 8) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
int8x8_t input_s8[2];
for (int i = 0; i < 2; i++) {
input_s8[i] = vld1_s8(input_ptr + 8 * i);
}
input_ptr += 16;
int16x8_t input[2];
for (int i = 0; i < 2; i++) {
input[i] = vmovl_s8(input_s8[i]);
}
for (int i = 0; i < 2; i++) {
input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
}
acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
for (; outp <= num_output_pixels - 4; outp += 4) {
int32x4_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
const int8x8_t input_s8 = vld1_s8(input_ptr);
input_ptr += 8;
const int16x8_t input_s16 = vmovl_s8(input_s8);
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 8;
}
for (; outp <= num_output_pixels - 2; outp += 2) {
int32x4_t acc = vld1q_s32(acc_buffer_ptr);
int8x8_t input_s8 = vdup_n_s8(0);
input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
input_ptr += 4;
const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc = vmlal_s16(acc, filter, input);
vst1q_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 4;
}
for (; outp < num_output_pixels; outp++) {
int32x2_t acc = vld1_s32(acc_buffer_ptr);
int8x8_t input_s8 = vdup_n_s8(0);
input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
input_ptr += 2;
const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
vst1_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 2;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 1, 2> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
int8x8_t filter_s8 = vdup_n_s8(0);
filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
int outp = 0;
for (; outp <= num_output_pixels - 8; outp += 8) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
const int8x8_t input_s8 = vld1_s8(input_ptr);
input_ptr += 8;
const int16x8_t input_s16 = vmovl_s8(input_s8);
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
const int16x8x2_t input_dup2 = vzipq_s16(input, input);
acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
for (; outp < num_output_pixels; outp++) {
int32x2_t acc = vld1_s32(acc_buffer_ptr);
const uint32_t input = *input_ptr++ + input_offset;
acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
vst1_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 2;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 1, 4> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
int8x8_t filter_s8 = vdup_n_s8(0);
filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
int outp = 0;
for (; outp <= num_output_pixels - 8; outp += 8) {
int32x4_t acc[8];
for (int i = 0; i < 8; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
int8x8_t input_s8 = vld1_s8(input_ptr);
input_ptr += 8;
const int16x8_t input_s16 = vmovl_s8(input_s8);
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
for (int i = 0; i < 8; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 32;
}
for (; outp <= num_output_pixels - 4; outp += 4) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
int8x8_t input_s8 = vdup_n_s8(0);
input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
input_ptr += 4;
const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
for (; outp < num_output_pixels; outp++) {
int32x4_t acc = vld1q_s32(acc_buffer_ptr);
const uint32_t input = *input_ptr++ + input_offset;
acc = vmlal_n_s16(acc, filter, input);
vst1q_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 4;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 4, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
int8x8_t filter_s8 = vdup_n_s8(0);
filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
int outp = 0;
for (; outp <= num_output_pixels - 4; outp += 4) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
int16x8_t input[2];
for (int i = 0; i < 2; i++) {
const int8x8_t input_s8 = vld1_s8(input_ptr + 8 * i);
const int16x8_t input_s16 = vmovl_s8(input_s8);
input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
}
input_ptr += 16;
for (int i = 0; i < 2; i++) {
acc[2 * i + 0] =
vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
acc[2 * i + 1] =
vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
}
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
for (; outp < num_output_pixels; outp++) {
int32x4_t acc;
acc = vld1q_s32(acc_buffer_ptr);
int8x8_t input_s8 = vdup_n_s8(0);
input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
input_ptr += 4;
const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc = vmlal_s16(acc, filter, input);
vst1q_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 4;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 4, 4> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
int16x8_t filter[2];
for (int i = 0; i < 2; i++) {
const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i);
filter[i] = vmovl_s8(filter_s8);
}
int outp = 0;
for (; outp <= num_output_pixels - 2; outp += 2) {
int32x4_t acc[8];
for (int i = 0; i < 8; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
int8x8_t input_s8 = vld1_s8(input_ptr);
input_ptr += 8;
const int16x8_t input_s16 = vmovl_s8(input_s8);
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
vget_low_s16(input), 0);
acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
vget_low_s16(input), 1);
acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
vget_low_s16(input), 2);
acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
vget_low_s16(input), 3);
acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
vget_high_s16(input), 0);
acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
vget_high_s16(input), 1);
acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
vget_high_s16(input), 2);
acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
vget_high_s16(input), 3);
for (int i = 0; i < 8; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 32;
}
for (; outp < num_output_pixels; outp++) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
int8x8_t input_s8 = vdup_n_s8(0);
input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
input_ptr += 4;
const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 0, 3> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
static const int8_t dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
{2, 3, 3, 3, 4, 4, 4, 5},
{5, 5, 6, 6, 6, 7, 7, 7}};
int8x8_t dup3_indices[3];
for (int i = 0; i < 3; i++) {
dup3_indices[i] = vld1_s8(dup3_indices_array[i]);
}
for (int outp = 0; outp < num_output_pixels; outp++) {
const int8_t* local_filter_ptr = filter_ptr;
const int8_t* local_input_ptr = input_ptr;
int ic = 0;
for (; ic <= input_depth - 8; ic += 8) {
int16x8_t filter[3];
int8x8x3_t filter_s8;
filter_s8.val[0] = vld1_s8(local_filter_ptr);
filter_s8.val[1] = vld1_s8(local_filter_ptr + 8);
filter_s8.val[2] = vld1_s8(local_filter_ptr + 16);
local_filter_ptr += 24;
for (int i = 0; i < 3; i++) {
filter[i] = vmovl_s8(filter_s8.val[i]);
}
const int8x8_t input_s8 = vld1_s8(local_input_ptr);
local_input_ptr += 8;
int8x8_t input_s8_dup3[3];
for (int i = 0; i < 3; i++) {
input_s8_dup3[i] = vtbl1_s8(input_s8, dup3_indices[i]);
}
int16x8_t input_dup3[3];
for (int i = 0; i < 3; i++) {
const int16x8_t input_s16_dup3 = vmovl_s8(input_s8_dup3[i]);
input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
}
int32x4x3_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
}
for (int j = 0; j < 3; j++) {
acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
vget_low_s16(filter[j]));
acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
vget_high_s16(filter[j]));
}
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
}
acc_buffer_ptr += 24;
}
for (; ic < input_depth; ic++) {
const int16_t input_val = *local_input_ptr++ + input_offset;
for (int i = 0; i < 3; i++) {
*acc_buffer_ptr++ +=
static_cast<int32_t>(local_filter_ptr[i]) * input_val;
}
local_filter_ptr += 3;
}
input_ptr += input_ptr_increment;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 0, 2> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
for (int outp = 0; outp < num_output_pixels; outp++) {
const int8_t* local_filter_ptr = filter_ptr;
const int8_t* local_input_ptr = input_ptr;
int ic = 0;
for (; ic <= input_depth - 8; ic += 8) {
int16x8_t filter[2];
int8x8x2_t filter_s8;
filter_s8.val[0] = vld1_s8(local_filter_ptr);
filter_s8.val[1] = vld1_s8(local_filter_ptr + 8);
local_filter_ptr += 16;
for (int i = 0; i < 2; i++) {
filter[i] = vmovl_s8(filter_s8.val[i]);
}
const int8x8_t input_s8 = vld1_s8(local_input_ptr);
local_input_ptr += 8;
const int16x8_t input_s16 = vmovl_s8(input_s8);
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
const int16x8x2_t input_dup2 = vzipq_s16(input, input);
int32x4x2_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
}
for (int j = 0; j < 2; j++) {
acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
vget_low_s16(input_dup2.val[j]));
acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
vget_high_s16(input_dup2.val[j]));
}
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
}
acc_buffer_ptr += 16;
}
for (; ic < input_depth; ic++) {
const int16_t input_val = *local_input_ptr++ + input_offset;
for (int i = 0; i < 2; i++) {
*acc_buffer_ptr++ +=
static_cast<int32_t>(local_filter_ptr[i]) * input_val;
}
local_filter_ptr += 2;
}
input_ptr += input_ptr_increment;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 0, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
for (int outp = 0; outp < num_output_pixels; outp++) {
const int8_t* local_filter_ptr = filter_ptr;
const int8_t* local_input_ptr = input_ptr;
int ic = 0;
for (; ic <= input_depth - 16; ic += 16) {
int8x8_t filter_s8_0 = vld1_s8(local_filter_ptr + 8 * 0);
int8x8_t filter_s8_1 = vld1_s8(local_filter_ptr + 8 * 1);
local_filter_ptr += 16;
int16x8_t filter_0 = vmovl_s8(filter_s8_0);
int16x8_t filter_1 = vmovl_s8(filter_s8_1);
int8x8_t input_s8_0 = vld1_s8(local_input_ptr + 8 * 0);
int8x8_t input_s8_1 = vld1_s8(local_input_ptr + 8 * 1);
local_input_ptr += 16;
int16x8_t input_0 = vmovl_s8(input_s8_0);
int16x8_t input_1 = vmovl_s8(input_s8_1);
input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
acc_1 =
vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
acc_3 =
vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
acc_buffer_ptr += 16;
}
for (; ic <= input_depth - 8; ic += 8) {
const int8x8_t filter_s8 = vld1_s8(local_filter_ptr);
local_filter_ptr += 8;
const int16x8_t filter = vmovl_s8(filter_s8);
const int8x8_t input_s8 = vld1_s8(local_input_ptr);
local_input_ptr += 8;
const int16x8_t input_s16 = vmovl_s8(input_s8);
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
int32x4_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 8;
}
for (; ic < input_depth; ic++) {
const int16_t input_val = *local_input_ptr++ + input_offset;
const int16_t filter_val = *local_filter_ptr++;
*acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
}
input_ptr += input_ptr_increment;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 16, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
int8x8_t filter_s8[2];
for (int i = 0; i < 2; i++) {
filter_s8[i] = vld1_s8(filter_ptr + 8 * i);
}
int16x8_t filter[2];
for (int i = 0; i < 2; i++) {
filter[i] = vmovl_s8(filter_s8[i]);
}
for (int outp = 0; outp < num_output_pixels; outp++) {
int8x8_t input_s8[2];
for (int i = 0; i < 2; i++) {
input_s8[i] = vld1_s8(input_ptr + 8 * i);
}
input_ptr += input_ptr_increment;
int16x8_t input[2];
for (int i = 0; i < 2; i++) {
input[i] = vmovl_s8(input_s8[i]);
}
for (int i = 0; i < 2; i++) {
input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
}
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
for (int i = 0; i < 2; i++) {
acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
vget_low_s16(filter[i]));
acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
vget_high_s16(filter[i]));
}
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 8, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
const int8x8_t filter_s8 = vld1_s8(filter_ptr);
const int16x8_t filter = vmovl_s8(filter_s8);
for (int outp = 0; outp < num_output_pixels; outp++) {
const int8x8_t input_s8 = vld1_s8(input_ptr);
const int16x8_t input_s16 = vmovl_s8(input_s8);
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
int32x4_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 8;
input_ptr += input_ptr_increment;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 1, 16> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
int8x8_t filter_s8[2];
for (int i = 0; i < 2; i++) {
filter_s8[i] = vld1_s8(filter_ptr + 8 * i);
}
int16x8_t filter[2];
for (int i = 0; i < 2; i++) {
filter[i] = vmovl_s8(filter_s8[i]);
}
for (int outp = 0; outp < num_output_pixels; outp++) {
int8_t input_s8 = *input_ptr;
input_ptr += input_ptr_increment;
int16_t input = static_cast<int16_t>(input_s8 + input_offset);
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
for (int i = 0; i < 2; i++) {
acc[2 * i + 0] =
vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
acc[2 * i + 1] =
vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
}
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 1, 32> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0);
int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1);
int8x8_t filter_s8_2 = vld1_s8(filter_ptr + 8 * 2);
int8x8_t filter_s8_3 = vld1_s8(filter_ptr + 8 * 3);
int16x8_t filter_0 = vmovl_s8(filter_s8_0);
int16x8_t filter_1 = vmovl_s8(filter_s8_1);
int16x8_t filter_2 = vmovl_s8(filter_s8_2);
int16x8_t filter_3 = vmovl_s8(filter_s8_3);
for (int outp = 0; outp < num_output_pixels; outp++) {
int8_t input_s8 = *input_ptr;
input_ptr += input_ptr_increment;
int16_t input = static_cast<int16_t>(input_s8 + input_offset);
int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
acc_buffer_ptr += 32;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 1, 20> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0);
int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1);
int8x8_t filter_s8_x = vld1_s8(filter_ptr + 8 * 1 + 4);
int16x8_t filter_0 = vmovl_s8(filter_s8_0);
int16x8_t filter_1 = vmovl_s8(filter_s8_1);
int16x8_t filter_x = vmovl_s8(filter_s8_x);
for (int outp = 0; outp < num_output_pixels; outp++) {
int8_t input_s8 = *input_ptr;
input_ptr += input_ptr_increment;
int16_t input = static_cast<int16_t>(input_s8 + input_offset);
int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
acc_buffer_ptr += 20;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 1, 8> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
const int8x8_t filter_s8 = vld1_s8(filter_ptr);
const int16x8_t filter = vmovl_s8(filter_s8);
for (int outp = 0; outp < num_output_pixels; outp++) {
int8_t input_s8 = *input_ptr;
input_ptr += input_ptr_increment;
int16_t input = static_cast<int16_t>(input_s8 + input_offset);
int32x4_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 8;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 2, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
int8x8_t filter_s8 = vdup_n_s8(0);
filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
int outp = 0;
for (; outp <= num_output_pixels - 2; outp += 2) {
int32x4_t acc = vld1q_s32(acc_buffer_ptr);
int16x4_t input_s16 = vdup_n_s16(0);
input_s16 = vset_lane_s16(
(reinterpret_cast<const int16_t*>(input_ptr))[0], input_s16, 0);
input_ptr += input_ptr_increment;
input_s16 = vset_lane_s16(
(reinterpret_cast<const int16_t*>(input_ptr))[0], input_s16, 1);
input_ptr += input_ptr_increment;
input_s16 = vget_low_s16(vmovl_s8(vreinterpret_s8_s16(input_s16)));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc = vmlal_s16(acc, filter, input);
vst1q_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 4;
}
for (; outp < num_output_pixels; outp++) {
int32x2_t acc = vld1_s32(acc_buffer_ptr);
int8x8_t input_s8 = vdup_n_s8(0);
input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
input_ptr += input_ptr_increment;
const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
vst1_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 2;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 4, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
if (num_output_pixels <= 0) {
return;
}
int8x8_t filter_s8 = vdup_n_s8(0);
filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
int outp = 0;
for (; outp < num_output_pixels - 1; outp++) {
int32x4_t acc;
acc = vld1q_s32(acc_buffer_ptr);
int8x8_t input_s8 = vld1_s8(input_ptr);
input_ptr += input_ptr_increment;
const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc = vmlal_s16(acc, filter, input);
vst1q_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 4;
}
int32x4_t acc;
acc = vld1q_s32(acc_buffer_ptr);
int8x8_t input_s8 = vdup_n_s8(0);
input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc = vmlal_s16(acc, filter, input);
vst1q_s32(acc_buffer_ptr, acc);
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 12, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const int8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const int8_t* filter_ptr,
int32_t* acc_buffer_ptr) {
int8x8_t filter_s8_0 = vld1_s8(filter_ptr);
int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 4);
int16x8_t filter_s16_0 = vmovl_s8(filter_s8_0);
int16x8_t filter_s16_1 = vmovl_s8(filter_s8_1);
int16x4_t filter_0 = vget_low_s16(filter_s16_0);
int16x4_t filter_1 = vget_high_s16(filter_s16_0);
int16x4_t filter_2 = vget_high_s16(filter_s16_1);
for (int outp = 0; outp < num_output_pixels; outp++) {
int8x8_t input_s8_0 = vld1_s8(input_ptr);
int8x8_t input_s8_1 = vld1_s8(input_ptr + 4);
input_ptr += input_ptr_increment;
int16x8_t input_0 = vmovl_s8(input_s8_0);
int16x8_t input_1 = vmovl_s8(input_s8_1);
input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
acc_buffer_ptr += 12;
}
}
};
#endif
template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
void QuantizedDepthwiseConvAccumRow(
int stride, int dilation_factor, int input_depth, int input_width,
const int8_t* input_data, int16_t input_offset, int pad_width,
int depth_multiplier, int filter_width, const int8_t* filter_data,
int out_x_buffer_start, int out_x_buffer_end, int output_depth,
int32_t* acc_buffer) { … }
inline void QuantizedDepthwiseConvAccumRowGeneric(
int stride, int dilation_factor, int input_depth, int input_width,
const int8_t* input_data, int16_t input_offset, int pad_width,
int depth_multiplier, int filter_width, const int8_t* filter_data,
int out_x_buffer_start, int out_x_buffer_end, int output_depth,
int32_t* acc_buffer) { … }
inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
const int32_t* bias_data,
int32_t* acc_buffer) { … }
inline void DepthwiseConvGeneral(
const DepthwiseParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const RuntimeShape& input_shape,
const int8_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int8_t* output_data, int thread_start, int thread_end, int thread_dim) { … }
}
template <DepthwiseConvOutputRounding kOutputRounding>
inline void DepthwiseConvWithRounding(
const DepthwiseParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const RuntimeShape& input_shape,
const int8_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int8_t* output_data, int thread_start, int thread_end, int thread_dim,
const CpuBackendContext& cpu_backend_context) { … }
inline void DepthwiseConvImpl(
const DepthwiseParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const RuntimeShape& input_shape,
const int8_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int8_t* output_data, int thread_start, int thread_end, int thread_dim,
const CpuBackendContext& cpu_backend_context) { … }
template <typename T, typename TS>
struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task { … };
inline int HowManyConvThreads(const RuntimeShape& output_shape,
const RuntimeShape& filter_shape,
int thread_dim) { … }
inline void DepthwiseConvPerChannel(
const DepthwiseParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const RuntimeShape& input_shape,
const int8_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int8_t* output_data, CpuBackendContext* cpu_backend_context) { … }
}
}
#endif