#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
#include <algorithm>
#include <type_traits>
#include "ruy/profiler/instrumentation.h"
#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
#include "tensorflow/lite/kernels/internal/types.h"
#ifdef __AVX2__
#include <immintrin.h>
#endif
namespace tflite {
namespace optimized_ops {
namespace depthwise_conv {
template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
struct QuantizedDepthwiseConvKernel { … };
#ifdef USE_NEON
template <>
struct QuantizedDepthwiseConvKernel<true, 8, 2> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
uint8x8x2_t filter_u8;
filter_u8.val[0] = vld1_u8(filter_ptr);
filter_u8.val[1] = vld1_u8(filter_ptr + 8);
int16x8_t filter[2];
for (int i = 0; i < 2; i++) {
filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])),
vdupq_n_s16(filter_offset));
}
for (int outp = 0; outp < num_output_pixels; outp++) {
int32x4x2_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
}
const uint8x8_t input_u8 = vld1_u8(input_ptr);
input_ptr += input_ptr_increment;
const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
const int16x8x2_t input_dup2 = vzipq_s16(input, input);
for (int i = 0; i < 2; i++) {
acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
vget_low_s16(input_dup2.val[i]));
acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
vget_high_s16(input_dup2.val[i]));
}
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
}
acc_buffer_ptr += 16;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 8, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
int outp = 0;
for (; outp <= num_output_pixels - 2; outp += 2) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
uint8x8_t input_u8[2];
for (int i = 0; i < 2; i++) {
input_u8[i] = vld1_u8(input_ptr + 8 * i);
}
input_ptr += 16;
int16x8_t input[2];
for (int i = 0; i < 2; i++) {
input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
}
for (int i = 0; i < 2; i++) {
input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
}
acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
acc[1] =
vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
acc[3] =
vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
for (; outp < num_output_pixels; outp++) {
int32x4_t acc[2];
acc[0] = vld1q_s32(acc_buffer_ptr);
acc[1] = vld1q_s32(acc_buffer_ptr + 4);
const uint8x8_t input_u8 = vld1_u8(input_ptr);
input_ptr += 8;
const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
vst1q_s32(acc_buffer_ptr, acc[0]);
vst1q_s32(acc_buffer_ptr + 4, acc[1]);
acc_buffer_ptr += 8;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 4, 2> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
int outp = 0;
for (; outp <= num_output_pixels - 2; outp += 2) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
const uint8x8_t input_u8 = vld1_u8(input_ptr);
input_ptr += 8;
const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
const int16x8x2_t input_dup2 = vzipq_s16(input, input);
for (int i = 0; i < 2; i++) {
acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
vget_low_s16(input_dup2.val[i]));
acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
vget_high_s16(input_dup2.val[i]));
}
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
for (; outp < num_output_pixels; outp++) {
int32x4_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
uint8x8_t input_u8 = vdup_n_u8(0);
input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
input_ptr += 4;
const int16x4_t input_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
const int16x4x2_t input_dup2 = vzip_s16(input, input);
acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 8;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 2, 8> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
int16x8_t filter[2];
for (int i = 0; i < 2; i++) {
const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
}
int outp = 0;
for (; outp <= num_output_pixels - 2; outp += 2) {
int32x4_t acc[8];
for (int i = 0; i < 8; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
uint8x8_t input_u8 = vdup_n_u8(0);
input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
input_ptr += 4;
const int16x4_t input_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
for (int i = 0; i < 8; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 32;
}
for (; outp < num_output_pixels; outp++) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
uint8x8_t input_u8 = vdup_n_u8(0);
input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
input_ptr += 2;
const int16x4_t input_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 2, 2> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
uint8x8_t filter_u8 = vdup_n_u8(0);
filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
const int16x4_t filter_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
int outp = 0;
for (; outp <= num_output_pixels - 4; outp += 4) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
const uint8x8_t input_u8 = vld1_u8(input_ptr);
input_ptr += 8;
const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
const int16x8x2_t input_dup2 = vzipq_s16(input, input);
acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
for (; outp < num_output_pixels; outp++) {
int32x4_t acc = vld1q_s32(acc_buffer_ptr);
uint8x8_t input_u8 = vdup_n_u8(0);
input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
input_ptr += 2;
const int16x4_t input_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
acc = vmlal_s16(acc, filter, input_dup2);
vst1q_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 4;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 2, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
uint8x8_t filter_u8 = vdup_n_u8(0);
filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
const int16x4_t filter_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
int outp = 0;
for (; outp <= num_output_pixels - 8; outp += 8) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
uint8x8_t input_u8[2];
for (int i = 0; i < 2; i++) {
input_u8[i] = vld1_u8(input_ptr + 8 * i);
}
input_ptr += 16;
int16x8_t input[2];
for (int i = 0; i < 2; i++) {
input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
}
for (int i = 0; i < 2; i++) {
input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
}
acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
for (; outp <= num_output_pixels - 4; outp += 4) {
int32x4_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
const uint8x8_t input_u8 = vld1_u8(input_ptr);
input_ptr += 8;
const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 8;
}
for (; outp <= num_output_pixels - 2; outp += 2) {
int32x4_t acc = vld1q_s32(acc_buffer_ptr);
uint8x8_t input_u8 = vdup_n_u8(0);
input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
input_ptr += 4;
const int16x4_t input_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc = vmlal_s16(acc, filter, input);
vst1q_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 4;
}
for (; outp < num_output_pixels; outp++) {
int32x2_t acc = vld1_s32(acc_buffer_ptr);
uint8x8_t input_u8 = vdup_n_u8(0);
input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
input_ptr += 2;
const int16x4_t input_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
vst1_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 2;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 1, 2> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
uint8x8_t filter_u8 = vdup_n_u8(0);
filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
const int16x4_t filter_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
int outp = 0;
for (; outp <= num_output_pixels - 8; outp += 8) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
const uint8x8_t input_u8 = vld1_u8(input_ptr);
input_ptr += 8;
const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
const int16x8x2_t input_dup2 = vzipq_s16(input, input);
acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
for (; outp < num_output_pixels; outp++) {
int32x2_t acc = vld1_s32(acc_buffer_ptr);
const uint32_t input = *input_ptr++ + input_offset;
acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
vst1_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 2;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 1, 4> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
uint8x8_t filter_u8 = vdup_n_u8(0);
filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
const int16x4_t filter_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
int outp = 0;
for (; outp <= num_output_pixels - 8; outp += 8) {
int32x4_t acc[8];
for (int i = 0; i < 8; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
uint8x8_t input_u8 = vld1_u8(input_ptr);
input_ptr += 8;
const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
for (int i = 0; i < 8; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 32;
}
for (; outp <= num_output_pixels - 4; outp += 4) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
uint8x8_t input_u8 = vdup_n_u8(0);
input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
input_ptr += 4;
const int16x4_t input_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
for (; outp < num_output_pixels; outp++) {
int32x4_t acc = vld1q_s32(acc_buffer_ptr);
const uint32_t input = *input_ptr++ + input_offset;
acc = vmlal_n_s16(acc, filter, input);
vst1q_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 4;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 4, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
uint8x8_t filter_u8 = vdup_n_u8(0);
filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
const int16x4_t filter_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
int outp = 0;
for (; outp <= num_output_pixels - 4; outp += 4) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
int16x8_t input[2];
for (int i = 0; i < 2; i++) {
const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
}
input_ptr += 16;
for (int i = 0; i < 2; i++) {
acc[2 * i + 0] =
vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
acc[2 * i + 1] =
vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
}
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
for (; outp < num_output_pixels; outp++) {
int32x4_t acc;
acc = vld1q_s32(acc_buffer_ptr);
uint8x8_t input_u8 = vdup_n_u8(0);
input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
input_ptr += 4;
const int16x4_t input_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc = vmlal_s16(acc, filter, input);
vst1q_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 4;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 4, 4> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
int16x8_t filter[2];
for (int i = 0; i < 2; i++) {
const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
}
int outp = 0;
for (; outp <= num_output_pixels - 2; outp += 2) {
int32x4_t acc[8];
for (int i = 0; i < 8; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
uint8x8_t input_u8 = vld1_u8(input_ptr);
input_ptr += 8;
const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
vget_low_s16(input), 0);
acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
vget_low_s16(input), 1);
acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
vget_low_s16(input), 2);
acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
vget_low_s16(input), 3);
acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
vget_high_s16(input), 0);
acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
vget_high_s16(input), 1);
acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
vget_high_s16(input), 2);
acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
vget_high_s16(input), 3);
for (int i = 0; i < 8; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 32;
}
for (; outp < num_output_pixels; outp++) {
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
uint8x8_t input_u8 = vdup_n_u8(0);
input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
input_ptr += 4;
const int16x4_t input_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 0, 3> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
static const uint8_t dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
{2, 3, 3, 3, 4, 4, 4, 5},
{5, 5, 6, 6, 6, 7, 7, 7}};
uint8x8_t dup3_indices[3];
for (int i = 0; i < 3; i++) {
dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
}
for (int outp = 0; outp < num_output_pixels; outp++) {
const uint8_t* local_filter_ptr = filter_ptr;
const uint8_t* local_input_ptr = input_ptr;
int ic = 0;
for (; ic <= input_depth - 8; ic += 8) {
int16x8_t filter[3];
uint8x8x3_t filter_u8;
filter_u8.val[0] = vld1_u8(local_filter_ptr);
filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
local_filter_ptr += 24;
for (int i = 0; i < 3; i++) {
const int16x8_t filter_s16 =
vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
}
const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
local_input_ptr += 8;
uint8x8_t input_u8_dup3[3];
for (int i = 0; i < 3; i++) {
input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
}
int16x8_t input_dup3[3];
for (int i = 0; i < 3; i++) {
const int16x8_t input_s16_dup3 =
vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
}
int32x4x3_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
}
for (int j = 0; j < 3; j++) {
acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
vget_low_s16(filter[j]));
acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
vget_high_s16(filter[j]));
}
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
}
acc_buffer_ptr += 24;
}
for (; ic < input_depth; ic++) {
const int16_t input_val = *local_input_ptr++ + input_offset;
for (int i = 0; i < 3; i++) {
const int16_t filter_val = local_filter_ptr[i] + filter_offset;
*acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
}
local_filter_ptr += 3;
}
input_ptr += input_ptr_increment;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 0, 2> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
for (int outp = 0; outp < num_output_pixels; outp++) {
const uint8_t* local_filter_ptr = filter_ptr;
const uint8_t* local_input_ptr = input_ptr;
int ic = 0;
for (; ic <= input_depth - 8; ic += 8) {
int16x8_t filter[2];
uint8x8x2_t filter_u8;
filter_u8.val[0] = vld1_u8(local_filter_ptr);
filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
local_filter_ptr += 16;
for (int i = 0; i < 2; i++) {
const int16x8_t filter_s16 =
vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
}
const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
local_input_ptr += 8;
const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
const int16x8x2_t input_dup2 = vzipq_s16(input, input);
int32x4x2_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
}
for (int j = 0; j < 2; j++) {
acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
vget_low_s16(input_dup2.val[j]));
acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
vget_high_s16(input_dup2.val[j]));
}
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
}
acc_buffer_ptr += 16;
}
for (; ic < input_depth; ic++) {
const int16_t input_val = *local_input_ptr++ + input_offset;
for (int i = 0; i < 2; i++) {
const int16_t filter_val = local_filter_ptr[i] + filter_offset;
*acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
}
local_filter_ptr += 2;
}
input_ptr += input_ptr_increment;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 0, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
for (int outp = 0; outp < num_output_pixels; outp++) {
const uint8_t* local_filter_ptr = filter_ptr;
const uint8_t* local_input_ptr = input_ptr;
int ic = 0;
for (; ic <= input_depth - 16; ic += 16) {
#ifdef __AVX2__
__m128i filter_u8_0 = _mm_loadl_epi64(
reinterpret_cast<const __m128i*>(local_filter_ptr + 8 * 0));
__m128i filter_u8_1 = _mm_loadl_epi64(
reinterpret_cast<const __m128i*>(local_filter_ptr + 8 * 1));
local_filter_ptr += 16;
__m256i filter_0 = _mm256_cvtepu8_epi32(filter_u8_0);
__m256i filter_1 = _mm256_cvtepu8_epi32(filter_u8_1);
__m256i filter_offset_vec = _mm256_set1_epi32(filter_offset);
filter_0 = _mm256_add_epi32(filter_0, filter_offset_vec);
filter_1 = _mm256_add_epi32(filter_1, filter_offset_vec);
__m128i input_u8_0 = _mm_loadl_epi64(
reinterpret_cast<const __m128i*>(local_input_ptr + 8 * 0));
__m128i input_u8_1 = _mm_loadl_epi64(
reinterpret_cast<const __m128i*>(local_input_ptr + 8 * 1));
local_input_ptr += 16;
__m256i input_0 = _mm256_cvtepu8_epi32(input_u8_0);
__m256i input_1 = _mm256_cvtepu8_epi32(input_u8_1);
__m256i input_offset_vec = _mm256_set1_epi32(input_offset);
input_0 = _mm256_add_epi32(input_0, input_offset_vec);
input_1 = _mm256_add_epi32(input_1, input_offset_vec);
__m256i acc_0 = _mm256_loadu_si256(
reinterpret_cast<const __m256i*>(acc_buffer_ptr + 8 * 0));
__m256i acc_1 = _mm256_loadu_si256(
reinterpret_cast<const __m256i*>(acc_buffer_ptr + 8 * 1));
acc_0 = _mm256_add_epi32(acc_0, _mm256_mullo_epi32(input_0, filter_0));
acc_1 = _mm256_add_epi32(acc_1, _mm256_mullo_epi32(input_1, filter_1));
_mm256_storeu_si256(reinterpret_cast<__m256i*>(acc_buffer_ptr + 8 * 0),
acc_0);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(acc_buffer_ptr + 8 * 1),
acc_1);
acc_buffer_ptr += 16;
#else
uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);
uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);
local_filter_ptr += 16;
int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);
uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);
local_input_ptr += 16;
int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
acc_1 =
vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
acc_3 =
vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
acc_buffer_ptr += 16;
#endif
}
for (; ic <= input_depth - 8; ic += 8) {
const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
local_filter_ptr += 8;
const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
const int16x8_t filter =
vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
local_input_ptr += 8;
const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
int32x4_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 8;
}
for (; ic < input_depth; ic++) {
const int16_t input_val = *local_input_ptr++ + input_offset;
const int16_t filter_val = *local_filter_ptr++ + filter_offset;
*acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
}
input_ptr += input_ptr_increment;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 16, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
uint8x8_t filter_u8[2];
for (int i = 0; i < 2; i++) {
filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
}
int16x8_t filter[2];
for (int i = 0; i < 2; i++) {
filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
}
for (int i = 0; i < 2; i++) {
filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
}
for (int outp = 0; outp < num_output_pixels; outp++) {
uint8x8_t input_u8[2];
for (int i = 0; i < 2; i++) {
input_u8[i] = vld1_u8(input_ptr + 8 * i);
}
input_ptr += input_ptr_increment;
int16x8_t input[2];
for (int i = 0; i < 2; i++) {
input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
}
for (int i = 0; i < 2; i++) {
input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
}
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
for (int i = 0; i < 2; i++) {
acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
vget_low_s16(filter[i]));
acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
vget_high_s16(filter[i]));
}
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 8, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
for (int outp = 0; outp < num_output_pixels; outp++) {
const uint8x8_t input_u8 = vld1_u8(input_ptr);
const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
int32x4_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 8;
input_ptr += input_ptr_increment;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 1, 16> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
uint8x8_t filter_u8[2];
for (int i = 0; i < 2; i++) {
filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
}
int16x8_t filter[2];
for (int i = 0; i < 2; i++) {
filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
}
for (int i = 0; i < 2; i++) {
filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
}
for (int outp = 0; outp < num_output_pixels; outp++) {
uint8_t input_u8 = *input_ptr;
input_ptr += input_ptr_increment;
int16_t input = static_cast<int16_t>(input_u8 + input_offset);
int32x4_t acc[4];
for (int i = 0; i < 4; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
for (int i = 0; i < 2; i++) {
acc[2 * i + 0] =
vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
acc[2 * i + 1] =
vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
}
for (int i = 0; i < 4; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 16;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 1, 32> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);
uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);
int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));
int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));
filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));
filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));
for (int outp = 0; outp < num_output_pixels; outp++) {
uint8_t input_u8 = *input_ptr;
input_ptr += input_ptr_increment;
int16_t input = static_cast<int16_t>(input_u8 + input_offset);
int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
acc_buffer_ptr += 32;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 1, 20> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);
int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));
filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));
for (int outp = 0; outp < num_output_pixels; outp++) {
uint8_t input_u8 = *input_ptr;
input_ptr += input_ptr_increment;
int16_t input = static_cast<int16_t>(input_u8 + input_offset);
int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
acc_buffer_ptr += 20;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 1, 8> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
const int16x8_t filter = vaddq_s16(
vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
for (int outp = 0; outp < num_output_pixels; outp++) {
uint8_t input_u8 = *input_ptr;
input_ptr += input_ptr_increment;
int16_t input = static_cast<int16_t>(input_u8 + input_offset);
int32x4_t acc[2];
for (int i = 0; i < 2; i++) {
acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
}
acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
for (int i = 0; i < 2; i++) {
vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
}
acc_buffer_ptr += 8;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 2, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
uint8x8_t filter_u8 = vdup_n_u8(0);
filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
const int16x4_t filter_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
int outp = 0;
for (; outp <= num_output_pixels - 2; outp += 2) {
int32x4_t acc = vld1q_s32(acc_buffer_ptr);
uint16x4_t input_u16 = vdup_n_u16(0);
input_u16 = vset_lane_u16(
(reinterpret_cast<const uint16_t*>(input_ptr))[0], input_u16, 0);
input_ptr += input_ptr_increment;
input_u16 = vset_lane_u16(
(reinterpret_cast<const uint16_t*>(input_ptr))[0], input_u16, 1);
input_ptr += input_ptr_increment;
const int16x4_t input_s16 = vreinterpret_s16_u16(
vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc = vmlal_s16(acc, filter, input);
vst1q_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 4;
}
for (; outp < num_output_pixels; outp++) {
int32x2_t acc = vld1_s32(acc_buffer_ptr);
uint8x8_t input_u8 = vdup_n_u8(0);
input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
input_ptr += input_ptr_increment;
const int16x4_t input_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
vst1_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 2;
}
}
};
template <>
struct QuantizedDepthwiseConvKernel<true, 4, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
if (num_output_pixels <= 0) {
return;
}
uint8x8_t filter_u8 = vdup_n_u8(0);
filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
const int16x4_t filter_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
int outp = 0;
for (; outp < num_output_pixels - 1; outp++) {
int32x4_t acc;
acc = vld1q_s32(acc_buffer_ptr);
uint8x8_t input_u8 = vld1_u8(input_ptr);
input_ptr += input_ptr_increment;
const int16x4_t input_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc = vmlal_s16(acc, filter, input);
vst1q_s32(acc_buffer_ptr, acc);
acc_buffer_ptr += 4;
}
int32x4_t acc;
acc = vld1q_s32(acc_buffer_ptr);
uint8x8_t input_u8 = vdup_n_u8(0);
input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
const int16x4_t input_s16 =
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
acc = vmlal_s16(acc, filter, input);
vst1q_s32(acc_buffer_ptr, acc);
}
};
template <>
struct QuantizedDepthwiseConvKernel<false, 12, 1> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
const uint8_t* input_ptr, int16_t input_offset,
int input_ptr_increment, const uint8_t* filter_ptr,
int16_t filter_offset, int32_t* acc_buffer_ptr) {
uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);
uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);
int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));
filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));
int16x4_t filter_0 = vget_low_s16(filter_s16_0);
int16x4_t filter_1 = vget_high_s16(filter_s16_0);
int16x4_t filter_2 = vget_high_s16(filter_s16_1);
for (int outp = 0; outp < num_output_pixels; outp++) {
uint8x8_t input_u8_0 = vld1_u8(input_ptr);
uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);
input_ptr += input_ptr_increment;
int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
acc_buffer_ptr += 12;
}
}
};
#endif
template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
void QuantizedDepthwiseConvAccumRow(
int stride, int dilation_factor, int input_depth, int input_width,
const uint8_t* input_data, int16_t input_offset, int pad_width,
int depth_multiplier, int filter_width, const uint8_t* filter_data,
int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end,
int output_depth, int32_t* acc_buffer) { … }
inline void QuantizedDepthwiseConvAccumRowGeneric(
int stride, int dilation_factor, int input_depth, int input_width,
const uint8_t* input_data, int16_t input_offset, int pad_width,
int depth_multiplier, int filter_width, const uint8_t* filter_data,
int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end,
int output_depth, int32_t* acc_buffer) { … }
inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
const int32_t* bias_data,
int32_t* acc_buffer) { … }
inline void DepthwiseConvGeneral(
const DepthwiseParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& filter_shape,
const uint8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
uint8_t* output_data, int thread_start, int thread_end, int thread_dim) { … }
}
template <DepthwiseConvOutputRounding kOutputRounding>
inline void DepthwiseConvWithRounding(
const DepthwiseParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& filter_shape,
const uint8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
uint8_t* output_data, const CpuFlags& cpu_flags, int thread_start,
int thread_end, int thread_dim) { … }
inline void DepthwiseConvImpl(
const DepthwiseParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& filter_shape,
const uint8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
uint8_t* output_data, const CpuFlags& cpu_flags, int thread_start,
int thread_end, int thread_dim) { … }
}
}
#endif