#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
#include <stddef.h>
#include <algorithm>
#include <memory>
#include "ruy/profiler/instrumentation.h"
#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace optimized_ops {
namespace depthwise_conv {
#ifdef USE_NEON
inline int8x16_t util_vld1q_x8(const uint8_t* data_addr) {
return vreinterpretq_s8_u8(vld1q_u8(data_addr));
}
inline int8x16_t util_vld1q_x8(const int8_t* data_addr) {
return vld1q_s8(data_addr);
}
inline int8x8_t util_vld1_x8(const uint8_t* data_addr) {
return vreinterpret_s8_u8(vld1_u8(data_addr));
}
inline int8x8_t util_vld1_x8(const int8_t* data_addr) {
return vld1_s8(data_addr);
}
#endif
#define STR …
#define STR_UNEXPANDED …
#if defined(__aarch64__) && !defined(GOOGLE_L4T)
#define vst1_lane_8x4 …
#define vst1q_lane_8x4 …
#define vld1q_lane_s8x8 …
#define vld1_lane_8x4 …
#define vld1q_lane_8x4 …
#define vld1q_dup_s8x4 …
#define OFFSET_INPUT_DEPTH …
#define OFFSET_INPUT_ROW_SIZE …
#define OFFSET_OUTPUT_DEPTH …
#define OFFSET_OUTPUT_ROW_SIZE …
#define OFFSET_FILTER_ROW_SIZE …
#define OFFSET_INPUT_OFFSET …
#define OFFSET_OUTPUT_OFFSET …
#define OFFSET_FILTER_OFFSET …
#define OFFSET_OUTPUT_MULTIPLIER …
#define OFFSET_OUTPUT_ACTIVATION_MIN …
#define OFFSET_OUTPUT_ACTIVATION_MAX …
#define OFFSET_OUTPUT_RIGHT_SHIFT …
#define OFFSET_INPUT_WIDTH …
#define OFFSET_INPUT_HEIGHT …
#define OFFSET_STRIDE_WIDTH …
#define OFFSET_STRIDE_HEIGHT …
#define OFFSET_OUTPUT_WIDTH …
#define OFFSET_OUTPUT_HEIGHT …
static_assert(offsetof(DepthwiseConvParams, input_depth) == OFFSET_INPUT_DEPTH,
"");
static_assert(offsetof(DepthwiseConvParams, input_row_size) ==
OFFSET_INPUT_ROW_SIZE,
"");
static_assert(offsetof(DepthwiseConvParams, output_depth) ==
OFFSET_OUTPUT_DEPTH,
"");
static_assert(offsetof(DepthwiseConvParams, output_row_size) ==
OFFSET_OUTPUT_ROW_SIZE,
"");
static_assert(offsetof(DepthwiseConvParams, filter_row_size) ==
OFFSET_FILTER_ROW_SIZE,
"");
static_assert(offsetof(DepthwiseConvParams, input_offset) ==
OFFSET_INPUT_OFFSET,
"");
static_assert(offsetof(DepthwiseConvParams, output_offset) ==
OFFSET_OUTPUT_OFFSET,
"");
static_assert(offsetof(DepthwiseConvParams, filter_offset) ==
OFFSET_FILTER_OFFSET,
"");
static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
OFFSET_OUTPUT_MULTIPLIER,
"");
static_assert(offsetof(DepthwiseConvParams, output_activation_min) ==
OFFSET_OUTPUT_ACTIVATION_MIN,
"");
static_assert(offsetof(DepthwiseConvParams, output_activation_max) ==
OFFSET_OUTPUT_ACTIVATION_MAX,
"");
static_assert(offsetof(DepthwiseConvParams, output_right_shift) ==
OFFSET_OUTPUT_RIGHT_SHIFT,
"");
static_assert(offsetof(DepthwiseConvParams, input_width) == OFFSET_INPUT_WIDTH,
"");
static_assert(offsetof(DepthwiseConvParams, input_height) ==
OFFSET_INPUT_HEIGHT,
"");
static_assert(offsetof(DepthwiseConvParams, stride_width) ==
OFFSET_STRIDE_WIDTH,
"");
static_assert(offsetof(DepthwiseConvParams, stride_height) ==
OFFSET_STRIDE_HEIGHT,
"");
static_assert(offsetof(DepthwiseConvParams, output_width) ==
OFFSET_OUTPUT_WIDTH,
"");
static_assert(offsetof(DepthwiseConvParams, output_height) ==
OFFSET_OUTPUT_HEIGHT,
"");
#define DP_OFFSET_INPUT_DEPTH …
#define DP_OFFSET_OUTPUT_DEPTH …
#define DP_OFFSET_STRIDE …
#define DP_OFFSET_BIAS_INCREMENT …
#define DP_OFFSET_INPUT_OFFSET …
#define DP_OFFSET_OUTPUT_OFFSET …
#define DP_OFFSET_OUTPUT_MULTIPLIER …
#define DP_OFFSET_OUTPUT_SHIFT …
#define DP_OFFSET_QUANTIZED_ACTIVATION_MIN …
#define DP_OFFSET_QUANTIZED_ACTIVATION_MAX …
#define DP_OFFSET_PADDING_LEFT …
#define DP_OFFSET_PADDING_RIGHT …
#define DP_OFFSET_PADDING_TOP …
#define DP_OFFSET_PADDING_BOTTOM …
#define DP_OFFSET_DEPTH_MICRO_REPEATS …
#define DP_OFFSET_WIDTH_MACRO_COUNT …
#define DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS …
#define DP_OFFSET_INPUT_WIDTH_MICRO_REPEATS …
#define DP_OFFSET_RESIDUAL_WIDTH …
#define DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS …
#define DP_OFFSET_OUTPUT_WIDTH_MICRO_REPEATS …
#define DP_OFFSET_OUTPUT_RESIDUAL_WIDTH …
#define DP_OFFSET_WORKSPACE_WIDTH_MICRO_REPEATS …
#define DP_OFFSET_HEIGHT_MACRO_COUNT …
#define DP_OFFSET_INBOUND_BLOCK_HEIGHT …
#define DP_OFFSET_OUTBOUND_BLOCK_HEIGHT …
#define DP_OFFSET_INPUT_HEIGHT_STRIDE …
#define DP_OFFSET_OUTPUT_HEIGHT_STRIDE …
#define DP_OFFSET_WORKSPACE_HEIGHT_STRIDE …
#define DP_OFFSET_FOUR_OVER_STRIDE …
#define DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL …
#define DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL …
static_assert(offsetof(DepthwiseConvDotProdParams, input_depth) ==
DP_OFFSET_INPUT_DEPTH,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, output_depth) ==
DP_OFFSET_OUTPUT_DEPTH,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, stride) == DP_OFFSET_STRIDE,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, bias_increment) ==
DP_OFFSET_BIAS_INCREMENT,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, input_offset) ==
DP_OFFSET_INPUT_OFFSET,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, output_offset) ==
DP_OFFSET_OUTPUT_OFFSET,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, output_multiplier) ==
DP_OFFSET_OUTPUT_MULTIPLIER,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, output_shift) ==
DP_OFFSET_OUTPUT_SHIFT,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, quantized_activation_min) ==
DP_OFFSET_QUANTIZED_ACTIVATION_MIN,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, quantized_activation_max) ==
DP_OFFSET_QUANTIZED_ACTIVATION_MAX,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, padding_left) ==
DP_OFFSET_PADDING_LEFT,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, padding_right) ==
DP_OFFSET_PADDING_RIGHT,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, padding_top) ==
DP_OFFSET_PADDING_TOP,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, padding_bottom) ==
DP_OFFSET_PADDING_BOTTOM,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, depth_micro_repeats) ==
DP_OFFSET_DEPTH_MICRO_REPEATS,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, width_macro_count) ==
DP_OFFSET_WIDTH_MACRO_COUNT,
"");
static_assert(offsetof(DepthwiseConvDotProdParams,
input_width_overall_micro_repeats) ==
DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, input_width_micro_repeats) ==
DP_OFFSET_INPUT_WIDTH_MICRO_REPEATS,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, residual_width) ==
DP_OFFSET_RESIDUAL_WIDTH,
"");
static_assert(offsetof(DepthwiseConvDotProdParams,
output_width_overall_micro_repeats) ==
DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS,
"");
static_assert(offsetof(DepthwiseConvDotProdParams,
output_width_micro_repeats) ==
DP_OFFSET_OUTPUT_WIDTH_MICRO_REPEATS,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, output_residual_width) ==
DP_OFFSET_OUTPUT_RESIDUAL_WIDTH,
"");
static_assert(offsetof(DepthwiseConvDotProdParams,
workspace_width_micro_repeats) ==
DP_OFFSET_WORKSPACE_WIDTH_MICRO_REPEATS,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, height_macro_count) ==
DP_OFFSET_HEIGHT_MACRO_COUNT,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, inbound_block_height) ==
DP_OFFSET_INBOUND_BLOCK_HEIGHT,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, outbound_block_height) ==
DP_OFFSET_OUTBOUND_BLOCK_HEIGHT,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, input_height_stride) ==
DP_OFFSET_INPUT_HEIGHT_STRIDE,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, output_height_stride) ==
DP_OFFSET_OUTPUT_HEIGHT_STRIDE,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, workspace_height_stride) ==
DP_OFFSET_WORKSPACE_HEIGHT_STRIDE,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, four_over_stride) ==
DP_OFFSET_FOUR_OVER_STRIDE,
"");
static_assert(offsetof(DepthwiseConvDotProdParams,
output_multiplier_per_channel) ==
DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL,
"");
static_assert(offsetof(DepthwiseConvDotProdParams, output_shift_per_channel) ==
DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL,
"");
#endif
#if defined(__aarch64__) && !defined(GOOGLE_L4T)
template <>
struct DepthwiseConvWindow<DepthwiseConvOutputRounding::kAwayFromZero, 8, 1,
1> {
public:
static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
const int32* bias_ptr, uint8* output_ptr,
int64_t input_depth, int64_t input_row_size,
int32 output_window_height, int32 output_window_width,
const DepthwiseConvParams* params_ptr) {
const int64_t input_width_increment = 2 * input_depth;
const int64_t input_height_increment = 2 * input_row_size;
const int64_t output_height_increment = 2 * params_ptr->output_row_size;
#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1 …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_END …
asm volatile(
"ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"cmp %w[output_window_height], #2\n"
"dup v26.8h, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
"ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v27.4s, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
"dup v29.8h, w2\n"
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"dup v30.16b, w4\n"
"ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v31.16b, w0\n"
"dup v28.4s, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"add x10, %[bias_ptr], #16\n"
"ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
"dup v9.8h, w9\n"
"ld1 {v0.8b}, [%[filter_ptr]], x3\n"
"ld1 {v1.8b}, [%[filter_ptr]], x3\n"
"uaddw v0.8h, v9.8h, v0.8b\n"
"ld1 {v2.8b}, [%[filter_ptr]], x3\n"
"uaddw v1.8h, v9.8h, v1.8b\n"
"ld1 {v3.8b}, [%[filter_ptr]], x3\n"
"uaddw v2.8h, v9.8h, v2.8b\n"
"ld1 {v4.8b}, [%[filter_ptr]], x3\n"
"uaddw v3.8h, v9.8h, v3.8b\n"
"ld1 {v5.8b}, [%[filter_ptr]], x3\n"
"uaddw v4.8h, v9.8h, v4.8b\n"
"ld1 {v6.8b}, [%[filter_ptr]], x3\n"
"uaddw v5.8h, v9.8h, v5.8b\n"
"ld1 {v7.8b}, [%[filter_ptr]], x3\n"
"uaddw v6.8h, v9.8h, v6.8b\n"
"ld1 {v8.8b}, [%[filter_ptr]], x3\n"
"uaddw v7.8h, v9.8h, v7.8b\n"
"uaddw v8.8h, v9.8h, v8.8b\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
"mov x11, %[input_ptr]\n"
"mov x12, x11\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"add x13, x11, %[input_row_size]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"add x14, x13, %[input_row_size]\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x15, x14, %[input_row_size]\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"mov w5, %w[output_window_width]\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x1\n"
"ld1 {v15.8b}, [x14], %[input_depth]\n"
"cmp w5, #2\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v16.8b}, [x14], %[input_depth]\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"ld1 {v18.8b}, [x15], %[input_depth]\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"ld1 {v19.8b}, [x15], %[input_depth]\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"ld1 {v20.8b}, [x15], %[input_depth]\n"
"uaddw v14.8h, v26.8h, v14.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"uaddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v22.4s}, [x10]\n"
"uaddw v16.8h, v26.8h, v16.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"uaddw v17.8h, v26.8h, v17.8b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v18.8h, v26.8h, v18.8b\n"
"uaddw v19.8h, v26.8h, v19.8b\n"
"uaddw v20.8h, v26.8h, v20.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
"cmp w5, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"subs w5, w5, #2\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"cmp w5, #3\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"ld1 {v9.8b}, [x12]\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"ld1 {v12.8b}, [x13]\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"ld1 {v15.8b}, [x14]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"ld1 {v18.8b}, [x15]\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"and v25.16b, v21.16b, v28.16b\n"
"and v29.16b, v22.16b, v28.16b\n"
"and v30.16b, v23.16b, v28.16b\n"
"and v31.16b, v24.16b, v28.16b\n"
"sshr v25.4s, v25.4s, #31\n"
"sshr v29.4s, v29.4s, #31\n"
"sshr v30.4s, v30.4s, #31\n"
"sshr v31.4s, v31.4s, #31\n"
"sqadd v21.4s, v21.4s, v25.4s\n"
"sqadd v22.4s, v22.4s, v29.4s\n"
"dup v29.8h, w2\n"
"sqadd v23.4s, v23.4s, v30.4s\n"
"dup v30.16b, w4\n"
"sqadd v24.4s, v24.4s, v31.4s\n"
"dup v31.16b, w0\n"
"srshl v21.4s, v21.4s, v28.4s\n"
"srshl v22.4s, v22.4s, v28.4s\n"
"srshl v23.4s, v23.4s, v28.4s\n"
"srshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"ld1 {v22.4s}, [x10]\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x3\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
"uaddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"uaddw v18.8h, v26.8h, v18.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"smlal v21.4s, v0.4h, v10.4h\n"
"add x11, x11, %[input_width_increment]\n"
"smlal2 v22.4s, v0.8h, v10.8h\n"
"mov x12, x11\n"
"smlal v23.4s, v0.4h, v13.4h\n"
"add x13, x11, %[input_row_size]\n"
"smlal2 v24.4s, v0.8h, v13.8h\n"
"add x14, x13, %[input_row_size]\n"
"smlal v21.4s, v1.4h, v11.4h\n"
"add x15, x14, %[input_row_size]\n"
"smlal2 v22.4s, v1.8h, v11.8h\n"
"smlal v23.4s, v1.4h, v14.4h\n"
"smlal2 v24.4s, v1.8h, v14.8h\n"
"smlal v21.4s, v2.4h, v9.4h\n"
"smlal2 v22.4s, v2.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"smlal v21.4s, v5.4h, v12.4h\n"
"smlal2 v22.4s, v5.8h, v12.8h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v5.4h, v15.4h\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v5.8h, v15.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v21.4s, v6.4h, v16.4h\n"
"smlal2 v22.4s, v6.8h, v16.8h\n"
"smlal v23.4s, v6.4h, v19.4h\n"
"smlal2 v24.4s, v6.8h, v19.8h\n"
"smlal v21.4s, v7.4h, v17.4h\n"
"smlal2 v22.4s, v7.8h, v17.8h\n"
"smlal v23.4s, v7.4h, v20.4h\n"
"smlal2 v24.4s, v7.8h, v20.8h\n"
"smlal v21.4s, v8.4h, v15.4h\n"
"smlal2 v22.4s, v8.8h, v15.8h\n"
"ld1 {v15.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v8.4h, v18.4h\n"
"ld1 {v16.8b}, [x14], %[input_depth]\n"
"smlal2 v24.4s, v8.8h, v18.8h\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"ld1 {v18.8b}, [x15], %[input_depth]\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"ld1 {v19.8b}, [x15], %[input_depth]\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"ld1 {v20.8b}, [x15], %[input_depth]\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"and v25.16b, v21.16b, v28.16b\n"
"and v29.16b, v22.16b, v28.16b\n"
"and v30.16b, v23.16b, v28.16b\n"
"and v31.16b, v24.16b, v28.16b\n"
"sshr v25.4s, v25.4s, #31\n"
"sshr v29.4s, v29.4s, #31\n"
"sshr v30.4s, v30.4s, #31\n"
"sshr v31.4s, v31.4s, #31\n"
"sqadd v21.4s, v21.4s, v25.4s\n"
"sqadd v22.4s, v22.4s, v29.4s\n"
"dup v29.8h, w2\n"
"sqadd v23.4s, v23.4s, v30.4s\n"
"dup v30.16b, w4\n"
"sqadd v24.4s, v24.4s, v31.4s\n"
"dup v31.16b, w0\n"
"srshl v21.4s, v21.4s, v28.4s\n"
"srshl v22.4s, v22.4s, v28.4s\n"
"srshl v23.4s, v23.4s, v28.4s\n"
"srshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"ld1 {v22.4s}, [x10]\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x3\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"uaddw v14.8h, v26.8h, v14.8b\n"
"uaddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"uaddw v16.8h, v26.8h, v16.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"uaddw v17.8h, v26.8h, v17.8b\n"
"uaddw v18.8h, v26.8h, v18.8b\n"
"uaddw v19.8h, v26.8h, v19.8b\n"
"uaddw v20.8h, v26.8h, v20.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
"cmp w5, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"ld1 {v9.8b}, [x12]\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"ld1 {v12.8b}, [x13]\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"ld1 {v15.8b}, [x14]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"ld1 {v18.8b}, [x15]\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"and v25.16b, v21.16b, v28.16b\n"
"and v29.16b, v22.16b, v28.16b\n"
"and v30.16b, v23.16b, v28.16b\n"
"and v31.16b, v24.16b, v28.16b\n"
"sshr v25.4s, v25.4s, #31\n"
"sshr v29.4s, v29.4s, #31\n"
"sshr v30.4s, v30.4s, #31\n"
"sshr v31.4s, v31.4s, #31\n"
"sqadd v21.4s, v21.4s, v25.4s\n"
"sqadd v22.4s, v22.4s, v29.4s\n"
"dup v29.8h, w2\n"
"sqadd v23.4s, v23.4s, v30.4s\n"
"dup v30.16b, w4\n"
"sqadd v24.4s, v24.4s, v31.4s\n"
"dup v31.16b, w0\n"
"srshl v21.4s, v21.4s, v28.4s\n"
"srshl v22.4s, v22.4s, v28.4s\n"
"srshl v23.4s, v23.4s, v28.4s\n"
"srshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"ld1 {v22.4s}, [x10]\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x3\n"
"mov v23.d[0], v21.d[1]\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"st1 {v23.8b}, [x7], x3\n"
"uaddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"uaddw v18.8h, v26.8h, v18.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"smlal v21.4s, v0.4h, v10.4h\n"
"smlal2 v22.4s, v0.8h, v10.8h\n"
"smlal v23.4s, v0.4h, v13.4h\n"
"smlal2 v24.4s, v0.8h, v13.8h\n"
"smlal v21.4s, v1.4h, v11.4h\n"
"smlal2 v22.4s, v1.8h, v11.8h\n"
"smlal v23.4s, v1.4h, v14.4h\n"
"smlal2 v24.4s, v1.8h, v14.8h\n"
"smlal v21.4s, v2.4h, v9.4h\n"
"smlal2 v22.4s, v2.8h, v9.8h\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"smlal v21.4s, v5.4h, v12.4h\n"
"smlal2 v22.4s, v5.8h, v12.8h\n"
"smlal v23.4s, v5.4h, v15.4h\n"
"smlal2 v24.4s, v5.8h, v15.8h\n"
"smlal v21.4s, v6.4h, v16.4h\n"
"smlal2 v22.4s, v6.8h, v16.8h\n"
"smlal v23.4s, v6.4h, v19.4h\n"
"smlal2 v24.4s, v6.8h, v19.8h\n"
"smlal v21.4s, v7.4h, v17.4h\n"
"smlal2 v22.4s, v7.8h, v17.8h\n"
"smlal v23.4s, v7.4h, v20.4h\n"
"smlal2 v24.4s, v7.8h, v20.8h\n"
"smlal v21.4s, v8.4h, v15.4h\n"
"smlal2 v22.4s, v8.8h, v15.8h\n"
"smlal v23.4s, v8.4h, v18.4h\n"
"smlal2 v24.4s, v8.8h, v18.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"and v25.16b, v21.16b, v28.16b\n"
"and v29.16b, v22.16b, v28.16b\n"
"and v30.16b, v23.16b, v28.16b\n"
"and v31.16b, v24.16b, v28.16b\n"
"sshr v25.4s, v25.4s, #31\n"
"sshr v29.4s, v29.4s, #31\n"
"sshr v30.4s, v30.4s, #31\n"
"sshr v31.4s, v31.4s, #31\n"
"sqadd v21.4s, v21.4s, v25.4s\n"
"sqadd v22.4s, v22.4s, v29.4s\n"
"dup v29.8h, w2\n"
"sqadd v23.4s, v23.4s, v30.4s\n"
"dup v30.16b, w4\n"
"sqadd v24.4s, v24.4s, v31.4s\n"
"dup v31.16b, w0\n"
"srshl v21.4s, v21.4s, v28.4s\n"
"srshl v22.4s, v22.4s, v28.4s\n"
"srshl v23.4s, v23.4s, v28.4s\n"
"srshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"st1 {v21.8b}, [x6], x3\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"and v9.16b, v21.16b, v28.16b\n"
"and v12.16b, v22.16b, v28.16b\n"
"and v15.16b, v23.16b, v28.16b\n"
"and v18.16b, v24.16b, v28.16b\n"
"sshr v9.4s, v9.4s, #31\n"
"sshr v12.4s, v12.4s, #31\n"
"sshr v15.4s, v15.4s, #31\n"
"sshr v18.4s, v18.4s, #31\n"
"sqadd v21.4s, v21.4s, v9.4s\n"
"sqadd v22.4s, v22.4s, v12.4s\n"
"sqadd v23.4s, v23.4s, v15.4s\n"
"sqadd v24.4s, v24.4s, v18.4s\n"
"srshl v21.4s, v21.4s, v28.4s\n"
"srshl v22.4s, v22.4s, v28.4s\n"
"srshl v23.4s, v23.4s, v28.4s\n"
"srshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"st1 {v21.8b}, [x6], x3\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
"subs %w[output_window_height], %w[output_window_height], #2\n"
"add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
"cmp %w[output_window_height], #2\n"
"add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
"cmp %w[output_window_height], #1\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
"mov x12, %[input_ptr]\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"add x13, %[input_ptr], %[input_row_size]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"add x14, x13, %[input_row_size]\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x15, x14, %[input_row_size]\n"
"mov w5, %w[output_window_width]\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x1\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"cmp w5, #2\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"ld1 {v18.8b}, [x14], %[input_depth]\n"
"ld1 {v19.8b}, [x14], %[input_depth]\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"ld1 {v22.4s}, [x10]\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"uaddw v14.8h, v26.8h, v14.8b\n"
"uaddw v15.8h, v26.8h, v15.8b\n"
"uaddw v17.8h, v26.8h, v17.8b\n"
"uaddw v18.8h, v26.8h, v18.8b\n"
"uaddw v19.8h, v26.8h, v19.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
"cmp w5, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v16.8b}, [x13]\n"
"smlal v23.4s, v0.4h, v10.4h\n"
"ld1 {v20.8b}, [x14]\n"
"smlal2 v24.4s, v0.8h, v10.8h\n"
"subs w5, w5, #2\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"cmp w5, #3\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"add %[input_ptr], %[input_ptr], %[input_width_increment]\n"
"smlal v23.4s, v1.4h, v11.4h\n"
"mov x12, %[input_ptr]\n"
"smlal2 v24.4s, v1.8h, v11.8h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x13, %[input_ptr], %[input_row_size]\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"add x14, x13, %[input_row_size]\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"add x15, x14, %[input_row_size]\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v3.4h, v14.4h\n"
"smlal2 v24.4s, v3.8h, v14.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v4.4h, v15.4h\n"
"smlal2 v24.4s, v4.8h, v15.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"uaddw v16.8h, v26.8h, v16.8b\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v5.4h, v16.4h\n"
"smlal2 v24.4s, v5.8h, v16.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"ld1 {v18.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"uaddw v20.8h, v26.8h, v20.8b\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"ld1 {v19.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"and v25.16b, v21.16b, v28.16b\n"
"and v29.16b, v22.16b, v28.16b\n"
"and v30.16b, v23.16b, v28.16b\n"
"and v31.16b, v24.16b, v28.16b\n"
"sshr v25.4s, v25.4s, #31\n"
"sshr v29.4s, v29.4s, #31\n"
"sshr v30.4s, v30.4s, #31\n"
"sshr v31.4s, v31.4s, #31\n"
"sqadd v21.4s, v21.4s, v25.4s\n"
"sqadd v22.4s, v22.4s, v29.4s\n"
"dup v29.8h, w2\n"
"sqadd v23.4s, v23.4s, v30.4s\n"
"dup v30.16b, w4\n"
"sqadd v24.4s, v24.4s, v31.4s\n"
"dup v31.16b, w0\n"
"srshl v21.4s, v21.4s, v28.4s\n"
"srshl v22.4s, v22.4s, v28.4s\n"
"srshl v23.4s, v23.4s, v28.4s\n"
"srshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"ld1 {v22.4s}, [x10]\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [%[output_ptr]], x3\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [%[output_ptr]], x3\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"uaddw v14.8h, v26.8h, v14.8b\n"
"uaddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"uaddw v16.8h, v26.8h, v16.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"uaddw v17.8h, v26.8h, v17.8b\n"
"uaddw v18.8h, v26.8h, v18.8b\n"
"uaddw v19.8h, v26.8h, v19.8b\n"
"uaddw v20.8h, v26.8h, v20.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
"cmp w5, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v0.4h, v10.4h\n"
"ld1 {v20.8b}, [x14], %[input_depth]\n"
"smlal2 v24.4s, v0.8h, v10.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v11.4h\n"
"smlal2 v24.4s, v1.8h, v11.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v14.4h\n"
"smlal2 v24.4s, v3.8h, v14.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v15.4h\n"
"smlal2 v24.4s, v4.8h, v15.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"uaddw v16.8h, v26.8h, v16.8b\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"smlal v23.4s, v5.4h, v16.4h\n"
"smlal2 v24.4s, v5.8h, v16.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"uaddw v20.8h, v26.8h, v20.8b\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"and v25.16b, v21.16b, v28.16b\n"
"and v29.16b, v22.16b, v28.16b\n"
"and v30.16b, v23.16b, v28.16b\n"
"and v31.16b, v24.16b, v28.16b\n"
"sshr v25.4s, v25.4s, #31\n"
"sshr v29.4s, v29.4s, #31\n"
"sshr v30.4s, v30.4s, #31\n"
"sshr v31.4s, v31.4s, #31\n"
"sqadd v21.4s, v21.4s, v25.4s\n"
"sqadd v22.4s, v22.4s, v29.4s\n"
"dup v29.8h, w2\n"
"sqadd v23.4s, v23.4s, v30.4s\n"
"dup v30.16b, w4\n"
"sqadd v24.4s, v24.4s, v31.4s\n"
"dup v31.16b, w0\n"
"srshl v21.4s, v21.4s, v28.4s\n"
"srshl v22.4s, v22.4s, v28.4s\n"
"srshl v23.4s, v23.4s, v28.4s\n"
"srshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"st1 {v21.8b}, [%[output_ptr]], x3\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [%[output_ptr]], x3\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"and v9.16b, v21.16b, v28.16b\n"
"and v12.16b, v22.16b, v28.16b\n"
"sshr v9.4s, v9.4s, #31\n"
"sshr v12.4s, v12.4s, #31\n"
"sqadd v21.4s, v21.4s, v9.4s\n"
"sqadd v22.4s, v22.4s, v12.4s\n"
"srshl v21.4s, v21.4s, v28.4s\n"
"srshl v22.4s, v22.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"umax v21.8b, v21.8b, v30.8b\n"
"umin v21.8b, v21.8b, v31.8b\n"
"st1 {v21.8b}, [%[output_ptr]]\n"
DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr),
[output_window_height] "+r"(output_window_height)
:
[bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
[input_depth] "r"(input_depth),
[output_window_width] "r"(output_window_width),
[input_width_increment] "r"(input_width_increment),
[input_height_increment] "r"(input_height_increment),
[output_height_increment] "r"(output_height_increment),
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"v30", "v31",
"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
"x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
}
};
template <>
struct DepthwiseConvWindow<DepthwiseConvOutputRounding::kUpward, 8, 1, 1> {
public:
static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
const int32* bias_ptr, uint8* output_ptr,
int64_t input_depth, int64_t input_row_size,
int32 output_window_height, int32 output_window_width,
const DepthwiseConvParams* params_ptr) {
const int64_t input_width_increment = 2 * input_depth;
const int64_t input_height_increment = 2 * input_row_size;
const int64_t output_height_increment = 2 * params_ptr->output_row_size;
#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1 …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_END …
asm volatile(
"ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"cmp %w[output_window_height], #2\n"
"dup v26.8h, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
"ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v27.4s, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
"dup v29.8h, w2\n"
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"dup v30.16b, w4\n"
"ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v31.16b, w0\n"
"dup v28.4s, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"add x10, %[bias_ptr], #16\n"
"ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
"dup v9.8h, w9\n"
"ld1 {v0.8b}, [%[filter_ptr]], x3\n"
"ld1 {v1.8b}, [%[filter_ptr]], x3\n"
"uaddw v0.8h, v9.8h, v0.8b\n"
"ld1 {v2.8b}, [%[filter_ptr]], x3\n"
"uaddw v1.8h, v9.8h, v1.8b\n"
"ld1 {v3.8b}, [%[filter_ptr]], x3\n"
"uaddw v2.8h, v9.8h, v2.8b\n"
"ld1 {v4.8b}, [%[filter_ptr]], x3\n"
"uaddw v3.8h, v9.8h, v3.8b\n"
"ld1 {v5.8b}, [%[filter_ptr]], x3\n"
"uaddw v4.8h, v9.8h, v4.8b\n"
"ld1 {v6.8b}, [%[filter_ptr]], x3\n"
"uaddw v5.8h, v9.8h, v5.8b\n"
"ld1 {v7.8b}, [%[filter_ptr]], x3\n"
"uaddw v6.8h, v9.8h, v6.8b\n"
"ld1 {v8.8b}, [%[filter_ptr]], x3\n"
"uaddw v7.8h, v9.8h, v7.8b\n"
"uaddw v8.8h, v9.8h, v8.8b\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
"mov x11, %[input_ptr]\n"
"mov x12, x11\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"add x13, x11, %[input_row_size]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"add x14, x13, %[input_row_size]\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x15, x14, %[input_row_size]\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"mov w5, %w[output_window_width]\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x1\n"
"ld1 {v15.8b}, [x14], %[input_depth]\n"
"cmp w5, #2\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v16.8b}, [x14], %[input_depth]\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"ld1 {v18.8b}, [x15], %[input_depth]\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"ld1 {v19.8b}, [x15], %[input_depth]\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"ld1 {v20.8b}, [x15], %[input_depth]\n"
"uaddw v14.8h, v26.8h, v14.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"uaddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v22.4s}, [x10]\n"
"uaddw v16.8h, v26.8h, v16.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"uaddw v17.8h, v26.8h, v17.8b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v18.8h, v26.8h, v18.8b\n"
"uaddw v19.8h, v26.8h, v19.8b\n"
"uaddw v20.8h, v26.8h, v20.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
"cmp w5, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"subs w5, w5, #2\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"cmp w5, #3\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"ld1 {v9.8b}, [x12]\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"ld1 {v12.8b}, [x13]\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"ld1 {v15.8b}, [x14]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"ld1 {v18.8b}, [x15]\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrshl v21.4s, v21.4s, v28.4s\n"
"sqrshl v22.4s, v22.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v28.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"ld1 {v22.4s}, [x10]\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x3\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
"uaddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"uaddw v18.8h, v26.8h, v18.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"smlal v21.4s, v0.4h, v10.4h\n"
"add x11, x11, %[input_width_increment]\n"
"smlal2 v22.4s, v0.8h, v10.8h\n"
"mov x12, x11\n"
"smlal v23.4s, v0.4h, v13.4h\n"
"add x13, x11, %[input_row_size]\n"
"smlal2 v24.4s, v0.8h, v13.8h\n"
"add x14, x13, %[input_row_size]\n"
"smlal v21.4s, v1.4h, v11.4h\n"
"add x15, x14, %[input_row_size]\n"
"smlal2 v22.4s, v1.8h, v11.8h\n"
"smlal v23.4s, v1.4h, v14.4h\n"
"smlal2 v24.4s, v1.8h, v14.8h\n"
"smlal v21.4s, v2.4h, v9.4h\n"
"smlal2 v22.4s, v2.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"smlal v21.4s, v5.4h, v12.4h\n"
"smlal2 v22.4s, v5.8h, v12.8h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v5.4h, v15.4h\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v5.8h, v15.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v21.4s, v6.4h, v16.4h\n"
"smlal2 v22.4s, v6.8h, v16.8h\n"
"smlal v23.4s, v6.4h, v19.4h\n"
"smlal2 v24.4s, v6.8h, v19.8h\n"
"smlal v21.4s, v7.4h, v17.4h\n"
"smlal2 v22.4s, v7.8h, v17.8h\n"
"smlal v23.4s, v7.4h, v20.4h\n"
"smlal2 v24.4s, v7.8h, v20.8h\n"
"smlal v21.4s, v8.4h, v15.4h\n"
"smlal2 v22.4s, v8.8h, v15.8h\n"
"ld1 {v15.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v8.4h, v18.4h\n"
"ld1 {v16.8b}, [x14], %[input_depth]\n"
"smlal2 v24.4s, v8.8h, v18.8h\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"ld1 {v18.8b}, [x15], %[input_depth]\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"ld1 {v19.8b}, [x15], %[input_depth]\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"ld1 {v20.8b}, [x15], %[input_depth]\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrshl v21.4s, v21.4s, v28.4s\n"
"sqrshl v22.4s, v22.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v28.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"ld1 {v22.4s}, [x10]\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x3\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"uaddw v14.8h, v26.8h, v14.8b\n"
"uaddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"uaddw v16.8h, v26.8h, v16.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"uaddw v17.8h, v26.8h, v17.8b\n"
"uaddw v18.8h, v26.8h, v18.8b\n"
"uaddw v19.8h, v26.8h, v19.8b\n"
"uaddw v20.8h, v26.8h, v20.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
"cmp w5, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"ld1 {v9.8b}, [x12]\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"ld1 {v12.8b}, [x13]\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"ld1 {v15.8b}, [x14]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"ld1 {v18.8b}, [x15]\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrshl v21.4s, v21.4s, v28.4s\n"
"sqrshl v22.4s, v22.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v28.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"ld1 {v22.4s}, [x10]\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x3\n"
"mov v23.d[0], v21.d[1]\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"st1 {v23.8b}, [x7], x3\n"
"uaddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"uaddw v18.8h, v26.8h, v18.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"smlal v21.4s, v0.4h, v10.4h\n"
"smlal2 v22.4s, v0.8h, v10.8h\n"
"smlal v23.4s, v0.4h, v13.4h\n"
"smlal2 v24.4s, v0.8h, v13.8h\n"
"smlal v21.4s, v1.4h, v11.4h\n"
"smlal2 v22.4s, v1.8h, v11.8h\n"
"smlal v23.4s, v1.4h, v14.4h\n"
"smlal2 v24.4s, v1.8h, v14.8h\n"
"smlal v21.4s, v2.4h, v9.4h\n"
"smlal2 v22.4s, v2.8h, v9.8h\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"smlal v21.4s, v5.4h, v12.4h\n"
"smlal2 v22.4s, v5.8h, v12.8h\n"
"smlal v23.4s, v5.4h, v15.4h\n"
"smlal2 v24.4s, v5.8h, v15.8h\n"
"smlal v21.4s, v6.4h, v16.4h\n"
"smlal2 v22.4s, v6.8h, v16.8h\n"
"smlal v23.4s, v6.4h, v19.4h\n"
"smlal2 v24.4s, v6.8h, v19.8h\n"
"smlal v21.4s, v7.4h, v17.4h\n"
"smlal2 v22.4s, v7.8h, v17.8h\n"
"smlal v23.4s, v7.4h, v20.4h\n"
"smlal2 v24.4s, v7.8h, v20.8h\n"
"smlal v21.4s, v8.4h, v15.4h\n"
"smlal2 v22.4s, v8.8h, v15.8h\n"
"smlal v23.4s, v8.4h, v18.4h\n"
"smlal2 v24.4s, v8.8h, v18.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrshl v21.4s, v21.4s, v28.4s\n"
"sqrshl v22.4s, v22.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v28.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"st1 {v21.8b}, [x6], x3\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrshl v21.4s, v21.4s, v28.4s\n"
"sqrshl v22.4s, v22.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v28.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"st1 {v21.8b}, [x6], x3\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
"subs %w[output_window_height], %w[output_window_height], #2\n"
"add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
"cmp %w[output_window_height], #2\n"
"add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
"cmp %w[output_window_height], #1\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
"mov x12, %[input_ptr]\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"add x13, %[input_ptr], %[input_row_size]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"add x14, x13, %[input_row_size]\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x15, x14, %[input_row_size]\n"
"mov w5, %w[output_window_width]\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x1\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"cmp w5, #2\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"ld1 {v18.8b}, [x14], %[input_depth]\n"
"ld1 {v19.8b}, [x14], %[input_depth]\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"ld1 {v22.4s}, [x10]\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"uaddw v14.8h, v26.8h, v14.8b\n"
"uaddw v15.8h, v26.8h, v15.8b\n"
"uaddw v17.8h, v26.8h, v17.8b\n"
"uaddw v18.8h, v26.8h, v18.8b\n"
"uaddw v19.8h, v26.8h, v19.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
"cmp w5, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v16.8b}, [x13]\n"
"smlal v23.4s, v0.4h, v10.4h\n"
"ld1 {v20.8b}, [x14]\n"
"smlal2 v24.4s, v0.8h, v10.8h\n"
"subs w5, w5, #2\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"cmp w5, #3\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"add %[input_ptr], %[input_ptr], %[input_width_increment]\n"
"smlal v23.4s, v1.4h, v11.4h\n"
"mov x12, %[input_ptr]\n"
"smlal2 v24.4s, v1.8h, v11.8h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x13, %[input_ptr], %[input_row_size]\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"add x14, x13, %[input_row_size]\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"add x15, x14, %[input_row_size]\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v3.4h, v14.4h\n"
"smlal2 v24.4s, v3.8h, v14.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v4.4h, v15.4h\n"
"smlal2 v24.4s, v4.8h, v15.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"uaddw v16.8h, v26.8h, v16.8b\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v5.4h, v16.4h\n"
"smlal2 v24.4s, v5.8h, v16.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"ld1 {v18.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"uaddw v20.8h, v26.8h, v20.8b\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"ld1 {v19.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrshl v21.4s, v21.4s, v28.4s\n"
"sqrshl v22.4s, v22.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v28.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"ld1 {v22.4s}, [x10]\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [%[output_ptr]], x3\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [%[output_ptr]], x3\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"uaddw v14.8h, v26.8h, v14.8b\n"
"uaddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"uaddw v16.8h, v26.8h, v16.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"uaddw v17.8h, v26.8h, v17.8b\n"
"uaddw v18.8h, v26.8h, v18.8b\n"
"uaddw v19.8h, v26.8h, v19.8b\n"
"uaddw v20.8h, v26.8h, v20.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
"cmp w5, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v0.4h, v10.4h\n"
"ld1 {v20.8b}, [x14], %[input_depth]\n"
"smlal2 v24.4s, v0.8h, v10.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v11.4h\n"
"smlal2 v24.4s, v1.8h, v11.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v14.4h\n"
"smlal2 v24.4s, v3.8h, v14.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v15.4h\n"
"smlal2 v24.4s, v4.8h, v15.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"uaddw v16.8h, v26.8h, v16.8b\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"smlal v23.4s, v5.4h, v16.4h\n"
"smlal2 v24.4s, v5.8h, v16.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"uaddw v20.8h, v26.8h, v20.8b\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrshl v21.4s, v21.4s, v28.4s\n"
"sqrshl v22.4s, v22.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v28.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"st1 {v21.8b}, [%[output_ptr]], x3\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [%[output_ptr]], x3\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrshl v21.4s, v21.4s, v28.4s\n"
"sqrshl v22.4s, v22.4s, v28.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"umax v21.8b, v21.8b, v30.8b\n"
"umin v21.8b, v21.8b, v31.8b\n"
"st1 {v21.8b}, [%[output_ptr]]\n"
DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr),
[output_window_height] "+r"(output_window_height)
:
[bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
[input_depth] "r"(input_depth),
[output_window_width] "r"(output_window_width),
[input_width_increment] "r"(input_width_increment),
[input_height_increment] "r"(input_height_increment),
[output_height_increment] "r"(output_height_increment),
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"v30", "v31",
"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
"x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
}
};
template <>
struct DepthwiseConvWindow<DepthwiseConvOutputRounding::kAwayFromZero, 8, 2,
2> {
static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
const int32* bias_ptr, uint8* output_ptr,
int64_t input_depth, int64_t input_row_size,
int32 output_window_height, int32 output_window_width,
const DepthwiseConvParams* params_ptr) {
const int64_t input_width_increment = 4 * input_depth;
const int64_t input_height_increment = 4 * input_row_size;
const int64_t output_height_increment = 2 * params_ptr->output_row_size;
#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1 …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_END …
asm volatile(
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
"ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"cmp %w[output_window_height], #2\n"
"dup v28.8h, w0\n"
"ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
"dup v26.4s, w9\n"
"ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v27.4s, w1\n"
"ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"dup v29.8h, w2\n"
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.16b, w3\n"
"ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"dup v31.16b, w4\n"
"ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
"ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"add x10, %[bias_ptr], #16\n"
"ld1 {v0.8b}, [%[filter_ptr]], x5\n"
"dup v9.8h, w20\n"
"ld1 {v1.8b}, [%[filter_ptr]], x5\n"
"uaddw v0.8h, v9.8h, v0.8b\n"
"ld1 {v2.8b}, [%[filter_ptr]], x5\n"
"uaddw v1.8h, v9.8h, v1.8b\n"
"ld1 {v3.8b}, [%[filter_ptr]], x5\n"
"uaddw v2.8h, v9.8h, v2.8b\n"
"ld1 {v4.8b}, [%[filter_ptr]], x5\n"
"uaddw v3.8h, v9.8h, v3.8b\n"
"ld1 {v5.8b}, [%[filter_ptr]], x5\n"
"uaddw v4.8h, v9.8h, v4.8b\n"
"ld1 {v6.8b}, [%[filter_ptr]], x5\n"
"uaddw v5.8h, v9.8h, v5.8b\n"
"ld1 {v7.8b}, [%[filter_ptr]], x5\n"
"uaddw v6.8h, v9.8h, v6.8b\n"
"ld1 {v8.8b}, [%[filter_ptr]]\n"
"uaddw v7.8h, v9.8h, v7.8b\n"
"uaddw v8.8h, v9.8h, v8.8b\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
"mov x11, %[input_ptr]\n"
"mov x12, x11\n"
"add x13, x12, %[input_row_size]\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"mov w14, %w[output_window_width]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"cmp w14, #2\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x15, x13, %[input_row_size]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x19\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"ld1 {v22.4s}, [x10]\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"ld1 {v19.4s}, [%[bias_ptr]]\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"ld1 {v20.4s}, [x10]\n"
"uaddw v14.8h, v28.8h, v14.8b\n"
"ld1 {v25.4s}, [%[bias_ptr]]\n"
"uaddw v15.8h, v28.8h, v15.8b\n"
"ld1 {v26.4s}, [x10]\n"
"uaddw v16.8h, v28.8h, v16.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
"cmp w14, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v13.8b}, [x12]\n"
"add x12, x15, %[input_row_size]\n"
"smlal v23.4s, v0.4h, v11.4h\n"
"ld1 {v17.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v0.8h, v11.8h\n"
"ld1 {v18.8b}, [x13]\n"
"add x13, x12, %[input_row_size]\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"ld1 {v9.8b}, [x15], %[input_depth]\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v3.4h, v14.4h\n"
"smlal2 v22.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"subs w14, w14, #2\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"cmp w14, #3\n"
"smlal v21.4s, v4.4h, v15.4h\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v22.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v5.4h, v16.4h\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v22.4s, v5.8h, v16.8h\n"
"ld1 {v16.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v1.4h, v12.4h\n"
"uaddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v24.4s, v1.8h, v12.8h\n"
"ld1 {v12.8b}, [x15], %[input_depth]\n"
"smlal v23.4s, v2.4h, v13.4h\n"
"uaddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v24.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x15]\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"ld1 {v17.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v5.4h, v18.4h\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"smlal2 v24.4s, v5.8h, v18.8h\n"
"ld1 {v18.8b}, [x12]\n"
"smlal v21.4s, v6.4h, v9.4h\n"
"smlal2 v22.4s, v6.8h, v9.8h\n"
"smlal v19.4s, v0.4h, v9.4h\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"smlal2 v20.4s, v0.8h, v9.8h\n"
"ld1 {v9.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v6.4h, v11.4h\n"
"smlal2 v24.4s, v6.8h, v11.8h\n"
"smlal v21.4s, v7.4h, v10.4h\n"
"smlal2 v22.4s, v7.8h, v10.8h\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"smlal v19.4s, v1.4h, v10.4h\n"
"smlal2 v20.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v7.4h, v12.4h\n"
"smlal2 v24.4s, v7.8h, v12.8h\n"
"smlal v25.4s, v1.4h, v12.4h\n"
"smlal2 v26.4s, v1.8h, v12.8h\n"
"smlal v21.4s, v8.4h, v11.4h\n"
"smlal2 v22.4s, v8.8h, v11.8h\n"
"add x11, x11, %[input_width_increment]\n"
"smlal v19.4s, v2.4h, v11.4h\n"
"mov x12, x11\n"
"smlal2 v20.4s, v2.8h, v11.8h\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"smlal v25.4s, v0.4h, v11.4h\n"
"smlal2 v26.4s, v0.8h, v11.8h\n"
"ld1 {v11.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v8.4h, v13.4h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v8.8h, v13.8h\n"
"smlal v25.4s, v2.4h, v13.4h\n"
"smlal2 v26.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x13]\n"
"add x13, x12, %[input_row_size]\n"
"add x15, x13, %[input_row_size]\n"
"dup v28.4s, w9\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"and v27.16b, v21.16b, v28.16b\n"
"and v29.16b, v22.16b, v28.16b\n"
"and v30.16b, v23.16b, v28.16b\n"
"and v31.16b, v24.16b, v28.16b\n"
"sshr v27.4s, v27.4s, #31\n"
"sshr v29.4s, v29.4s, #31\n"
"sshr v30.4s, v30.4s, #31\n"
"sshr v31.4s, v31.4s, #31\n"
"sqadd v21.4s, v21.4s, v27.4s\n"
"dup v27.4s, w1\n"
"sqadd v22.4s, v22.4s, v29.4s\n"
"dup v29.8h, w2\n"
"sqadd v23.4s, v23.4s, v30.4s\n"
"dup v30.16b, w3\n"
"sqadd v24.4s, v24.4s, v31.4s\n"
"dup v31.16b, w4\n"
"srshl v21.4s, v21.4s, v28.4s\n"
"srshl v22.4s, v22.4s, v28.4s\n"
"srshl v23.4s, v23.4s, v28.4s\n"
"srshl v24.4s, v24.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"ld1 {v22.4s}, [x10]\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x5\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x6], x5\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"smlal v19.4s, v6.4h, v9.4h\n"
"smlal2 v20.4s, v6.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal v25.4s, v6.4h, v11.4h\n"
"smlal2 v26.4s, v6.8h, v11.8h\n"
"smlal v19.4s, v7.4h, v10.4h\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v20.4s, v7.8h, v10.8h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal v25.4s, v7.4h, v12.4h\n"
"smlal2 v26.4s, v7.8h, v12.8h\n"
"smlal v19.4s, v8.4h, v11.4h\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v20.4s, v8.8h, v11.8h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"smlal v25.4s, v8.4h, v13.4h\n"
"uaddw v14.8h, v28.8h, v14.8b\n"
"smlal2 v26.4s, v8.8h, v13.8h\n"
"uaddw v16.8h, v28.8h, v16.8b\n"
"smlal v19.4s, v3.4h, v14.4h\n"
"uaddw v15.8h, v28.8h, v15.8b\n"
"smlal2 v20.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v25.4s, v3.4h, v16.4h\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"smlal2 v26.4s, v3.8h, v16.8h\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"smlal v19.4s, v4.4h, v15.4h\n"
"uaddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v20.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"smlal v25.4s, v4.4h, v17.4h\n"
"smlal2 v26.4s, v4.8h, v17.8h\n"
"smlal v19.4s, v5.4h, v16.4h\n"
"uaddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v20.4s, v5.8h, v16.8h\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n"
"smlal v25.4s, v5.4h, v18.4h\n"
"smlal2 v26.4s, v5.8h, v18.8h\n"
"dup v28.4s, w9\n"
"sqrdmulh v19.4s, v19.4s, v27.4s\n"
"sqrdmulh v20.4s, v20.4s, v27.4s\n"
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
"sqrdmulh v26.4s, v26.4s, v27.4s\n"
"and v27.16b, v19.16b, v28.16b\n"
"and v29.16b, v20.16b, v28.16b\n"
"and v30.16b, v25.16b, v28.16b\n"
"and v31.16b, v26.16b, v28.16b\n"
"sshr v27.4s, v27.4s, #31\n"
"sshr v29.4s, v29.4s, #31\n"
"sshr v30.4s, v30.4s, #31\n"
"sshr v31.4s, v31.4s, #31\n"
"sqadd v19.4s, v19.4s, v27.4s\n"
"dup v27.4s, w1\n"
"sqadd v20.4s, v20.4s, v29.4s\n"
"dup v29.8h, w2\n"
"sqadd v25.4s, v25.4s, v30.4s\n"
"dup v30.16b, w3\n"
"sqadd v26.4s, v26.4s, v31.4s\n"
"dup v31.16b, w4\n"
"srshl v19.4s, v19.4s, v28.4s\n"
"srshl v20.4s, v20.4s, v28.4s\n"
"srshl v25.4s, v25.4s, v28.4s\n"
"srshl v26.4s, v26.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v19.4h, v19.4s\n"
"sqxtn2 v19.8h, v20.4s\n"
"sqxtn v25.4h, v25.4s\n"
"sqxtn2 v25.8h, v26.4s\n"
"sqadd v19.8h, v19.8h, v29.8h\n"
"sqadd v25.8h, v25.8h, v29.8h\n"
"sqxtun v19.8b, v19.8h\n"
"sqxtun2 v19.16b, v25.8h\n"
"ld1 {v20.4s}, [x10]\n"
"umax v19.16b, v19.16b, v30.16b\n"
"umin v19.16b, v19.16b, v31.16b\n"
"ld1 {v26.4s}, [x10]\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"st1 {v19.8b}, [x7], x5\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"mov v25.d[0], v19.d[1]\n"
"st1 {v25.8b}, [x7], x5\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"ld1 {v19.4s}, [%[bias_ptr]]\n"
"uaddw v14.8h, v28.8h, v14.8b\n"
"ld1 {v25.4s}, [%[bias_ptr]]\n"
"uaddw v15.8h, v28.8h, v15.8b\n"
"uaddw v16.8h, v28.8h, v16.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
"cmp w14, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v13.8b}, [x12]\n"
"add x12, x15, %[input_row_size]\n"
"smlal v23.4s, v0.4h, v11.4h\n"
"ld1 {v17.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v0.8h, v11.8h\n"
"ld1 {v18.8b}, [x13]\n"
"add x13, x12, %[input_row_size]\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"ld1 {v9.8b}, [x15], %[input_depth]\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v3.4h, v14.4h\n"
"smlal2 v22.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"smlal v21.4s, v4.4h, v15.4h\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v22.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v5.4h, v16.4h\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v22.4s, v5.8h, v16.8h\n"
"ld1 {v16.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v1.4h, v12.4h\n"
"uaddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v24.4s, v1.8h, v12.8h\n"
"ld1 {v12.8b}, [x15], %[input_depth]\n"
"smlal v23.4s, v2.4h, v13.4h\n"
"uaddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v24.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x15]\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"ld1 {v17.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v5.4h, v18.4h\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"smlal2 v24.4s, v5.8h, v18.8h\n"
"ld1 {v18.8b}, [x12]\n"
"smlal v21.4s, v6.4h, v9.4h\n"
"smlal2 v22.4s, v6.8h, v9.8h\n"
"smlal v19.4s, v0.4h, v9.4h\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"smlal2 v20.4s, v0.8h, v9.8h\n"
"ld1 {v9.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v6.4h, v11.4h\n"
"smlal2 v24.4s, v6.8h, v11.8h\n"
"smlal v21.4s, v7.4h, v10.4h\n"
"smlal2 v22.4s, v7.8h, v10.8h\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"smlal v19.4s, v1.4h, v10.4h\n"
"smlal2 v20.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v7.4h, v12.4h\n"
"smlal2 v24.4s, v7.8h, v12.8h\n"
"smlal v25.4s, v1.4h, v12.4h\n"
"smlal2 v26.4s, v1.8h, v12.8h\n"
"smlal v21.4s, v8.4h, v11.4h\n"
"smlal2 v22.4s, v8.8h, v11.8h\n"
"smlal v19.4s, v2.4h, v11.4h\n"
"smlal2 v20.4s, v2.8h, v11.8h\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"smlal v25.4s, v0.4h, v11.4h\n"
"smlal2 v26.4s, v0.8h, v11.8h\n"
"ld1 {v11.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v8.4h, v13.4h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v8.8h, v13.8h\n"
"smlal v25.4s, v2.4h, v13.4h\n"
"smlal2 v26.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x13]\n"
"dup v28.4s, w9\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"and v27.16b, v21.16b, v28.16b\n"
"and v29.16b, v22.16b, v28.16b\n"
"and v30.16b, v23.16b, v28.16b\n"
"and v31.16b, v24.16b, v28.16b\n"
"sshr v27.4s, v27.4s, #31\n"
"sshr v29.4s, v29.4s, #31\n"
"sshr v30.4s, v30.4s, #31\n"
"sshr v31.4s, v31.4s, #31\n"
"sqadd v21.4s, v21.4s, v27.4s\n"
"dup v27.4s, w1\n"
"sqadd v22.4s, v22.4s, v29.4s\n"
"dup v29.8h, w2\n"
"sqadd v23.4s, v23.4s, v30.4s\n"
"dup v30.16b, w3\n"
"sqadd v24.4s, v24.4s, v31.4s\n"
"dup v31.16b, w4\n"
"srshl v21.4s, v21.4s, v28.4s\n"
"srshl v22.4s, v22.4s, v28.4s\n"
"srshl v23.4s, v23.4s, v28.4s\n"
"srshl v24.4s, v24.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"ld1 {v22.4s}, [x10]\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x5\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x6]\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"smlal v19.4s, v6.4h, v9.4h\n"
"smlal2 v20.4s, v6.8h, v9.8h\n"
"smlal v25.4s, v6.4h, v11.4h\n"
"smlal2 v26.4s, v6.8h, v11.8h\n"
"smlal v19.4s, v7.4h, v10.4h\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v20.4s, v7.8h, v10.8h\n"
"smlal v25.4s, v7.4h, v12.4h\n"
"smlal2 v26.4s, v7.8h, v12.8h\n"
"smlal v19.4s, v8.4h, v11.4h\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v20.4s, v8.8h, v11.8h\n"
"smlal v25.4s, v8.4h, v13.4h\n"
"uaddw v14.8h, v28.8h, v14.8b\n"
"smlal2 v26.4s, v8.8h, v13.8h\n"
"uaddw v16.8h, v28.8h, v16.8b\n"
"smlal v19.4s, v3.4h, v14.4h\n"
"uaddw v15.8h, v28.8h, v15.8b\n"
"smlal2 v20.4s, v3.8h, v14.8h\n"
"smlal v25.4s, v3.4h, v16.4h\n"
"smlal2 v26.4s, v3.8h, v16.8h\n"
"smlal v19.4s, v4.4h, v15.4h\n"
"uaddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v20.4s, v4.8h, v15.8h\n"
"smlal v25.4s, v4.4h, v17.4h\n"
"smlal2 v26.4s, v4.8h, v17.8h\n"
"smlal v19.4s, v5.4h, v16.4h\n"
"uaddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v20.4s, v5.8h, v16.8h\n"
"smlal v25.4s, v5.4h, v18.4h\n"
"smlal2 v26.4s, v5.8h, v18.8h\n"
"dup v28.4s, w9\n"
"sqrdmulh v19.4s, v19.4s, v27.4s\n"
"sqrdmulh v20.4s, v20.4s, v27.4s\n"
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
"sqrdmulh v26.4s, v26.4s, v27.4s\n"
"and v27.16b, v19.16b, v28.16b\n"
"and v29.16b, v20.16b, v28.16b\n"
"and v30.16b, v25.16b, v28.16b\n"
"and v31.16b, v26.16b, v28.16b\n"
"sshr v27.4s, v27.4s, #31\n"
"sshr v29.4s, v29.4s, #31\n"
"sshr v30.4s, v30.4s, #31\n"
"sshr v31.4s, v31.4s, #31\n"
"sqadd v19.4s, v19.4s, v27.4s\n"
"dup v27.4s, w1\n"
"sqadd v20.4s, v20.4s, v29.4s\n"
"dup v29.8h, w2\n"
"sqadd v25.4s, v25.4s, v30.4s\n"
"dup v30.16b, w3\n"
"sqadd v26.4s, v26.4s, v31.4s\n"
"dup v31.16b, w4\n"
"srshl v19.4s, v19.4s, v28.4s\n"
"srshl v20.4s, v20.4s, v28.4s\n"
"srshl v25.4s, v25.4s, v28.4s\n"
"srshl v26.4s, v26.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v19.4h, v19.4s\n"
"sqxtn2 v19.8h, v20.4s\n"
"sqxtn v25.4h, v25.4s\n"
"sqxtn2 v25.8h, v26.4s\n"
"sqadd v19.8h, v19.8h, v29.8h\n"
"sqadd v25.8h, v25.8h, v29.8h\n"
"sqxtun v19.8b, v19.8h\n"
"sqxtun2 v19.16b, v25.8h\n"
"umax v19.16b, v19.16b, v30.16b\n"
"umin v19.16b, v19.16b, v31.16b\n"
"st1 {v19.8b}, [x7], x5\n"
"mov v25.d[0], v19.d[1]\n"
"st1 {v25.8b}, [x7]\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
"add x12, x15, %[input_row_size]\n"
"add x13, x12, %[input_row_size]\n"
"ld1 {v12.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v13.8b}, [x15], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v17.8b}, [x15]\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x12]\n"
"smlal v21.4s, v3.4h, v14.4h\n"
"smlal2 v22.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v21.4s, v4.4h, v15.4h\n"
"smlal2 v22.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"smlal v21.4s, v5.4h, v16.4h\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v22.4s, v5.8h, v16.8h\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"ld1 {v16.8b}, [x13]\n"
"smlal v21.4s, v6.4h, v12.4h\n"
"smlal2 v22.4s, v6.8h, v12.8h\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"uaddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v7.4h, v13.4h\n"
"smlal2 v22.4s, v7.8h, v13.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v2.4h, v17.4h\n"
"smlal2 v24.4s, v2.8h, v17.8h\n"
"dup v26.4s, w9\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"and v18.16b, v21.16b, v26.16b\n"
"and v19.16b, v22.16b, v26.16b\n"
"sshr v18.4s, v18.4s, #31\n"
"sshr v19.4s, v19.4s, #31\n"
"sqadd v21.4s, v21.4s, v18.4s\n"
"sqadd v22.4s, v22.4s, v19.4s\n"
"srshl v21.4s, v21.4s, v26.4s\n"
"srshl v22.4s, v22.4s, v26.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"umax v21.8b, v21.8b, v30.8b\n"
"umin v21.8b, v21.8b, v31.8b\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"st1 {v21.8b}, [x6]\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"smlal v23.4s, v3.4h, v9.4h\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"smlal2 v24.4s, v3.8h, v9.8h\n"
"uaddw v14.8h, v28.8h, v14.8b\n"
"smlal v23.4s, v4.4h, v10.4h\n"
"uaddw v15.8h, v28.8h, v15.8b\n"
"smlal2 v24.4s, v4.8h, v10.8h\n"
"uaddw v16.8h, v28.8h, v16.8b\n"
"smlal v23.4s, v5.4h, v11.4h\n"
"smlal2 v24.4s, v5.8h, v11.8h\n"
"smlal v23.4s, v6.4h, v14.4h\n"
"smlal2 v24.4s, v6.8h, v14.8h\n"
"smlal v23.4s, v7.4h, v15.4h\n"
"smlal2 v24.4s, v7.8h, v15.8h\n"
"smlal v23.4s, v8.4h, v16.4h\n"
"smlal2 v24.4s, v8.8h, v16.8h\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"and v18.16b, v23.16b, v26.16b\n"
"and v19.16b, v24.16b, v26.16b\n"
"sshr v18.4s, v18.4s, #31\n"
"sshr v19.4s, v19.4s, #31\n"
"sqadd v23.4s, v23.4s, v18.4s\n"
"sqadd v24.4s, v24.4s, v19.4s\n"
"srshl v23.4s, v23.4s, v26.4s\n"
"srshl v24.4s, v24.4s, v26.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v23.8b, v23.8h\n"
"umax v23.8b, v23.8b, v30.8b\n"
"umin v23.8b, v23.8b, v31.8b\n"
"st1 {v23.8b}, [x7]\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
"subs %w[output_window_height], %w[output_window_height], #2\n"
"add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
"cmp %w[output_window_height], #2\n"
"add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
"cmp %w[output_window_height], #1\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
"mov x11, %[input_ptr]\n"
"mov x12, x11\n"
"add x13, x12, %[input_row_size]\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"add x15, x13, %[input_row_size]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"mov w14, %w[output_window_width]\n"
"cmp w14, #2\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"ld1 {v15.8b}, [x15], %[input_depth]\n"
"ld1 {v16.8b}, [x15], %[input_depth]\n"
"ld1 {v17.8b}, [x15], %[input_depth]\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"ld1 {v24.4s}, [%[bias_ptr]]\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"ld1 {v25.4s}, [x10]\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"ld1 {v26.4s}, [%[bias_ptr]]\n"
"ld1 {v27.4s}, [x10]\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"uaddw v14.8h, v28.8h, v14.8b\n"
"uaddw v15.8h, v28.8h, v15.8b\n"
"uaddw v16.8h, v28.8h, v16.8b\n"
"uaddw v17.8h, v28.8h, v17.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
"cmp w14, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
"smlal v24.4s, v0.4h, v9.4h\n"
"ld1 {v18.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v0.8h, v9.8h\n"
"ld1 {v19.8b}, [x12]\n"
"smlal v26.4s, v0.4h, v11.4h\n"
"ld1 {v20.8b}, [x13], %[input_depth]\n"
"smlal2 v27.4s, v0.8h, v11.8h\n"
"ld1 {v21.8b}, [x13]\n"
"smlal v24.4s, v1.4h, v10.4h\n"
"ld1 {v22.8b}, [x15], %[input_depth]\n"
"smlal2 v25.4s, v1.8h, v10.8h\n"
"ld1 {v23.8b}, [x15]\n"
"smlal v24.4s, v2.4h, v11.4h\n"
"subs w14, w14, #2\n"
"smlal2 v25.4s, v2.8h, v11.8h\n"
"cmp w14, #3\n"
"smlal v24.4s, v3.4h, v12.4h\n"
"add x11, x11, %[input_width_increment]\n"
"smlal2 v25.4s, v3.8h, v12.8h\n"
"mov x12, x11\n"
"smlal v26.4s, v3.4h, v14.4h\n"
"add x13, x12, %[input_row_size]\n"
"smlal2 v27.4s, v3.8h, v14.8h\n"
"add x15, x13, %[input_row_size]\n"
"smlal v24.4s, v4.4h, v13.4h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v4.8h, v13.8h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal v24.4s, v5.4h, v14.4h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v5.8h, v14.8h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal v24.4s, v6.4h, v15.4h\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"smlal2 v25.4s, v6.8h, v15.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v26.4s, v6.4h, v17.4h\n"
"ld1 {v15.8b}, [x15], %[input_depth]\n"
"smlal2 v27.4s, v6.8h, v17.8h\n"
"smlal v24.4s, v7.4h, v16.4h\n"
"smlal2 v25.4s, v7.8h, v16.8h\n"
"ld1 {v16.8b}, [x15], %[input_depth]\n"
"smlal v24.4s, v8.4h, v17.4h\n"
"uaddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v25.4s, v8.8h, v17.8h\n"
"ld1 {v17.8b}, [x15], %[input_depth]\n"
"uaddw v19.8h, v28.8h, v19.8b\n"
"smlal v26.4s, v1.4h, v18.4h\n"
"uaddw v20.8h, v28.8h, v20.8b\n"
"smlal2 v27.4s, v1.8h, v18.8h\n"
"smlal v26.4s, v2.4h, v19.4h\n"
"uaddw v21.8h, v28.8h, v21.8b\n"
"smlal2 v27.4s, v2.8h, v19.8h\n"
"smlal v26.4s, v4.4h, v20.4h\n"
"smlal v26.4s, v5.4h, v21.4h\n"
"smlal2 v27.4s, v4.8h, v20.8h\n"
"uaddw v22.8h, v28.8h, v22.8b\n"
"smlal2 v27.4s, v5.8h, v21.8h\n"
"uaddw v23.8h, v28.8h, v23.8b\n"
"smlal v26.4s, v7.4h, v22.4h\n"
"smlal2 v27.4s, v7.8h, v22.8h\n"
"smlal v26.4s, v8.4h, v23.4h\n"
"smlal2 v27.4s, v8.8h, v23.8h\n"
"dup v28.4s, w1\n"
"dup v29.4s, w9\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrdmulh v25.4s, v25.4s, v28.4s\n"
"sqrdmulh v26.4s, v26.4s, v28.4s\n"
"sqrdmulh v27.4s, v27.4s, v28.4s\n"
"dup v28.8h, w2\n"
"and v30.16b, v24.16b, v29.16b\n"
"and v31.16b, v25.16b, v29.16b\n"
"sshr v30.4s, v30.4s, #31\n"
"sshr v31.4s, v31.4s, #31\n"
"sqadd v24.4s, v24.4s, v30.4s\n"
"sqadd v25.4s, v25.4s, v31.4s\n"
"and v30.16b, v26.16b, v29.16b\n"
"and v31.16b, v27.16b, v29.16b\n"
"sshr v30.4s, v30.4s, #31\n"
"sshr v31.4s, v31.4s, #31\n"
"sqadd v26.4s, v26.4s, v30.4s\n"
"dup v30.16b, w3\n"
"sqadd v27.4s, v27.4s, v31.4s\n"
"dup v31.16b, w4\n"
"srshl v24.4s, v24.4s, v29.4s\n"
"srshl v25.4s, v25.4s, v29.4s\n"
"srshl v26.4s, v26.4s, v29.4s\n"
"srshl v27.4s, v27.4s, v29.4s\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v25.4s\n"
"sqxtn v26.4h, v26.4s\n"
"sqxtn2 v26.8h, v27.4s\n"
"sqadd v24.8h, v24.8h, v28.8h\n"
"sqadd v26.8h, v26.8h, v28.8h\n"
"sqxtun v24.8b, v24.8h\n"
"sqxtun2 v24.16b, v26.8h\n"
"dup v28.8h, w0\n"
"ld1 {v25.4s}, [x10]\n"
"umax v24.16b, v24.16b, v30.16b\n"
"umin v24.16b, v24.16b, v31.16b\n"
"ld1 {v27.4s}, [x10]\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"st1 {v24.8b}, [x6], x5\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"mov v26.d[0], v24.d[1]\n"
"st1 {v26.8b}, [x6], x5\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"uaddw v14.8h, v28.8h, v14.8b\n"
"ld1 {v24.4s}, [%[bias_ptr]]\n"
"uaddw v15.8h, v28.8h, v15.8b\n"
"ld1 {v26.4s}, [%[bias_ptr]]\n"
"uaddw v16.8h, v28.8h, v16.8b\n"
"uaddw v17.8h, v28.8h, v17.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
"cmp w14, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
"smlal v24.4s, v0.4h, v9.4h\n"
"ld1 {v18.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v0.8h, v9.8h\n"
"ld1 {v19.8b}, [x12]\n"
"smlal v26.4s, v0.4h, v11.4h\n"
"ld1 {v20.8b}, [x13], %[input_depth]\n"
"smlal2 v27.4s, v0.8h, v11.8h\n"
"ld1 {v21.8b}, [x13]\n"
"smlal v24.4s, v1.4h, v10.4h\n"
"ld1 {v22.8b}, [x15], %[input_depth]\n"
"smlal2 v25.4s, v1.8h, v10.8h\n"
"ld1 {v23.8b}, [x15]\n"
"smlal v24.4s, v2.4h, v11.4h\n"
"smlal2 v25.4s, v2.8h, v11.8h\n"
"smlal v24.4s, v3.4h, v12.4h\n"
"smlal2 v25.4s, v3.8h, v12.8h\n"
"smlal v26.4s, v3.4h, v14.4h\n"
"smlal2 v27.4s, v3.8h, v14.8h\n"
"smlal v24.4s, v4.4h, v13.4h\n"
"smlal2 v25.4s, v4.8h, v13.8h\n"
"smlal v24.4s, v5.4h, v14.4h\n"
"smlal2 v25.4s, v5.8h, v14.8h\n"
"smlal v24.4s, v6.4h, v15.4h\n"
"smlal2 v25.4s, v6.8h, v15.8h\n"
"smlal v26.4s, v6.4h, v17.4h\n"
"smlal2 v27.4s, v6.8h, v17.8h\n"
"smlal v24.4s, v7.4h, v16.4h\n"
"smlal2 v25.4s, v7.8h, v16.8h\n"
"smlal v24.4s, v8.4h, v17.4h\n"
"uaddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v25.4s, v8.8h, v17.8h\n"
"uaddw v19.8h, v28.8h, v19.8b\n"
"smlal v26.4s, v1.4h, v18.4h\n"
"uaddw v20.8h, v28.8h, v20.8b\n"
"smlal2 v27.4s, v1.8h, v18.8h\n"
"smlal v26.4s, v2.4h, v19.4h\n"
"uaddw v21.8h, v28.8h, v21.8b\n"
"smlal2 v27.4s, v2.8h, v19.8h\n"
"smlal v26.4s, v4.4h, v20.4h\n"
"smlal v26.4s, v5.4h, v21.4h\n"
"smlal2 v27.4s, v4.8h, v20.8h\n"
"uaddw v22.8h, v28.8h, v22.8b\n"
"smlal2 v27.4s, v5.8h, v21.8h\n"
"uaddw v23.8h, v28.8h, v23.8b\n"
"smlal v26.4s, v7.4h, v22.4h\n"
"smlal2 v27.4s, v7.8h, v22.8h\n"
"smlal v26.4s, v8.4h, v23.4h\n"
"smlal2 v27.4s, v8.8h, v23.8h\n"
"dup v28.4s, w1\n"
"dup v29.4s, w9\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrdmulh v25.4s, v25.4s, v28.4s\n"
"sqrdmulh v26.4s, v26.4s, v28.4s\n"
"sqrdmulh v27.4s, v27.4s, v28.4s\n"
"dup v28.8h, w2\n"
"and v30.16b, v24.16b, v29.16b\n"
"and v31.16b, v25.16b, v29.16b\n"
"sshr v30.4s, v30.4s, #31\n"
"sshr v31.4s, v31.4s, #31\n"
"sqadd v24.4s, v24.4s, v30.4s\n"
"sqadd v25.4s, v25.4s, v31.4s\n"
"and v30.16b, v26.16b, v29.16b\n"
"and v31.16b, v27.16b, v29.16b\n"
"sshr v30.4s, v30.4s, #31\n"
"sshr v31.4s, v31.4s, #31\n"
"sqadd v26.4s, v26.4s, v30.4s\n"
"dup v30.16b, w3\n"
"sqadd v27.4s, v27.4s, v31.4s\n"
"dup v31.16b, w4\n"
"srshl v24.4s, v24.4s, v29.4s\n"
"srshl v25.4s, v25.4s, v29.4s\n"
"srshl v26.4s, v26.4s, v29.4s\n"
"srshl v27.4s, v27.4s, v29.4s\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v25.4s\n"
"sqxtn v26.4h, v26.4s\n"
"sqxtn2 v26.8h, v27.4s\n"
"sqadd v24.8h, v24.8h, v28.8h\n"
"sqadd v26.8h, v26.8h, v28.8h\n"
"sqxtun v24.8b, v24.8h\n"
"sqxtun2 v24.16b, v26.8h\n"
"dup v28.8h, w0\n"
"umax v24.16b, v24.16b, v30.16b\n"
"umin v24.16b, v24.16b, v31.16b\n"
"st1 {v24.8b}, [x6], x5\n"
"mov v26.d[0], v24.d[1]\n"
"st1 {v26.8b}, [x6]\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
"dup v26.4s, w9\n"
"dup v27.4s, w1\n"
"dup v29.8h, w2\n"
"smlal v24.4s, v0.4h, v9.4h\n"
"smlal2 v25.4s, v0.8h, v9.8h\n"
"smlal v24.4s, v1.4h, v10.4h\n"
"smlal2 v25.4s, v1.8h, v10.8h\n"
"smlal v24.4s, v2.4h, v11.4h\n"
"smlal2 v25.4s, v2.8h, v11.8h\n"
"smlal v24.4s, v3.4h, v12.4h\n"
"smlal2 v25.4s, v3.8h, v12.8h\n"
"smlal v24.4s, v4.4h, v13.4h\n"
"smlal2 v25.4s, v4.8h, v13.8h\n"
"smlal v24.4s, v5.4h, v14.4h\n"
"smlal2 v25.4s, v5.8h, v14.8h\n"
"smlal v24.4s, v6.4h, v15.4h\n"
"smlal2 v25.4s, v6.8h, v15.8h\n"
"smlal v24.4s, v7.4h, v16.4h\n"
"smlal2 v25.4s, v7.8h, v16.8h\n"
"smlal v24.4s, v8.4h, v17.4h\n"
"smlal2 v25.4s, v8.8h, v17.8h\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
"and v18.16b, v24.16b, v26.16b\n"
"and v19.16b, v25.16b, v26.16b\n"
"sshr v18.4s, v18.4s, #31\n"
"sshr v19.4s, v19.4s, #31\n"
"sqadd v24.4s, v24.4s, v18.4s\n"
"sqadd v25.4s, v25.4s, v19.4s\n"
"srshl v24.4s, v24.4s, v26.4s\n"
"srshl v25.4s, v25.4s, v26.4s\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v25.4s\n"
"sqadd v24.8h, v24.8h, v29.8h\n"
"sqxtun v24.8b, v24.8h\n"
"umax v24.8b, v24.8b, v30.8b\n"
"umin v24.8b, v24.8b, v31.8b\n"
"st1 {v24.8b}, [x6]\n"
DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr),
[output_window_height] "+r"(output_window_height)
:
[bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
[input_depth] "r"(input_depth),
[output_window_width] "r"(output_window_width),
[input_width_increment] "r"(input_width_increment),
[input_height_increment] "r"(input_height_increment),
[output_height_increment] "r"(output_height_increment),
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"v30", "v31",
"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
"x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x19", "x20");
#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
}
};
template <>
struct DepthwiseConvWindow<DepthwiseConvOutputRounding::kUpward, 8, 2, 2> {
static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
const int32* bias_ptr, uint8* output_ptr,
int64_t input_depth, int64_t input_row_size,
int32 output_window_height, int32 output_window_width,
const DepthwiseConvParams* params_ptr) {
const int64_t input_width_increment = 4 * input_depth;
const int64_t input_height_increment = 4 * input_row_size;
const int64_t output_height_increment = 2 * params_ptr->output_row_size;
#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1 …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_END …
asm volatile(
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
"ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"cmp %w[output_window_height], #2\n"
"dup v28.8h, w0\n"
"ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
"dup v26.4s, w9\n"
"ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v27.4s, w1\n"
"ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"dup v29.8h, w2\n"
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.16b, w3\n"
"ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"dup v31.16b, w4\n"
"ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
"ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"add x10, %[bias_ptr], #16\n"
"ld1 {v0.8b}, [%[filter_ptr]], x5\n"
"dup v9.8h, w20\n"
"ld1 {v1.8b}, [%[filter_ptr]], x5\n"
"uaddw v0.8h, v9.8h, v0.8b\n"
"ld1 {v2.8b}, [%[filter_ptr]], x5\n"
"uaddw v1.8h, v9.8h, v1.8b\n"
"ld1 {v3.8b}, [%[filter_ptr]], x5\n"
"uaddw v2.8h, v9.8h, v2.8b\n"
"ld1 {v4.8b}, [%[filter_ptr]], x5\n"
"uaddw v3.8h, v9.8h, v3.8b\n"
"ld1 {v5.8b}, [%[filter_ptr]], x5\n"
"uaddw v4.8h, v9.8h, v4.8b\n"
"ld1 {v6.8b}, [%[filter_ptr]], x5\n"
"uaddw v5.8h, v9.8h, v5.8b\n"
"ld1 {v7.8b}, [%[filter_ptr]], x5\n"
"uaddw v6.8h, v9.8h, v6.8b\n"
"ld1 {v8.8b}, [%[filter_ptr]]\n"
"uaddw v7.8h, v9.8h, v7.8b\n"
"uaddw v8.8h, v9.8h, v8.8b\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
"mov x11, %[input_ptr]\n"
"mov x12, x11\n"
"add x13, x12, %[input_row_size]\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"mov w14, %w[output_window_width]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"cmp w14, #2\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x15, x13, %[input_row_size]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x19\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"ld1 {v22.4s}, [x10]\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"ld1 {v19.4s}, [%[bias_ptr]]\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"ld1 {v20.4s}, [x10]\n"
"uaddw v14.8h, v28.8h, v14.8b\n"
"ld1 {v25.4s}, [%[bias_ptr]]\n"
"uaddw v15.8h, v28.8h, v15.8b\n"
"ld1 {v26.4s}, [x10]\n"
"uaddw v16.8h, v28.8h, v16.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
"cmp w14, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v13.8b}, [x12]\n"
"add x12, x15, %[input_row_size]\n"
"smlal v23.4s, v0.4h, v11.4h\n"
"ld1 {v17.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v0.8h, v11.8h\n"
"ld1 {v18.8b}, [x13]\n"
"add x13, x12, %[input_row_size]\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"ld1 {v9.8b}, [x15], %[input_depth]\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v3.4h, v14.4h\n"
"smlal2 v22.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"subs w14, w14, #2\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"cmp w14, #3\n"
"smlal v21.4s, v4.4h, v15.4h\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v22.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v5.4h, v16.4h\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v22.4s, v5.8h, v16.8h\n"
"ld1 {v16.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v1.4h, v12.4h\n"
"uaddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v24.4s, v1.8h, v12.8h\n"
"ld1 {v12.8b}, [x15], %[input_depth]\n"
"smlal v23.4s, v2.4h, v13.4h\n"
"uaddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v24.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x15]\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"ld1 {v17.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v5.4h, v18.4h\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"smlal2 v24.4s, v5.8h, v18.8h\n"
"ld1 {v18.8b}, [x12]\n"
"smlal v21.4s, v6.4h, v9.4h\n"
"smlal2 v22.4s, v6.8h, v9.8h\n"
"smlal v19.4s, v0.4h, v9.4h\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"smlal2 v20.4s, v0.8h, v9.8h\n"
"ld1 {v9.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v6.4h, v11.4h\n"
"smlal2 v24.4s, v6.8h, v11.8h\n"
"smlal v21.4s, v7.4h, v10.4h\n"
"smlal2 v22.4s, v7.8h, v10.8h\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"smlal v19.4s, v1.4h, v10.4h\n"
"smlal2 v20.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v7.4h, v12.4h\n"
"smlal2 v24.4s, v7.8h, v12.8h\n"
"smlal v25.4s, v1.4h, v12.4h\n"
"smlal2 v26.4s, v1.8h, v12.8h\n"
"smlal v21.4s, v8.4h, v11.4h\n"
"smlal2 v22.4s, v8.8h, v11.8h\n"
"add x11, x11, %[input_width_increment]\n"
"smlal v19.4s, v2.4h, v11.4h\n"
"mov x12, x11\n"
"smlal2 v20.4s, v2.8h, v11.8h\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"smlal v25.4s, v0.4h, v11.4h\n"
"smlal2 v26.4s, v0.8h, v11.8h\n"
"ld1 {v11.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v8.4h, v13.4h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v8.8h, v13.8h\n"
"smlal v25.4s, v2.4h, v13.4h\n"
"smlal2 v26.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x13]\n"
"add x13, x12, %[input_row_size]\n"
"add x15, x13, %[input_row_size]\n"
"dup v28.4s, w9\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrshl v21.4s, v21.4s, v28.4s\n"
"sqrshl v22.4s, v22.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v28.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"ld1 {v22.4s}, [x10]\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x5\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x6], x5\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"smlal v19.4s, v6.4h, v9.4h\n"
"smlal2 v20.4s, v6.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal v25.4s, v6.4h, v11.4h\n"
"smlal2 v26.4s, v6.8h, v11.8h\n"
"smlal v19.4s, v7.4h, v10.4h\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v20.4s, v7.8h, v10.8h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal v25.4s, v7.4h, v12.4h\n"
"smlal2 v26.4s, v7.8h, v12.8h\n"
"smlal v19.4s, v8.4h, v11.4h\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v20.4s, v8.8h, v11.8h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"smlal v25.4s, v8.4h, v13.4h\n"
"uaddw v14.8h, v28.8h, v14.8b\n"
"smlal2 v26.4s, v8.8h, v13.8h\n"
"uaddw v16.8h, v28.8h, v16.8b\n"
"smlal v19.4s, v3.4h, v14.4h\n"
"uaddw v15.8h, v28.8h, v15.8b\n"
"smlal2 v20.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v25.4s, v3.4h, v16.4h\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"smlal2 v26.4s, v3.8h, v16.8h\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"smlal v19.4s, v4.4h, v15.4h\n"
"uaddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v20.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"smlal v25.4s, v4.4h, v17.4h\n"
"smlal2 v26.4s, v4.8h, v17.8h\n"
"smlal v19.4s, v5.4h, v16.4h\n"
"uaddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v20.4s, v5.8h, v16.8h\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n"
"smlal v25.4s, v5.4h, v18.4h\n"
"smlal2 v26.4s, v5.8h, v18.8h\n"
"dup v28.4s, w9\n"
"sqrdmulh v19.4s, v19.4s, v27.4s\n"
"sqrdmulh v20.4s, v20.4s, v27.4s\n"
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
"sqrdmulh v26.4s, v26.4s, v27.4s\n"
"sqrshl v19.4s, v19.4s, v28.4s\n"
"sqrshl v20.4s, v20.4s, v28.4s\n"
"sqrshl v25.4s, v25.4s, v28.4s\n"
"sqrshl v26.4s, v26.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v19.4h, v19.4s\n"
"sqxtn2 v19.8h, v20.4s\n"
"sqxtn v25.4h, v25.4s\n"
"sqxtn2 v25.8h, v26.4s\n"
"sqadd v19.8h, v19.8h, v29.8h\n"
"sqadd v25.8h, v25.8h, v29.8h\n"
"sqxtun v19.8b, v19.8h\n"
"sqxtun2 v19.16b, v25.8h\n"
"ld1 {v20.4s}, [x10]\n"
"umax v19.16b, v19.16b, v30.16b\n"
"umin v19.16b, v19.16b, v31.16b\n"
"ld1 {v26.4s}, [x10]\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"st1 {v19.8b}, [x7], x5\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"mov v25.d[0], v19.d[1]\n"
"st1 {v25.8b}, [x7], x5\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"ld1 {v19.4s}, [%[bias_ptr]]\n"
"uaddw v14.8h, v28.8h, v14.8b\n"
"ld1 {v25.4s}, [%[bias_ptr]]\n"
"uaddw v15.8h, v28.8h, v15.8b\n"
"uaddw v16.8h, v28.8h, v16.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
"cmp w14, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v13.8b}, [x12]\n"
"add x12, x15, %[input_row_size]\n"
"smlal v23.4s, v0.4h, v11.4h\n"
"ld1 {v17.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v0.8h, v11.8h\n"
"ld1 {v18.8b}, [x13]\n"
"add x13, x12, %[input_row_size]\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"ld1 {v9.8b}, [x15], %[input_depth]\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v3.4h, v14.4h\n"
"smlal2 v22.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"smlal v21.4s, v4.4h, v15.4h\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v22.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v5.4h, v16.4h\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v22.4s, v5.8h, v16.8h\n"
"ld1 {v16.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v1.4h, v12.4h\n"
"uaddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v24.4s, v1.8h, v12.8h\n"
"ld1 {v12.8b}, [x15], %[input_depth]\n"
"smlal v23.4s, v2.4h, v13.4h\n"
"uaddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v24.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x15]\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"ld1 {v17.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v5.4h, v18.4h\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"smlal2 v24.4s, v5.8h, v18.8h\n"
"ld1 {v18.8b}, [x12]\n"
"smlal v21.4s, v6.4h, v9.4h\n"
"smlal2 v22.4s, v6.8h, v9.8h\n"
"smlal v19.4s, v0.4h, v9.4h\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"smlal2 v20.4s, v0.8h, v9.8h\n"
"ld1 {v9.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v6.4h, v11.4h\n"
"smlal2 v24.4s, v6.8h, v11.8h\n"
"smlal v21.4s, v7.4h, v10.4h\n"
"smlal2 v22.4s, v7.8h, v10.8h\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"smlal v19.4s, v1.4h, v10.4h\n"
"smlal2 v20.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v7.4h, v12.4h\n"
"smlal2 v24.4s, v7.8h, v12.8h\n"
"smlal v25.4s, v1.4h, v12.4h\n"
"smlal2 v26.4s, v1.8h, v12.8h\n"
"smlal v21.4s, v8.4h, v11.4h\n"
"smlal2 v22.4s, v8.8h, v11.8h\n"
"smlal v19.4s, v2.4h, v11.4h\n"
"smlal2 v20.4s, v2.8h, v11.8h\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"smlal v25.4s, v0.4h, v11.4h\n"
"smlal2 v26.4s, v0.8h, v11.8h\n"
"ld1 {v11.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v8.4h, v13.4h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v8.8h, v13.8h\n"
"smlal v25.4s, v2.4h, v13.4h\n"
"smlal2 v26.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x13]\n"
"dup v28.4s, w9\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrshl v21.4s, v21.4s, v28.4s\n"
"sqrshl v22.4s, v22.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v28.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"sqxtun2 v21.16b, v23.8h\n"
"ld1 {v22.4s}, [x10]\n"
"umax v21.16b, v21.16b, v30.16b\n"
"umin v21.16b, v21.16b, v31.16b\n"
"ld1 {v24.4s}, [x10]\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x5\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x6]\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"smlal v19.4s, v6.4h, v9.4h\n"
"smlal2 v20.4s, v6.8h, v9.8h\n"
"smlal v25.4s, v6.4h, v11.4h\n"
"smlal2 v26.4s, v6.8h, v11.8h\n"
"smlal v19.4s, v7.4h, v10.4h\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v20.4s, v7.8h, v10.8h\n"
"smlal v25.4s, v7.4h, v12.4h\n"
"smlal2 v26.4s, v7.8h, v12.8h\n"
"smlal v19.4s, v8.4h, v11.4h\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v20.4s, v8.8h, v11.8h\n"
"smlal v25.4s, v8.4h, v13.4h\n"
"uaddw v14.8h, v28.8h, v14.8b\n"
"smlal2 v26.4s, v8.8h, v13.8h\n"
"uaddw v16.8h, v28.8h, v16.8b\n"
"smlal v19.4s, v3.4h, v14.4h\n"
"uaddw v15.8h, v28.8h, v15.8b\n"
"smlal2 v20.4s, v3.8h, v14.8h\n"
"smlal v25.4s, v3.4h, v16.4h\n"
"smlal2 v26.4s, v3.8h, v16.8h\n"
"smlal v19.4s, v4.4h, v15.4h\n"
"uaddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v20.4s, v4.8h, v15.8h\n"
"smlal v25.4s, v4.4h, v17.4h\n"
"smlal2 v26.4s, v4.8h, v17.8h\n"
"smlal v19.4s, v5.4h, v16.4h\n"
"uaddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v20.4s, v5.8h, v16.8h\n"
"smlal v25.4s, v5.4h, v18.4h\n"
"smlal2 v26.4s, v5.8h, v18.8h\n"
"dup v28.4s, w9\n"
"sqrdmulh v19.4s, v19.4s, v27.4s\n"
"sqrdmulh v20.4s, v20.4s, v27.4s\n"
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
"sqrdmulh v26.4s, v26.4s, v27.4s\n"
"sqrshl v19.4s, v19.4s, v28.4s\n"
"sqrshl v20.4s, v20.4s, v28.4s\n"
"sqrshl v25.4s, v25.4s, v28.4s\n"
"sqrshl v26.4s, v26.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v19.4h, v19.4s\n"
"sqxtn2 v19.8h, v20.4s\n"
"sqxtn v25.4h, v25.4s\n"
"sqxtn2 v25.8h, v26.4s\n"
"sqadd v19.8h, v19.8h, v29.8h\n"
"sqadd v25.8h, v25.8h, v29.8h\n"
"sqxtun v19.8b, v19.8h\n"
"sqxtun2 v19.16b, v25.8h\n"
"umax v19.16b, v19.16b, v30.16b\n"
"umin v19.16b, v19.16b, v31.16b\n"
"st1 {v19.8b}, [x7], x5\n"
"mov v25.d[0], v19.d[1]\n"
"st1 {v25.8b}, [x7]\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
"add x12, x15, %[input_row_size]\n"
"add x13, x12, %[input_row_size]\n"
"ld1 {v12.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v13.8b}, [x15], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v17.8b}, [x15]\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x12]\n"
"smlal v21.4s, v3.4h, v14.4h\n"
"smlal2 v22.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v21.4s, v4.4h, v15.4h\n"
"smlal2 v22.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"smlal v21.4s, v5.4h, v16.4h\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v22.4s, v5.8h, v16.8h\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"ld1 {v16.8b}, [x13]\n"
"smlal v21.4s, v6.4h, v12.4h\n"
"smlal2 v22.4s, v6.8h, v12.8h\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"uaddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v7.4h, v13.4h\n"
"smlal2 v22.4s, v7.8h, v13.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v2.4h, v17.4h\n"
"smlal2 v24.4s, v2.8h, v17.8h\n"
"dup v26.4s, w9\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrshl v21.4s, v21.4s, v26.4s\n"
"sqrshl v22.4s, v22.4s, v26.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqxtun v21.8b, v21.8h\n"
"umax v21.8b, v21.8b, v30.8b\n"
"umin v21.8b, v21.8b, v31.8b\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"st1 {v21.8b}, [x6]\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"smlal v23.4s, v3.4h, v9.4h\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"smlal2 v24.4s, v3.8h, v9.8h\n"
"uaddw v14.8h, v28.8h, v14.8b\n"
"smlal v23.4s, v4.4h, v10.4h\n"
"uaddw v15.8h, v28.8h, v15.8b\n"
"smlal2 v24.4s, v4.8h, v10.8h\n"
"uaddw v16.8h, v28.8h, v16.8b\n"
"smlal v23.4s, v5.4h, v11.4h\n"
"smlal2 v24.4s, v5.8h, v11.8h\n"
"smlal v23.4s, v6.4h, v14.4h\n"
"smlal2 v24.4s, v6.8h, v14.8h\n"
"smlal v23.4s, v7.4h, v15.4h\n"
"smlal2 v24.4s, v7.8h, v15.8h\n"
"smlal v23.4s, v8.4h, v16.4h\n"
"smlal2 v24.4s, v8.8h, v16.8h\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrshl v23.4s, v23.4s, v26.4s\n"
"sqrshl v24.4s, v24.4s, v26.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtun v23.8b, v23.8h\n"
"umax v23.8b, v23.8b, v30.8b\n"
"umin v23.8b, v23.8b, v31.8b\n"
"st1 {v23.8b}, [x7]\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
"subs %w[output_window_height], %w[output_window_height], #2\n"
"add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
"cmp %w[output_window_height], #2\n"
"add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
"cmp %w[output_window_height], #1\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
"mov x11, %[input_ptr]\n"
"mov x12, x11\n"
"add x13, x12, %[input_row_size]\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"add x15, x13, %[input_row_size]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"mov w14, %w[output_window_width]\n"
"cmp w14, #2\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"ld1 {v15.8b}, [x15], %[input_depth]\n"
"ld1 {v16.8b}, [x15], %[input_depth]\n"
"ld1 {v17.8b}, [x15], %[input_depth]\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"ld1 {v24.4s}, [%[bias_ptr]]\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"ld1 {v25.4s}, [x10]\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"ld1 {v26.4s}, [%[bias_ptr]]\n"
"ld1 {v27.4s}, [x10]\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"uaddw v14.8h, v28.8h, v14.8b\n"
"uaddw v15.8h, v28.8h, v15.8b\n"
"uaddw v16.8h, v28.8h, v16.8b\n"
"uaddw v17.8h, v28.8h, v17.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
"cmp w14, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
"smlal v24.4s, v0.4h, v9.4h\n"
"ld1 {v18.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v0.8h, v9.8h\n"
"ld1 {v19.8b}, [x12]\n"
"smlal v26.4s, v0.4h, v11.4h\n"
"ld1 {v20.8b}, [x13], %[input_depth]\n"
"smlal2 v27.4s, v0.8h, v11.8h\n"
"ld1 {v21.8b}, [x13]\n"
"smlal v24.4s, v1.4h, v10.4h\n"
"ld1 {v22.8b}, [x15], %[input_depth]\n"
"smlal2 v25.4s, v1.8h, v10.8h\n"
"ld1 {v23.8b}, [x15]\n"
"smlal v24.4s, v2.4h, v11.4h\n"
"subs w14, w14, #2\n"
"smlal2 v25.4s, v2.8h, v11.8h\n"
"cmp w14, #3\n"
"smlal v24.4s, v3.4h, v12.4h\n"
"add x11, x11, %[input_width_increment]\n"
"smlal2 v25.4s, v3.8h, v12.8h\n"
"mov x12, x11\n"
"smlal v26.4s, v3.4h, v14.4h\n"
"add x13, x12, %[input_row_size]\n"
"smlal2 v27.4s, v3.8h, v14.8h\n"
"add x15, x13, %[input_row_size]\n"
"smlal v24.4s, v4.4h, v13.4h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v4.8h, v13.8h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal v24.4s, v5.4h, v14.4h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v5.8h, v14.8h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal v24.4s, v6.4h, v15.4h\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"smlal2 v25.4s, v6.8h, v15.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v26.4s, v6.4h, v17.4h\n"
"ld1 {v15.8b}, [x15], %[input_depth]\n"
"smlal2 v27.4s, v6.8h, v17.8h\n"
"smlal v24.4s, v7.4h, v16.4h\n"
"smlal2 v25.4s, v7.8h, v16.8h\n"
"ld1 {v16.8b}, [x15], %[input_depth]\n"
"smlal v24.4s, v8.4h, v17.4h\n"
"uaddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v25.4s, v8.8h, v17.8h\n"
"ld1 {v17.8b}, [x15], %[input_depth]\n"
"uaddw v19.8h, v28.8h, v19.8b\n"
"smlal v26.4s, v1.4h, v18.4h\n"
"uaddw v20.8h, v28.8h, v20.8b\n"
"smlal2 v27.4s, v1.8h, v18.8h\n"
"smlal v26.4s, v2.4h, v19.4h\n"
"uaddw v21.8h, v28.8h, v21.8b\n"
"smlal2 v27.4s, v2.8h, v19.8h\n"
"smlal v26.4s, v4.4h, v20.4h\n"
"smlal v26.4s, v5.4h, v21.4h\n"
"smlal2 v27.4s, v4.8h, v20.8h\n"
"uaddw v22.8h, v28.8h, v22.8b\n"
"smlal2 v27.4s, v5.8h, v21.8h\n"
"uaddw v23.8h, v28.8h, v23.8b\n"
"smlal v26.4s, v7.4h, v22.4h\n"
"smlal2 v27.4s, v7.8h, v22.8h\n"
"smlal v26.4s, v8.4h, v23.4h\n"
"smlal2 v27.4s, v8.8h, v23.8h\n"
"dup v28.4s, w1\n"
"dup v29.4s, w9\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrdmulh v25.4s, v25.4s, v28.4s\n"
"sqrdmulh v26.4s, v26.4s, v28.4s\n"
"sqrdmulh v27.4s, v27.4s, v28.4s\n"
"dup v28.8h, w2\n"
"sqrshl v24.4s, v24.4s, v29.4s\n"
"sqrshl v25.4s, v25.4s, v29.4s\n"
"sqrshl v26.4s, v26.4s, v29.4s\n"
"sqrshl v27.4s, v27.4s, v29.4s\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v25.4s\n"
"sqxtn v26.4h, v26.4s\n"
"sqxtn2 v26.8h, v27.4s\n"
"sqadd v24.8h, v24.8h, v28.8h\n"
"sqadd v26.8h, v26.8h, v28.8h\n"
"sqxtun v24.8b, v24.8h\n"
"sqxtun2 v24.16b, v26.8h\n"
"dup v28.8h, w0\n"
"ld1 {v25.4s}, [x10]\n"
"umax v24.16b, v24.16b, v30.16b\n"
"umin v24.16b, v24.16b, v31.16b\n"
"ld1 {v27.4s}, [x10]\n"
"uaddw v9.8h, v28.8h, v9.8b\n"
"st1 {v24.8b}, [x6], x5\n"
"uaddw v10.8h, v28.8h, v10.8b\n"
"mov v26.d[0], v24.d[1]\n"
"st1 {v26.8b}, [x6], x5\n"
"uaddw v11.8h, v28.8h, v11.8b\n"
"uaddw v12.8h, v28.8h, v12.8b\n"
"uaddw v13.8h, v28.8h, v13.8b\n"
"uaddw v14.8h, v28.8h, v14.8b\n"
"ld1 {v24.4s}, [%[bias_ptr]]\n"
"uaddw v15.8h, v28.8h, v15.8b\n"
"ld1 {v26.4s}, [%[bias_ptr]]\n"
"uaddw v16.8h, v28.8h, v16.8b\n"
"uaddw v17.8h, v28.8h, v17.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
"cmp w14, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
"smlal v24.4s, v0.4h, v9.4h\n"
"ld1 {v18.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v0.8h, v9.8h\n"
"ld1 {v19.8b}, [x12]\n"
"smlal v26.4s, v0.4h, v11.4h\n"
"ld1 {v20.8b}, [x13], %[input_depth]\n"
"smlal2 v27.4s, v0.8h, v11.8h\n"
"ld1 {v21.8b}, [x13]\n"
"smlal v24.4s, v1.4h, v10.4h\n"
"ld1 {v22.8b}, [x15], %[input_depth]\n"
"smlal2 v25.4s, v1.8h, v10.8h\n"
"ld1 {v23.8b}, [x15]\n"
"smlal v24.4s, v2.4h, v11.4h\n"
"smlal2 v25.4s, v2.8h, v11.8h\n"
"smlal v24.4s, v3.4h, v12.4h\n"
"smlal2 v25.4s, v3.8h, v12.8h\n"
"smlal v26.4s, v3.4h, v14.4h\n"
"smlal2 v27.4s, v3.8h, v14.8h\n"
"smlal v24.4s, v4.4h, v13.4h\n"
"smlal2 v25.4s, v4.8h, v13.8h\n"
"smlal v24.4s, v5.4h, v14.4h\n"
"smlal2 v25.4s, v5.8h, v14.8h\n"
"smlal v24.4s, v6.4h, v15.4h\n"
"smlal2 v25.4s, v6.8h, v15.8h\n"
"smlal v26.4s, v6.4h, v17.4h\n"
"smlal2 v27.4s, v6.8h, v17.8h\n"
"smlal v24.4s, v7.4h, v16.4h\n"
"smlal2 v25.4s, v7.8h, v16.8h\n"
"smlal v24.4s, v8.4h, v17.4h\n"
"uaddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v25.4s, v8.8h, v17.8h\n"
"uaddw v19.8h, v28.8h, v19.8b\n"
"smlal v26.4s, v1.4h, v18.4h\n"
"uaddw v20.8h, v28.8h, v20.8b\n"
"smlal2 v27.4s, v1.8h, v18.8h\n"
"smlal v26.4s, v2.4h, v19.4h\n"
"uaddw v21.8h, v28.8h, v21.8b\n"
"smlal2 v27.4s, v2.8h, v19.8h\n"
"smlal v26.4s, v4.4h, v20.4h\n"
"smlal v26.4s, v5.4h, v21.4h\n"
"smlal2 v27.4s, v4.8h, v20.8h\n"
"uaddw v22.8h, v28.8h, v22.8b\n"
"smlal2 v27.4s, v5.8h, v21.8h\n"
"uaddw v23.8h, v28.8h, v23.8b\n"
"smlal v26.4s, v7.4h, v22.4h\n"
"smlal2 v27.4s, v7.8h, v22.8h\n"
"smlal v26.4s, v8.4h, v23.4h\n"
"smlal2 v27.4s, v8.8h, v23.8h\n"
"dup v28.4s, w1\n"
"dup v29.4s, w9\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrdmulh v25.4s, v25.4s, v28.4s\n"
"sqrdmulh v26.4s, v26.4s, v28.4s\n"
"sqrdmulh v27.4s, v27.4s, v28.4s\n"
"dup v28.8h, w2\n"
"sqrshl v24.4s, v24.4s, v29.4s\n"
"sqrshl v25.4s, v25.4s, v29.4s\n"
"sqrshl v26.4s, v26.4s, v29.4s\n"
"sqrshl v27.4s, v27.4s, v29.4s\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v25.4s\n"
"sqxtn v26.4h, v26.4s\n"
"sqxtn2 v26.8h, v27.4s\n"
"sqadd v24.8h, v24.8h, v28.8h\n"
"sqadd v26.8h, v26.8h, v28.8h\n"
"sqxtun v24.8b, v24.8h\n"
"sqxtun2 v24.16b, v26.8h\n"
"dup v28.8h, w0\n"
"umax v24.16b, v24.16b, v30.16b\n"
"umin v24.16b, v24.16b, v31.16b\n"
"st1 {v24.8b}, [x6], x5\n"
"mov v26.d[0], v24.d[1]\n"
"st1 {v26.8b}, [x6]\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
"dup v26.4s, w9\n"
"dup v27.4s, w1\n"
"dup v29.8h, w2\n"
"smlal v24.4s, v0.4h, v9.4h\n"
"smlal2 v25.4s, v0.8h, v9.8h\n"
"smlal v24.4s, v1.4h, v10.4h\n"
"smlal2 v25.4s, v1.8h, v10.8h\n"
"smlal v24.4s, v2.4h, v11.4h\n"
"smlal2 v25.4s, v2.8h, v11.8h\n"
"smlal v24.4s, v3.4h, v12.4h\n"
"smlal2 v25.4s, v3.8h, v12.8h\n"
"smlal v24.4s, v4.4h, v13.4h\n"
"smlal2 v25.4s, v4.8h, v13.8h\n"
"smlal v24.4s, v5.4h, v14.4h\n"
"smlal2 v25.4s, v5.8h, v14.8h\n"
"smlal v24.4s, v6.4h, v15.4h\n"
"smlal2 v25.4s, v6.8h, v15.8h\n"
"smlal v24.4s, v7.4h, v16.4h\n"
"smlal2 v25.4s, v7.8h, v16.8h\n"
"smlal v24.4s, v8.4h, v17.4h\n"
"smlal2 v25.4s, v8.8h, v17.8h\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
"sqrshl v24.4s, v24.4s, v26.4s\n"
"sqrshl v25.4s, v25.4s, v26.4s\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v25.4s\n"
"sqadd v24.8h, v24.8h, v29.8h\n"
"sqxtun v24.8b, v24.8h\n"
"umax v24.8b, v24.8b, v30.8b\n"
"umin v24.8b, v24.8b, v31.8b\n"
"st1 {v24.8b}, [x6]\n"
DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr),
[output_window_height] "+r"(output_window_height)
:
[bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
[input_depth] "r"(input_depth),
[output_window_width] "r"(output_window_width),
[input_width_increment] "r"(input_width_increment),
[input_height_increment] "r"(input_height_increment),
[output_height_increment] "r"(output_height_increment),
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"v30", "v31",
"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
"x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x19", "x20");
#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
}
};
template <>
struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kAwayFromZero,
EdgeType::kCenter, 1, 1> {
static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
const int32* bias_ptr, uint8* output_ptr,
const DepthwiseConvParams* params_ptr) {
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr x11, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
"dup v26.8h, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v27.4s, w10\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"cmp x11, #16\n"
"ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
"dup v28.8h, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"dup v29.4s, w10\n"
"ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.16b, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"dup v31.16b, w10\n"
"dup v25.8h, w9\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"subs x11, x11, #8\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"cmp x11, #16\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"and v18.16b, v16.16b, v29.16b\n"
"and v19.16b, v17.16b, v29.16b\n"
"sshr v18.4s, v18.4s, #31\n"
"sshr v19.4s, v19.4s, #31\n"
"sqadd v16.4s, v16.4s, v18.4s\n"
"sqadd v17.4s, v17.4s, v19.4s\n"
"srshl v16.4s, v16.4s, v29.4s\n"
"srshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]], #8\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"and v18.16b, v16.16b, v29.16b\n"
"and v19.16b, v17.16b, v29.16b\n"
"sshr v18.4s, v18.4s, #31\n"
"sshr v19.4s, v19.4s, #31\n"
"sqadd v16.4s, v16.4s, v18.4s\n"
"sqadd v17.4s, v17.4s, v19.4s\n"
"srshl v16.4s, v16.4s, v29.4s\n"
"srshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]]\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
:
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v8", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28",
"v29", "v30", "v31",
"x9", "x10", "x11");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
template <>
struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kUpward,
EdgeType::kCenter, 1, 1> {
static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
const int32* bias_ptr, uint8* output_ptr,
const DepthwiseConvParams* params_ptr) {
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr x11, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
"dup v26.8h, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v27.4s, w10\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"cmp x11, #16\n"
"ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
"dup v28.8h, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"dup v29.4s, w10\n"
"ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.16b, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"dup v31.16b, w10\n"
"dup v25.8h, w9\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"subs x11, x11, #8\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"cmp x11, #16\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"sqrshl v16.4s, v16.4s, v29.4s\n"
"sqrshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]], #8\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"sqrshl v16.4s, v16.4s, v29.4s\n"
"sqrshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]]\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
:
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v8", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28",
"v29", "v30", "v31",
"x9", "x10", "x11");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
template <>
struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kAwayFromZero,
EdgeType::kCorner, 1, 1> {
static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
const int32* bias_ptr, uint8* output_ptr,
const DepthwiseConvParams* params_ptr) {
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"ldr x9, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
"cmp x15, #16\n"
"add x12, %[input_ptr], x15\n"
"add x13, %[input_ptr], x9\n"
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"add x14, x13, x15\n"
"ld1 {v9.8b}, [x12], #8\n"
"ldr x6, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
"add x9, %[filter_ptr], x15\n"
"ld1 {v10.8b}, [x13], #8\n"
"add x10, %[filter_ptr], x6\n"
"ld1 {v11.8b}, [x14], #8\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"add x11, x10, x15\n"
"ld1 {v1.8b}, [x9], #8\n"
"ld1 {v2.8b}, [x10], #8\n"
"ld1 {v3.8b}, [x11], #8\n"
"ldr w6, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
"dup v26.8h, w6\n"
"ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v27.4s, w7\n"
"ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
"dup v28.8h, w6\n"
"ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"dup v29.4s, w7\n"
"ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.16b, w6\n"
"ldr w6, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"dup v31.16b, w7\n"
"dup v25.8h, w6\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"uaddw v1.8h, v25.8h, v1.8b\n"
"uaddw v2.8h, v25.8h, v2.8b\n"
"uaddw v3.8h, v25.8h, v3.8b\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"subs x15, x15, #8\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"cmp x15, #16\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], #8\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"ld1 {v1.8b}, [x9], #8\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], #8\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"ld1 {v2.8b}, [x10], #8\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"ld1 {v11.8b}, [x14], #8\n"
"ld1 {v3.8b}, [x11], #8\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"and v18.16b, v16.16b, v29.16b\n"
"and v19.16b, v17.16b, v29.16b\n"
"sshr v18.4s, v18.4s, #31\n"
"sshr v19.4s, v19.4s, #31\n"
"sqadd v16.4s, v16.4s, v18.4s\n"
"sqadd v17.4s, v17.4s, v19.4s\n"
"srshl v16.4s, v16.4s, v29.4s\n"
"srshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]], #8\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"uaddw v1.8h, v25.8h, v1.8b\n"
"uaddw v2.8h, v25.8h, v2.8b\n"
"uaddw v3.8h, v25.8h, v3.8b\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"and v18.16b, v16.16b, v29.16b\n"
"and v19.16b, v17.16b, v29.16b\n"
"sshr v18.4s, v18.4s, #31\n"
"sshr v19.4s, v19.4s, #31\n"
"sqadd v16.4s, v16.4s, v18.4s\n"
"sqadd v17.4s, v17.4s, v19.4s\n"
"srshl v16.4s, v16.4s, v29.4s\n"
"srshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]]\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
:
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18",
"v19", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
"x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
template <>
struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kUpward,
EdgeType::kCorner, 1, 1> {
static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
const int32* bias_ptr, uint8* output_ptr,
const DepthwiseConvParams* params_ptr) {
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"ldr x9, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
"cmp x15, #16\n"
"add x12, %[input_ptr], x15\n"
"add x13, %[input_ptr], x9\n"
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"add x14, x13, x15\n"
"ld1 {v9.8b}, [x12], #8\n"
"ldr x6, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
"add x9, %[filter_ptr], x15\n"
"ld1 {v10.8b}, [x13], #8\n"
"add x10, %[filter_ptr], x6\n"
"ld1 {v11.8b}, [x14], #8\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"add x11, x10, x15\n"
"ld1 {v1.8b}, [x9], #8\n"
"ld1 {v2.8b}, [x10], #8\n"
"ld1 {v3.8b}, [x11], #8\n"
"ldr w6, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
"dup v26.8h, w6\n"
"ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v27.4s, w7\n"
"ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
"dup v28.8h, w6\n"
"ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"dup v29.4s, w7\n"
"ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.16b, w6\n"
"ldr w6, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"dup v31.16b, w7\n"
"dup v25.8h, w6\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"uaddw v1.8h, v25.8h, v1.8b\n"
"uaddw v2.8h, v25.8h, v2.8b\n"
"uaddw v3.8h, v25.8h, v3.8b\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"subs x15, x15, #8\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"cmp x15, #16\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], #8\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"ld1 {v1.8b}, [x9], #8\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], #8\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"ld1 {v2.8b}, [x10], #8\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"ld1 {v11.8b}, [x14], #8\n"
"ld1 {v3.8b}, [x11], #8\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"sqrshl v16.4s, v16.4s, v29.4s\n"
"sqrshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]], #8\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"uaddw v1.8h, v25.8h, v1.8b\n"
"uaddw v2.8h, v25.8h, v2.8b\n"
"uaddw v3.8h, v25.8h, v3.8b\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"sqrshl v16.4s, v16.4s, v29.4s\n"
"sqrshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]]\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
:
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18",
"v19", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
"x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
template <>
struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kAwayFromZero,
EdgeType::kHorizontal, 1, 1> {
static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
const int32* bias_ptr, uint8* output_ptr,
const DepthwiseConvParams* params_ptr) {
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ldr x7, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
"mov x12, %[input_ptr]\n"
"ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
"mov x9, %[filter_ptr]\n"
"ldr x14, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
"add x13, x12, x11\n"
"ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"ld1 {v8.8b}, [x12], x7\n"
"add x10, x9, x14\n"
"ld1 {v9.8b}, [x12], x7\n"
"cmp x15, #16\n"
"ld1 {v10.8b}, [x12]\n"
"add %[input_ptr], %[input_ptr], #8\n"
"ld1 {v11.8b}, [x13], x7\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"ld1 {v12.8b}, [x13], x7\n"
"ld1 {v13.8b}, [x13]\n"
"ld1 {v0.8b}, [x9], x7\n"
"ld1 {v1.8b}, [x9], x7\n"
"ld1 {v2.8b}, [x9]\n"
"ld1 {v3.8b}, [x10], x7\n"
"ld1 {v4.8b}, [x10], x7\n"
"ld1 {v5.8b}, [x10]\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
"dup v26.8h, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v27.4s, w13\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
"dup v28.8h, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"dup v29.4s, w13\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.8b, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"dup v31.8b, w13\n"
"dup v25.8h, w12\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"uaddw v1.8h, v25.8h, v1.8b\n"
"uaddw v2.8h, v25.8h, v2.8b\n"
"uaddw v3.8h, v25.8h, v3.8b\n"
"uaddw v4.8h, v25.8h, v4.8b\n"
"uaddw v5.8h, v25.8h, v5.8b\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"mov x12, %[input_ptr]\n"
"subs x15, x15, #8\n"
"add x13, x12, x11\n"
"cmp x15, #16\n"
"add %[input_ptr], %[input_ptr], #8\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"mov x9, %[filter_ptr]\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [x12], x7\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"add x10, x9, x14\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], x7\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"ld1 {v10.8b}, [x12]\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"ld1 {v0.8b}, [x9], x7\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"ld1 {v11.8b}, [x13], x7\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"ld1 {v1.8b}, [x9], x7\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"ld1 {v12.8b}, [x13], x7\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"ld1 {v2.8b}, [x9]\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"ld1 {v13.8b}, [x13]\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"ld1 {v3.8b}, [x10], x7\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"ld1 {v4.8b}, [x10], x7\n"
"and v18.16b, v16.16b, v29.16b\n"
"ld1 {v5.8b}, [x10]\n"
"and v19.16b, v17.16b, v29.16b\n"
"sshr v18.4s, v18.4s, #31\n"
"sshr v19.4s, v19.4s, #31\n"
"sqadd v16.4s, v16.4s, v18.4s\n"
"sqadd v17.4s, v17.4s, v19.4s\n"
"srshl v16.4s, v16.4s, v29.4s\n"
"srshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"st1 {v16.8b}, [%[output_ptr]], #8\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"uaddw v1.8h, v25.8h, v1.8b\n"
"uaddw v2.8h, v25.8h, v2.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v3.8h, v25.8h, v3.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"uaddw v4.8h, v25.8h, v4.8b\n"
"uaddw v5.8h, v25.8h, v5.8b\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"and v18.16b, v16.16b, v29.16b\n"
"and v19.16b, v17.16b, v29.16b\n"
"sshr v18.4s, v18.4s, #31\n"
"sshr v19.4s, v19.4s, #31\n"
"sqadd v16.4s, v16.4s, v18.4s\n"
"sqadd v17.4s, v17.4s, v19.4s\n"
"srshl v16.4s, v16.4s, v29.4s\n"
"srshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]]\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
:
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
"v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
"v30", "v31",
"x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
template <>
struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kUpward,
EdgeType::kHorizontal, 1, 1> {
static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
const int32* bias_ptr, uint8* output_ptr,
const DepthwiseConvParams* params_ptr) {
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ldr x7, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
"mov x12, %[input_ptr]\n"
"ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
"mov x9, %[filter_ptr]\n"
"ldr x14, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
"add x13, x12, x11\n"
"ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"ld1 {v8.8b}, [x12], x7\n"
"add x10, x9, x14\n"
"ld1 {v9.8b}, [x12], x7\n"
"cmp x15, #16\n"
"ld1 {v10.8b}, [x12]\n"
"add %[input_ptr], %[input_ptr], #8\n"
"ld1 {v11.8b}, [x13], x7\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"ld1 {v12.8b}, [x13], x7\n"
"ld1 {v13.8b}, [x13]\n"
"ld1 {v0.8b}, [x9], x7\n"
"ld1 {v1.8b}, [x9], x7\n"
"ld1 {v2.8b}, [x9]\n"
"ld1 {v3.8b}, [x10], x7\n"
"ld1 {v4.8b}, [x10], x7\n"
"ld1 {v5.8b}, [x10]\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
"dup v26.8h, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v27.4s, w13\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
"dup v28.8h, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"dup v29.4s, w13\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.8b, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"dup v31.8b, w13\n"
"dup v25.8h, w12\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"uaddw v1.8h, v25.8h, v1.8b\n"
"uaddw v2.8h, v25.8h, v2.8b\n"
"uaddw v3.8h, v25.8h, v3.8b\n"
"uaddw v4.8h, v25.8h, v4.8b\n"
"uaddw v5.8h, v25.8h, v5.8b\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"mov x12, %[input_ptr]\n"
"subs x15, x15, #8\n"
"add x13, x12, x11\n"
"cmp x15, #16\n"
"add %[input_ptr], %[input_ptr], #8\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"mov x9, %[filter_ptr]\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [x12], x7\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"add x10, x9, x14\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], x7\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"ld1 {v10.8b}, [x12]\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"ld1 {v0.8b}, [x9], x7\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"ld1 {v11.8b}, [x13], x7\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"ld1 {v1.8b}, [x9], x7\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"ld1 {v12.8b}, [x13], x7\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"ld1 {v2.8b}, [x9]\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"ld1 {v13.8b}, [x13]\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"ld1 {v3.8b}, [x10], x7\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"ld1 {v4.8b}, [x10], x7\n"
"sqrshl v16.4s, v16.4s, v29.4s\n"
"ld1 {v5.8b}, [x10]\n"
"sqrshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"st1 {v16.8b}, [%[output_ptr]], #8\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"uaddw v1.8h, v25.8h, v1.8b\n"
"uaddw v2.8h, v25.8h, v2.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v3.8h, v25.8h, v3.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"uaddw v4.8h, v25.8h, v4.8b\n"
"uaddw v5.8h, v25.8h, v5.8b\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"sqrshl v16.4s, v16.4s, v29.4s\n"
"sqrshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]]\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
:
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
"v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
"v30", "v31",
"x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
template <>
struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kAwayFromZero,
EdgeType::kVertical, 1, 1> {
static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
const int32* bias_ptr, uint8* output_ptr,
const DepthwiseConvParams* params_ptr) {
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ldr x6, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
"mov x12, %[input_ptr]\n"
"ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
"mov x7, %[filter_ptr]\n"
"ldr x5, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
"add x13, x12, x11\n"
"ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"add x14, x13, x11\n"
"ld1 {v8.8b}, [x12], x6\n"
"add x9, x7, x5\n"
"ld1 {v9.8b}, [x12]\n"
"cmp x15, #16\n"
"add x10, x9, x5\n"
"ld1 {v10.8b}, [x13], x6\n"
"add %[input_ptr], %[input_ptr], #8\n"
"ld1 {v11.8b}, [x13]\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"ld1 {v12.8b}, [x14], x6\n"
"ld1 {v13.8b}, [x14]\n"
"ld1 {v0.8b}, [x7], x6\n"
"ld1 {v1.8b}, [x7]\n"
"ld1 {v2.8b}, [x9], x6\n"
"ld1 {v3.8b}, [x9]\n"
"ld1 {v4.8b}, [x10], x6\n"
"ld1 {v5.8b}, [x10]\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
"dup v26.8h, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v27.4s, w13\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
"dup v28.8h, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"dup v29.4s, w13\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.8b, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"dup v31.8b, w13\n"
"dup v25.8h, w12\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"uaddw v1.8h, v25.8h, v1.8b\n"
"uaddw v2.8h, v25.8h, v2.8b\n"
"uaddw v3.8h, v25.8h, v3.8b\n"
"uaddw v4.8h, v25.8h, v4.8b\n"
"uaddw v5.8h, v25.8h, v5.8b\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"mov x12, %[input_ptr]\n"
"subs x15, x15, #8\n"
"add x13, x12, x11\n"
"cmp x15, #16\n"
"add x14, x13, x11\n"
"add %[input_ptr], %[input_ptr], #8\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"mov x7, %[filter_ptr]\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [x12], x6\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"add x9, x7, x5\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"add x10, x9, x5\n"
"ld1 {v9.8b}, [x12]\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], x6\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"ld1 {v0.8b}, [x7], x6\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"ld1 {v11.8b}, [x13]\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"ld1 {v1.8b}, [x7]\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"ld1 {v12.8b}, [x14], x6\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"ld1 {v2.8b}, [x9], x6\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"ld1 {v13.8b}, [x14]\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"ld1 {v3.8b}, [x9]\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"ld1 {v4.8b}, [x10], x6\n"
"and v18.16b, v16.16b, v29.16b\n"
"ld1 {v5.8b}, [x10]\n"
"and v19.16b, v17.16b, v29.16b\n"
"sshr v18.4s, v18.4s, #31\n"
"sshr v19.4s, v19.4s, #31\n"
"sqadd v16.4s, v16.4s, v18.4s\n"
"sqadd v17.4s, v17.4s, v19.4s\n"
"srshl v16.4s, v16.4s, v29.4s\n"
"srshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"st1 {v16.8b}, [%[output_ptr]], #8\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"uaddw v1.8h, v25.8h, v1.8b\n"
"uaddw v2.8h, v25.8h, v2.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v3.8h, v25.8h, v3.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"uaddw v4.8h, v25.8h, v4.8b\n"
"uaddw v5.8h, v25.8h, v5.8b\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"and v18.16b, v16.16b, v29.16b\n"
"and v19.16b, v17.16b, v29.16b\n"
"sshr v18.4s, v18.4s, #31\n"
"sshr v19.4s, v19.4s, #31\n"
"sqadd v16.4s, v16.4s, v18.4s\n"
"sqadd v17.4s, v17.4s, v19.4s\n"
"srshl v16.4s, v16.4s, v29.4s\n"
"srshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]]\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
:
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
"v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
"v30", "v31",
"x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
template <>
struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kUpward,
EdgeType::kVertical, 1, 1> {
static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
const int32* bias_ptr, uint8* output_ptr,
const DepthwiseConvParams* params_ptr) {
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ldr x6, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
"mov x12, %[input_ptr]\n"
"ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
"mov x7, %[filter_ptr]\n"
"ldr x5, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
"add x13, x12, x11\n"
"ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"add x14, x13, x11\n"
"ld1 {v8.8b}, [x12], x6\n"
"add x9, x7, x5\n"
"ld1 {v9.8b}, [x12]\n"
"cmp x15, #16\n"
"add x10, x9, x5\n"
"ld1 {v10.8b}, [x13], x6\n"
"add %[input_ptr], %[input_ptr], #8\n"
"ld1 {v11.8b}, [x13]\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"ld1 {v12.8b}, [x14], x6\n"
"ld1 {v13.8b}, [x14]\n"
"ld1 {v0.8b}, [x7], x6\n"
"ld1 {v1.8b}, [x7]\n"
"ld1 {v2.8b}, [x9], x6\n"
"ld1 {v3.8b}, [x9]\n"
"ld1 {v4.8b}, [x10], x6\n"
"ld1 {v5.8b}, [x10]\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
"dup v26.8h, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v27.4s, w13\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
"dup v28.8h, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"dup v29.4s, w13\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.8b, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"dup v31.8b, w13\n"
"dup v25.8h, w12\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"uaddw v1.8h, v25.8h, v1.8b\n"
"uaddw v2.8h, v25.8h, v2.8b\n"
"uaddw v3.8h, v25.8h, v3.8b\n"
"uaddw v4.8h, v25.8h, v4.8b\n"
"uaddw v5.8h, v25.8h, v5.8b\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"mov x12, %[input_ptr]\n"
"subs x15, x15, #8\n"
"add x13, x12, x11\n"
"cmp x15, #16\n"
"add x14, x13, x11\n"
"add %[input_ptr], %[input_ptr], #8\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"mov x7, %[filter_ptr]\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [x12], x6\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"add x9, x7, x5\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"add x10, x9, x5\n"
"ld1 {v9.8b}, [x12]\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], x6\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"ld1 {v0.8b}, [x7], x6\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"ld1 {v11.8b}, [x13]\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"ld1 {v1.8b}, [x7]\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"ld1 {v12.8b}, [x14], x6\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"ld1 {v2.8b}, [x9], x6\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"ld1 {v13.8b}, [x14]\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"ld1 {v3.8b}, [x9]\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"ld1 {v4.8b}, [x10], x6\n"
"sqrshl v16.4s, v16.4s, v29.4s\n"
"ld1 {v5.8b}, [x10]\n"
"sqrshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"uaddw v8.8h, v26.8h, v8.8b\n"
"st1 {v16.8b}, [%[output_ptr]], #8\n"
"uaddw v9.8h, v26.8h, v9.8b\n"
"uaddw v10.8h, v26.8h, v10.8b\n"
"uaddw v11.8h, v26.8h, v11.8b\n"
"uaddw v12.8h, v26.8h, v12.8b\n"
"uaddw v13.8h, v26.8h, v13.8b\n"
"uaddw v0.8h, v25.8h, v0.8b\n"
"uaddw v1.8h, v25.8h, v1.8b\n"
"uaddw v2.8h, v25.8h, v2.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"uaddw v3.8h, v25.8h, v3.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"uaddw v4.8h, v25.8h, v4.8b\n"
"uaddw v5.8h, v25.8h, v5.8b\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"sqrdmulh v16.4s, v16.4s, v27.4s\n"
"sqrdmulh v17.4s, v17.4s, v27.4s\n"
"sqrshl v16.4s, v16.4s, v29.4s\n"
"sqrshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtun v16.8b, v16.8h\n"
"umax v16.8b, v16.8b, v30.8b\n"
"umin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]]\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
:
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
"v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
"v30", "v31",
"x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
#undef OFFSET_INPUT_DEPTH
#undef OFFSET_INPUT_ROW_SIZE
#undef OFFSET_OUTPUT_DEPTH
#undef OFFSET_OUTPUT_ROW_SIZE
#undef OFFSET_INPUT_OFFSET
#undef OFFSET_OUTPUT_OFFSET
#undef OFFSET_FILTER_OFFSET
#undef OFFSET_OUTPUT_MULTIPLIER
#undef OFFSET_OUTPUT_ACTIVATION_MIN
#undef OFFSET_OUTPUT_ACTIVATION_MAX
#undef OFFSET_OUTPUT_RIGHT_SHIFT
#undef OFFSET_INPUT_WIDTH
#undef OFFSET_INPUT_HEIGHT
#undef OFFSET_OUTPUT_WIDTH
#undef OFFSET_OUTPUT_HEIGHT
template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
int32 kStrideHeight>
struct DepthwiseConvThroughDepth {
static void __attribute__((noinline))
Run(const uint8* input_ptr, const uint8* filter_ptr, const int32* bias_ptr,
uint8* output_ptr, int64_t start_depth, int64_t end_depth,
int64_t input_depth, int64_t input_row_size, int32 output_window_height,
int32 output_window_width, const DepthwiseConvParams& params) {
for (; start_depth <= end_depth - 8; start_depth += 8) {
DepthwiseConvWindow<output_rounding, 8, kStrideWidth, kStrideHeight>::Run(
input_ptr, filter_ptr, bias_ptr, output_ptr, input_depth,
input_row_size, output_window_height, output_window_width, ¶ms);
input_ptr += 8;
output_ptr += 8;
filter_ptr += 8;
bias_ptr += 8;
}
}
};
template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
int32 kStrideHeight>
struct DepthwiseConvMultiRow {
using ConvKernel =
DepthwiseConvThroughDepth<output_rounding, kStrideWidth, kStrideHeight>;
static inline void Run(const uint8* input_data, int32 start_x, int32 end_x,
const uint8* filter_data, const int32* bias_data,
uint8* output_data, const DepthwiseConvParams& params,
const ShuffleParams& shuffle_params,
uint8* shuffle_workspace) {
TFLITE_DCHECK(
shuffle_params.input_height ==
get_shuffle_input_size(kStrideHeight, shuffle_params.output_height));
TFLITE_DCHECK(
shuffle_params.input_width ==
get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
TFLITE_DCHECK_LE(
64 * shuffle_params.input_width * shuffle_params.input_height,
kDepthwiseConvScratchWorkspaceSize);
int32 out_x = start_x;
if (params.output_depth > 64 ||
(params.output_depth <= 64 && params.input_width > 150)) {
for (; out_x <= (end_x - shuffle_params.output_width);
out_x += shuffle_params.output_width) {
const uint8* input_ptr = input_data;
const int32* bias_ptr = bias_data;
const uint8* filter_ptr = filter_data;
uint8* output_ptr = output_data;
int64_t depth = 0;
const int64_t shuffle_row_size = 64 * shuffle_params.input_width;
for (; depth <= params.output_depth - 64; depth += 64) {
const uint8* h_ptr = input_ptr;
for (int32 i = 0; i < shuffle_params.input_height; i++) {
const uint8* ptr = h_ptr;
for (int32 j = 0; j < shuffle_params.input_width; j++) {
asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
ptr += params.input_depth;
}
h_ptr += params.input_row_size;
}
ShuffleInput(input_ptr, params.input_depth, params.input_width,
params.input_height, 64, shuffle_params.input_width,
shuffle_params.input_height, shuffle_workspace);
ConvKernel::Run(shuffle_workspace, filter_ptr, bias_ptr, output_ptr,
0, 64, 64, shuffle_row_size,
shuffle_params.output_height,
shuffle_params.output_width, params);
input_ptr += 64;
output_ptr += 64;
filter_ptr += 64;
bias_ptr += 64;
}
const uint8* h_ptr = input_ptr;
for (int32 i = 0; i < shuffle_params.input_height; i++) {
const uint8* ptr = h_ptr;
for (int32 j = 0; j < shuffle_params.input_width; j++) {
asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
ptr += params.input_depth;
}
h_ptr += params.input_row_size;
}
ConvKernel::Run(input_ptr, filter_ptr, bias_ptr, output_ptr, depth,
params.output_depth, params.input_depth,
params.input_row_size, shuffle_params.output_height,
shuffle_params.output_width, params);
input_data +=
shuffle_params.output_width * kStrideWidth * params.input_depth;
output_data += shuffle_params.output_width * params.output_depth;
}
}
const int32 output_leftover_width = end_x - out_x;
if (output_leftover_width > 0) {
ConvKernel::Run(input_data, filter_data, bias_data, output_data, 0,
params.output_depth, params.input_depth,
params.input_row_size, shuffle_params.output_height,
output_leftover_width, params);
}
}
};
template <DepthwiseConvOutputRounding output_rounding>
inline void DepthwiseConvHandlePadding(const uint8* input_data,
const uint8* filter_data,
const int32* bias_data,
uint8* output_data,
const DepthwiseConvParams& params) {
if (params.input_width == 1 && params.input_height == 1) {
const uint8* filter_ptr =
filter_data + params.filter_row_size + params.output_depth;
DepthwiseConvPartial<output_rounding, EdgeType::kCenter, 1, 1>::Run(
input_data, filter_ptr, bias_data, output_data, ¶ms);
return;
}
const int32 out_x_start_corner = 0;
const int32 out_x_end_corner = params.output_width - 1;
const int32 out_y_start_corner = 0;
const int32 out_y_end_corner = params.output_height - 1;
const uint8* input_ptr = input_data;
const uint8* filter_ptr =
filter_data + params.filter_row_size + params.output_depth;
uint8* output_ptr = output_data;
DepthwiseConvPartial<output_rounding, EdgeType::kCorner, 1, 1>::Run(
input_ptr, filter_ptr, bias_data, output_ptr, ¶ms);
input_ptr += (params.stride_width - 1) * params.input_depth;
filter_ptr = filter_data + params.filter_row_size;
output_ptr += params.output_depth;
for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
out_x++) {
DepthwiseConvPartial<output_rounding, EdgeType::kHorizontal, 1, 1>::Run(
input_ptr, filter_ptr, bias_data, output_ptr, ¶ms);
input_ptr += params.stride_width * params.input_depth;
output_ptr += params.output_depth;
}
DepthwiseConvPartial<output_rounding, EdgeType::kCorner, 1, 1>::Run(
input_ptr, filter_ptr, bias_data, output_ptr, ¶ms);
input_ptr = input_data + (params.stride_width - 1) * params.input_row_size;
filter_ptr = filter_data + params.input_depth;
output_ptr = output_data + params.output_row_size;
for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
out_y++) {
DepthwiseConvPartial<output_rounding, EdgeType::kVertical, 1, 1>::Run(
input_ptr, filter_ptr, bias_data, output_ptr, ¶ms);
input_ptr += params.stride_width * params.input_row_size;
output_ptr += params.output_row_size;
}
input_ptr = input_data + (params.input_width - 2) * params.input_depth +
(params.stride_width - 1) * params.input_row_size;
filter_ptr = filter_data;
output_ptr = output_data + params.output_row_size +
(params.output_width - 1) * params.output_depth;
for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
out_y++) {
DepthwiseConvPartial<output_rounding, EdgeType::kVertical, 1, 1>::Run(
input_ptr, filter_ptr, bias_data, output_ptr, ¶ms);
input_ptr += params.stride_width * params.input_row_size;
output_ptr += params.output_row_size;
}
input_ptr = input_data + (params.input_height - 2) * params.input_row_size;
filter_ptr = filter_data + params.output_depth;
output_ptr =
output_data + (params.output_height - 1) * params.output_row_size;
DepthwiseConvPartial<output_rounding, EdgeType::kCorner, 1, 1>::Run(
input_ptr, filter_ptr, bias_data, output_ptr, ¶ms);
input_ptr += (params.stride_width == 1) ? 0 : params.input_depth;
filter_ptr = filter_data;
output_ptr += params.output_depth;
for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
out_x++) {
DepthwiseConvPartial<output_rounding, EdgeType::kHorizontal, 1, 1>::Run(
input_ptr, filter_ptr, bias_data, output_ptr, ¶ms);
input_ptr += params.stride_width * params.input_depth;
output_ptr += params.output_depth;
}
DepthwiseConvPartial<output_rounding, EdgeType::kCorner, 1, 1>::Run(
input_ptr, filter_ptr, bias_data, output_ptr, ¶ms);
}
template <DepthwiseConvOutputRounding output_rounding>
inline void DepthwiseConv3x3Filter(
const DepthwiseParams& rt_params, const RuntimeShape& input_shape,
const uint8* input_data, const RuntimeShape& filter_shape,
const uint8* filter_data, const RuntimeShape& bias_shape,
const int32* bias_data, const RuntimeShape& output_shape,
uint8* output_data, int thread_start, int thread_end, int thread_dim) {
DepthwiseConvParams params;
const int32 stride_width = rt_params.stride_width;
const int32 stride_height = rt_params.stride_height;
const int32 pad_width = rt_params.padding_values.width;
const int32 pad_height = rt_params.padding_values.height;
const int32 depth_multiplier = rt_params.depth_multiplier;
const int32 output_activation_min = rt_params.quantized_activation_min;
const int32 output_activation_max = rt_params.quantized_activation_max;
const int32 input_offset = rt_params.input_offset;
const int32 filter_offset = rt_params.weights_offset;
const int32 output_offset = rt_params.output_offset;
const int32 output_multiplier = rt_params.output_multiplier;
const int32 output_shift = rt_params.output_shift;
params.input_depth = input_shape.Dims(3);
params.input_width = input_shape.Dims(2);
params.input_height = input_shape.Dims(1);
params.input_row_size = params.input_depth * params.input_width;
params.input_offset = input_offset;
params.stride_width = stride_width;
params.stride_height = stride_height;
params.output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
params.output_width = output_shape.Dims(2);
params.output_height = output_shape.Dims(1);
params.output_row_size = params.output_depth * params.output_width;
params.output_offset = output_offset;
params.filter_offset = filter_offset;
params.output_multiplier = output_multiplier;
params.output_right_shift = output_shift;
params.output_activation_min = output_activation_min;
params.output_activation_max = output_activation_max;
const int32 filter_height = filter_shape.Dims(1);
const int32 filter_width = filter_shape.Dims(2);
params.filter_row_size = params.output_depth * filter_width;
TFLITE_DCHECK(params.output_depth == params.input_depth * depth_multiplier);
TFLITE_DCHECK(depth_multiplier == 1);
TFLITE_DCHECK(filter_height == 3);
TFLITE_DCHECK(filter_width == 3);
TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
TFLITE_DCHECK(stride_width == stride_height);
TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
TFLITE_DCHECK(pad_width == pad_height);
TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
const int64_t input_batch_size = params.input_row_size * params.input_height;
const int64_t output_batch_size =
params.output_row_size * params.output_height;
ShuffleParams one_row_shuffle_params, two_row_shuffle_params,
four_row_shuffle_params, eight_row_shuffle_params;
if (stride_width == 1) {
one_row_shuffle_params = ShuffleParams(30, 1, 1, 1);
two_row_shuffle_params = ShuffleParams(22, 2, 1, 1);
four_row_shuffle_params = ShuffleParams(14, 4, 1, 1);
eight_row_shuffle_params = ShuffleParams(8, 8, 1, 1);
} else {
one_row_shuffle_params = ShuffleParams(14, 1, 2, 2);
two_row_shuffle_params = ShuffleParams(8, 2, 2, 2);
four_row_shuffle_params = ShuffleParams(4, 4, 2, 2);
eight_row_shuffle_params = ShuffleParams(2, 8, 2, 2);
}
using conv_multirow_func_t =
decltype(&DepthwiseConvMultiRow<output_rounding, 1, 1>::Run);
conv_multirow_func_t conv_multirow_func =
DepthwiseConvMultiRow<output_rounding, 1, 1>::Run;
if (stride_width == 2) {
conv_multirow_func = DepthwiseConvMultiRow<output_rounding, 2, 2>::Run;
}
uint8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
int batch_start = 0;
int batch_end = batches;
int row_start = 0;
int row_end = params.output_height;
switch (thread_dim) {
case 0:
TFLITE_DCHECK_GE(thread_start, 0);
TFLITE_DCHECK_LE(thread_end, batches);
batch_start = thread_start;
batch_end = thread_end;
break;
case 1:
TFLITE_DCHECK_GE(thread_start, 0);
TFLITE_DCHECK_LE(thread_end, params.output_height);
row_start = thread_start;
row_end = thread_end;
break;
}
for (int32 b = batch_start; b < batch_end; ++b) {
const uint8* input_ptr = input_data + b * input_batch_size;
uint8* output_ptr = output_data + b * output_batch_size;
int32 out_x = 0;
int32 out_y = row_start;
int32 end_x = params.output_width;
int32 end_y = row_end;
if (pad_width == 1 && pad_height == 1) {
DepthwiseConvHandlePadding<output_rounding>(
input_ptr, filter_data, bias_data, output_ptr, params);
out_x = 1;
end_x = params.output_width - 1;
out_y = std::max(1, out_y);
end_y = std::min(params.output_height - 1, end_y);
}
const int in_x = (out_x * stride_width) - pad_width;
const int in_y = (out_y * stride_height) - pad_height;
input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
output_ptr += out_y * params.output_row_size + out_x * params.output_depth;
if (params.input_width < four_row_shuffle_params.input_width) {
for (; out_y <= end_y - 8; out_y += 8) {
conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
output_ptr, params, eight_row_shuffle_params,
shuffle_workspace);
input_ptr += 8 * stride_height * params.input_row_size;
output_ptr += 8 * params.output_row_size;
}
}
if (params.input_width < two_row_shuffle_params.input_width) {
for (; out_y <= end_y - 4; out_y += 4) {
conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
output_ptr, params, four_row_shuffle_params,
shuffle_workspace);
input_ptr += 4 * stride_height * params.input_row_size;
output_ptr += 4 * params.output_row_size;
}
}
for (; out_y <= end_y - 2; out_y += 2) {
conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
output_ptr, params, two_row_shuffle_params,
shuffle_workspace);
input_ptr += 2 * stride_height * params.input_row_size;
output_ptr += 2 * params.output_row_size;
}
for (; out_y < end_y; out_y++) {
conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
output_ptr, params, one_row_shuffle_params,
shuffle_workspace);
input_ptr += stride_height * params.input_row_size;
output_ptr += params.output_row_size;
}
}
}
#endif
template <DepthwiseConvImplementation implementation>
struct WorkspacePrefetchWrite { … };
#if defined(__aarch64__)
template <>
struct WorkspacePrefetchWrite<
DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
static void __attribute__((noinline))
Run(int8 fill_data, int size, int8* workspace) {
const int8x8_t fill_data_vec_int8 = vdup_n_s8(fill_data);
const uint32x2_t fill_data_vec = vreinterpret_u32_s8(fill_data_vec_int8);
for (int i = 0; i < (size - 15); i += 64) {
int8* ptr = workspace + i;
asm volatile("prfm pstl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
vst1_lane_u32(reinterpret_cast<uint32_t*>(ptr), fill_data_vec, 0);
}
vst1_lane_u32(reinterpret_cast<uint32_t*>(workspace + size - 4),
fill_data_vec, 0);
}
};
#endif
#if defined(__aarch64__) && !defined(GOOGLE_L4T) && defined(__ANDROID__) && \
defined(__clang__)
template <>
struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
QuantizationType::kNonPerChannelUint8> {
static inline void ProcessPerDepthNeon(
const uint8* filter_data, const int32* bias_data,
int8* shuffled_filter_data, int32* adjusted_bias_data,
const DepthwiseConvDotProdParams* function_params) {
#define DC_PER_DEPTH_1 …
#define DC_PER_DEPTH_2 …
asm volatile(
"ldp w12, w11, [%[function_params], #" STR(DP_OFFSET_BIAS_INCREMENT) "]\n"
"ldrsw x9, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
"ldr w10, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"mov x8, xzr\n"
"add w11, w11, #128\n"
"sxtw x12, w12\n"
"movi v0.16b, #128\n"
"dup v1.4s, w11\n"
"lsl x11, x12, #3\n"
"lsl x12, x12, #2\n"
"movi v2.16b, #1\n"
"b " DC_PER_DEPTH_2 "f\n"
DC_PER_DEPTH_1 ":\n"
"add x13, %[filter_data], x8, lsl #3\n"
"ld1 { v19.d }[0], [x13], x9\n"
"movi v21.16b, #0\n"
"movi v20.16b, #0\n"
"add x8, x8, #1\n"
"ld1 { v18.d }[0], [x13], x9\n"
"ld1 { v17.d }[0], [x13], x9\n"
"zip1 v22.16b, v19.16b, v18.16b\n"
"eor v22.16b, v22.16b, v0.16b\n"
"ld1 { v16.d }[0], [x13], x9\n"
"zip1 v23.16b, v17.16b, v0.16b\n"
"eor v23.16b, v23.16b, v0.16b\n"
"zip1 v24.8h, v22.8h, v23.8h\n"
"ld1 { v7.d }[0], [x13], x9\n"
"zip2 v22.8h, v22.8h, v23.8h\n"
".word 0x4e8296d5 // sdot v21.4s, v22.16b, v2.16b\n"
".word 0x4e829714 // sdot v20.4s, v24.16b, v2.16b\n"
"ld1 { v6.d }[0], [x13], x9\n"
"zip1 v23.16b, v16.16b, v7.16b\n"
"eor v23.16b, v23.16b, v0.16b\n"
"ld1 { v5.d }[0], [x13], x9\n"
"zip1 v25.16b, v6.16b, v0.16b\n"
"eor v25.16b, v25.16b, v0.16b\n"
"zip1 v26.8h, v23.8h, v25.8h\n"
"ld1 { v4.d }[0], [x13], x9\n"
"zip2 v23.8h, v23.8h, v25.8h\n"
".word 0x4e8296f5 // sdot v21.4s, v23.16b, v2.16b\n"
".word 0x4e829754 // sdot v20.4s, v26.16b, v2.16b\n"
"ld1 { v3.d }[0], [x13]\n"
"zip1 v25.16b, v5.16b, v4.16b\n"
"stp q26, q23, [%[shuffled_filter_data], #32]\n"
"stp q24, q22, [%[shuffled_filter_data]]\n"
"zip1 v23.16b, v3.16b, v0.16b\n"
"eor v22.16b, v25.16b, v0.16b\n"
"eor v23.16b, v23.16b, v0.16b\n"
"zip1 v24.8h, v22.8h, v23.8h\n"
"zip2 v22.8h, v22.8h, v23.8h\n"
"stp q24, q22, [%[shuffled_filter_data], #64]\n"
".word 0x4e8296d5 // sdot v21.4s, v22.16b, v2.16b\n"
"ldr q22, [%[bias_data]]\n"
"ldr q23, [%[bias_data], x12]\n"
".word 0x4e829714 // sdot v20.4s, v24.16b, v2.16b\n"
"add %[shuffled_filter_data], x2, #96\n"
"mla v22.4s, v20.4s, v1.4s\n"
"mla v23.4s, v21.4s, v1.4s\n"
"add %[bias_data], x1, x11\n"
"stp q22, q23, [%[adjusted_bias_data]], #32\n"
DC_PER_DEPTH_2 ":\n"
"cmp w8, w10\n"
"b.lt " DC_PER_DEPTH_1 "b\n"
:
[ filter_data ] "+r"(filter_data),
[ bias_data ] "+r"(bias_data),
[ shuffled_filter_data ] "+r"(shuffled_filter_data),
[ adjusted_bias_data ] "+r"(adjusted_bias_data)
:
[ function_params ] "r"(function_params)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"x8", "x9", "x10", "x11", "x12", "x13");
#undef DC_PER_DEPTH_1
#undef DC_PER_DEPTH_2
}
static void __attribute__((noinline))
Run(const uint8* filter_data, const int32* bias_data,
int8* shuffled_filter_data, int32* adjusted_bias_data,
const DepthwiseConvDotProdParams* function_params) {
ProcessPerDepthNeon(filter_data, bias_data, shuffled_filter_data,
adjusted_bias_data, function_params);
}
};
template <>
struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
QuantizationType::kPerChannelInt8> {
static inline void ProcessPerDepthNeon(
const int8* filter_data, const int32* bias_data,
int8* shuffled_filter_data, int32* adjusted_bias_data,
const DepthwiseConvDotProdParams* function_params) {
#define DC_PER_DEPTH_1 …
#define DC_PER_DEPTH_2 …
#define DC_PER_DEPTH_3 …
asm volatile(
"ldr w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"cmp w8, #1\n"
"b.lt " DC_PER_DEPTH_3 "f\n"
"add x10, %[function_params], #" STR(DP_OFFSET_INPUT_OFFSET) "\n"
"ldrsw x11, [%[function_params], #" STR(DP_OFFSET_BIAS_INCREMENT) "]\n"
"ldrsw x9, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
"ld1r { v1.4s }, [x10]\n"
"movi v0.16b, #0\n"
"lsl x10, x11, #2\n"
"lsl x11, x11, #3\n"
"movi v2.16b, #1\n"
"mov x12, %[filter_data]\n"
DC_PER_DEPTH_2 ":\n"
"add x13, %[filter_data], x9\n"
"ld1 { v3.d }[0], [x12], #8\n"
"ld1 { v4.d }[0], [x13], x9\n"
"movi v21.16b, #0\n"
"movi v20.16b, #0\n"
"subs w8, w8, #1\n"
"ld1 { v5.d }[0], [x13], x9\n"
"zip1 v22.16b, v3.16b, v4.16b\n"
"mov %[filter_data], x12\n"
"ld1 { v6.d }[0], [x13], x9\n"
"zip1 v23.16b, v5.16b, v0.16b\n"
"zip1 v24.8h, v22.8h, v23.8h\n"
"zip2 v22.8h, v22.8h, v23.8h\n"
"ld1 { v7.d }[0], [x13], x9\n"
".word 0x4e8296d5 // sdot v21.4s, v22.16b, v2.16b\n"
".word 0x4e829714 // sdot v20.4s, v24.16b, v2.16b\n"
"ld1 { v16.d }[0], [x13], x9\n"
"zip1 v23.16b, v6.16b, v7.16b\n"
"ld1 { v17.d }[0], [x13], x9\n"
"zip1 v25.16b, v16.16b, v0.16b\n"
"zip1 v26.8h, v23.8h, v25.8h\n"
"zip2 v23.8h, v23.8h, v25.8h\n"
"ld1 { v18.d }[0], [x13], x9\n"
".word 0x4e8296f5 // sdot v21.4s, v23.16b, v2.16b\n"
".word 0x4e829754 // sdot v20.4s, v26.16b, v2.16b\n"
"ld1 { v19.d }[0], [x13]\n"
"zip1 v25.16b, v17.16b, v18.16b\n"
"stp q24, q22, [%[shuffled_filter_data]]\n"
"stp q26, q23, [%[shuffled_filter_data], #32]\n"
"zip1 v22.16b, v19.16b, v0.16b\n"
"zip1 v23.8h, v25.8h, v22.8h\n"
"zip2 v22.8h, v25.8h, v22.8h\n"
"stp q23, q22, [%[shuffled_filter_data], #64]\n"
".word 0x4e8296f4 // sdot v20.4s, v23.16b, v2.16b\n"
".word 0x4e8296d5 // sdot v21.4s, v22.16b, v2.16b\n"
"ldr q22, [%[bias_data]]\n"
"ldr q23, [%[bias_data], x10]\n"
"add %[shuffled_filter_data], x2, #96\n"
"add %[bias_data], x1, x11\n"
"mla v22.4s, v20.4s, v1.4s\n"
"mla v23.4s, v21.4s, v1.4s\n"
"stp q22, q23, [%[adjusted_bias_data]], #32\n"
"b.ne " DC_PER_DEPTH_2 "b\n"
DC_PER_DEPTH_3 ":\n"
:
[ filter_data ] "+r"(filter_data),
[ bias_data ] "+r"(bias_data),
[ shuffled_filter_data ] "+r"(shuffled_filter_data),
[ adjusted_bias_data ] "+r"(adjusted_bias_data)
:
[ function_params ] "r"(function_params)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"x8", "x9", "x10", "x11", "x12", "x13");
#undef DC_PER_DEPTH_1
#undef DC_PER_DEPTH_2
#undef DC_PER_DEPTH_3
}
static void __attribute__((noinline))
Run(const int8* filter_data, const int32* bias_data,
int8* shuffled_filter_data, int32* adjusted_bias_data,
const DepthwiseConvDotProdParams* function_params) {
ProcessPerDepthNeon(filter_data, bias_data, shuffled_filter_data,
adjusted_bias_data, function_params);
}
};
template <QuantizationType quantization_type>
struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
quantization_type,
DepthwiseConvDepthMultiplication::kNoMultiplication,
0> {
static inline void PackMacroBlockNeon(
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
input_block_data,
int8* scratch_block_data,
const DepthwiseConvDotProdParams* function_params) {
TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
TFLITE_DCHECK_EQ(function_params->padding_top, 0);
TFLITE_DCHECK_EQ(function_params->padding_left, 0);
TFLITE_DCHECK_EQ(function_params->padding_right, 0);
const int workspace_height_stride =
function_params->workspace_height_stride;
const int width_overall_micro_repeats =
function_params->input_width_overall_micro_repeats;
const int input_width_micro_repeats =
function_params->input_width_micro_repeats;
const int depth_micro_repeats = function_params->depth_micro_repeats;
const int block_height = function_params->inbound_block_height;
const int residual_width = function_params->residual_width;
const int input_height_stride = function_params->input_height_stride;
const int input_depth = function_params->input_depth;
TFLITE_DCHECK_GE(depth_micro_repeats, 0);
constexpr uint8 kSignBit =
QuantizationTypeImpl<quantization_type>::kUint8SignBit;
const int micro_block_size = 4 * 8;
const int depth_advance = width_overall_micro_repeats * micro_block_size;
const int width_advance =
micro_block_size *
(1 - depth_micro_repeats * width_overall_micro_repeats);
const int height_advance = workspace_height_stride -
width_overall_micro_repeats * micro_block_size;
const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
int8x16_t work_reg_a;
int8x16_t work_reg_b;
const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
int8* scratch_data_0 = scratch_block_data;
for (int k_height = 0; k_height < block_height; ++k_height) {
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
input_data_0 = input_block_data;
int8x16_t input_data_a;
int8x16_t input_data_b;
int8x16_t input_data_c;
int8x16_t input_data_d;
for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
int8x16_t work_reg_a_sp;
int8x16_t work_reg_b_sp;
int i_depth = 0;
if (depth_micro_repeats >= 2) {
i_depth += 2;
input_data_a = util_vld1q_x8(input_data_0);
input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
input_data_0 += 16;
for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg_a = veorq_s8(work_reg_a, sign_bit);
work_reg_b = veorq_s8(work_reg_b, sign_bit);
}
work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
input_data_a = util_vld1q_x8(input_data_0);
input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a);
vst1q_s8(scratch_data_0 + 16, work_reg_b);
scratch_data_0 += depth_advance;
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
}
input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a_sp);
vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
scratch_data_0 += depth_advance;
input_data_0 += 16;
}
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg_a = veorq_s8(work_reg_a, sign_bit);
work_reg_b = veorq_s8(work_reg_b, sign_bit);
}
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a);
vst1q_s8(scratch_data_0 + 16, work_reg_b);
scratch_data_0 += depth_advance;
work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
}
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a_sp);
vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
scratch_data_0 += depth_advance;
}
for (; i_depth < depth_micro_repeats; ++i_depth) {
input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
input_data_b =
vld1q_lane_s8x8(input_data_0 + 1 * input_depth, input_data_b, 0);
input_data_c =
vld1q_lane_s8x8(input_data_0 + 2 * input_depth, input_data_c, 0);
input_data_d =
vld1q_lane_s8x8(input_data_0 + 3 * input_depth, input_data_d, 0);
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
input_data_0 += 8;
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg_a = veorq_s8(work_reg_a, sign_bit);
work_reg_b = veorq_s8(work_reg_b, sign_bit);
}
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a);
vst1q_s8(scratch_data_0 + 16, work_reg_b);
scratch_data_0 += depth_advance;
}
scratch_data_0 += width_advance;
input_data_0 += input_depth_skip;
}
if (width_overall_micro_repeats > input_width_micro_repeats) {
TFLITE_DCHECK_EQ(width_overall_micro_repeats,
input_width_micro_repeats + 1);
TFLITE_DCHECK_GT(residual_width, 0);
TFLITE_DCHECK_LT(residual_width, 4);
for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
if (residual_width > 1) {
input_data_b =
vld1q_lane_s8x8(input_data_0 + input_depth, input_data_b, 0);
if (residual_width == 3) {
input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
input_data_c, 0);
}
}
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg_a = veorq_s8(work_reg_a, sign_bit);
work_reg_b = veorq_s8(work_reg_b, sign_bit);
}
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a);
vst1q_s8(scratch_data_0 + 16, work_reg_b);
scratch_data_0 += depth_advance;
input_data_0 += 8;
}
scratch_data_0 += width_advance;
input_data_0 += input_depth_skip;
}
scratch_data_0 += height_advance;
input_block_data += input_height_stride;
}
TFLITE_DCHECK_EQ(
scratch_data_0,
scratch_block_data + block_height * workspace_height_stride);
}
static void __attribute__((noinline))
Run(int32 height_block_number, int32 width_block_number,
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
input_block_data,
int8* scratch_block_data,
const DepthwiseConvDotProdParams* function_params) {
PreloadInputBlock(input_block_data, function_params);
PackMacroBlockNeon(input_block_data, scratch_block_data, function_params);
}
};
template <QuantizationType quantization_type>
struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
quantization_type,
DepthwiseConvDepthMultiplication::kNoMultiplication,
1> {
static inline void PackMacroBlockNeon(
int32 height_block_number, int32 width_block_number,
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
input_block_data,
int8* scratch_block_data,
const DepthwiseConvDotProdParams* function_params) {
constexpr uint8 kSignBit =
QuantizationTypeImpl<quantization_type>::kUint8SignBit;
const int workspace_height_stride =
function_params->workspace_height_stride;
const int width_overall_micro_repeats =
function_params->input_width_overall_micro_repeats;
const int input_width_micro_repeats =
function_params->input_width_micro_repeats;
const int depth_micro_repeats = function_params->depth_micro_repeats;
const int block_height = function_params->inbound_block_height;
const int residual_width = function_params->residual_width;
const int input_height_stride = function_params->input_height_stride;
const int input_depth = function_params->input_depth;
const int padding_left = function_params->padding_left;
const int padding_right = function_params->padding_right;
const int padding_top = function_params->padding_top;
const int padding_bottom = function_params->padding_bottom;
TFLITE_DCHECK_GT(depth_micro_repeats, 0);
constexpr int kSymmetricZeroPoint =
QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
const int micro_block_size = 4 * 8;
const int depth_advance = width_overall_micro_repeats * micro_block_size;
const int width_advance =
micro_block_size *
(1 - depth_micro_repeats * width_overall_micro_repeats);
const int height_advance = workspace_height_stride -
width_overall_micro_repeats * micro_block_size;
const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
const bool leading_width_padding =
padding_left > 0 && width_block_number == 0;
const bool trailing_width_padding =
padding_right > 0 &&
width_block_number == (function_params->width_macro_count - 1);
const bool leading_height_padding =
padding_top > 0 && height_block_number < 0;
const bool trailing_height_padding =
padding_bottom > 0 &&
height_block_number == (function_params->height_macro_count - 1);
const int32 input_offset = function_params->input_offset;
const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
int8x16_t work_reg_a;
int8x16_t work_reg_b;
const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
int8* scratch_data_0 = scratch_block_data;
int copy_block_height = block_height;
if (leading_height_padding) {
copy_block_height -= 1;
memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
scratch_data_0 += workspace_height_stride;
input_block_data += input_height_stride;
}
if (trailing_height_padding) {
copy_block_height -= 1;
}
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
input_data_0 = input_block_data;
int8x16_t input_data_a;
int8x16_t input_data_b;
int8x16_t input_data_c;
int8x16_t input_data_d;
for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
int adjusted_residual_width =
j_width == (input_width_micro_repeats) ? residual_width : 4;
if (trailing_width_padding &&
j_width == (width_overall_micro_repeats - 1)) {
adjusted_residual_width -= 1;
}
int start_width = 0;
if (leading_width_padding && j_width == 0) {
start_width = 1;
}
if (start_width == 0) {
if (adjusted_residual_width == 4) {
int8x16_t work_reg_a_sp;
int8x16_t work_reg_b_sp;
int i_depth = 0;
if (depth_micro_repeats >= 2) {
i_depth += 2;
input_data_a = util_vld1q_x8(input_data_0);
input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
input_data_0 += 16;
for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
if (quantization_type ==
QuantizationType::kNonPerChannelUint8) {
work_reg_a = veorq_s8(work_reg_a, sign_bit);
work_reg_b = veorq_s8(work_reg_b, sign_bit);
}
work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
input_data_a = util_vld1q_x8(input_data_0);
input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a);
vst1q_s8(scratch_data_0 + 16, work_reg_b);
scratch_data_0 += depth_advance;
if (quantization_type ==
QuantizationType::kNonPerChannelUint8) {
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
}
input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a_sp);
vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
scratch_data_0 += depth_advance;
input_data_0 += 16;
}
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg_a = veorq_s8(work_reg_a, sign_bit);
work_reg_b = veorq_s8(work_reg_b, sign_bit);
}
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a);
vst1q_s8(scratch_data_0 + 16, work_reg_b);
scratch_data_0 += depth_advance;
work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
}
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a_sp);
vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
scratch_data_0 += depth_advance;
}
for (; i_depth < depth_micro_repeats; ++i_depth) {
input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
input_data_b, 0);
input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
input_data_c, 0);
input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
input_data_d, 0);
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
input_data_0 += 8;
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg_a = veorq_s8(work_reg_a, sign_bit);
work_reg_b = veorq_s8(work_reg_b, sign_bit);
}
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a);
vst1q_s8(scratch_data_0 + 16, work_reg_b);
scratch_data_0 += depth_advance;
}
scratch_data_0 += width_advance;
input_data_0 += input_depth_skip;
} else {
TFLITE_DCHECK_LT(adjusted_residual_width, 4);
for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
input_data_b = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
if (adjusted_residual_width > 0) {
input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
if (adjusted_residual_width > 1) {
input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
input_data_b, 0);
if (adjusted_residual_width == 3) {
input_data_c = vld1q_lane_s8x8(
input_data_0 + 2 * input_depth, input_data_c, 0);
}
}
}
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg_a = veorq_s8(work_reg_a, sign_bit);
work_reg_b = veorq_s8(work_reg_b, sign_bit);
}
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a);
vst1q_s8(scratch_data_0 + 16, work_reg_b);
scratch_data_0 += depth_advance;
input_data_0 += 8;
}
scratch_data_0 += width_advance;
input_data_0 += input_depth_skip;
}
} else {
if (adjusted_residual_width == 4) {
int8x16_t work_reg_a_sp;
int8x16_t work_reg_b_sp;
int i_depth = 0;
if (depth_micro_repeats >= 2) {
i_depth += 2;
input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
input_data_0 += 16;
for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
if (quantization_type ==
QuantizationType::kNonPerChannelUint8) {
work_reg_a = veorq_s8(work_reg_a, sign_bit);
work_reg_b = veorq_s8(work_reg_b, sign_bit);
}
work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a);
vst1q_s8(scratch_data_0 + 16, work_reg_b);
scratch_data_0 += depth_advance;
if (quantization_type ==
QuantizationType::kNonPerChannelUint8) {
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
}
input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a_sp);
vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
scratch_data_0 += depth_advance;
input_data_0 += 16;
}
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg_a = veorq_s8(work_reg_a, sign_bit);
work_reg_b = veorq_s8(work_reg_b, sign_bit);
}
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a);
vst1q_s8(scratch_data_0 + 16, work_reg_b);
scratch_data_0 += depth_advance;
work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
}
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a_sp);
vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
scratch_data_0 += depth_advance;
}
for (; i_depth < depth_micro_repeats; ++i_depth) {
input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
input_data_b, 0);
input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
input_data_c, 0);
input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
input_data_d, 0);
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
input_data_0 += 8;
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg_a = veorq_s8(work_reg_a, sign_bit);
work_reg_b = veorq_s8(work_reg_b, sign_bit);
}
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a);
vst1q_s8(scratch_data_0 + 16, work_reg_b);
scratch_data_0 += depth_advance;
}
scratch_data_0 += width_advance;
input_data_0 += input_depth_skip;
} else {
TFLITE_DCHECK_LT(adjusted_residual_width, 4);
for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
input_data_b = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
if (adjusted_residual_width > 1) {
input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
input_data_b, 0);
if (adjusted_residual_width == 3) {
input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
input_data_c, 0);
}
}
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg_a = veorq_s8(work_reg_a, sign_bit);
work_reg_b = veorq_s8(work_reg_b, sign_bit);
}
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
vst1q_s8(scratch_data_0, work_reg_a);
vst1q_s8(scratch_data_0 + 16, work_reg_b);
scratch_data_0 += depth_advance;
input_data_0 += 8;
}
scratch_data_0 += width_advance;
input_data_0 += input_depth_skip;
}
}
}
scratch_data_0 += height_advance;
input_block_data += input_height_stride;
}
if (trailing_height_padding) {
memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
scratch_data_0 += workspace_height_stride;
}
TFLITE_DCHECK_EQ(
scratch_data_0,
scratch_block_data + block_height * workspace_height_stride);
}
static void __attribute__((noinline))
Run(int32 height_block_number, int32 width_block_number,
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
input_block_data,
int8* scratch_block_data,
const DepthwiseConvDotProdParams* function_params) {
PreloadInputBlock(input_block_data, function_params);
PackMacroBlockNeon(height_block_number, width_block_number,
input_block_data, scratch_block_data, function_params);
}
};
template <QuantizationType quantization_type>
struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
quantization_type,
DepthwiseConvDepthMultiplication::kUnitInputDepth,
1> {
static inline void PackMacroBlockNeon(
int32 height_block_number, int32 width_block_number,
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
input_block_data,
int8* scratch_block_data,
const DepthwiseConvDotProdParams* function_params) {
const int workspace_height_stride =
function_params->workspace_height_stride;
const int width_overall_micro_repeats =
function_params->input_width_overall_micro_repeats;
const int input_width_micro_repeats =
function_params->input_width_micro_repeats;
const int block_height = function_params->inbound_block_height;
const int residual_width = function_params->residual_width;
const int input_height_stride = function_params->input_height_stride;
const int padding_left = function_params->padding_left;
const int padding_right = function_params->padding_right;
const int padding_top = function_params->padding_top;
const int padding_bottom = function_params->padding_bottom;
constexpr int kSymmetricZeroPoint =
QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
const bool leading_width_padding =
padding_left > 0 && width_block_number == 0;
const bool trailing_width_padding =
padding_right > 0 &&
width_block_number == (function_params->width_macro_count - 1);
const bool leading_height_padding =
padding_top > 0 && height_block_number < 0;
const bool trailing_height_padding =
padding_bottom > 0 &&
height_block_number == (function_params->height_macro_count - 1);
const int32 input_offset = function_params->input_offset;
const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
int8* scratch_data_base = scratch_block_data;
int copy_block_height = block_height;
if (leading_height_padding) {
copy_block_height -= 1;
memset(scratch_data_base, -input_offset_difference,
workspace_height_stride + kWorkspaceExtension);
scratch_data_base += workspace_height_stride;
input_block_data += input_height_stride;
}
if (trailing_height_padding) {
copy_block_height -= 1;
}
int adjusted_residual_width =
input_width_micro_repeats < width_overall_micro_repeats ? residual_width
: 4;
if (trailing_width_padding) {
adjusted_residual_width -= 1;
}
int start_width = 0;
if (leading_width_padding) {
start_width = 1;
input_block_data += 1;
}
const int copy_size = (width_overall_micro_repeats - 1) * 4 +
adjusted_residual_width - start_width;
const int copy_size_adjusted =
trailing_width_padding ? copy_size + 1 : copy_size;
TFLITE_DCHECK_LE(
copy_size,
input_height_stride - width_block_number * input_width_micro_repeats);
TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
int scratch_data_offset = 0;
int input_block_offset = 0;
constexpr uint8 kSignBit =
QuantizationTypeImpl<quantization_type>::kUint8SignBit;
int8x16_t work_reg;
int8x8_t half_work_reg;
int8x8_t padding_mask;
const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
const uint8x16_t padding_reg = vdupq_n_u8(-input_offset);
padding_mask = vdup_n_s8(-1);
half_work_reg = vdup_n_s8(0);
if (copy_size >= 16) {
const int copy_remaining = (copy_size + start_width) & 0x7;
padding_mask = vreinterpret_s8_s64(vshl_s64(
vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
int8* scratch_data = scratch_data_base + scratch_data_offset;
int copy_done = 0;
if (leading_width_padding) {
work_reg = util_vld1q_x8(input_block_data + input_block_offset);
work_reg = vextq_s8(vreinterpretq_s8_u8(padding_reg), work_reg, 15);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg = veorq_s8(work_reg, sign_bit);
}
optimized_ops_prefetch_write_l1_keep(scratch_data);
vst1q_s8(scratch_data, work_reg);
copy_done += 15;
}
for (; (copy_done + 16) <= copy_size; copy_done += 16) {
work_reg =
util_vld1q_x8(input_block_data + input_block_offset + copy_done);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg = veorq_s8(work_reg, sign_bit);
}
TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
copy_done);
vst1q_s8(scratch_data + start_width + copy_done, work_reg);
}
if (copy_done + 8 <= copy_size) {
half_work_reg =
util_vld1_x8(input_block_data + input_block_offset + copy_done);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
}
TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
copy_done);
vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
copy_done += 8;
}
TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
if (copy_done < copy_size_adjusted) {
half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
copy_size - 8);
half_work_reg = vreinterpret_s8_s64(
vshl_s64(vreinterpret_s64_s8(half_work_reg),
vdup_n_s64(-8 * (8 - copy_remaining))));
half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
vget_low_s8(vreinterpretq_s8_u8(padding_reg)),
half_work_reg);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
}
TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
copy_done);
vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
}
optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
copy_done);
optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
copy_done + 8);
vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
vst1_s8(scratch_data + start_width + copy_done + 8, half_work_reg);
scratch_data_offset += workspace_height_stride;
input_block_offset += input_height_stride;
}
} else if (copy_size >= 4) {
const int copy_remaining = (copy_size + start_width) & 0x3;
padding_mask = vreinterpret_s8_s64(vshl_s64(
vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
int8* scratch_data = scratch_data_base + scratch_data_offset;
int copy_done = 0;
if (leading_width_padding) {
half_work_reg = vld1_lane_8x4(input_block_data + input_block_offset,
half_work_reg, 0);
half_work_reg = vext_s8(vget_low_s8(vreinterpretq_s8_u8(padding_reg)),
half_work_reg, 7);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
}
optimized_ops_prefetch_write_l1_keep(scratch_data);
vst1_lane_8x4(scratch_data, half_work_reg, 0);
copy_done += 3;
}
for (; (copy_done + 4) <= copy_size; copy_done += 4) {
half_work_reg =
vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
half_work_reg, 0);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
}
TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
copy_done);
vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
0);
}
TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
if (copy_done < copy_size_adjusted) {
TFLITE_DCHECK_LT(copy_remaining, 4);
half_work_reg = vld1_lane_8x4(
input_block_data + input_block_offset + copy_size - 4,
half_work_reg, 0);
half_work_reg = vreinterpret_s8_s64(
vshl_s64(vreinterpret_s64_s8(half_work_reg),
vdup_n_s64(-8 * (4 - copy_remaining))));
half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
vget_low_s8(vreinterpretq_s8_u8(padding_reg)),
half_work_reg);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
}
TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
copy_done);
vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
0);
copy_done += 4;
}
optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
copy_done);
optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
copy_done + 12);
vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg, 0);
vst1_lane_8x4(scratch_data + start_width + copy_done + 4, half_work_reg,
0);
vst1_lane_8x4(scratch_data + start_width + copy_done + 8, half_work_reg,
0);
vst1_lane_8x4(scratch_data + start_width + copy_done + 12,
half_work_reg, 0);
scratch_data_offset += workspace_height_stride;
input_block_offset += input_height_stride;
}
} else if (width_overall_micro_repeats == 2) {
TFLITE_DCHECK_EQ(copy_size, 3);
TFLITE_DCHECK_EQ(start_width, 1);
TFLITE_DCHECK(leading_width_padding);
TFLITE_DCHECK(trailing_width_padding);
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
half_work_reg = vreinterpret_s8_u8(vdup_n_u8(-input_offset));
half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
input_block_data + input_block_offset),
half_work_reg, 1);
half_work_reg =
vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
input_block_offset + 1),
half_work_reg, 2);
half_work_reg =
vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
input_block_offset + 2),
half_work_reg, 3);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
}
TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
optimized_ops_prefetch_write_l1_keep(scratch_data_base +
scratch_data_offset);
vst1_s8(scratch_data_base + scratch_data_offset, half_work_reg);
optimized_ops_prefetch_write_l1_keep(scratch_data_base +
scratch_data_offset + 4);
optimized_ops_prefetch_write_l1_keep(scratch_data_base +
scratch_data_offset + 16);
vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
half_work_reg, 0);
vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
half_work_reg, 0);
vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
half_work_reg, 0);
vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
half_work_reg, 0);
scratch_data_offset += workspace_height_stride;
input_block_offset += input_height_stride;
}
} else {
TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
const int copy_remaining = (copy_size + start_width) & 0x3;
padding_mask = vreinterpret_s8_s64(vshl_s64(
vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
if (leading_width_padding) {
padding_mask = vset_lane_s8(-1, padding_mask, 0);
}
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
for (int i = 0; i < copy_size; ++i) {
half_work_reg = vreinterpret_s8_s64(
vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
half_work_reg = vld1_lane_s8(
reinterpret_cast<const int8*>(
input_block_data + input_block_offset + copy_size - 1 - i),
half_work_reg, 0);
}
if (leading_width_padding) {
half_work_reg = vreinterpret_s8_s64(
vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
}
half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
vget_low_s8(vreinterpretq_s8_u8(padding_reg)),
half_work_reg);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
}
TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
optimized_ops_prefetch_write_l1_keep(scratch_data_base +
scratch_data_offset);
vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
0);
optimized_ops_prefetch_write_l1_keep(scratch_data_base +
scratch_data_offset + 4);
optimized_ops_prefetch_write_l1_keep(scratch_data_base +
scratch_data_offset + 16);
vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
half_work_reg, 0);
vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
half_work_reg, 0);
vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
half_work_reg, 0);
vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
half_work_reg, 0);
scratch_data_offset += workspace_height_stride;
input_block_offset += input_height_stride;
}
}
scratch_data_base += copy_block_height * workspace_height_stride;
if (trailing_height_padding) {
memset(scratch_data_base, -input_offset_difference,
workspace_height_stride + kWorkspaceExtension);
scratch_data_base += workspace_height_stride;
}
TFLITE_DCHECK_EQ(
scratch_data_base,
scratch_block_data + block_height * workspace_height_stride);
}
static void __attribute__((noinline))
Run(int32 height_block_number, int32 width_block_number,
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
input_block_data,
int8* scratch_block_data,
const DepthwiseConvDotProdParams* function_params) {
PreloadInputBlock(input_block_data, function_params);
PackMacroBlockNeon(height_block_number, width_block_number,
input_block_data, scratch_block_data, function_params);
}
};
template <QuantizationType quantization_type>
struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
quantization_type,
DepthwiseConvDepthMultiplication::kUnitInputDepth,
0> {
static inline void PackMacroBlockNeon(
int32 height_block_number, int32 width_block_number,
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
input_block_data,
int8* scratch_block_data,
const DepthwiseConvDotProdParams* function_params) {
const int workspace_height_stride =
function_params->workspace_height_stride;
const int width_overall_micro_repeats =
function_params->input_width_overall_micro_repeats;
const int input_width_micro_repeats =
function_params->input_width_micro_repeats;
const int block_height = function_params->inbound_block_height;
const int residual_width = function_params->residual_width;
const int input_height_stride = function_params->input_height_stride;
TFLITE_DCHECK_EQ(function_params->padding_left, 0);
TFLITE_DCHECK_EQ(function_params->padding_right, 0);
TFLITE_DCHECK_EQ(function_params->padding_top, 0);
TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
int8* scratch_data_base = scratch_block_data;
const int copy_block_height = block_height;
int adjusted_residual_width =
input_width_micro_repeats < width_overall_micro_repeats ? residual_width
: 4;
const int copy_size =
(width_overall_micro_repeats - 1) * 4 + adjusted_residual_width;
TFLITE_DCHECK_LE(
copy_size,
input_height_stride - width_block_number * input_width_micro_repeats);
TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
int scratch_data_offset = 0;
int input_block_offset = 0;
constexpr uint8 kSignBit =
QuantizationTypeImpl<quantization_type>::kUint8SignBit;
int8x16_t work_reg;
int8x8_t half_work_reg;
const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
half_work_reg = vdup_n_s8(0);
if (copy_size >= 16) {
const int copy_remaining = copy_size & 0x7;
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
int8* scratch_data = scratch_data_base + scratch_data_offset;
int copy_done = 0;
for (; (copy_done + 16) <= copy_size; copy_done += 16) {
work_reg =
util_vld1q_x8(input_block_data + input_block_offset + copy_done);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
work_reg = veorq_s8(work_reg, sign_bit);
}
TFLITE_DCHECK_EQ(copy_done % 16, 0);
optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
vst1q_s8(scratch_data + copy_done, work_reg);
}
if (copy_done + 8 <= copy_size) {
half_work_reg =
util_vld1_x8(input_block_data + input_block_offset + copy_done);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
}
TFLITE_DCHECK_EQ(copy_done % 8, 0);
optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
vst1_s8(scratch_data + copy_done, half_work_reg);
copy_done += 8;
}
TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
if (copy_done < copy_size) {
half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
copy_size - 8);
half_work_reg = vreinterpret_s8_s64(
vshl_s64(vreinterpret_s64_s8(half_work_reg),
vdup_n_s64(-8 * (8 - copy_remaining))));
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
}
TFLITE_DCHECK_EQ(copy_done % 8, 0);
optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
vst1_s8(scratch_data + copy_done, half_work_reg);
copy_done += 8;
}
optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done + 8);
vst1_s8(scratch_data + copy_done, half_work_reg);
vst1_s8(scratch_data + copy_done + 8, half_work_reg);
scratch_data_offset += workspace_height_stride;
input_block_offset += input_height_stride;
}
} else if (copy_size >= 4) {
const int copy_remaining = copy_size & 0x3;
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
int8* scratch_data = scratch_data_base + scratch_data_offset;
int copy_done = 0;
for (; (copy_done + 4) <= copy_size; copy_done += 4) {
half_work_reg =
vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
half_work_reg, 0);
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
}
TFLITE_DCHECK_EQ(copy_done % 4, 0);
optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
}
TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
if (copy_done < copy_size) {
TFLITE_DCHECK_LT(copy_remaining, 4);
half_work_reg = vld1_lane_8x4(
input_block_data + input_block_offset + copy_size - 4,
half_work_reg, 0);
half_work_reg = vreinterpret_s8_s64(
vshl_s64(vreinterpret_s64_s8(half_work_reg),
vdup_n_s64(-8 * (4 - copy_remaining))));
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
}
TFLITE_DCHECK_EQ(copy_done % 4, 0);
optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
copy_done += 4;
}
optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done + 12);
vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
vst1_lane_8x4(scratch_data + copy_done + 4, half_work_reg, 0);
vst1_lane_8x4(scratch_data + copy_done + 8, half_work_reg, 0);
vst1_lane_8x4(scratch_data + copy_done + 12, half_work_reg, 0);
scratch_data_offset += workspace_height_stride;
input_block_offset += input_height_stride;
}
} else {
TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
for (int i = 0; i < copy_size; ++i) {
half_work_reg = vreinterpret_s8_s64(
vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
half_work_reg = vld1_lane_s8(
reinterpret_cast<const int8*>(
input_block_data + input_block_offset + copy_size - 1 - i),
half_work_reg, 0);
}
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
optimized_ops_prefetch_write_l1_keep(scratch_data_base +
scratch_data_offset);
vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
0);
optimized_ops_prefetch_write_l1_keep(scratch_data_base +
scratch_data_offset + 8);
vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
half_work_reg, 0);
vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
half_work_reg, 0);
vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
half_work_reg, 0);
vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
half_work_reg, 0);
scratch_data_offset += workspace_height_stride;
input_block_offset += input_height_stride;
}
}
scratch_data_base += copy_block_height * workspace_height_stride;
TFLITE_DCHECK_EQ(
scratch_data_base,
scratch_block_data + block_height * workspace_height_stride);
}
static void __attribute__((noinline))
Run(int32 height_block_number, int32 width_block_number,
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
input_block_data,
int8* scratch_block_data,
const DepthwiseConvDotProdParams* function_params) {
PreloadInputBlock(input_block_data, function_params);
PackMacroBlockNeon(height_block_number, width_block_number,
input_block_data, scratch_block_data, function_params);
}
};
template <>
struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
QuantizationType::kNonPerChannelUint8,
DepthwiseConvDepthMultiplication::kNoMultiplication,
1> {
static inline void KernelMacroBlockNeon(
const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, uint8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
#define DC_KERNEL_NO_MULT_1 …
#define DC_KERNEL_NO_MULT_2 …
#define DC_KERNEL_NO_MULT_3 …
#define DC_KERNEL_NO_MULT_4 …
#define DC_KERNEL_NO_MULT_5 …
#define DC_KERNEL_NO_MULT_6 …
#define DC_KERNEL_NO_MULT_7 …
#define DC_KERNEL_NO_MULT_8 …
#define DC_KERNEL_NO_MULT_9 …
#define DC_KERNEL_NO_MULT_10 …
#define DC_KERNEL_NO_MULT_11 …
#define DC_KERNEL_NO_MULT_12 …
#define DC_KERNEL_NO_MULT_13 …
#define DC_KERNEL_NO_MULT_14 …
#define DC_KERNEL_NO_MULT_15 …
#define DC_KERNEL_NO_MULT_16 …
#define DC_KERNEL_NO_MULT_17 …
#define DC_KERNEL_NO_MULT_18 …
#define DC_KERNEL_NO_MULT_19 …
#define DC_KERNEL_NO_MULT_20 …
#define DC_KERNEL_NO_MULT_21 …
#define DC_KERNEL_NO_MULT_22 …
#define DC_KERNEL_NO_MULT_23 …
#define DC_KERNEL_NO_MULT_24 …
#define DC_KERNEL_NO_MULT_25 …
#define DC_KERNEL_NO_MULT_26 …
#define DC_KERNEL_NO_MULT_27 …
#define DC_KERNEL_NO_MULT_28 …
#define DC_KERNEL_NO_MULT_29 …
#define DC_KERNEL_NO_MULT_30 …
#define DC_KERNEL_NO_MULT_31 …
#define DC_KERNEL_NO_MULT_32 …
#define DC_KERNEL_NO_MULT_33 …
#define DC_KERNEL_NO_MULT_34 …
#define DC_KERNEL_NO_MULT_35 …
asm volatile(
"sub sp, sp, #320\n"
"ldr w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"cmp w8, #1\n"
"str w8, [sp, #36]\n"
"b.lt " DC_KERNEL_NO_MULT_35 "f\n"
"ldr w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"str xzr, [sp, #64]\n"
"str wzr, [sp, #60]\n"
"ldpsw x21, x14, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
"str w8, [sp, #276]\n"
"ldr w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_MICRO_REPEATS) "]\n"
"ldrsw x13, [%[function_params], #" STR(DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"ldrb w9, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "]\n"
"ldrsw x5, [%[function_params]]\n"
"str w8, [sp, #280]\n"
"ldr w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
"add x11, %[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT) "\n"
"add x12, %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n"
"add x10, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"
"str w8, [sp, #284]\n"
"ldrb w8, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "]\n"
"ld1r { v1.4s }, [x12]\n"
"ld1r { v2.4s }, [x11]\n"
"lsl x12, x14, #2\n"
"dup v7.16b, w8\n"
"fmov s5, w8\n"
"lsl x8, x13, #5\n"
"add x13, x14, x14, lsl #1\n"
"add x11, x14, x14, lsl #2\n"
"mov x26, %[output_block_data]\n"
"mov %[output_block_data], %[filter_workspace]\n"
"ldr w7, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
"ld1r { v0.8h }, [x10]\n"
"dup v16.16b, w9\n"
"fmov s6, w9\n"
"lsl x15, x14, #1\n"
"lsl %[filter_workspace], x21, #1\n"
"add x27, x21, x21, lsl #1\n"
"lsl x9, x5, #1\n"
"add x10, x21, x5\n"
"stp x11, x12, [sp, #208]\n"
"add x11, x11, %[scratch_block_data]\n"
"add x12, x12, %[scratch_block_data]\n"
"str x13, [sp, #224]\n"
"add x13, x13, %[scratch_block_data]\n"
"str x8, [sp, #24]\n"
"stp x15, x14, [sp, #256]\n"
"add x8, x14, %[scratch_block_data]\n"
"add x14, x15, %[scratch_block_data]\n"
"add x15, x9, x5\n"
"add x16, x9, x27\n"
"add x17, x9, %[filter_workspace]\n"
"add x6, x9, x21\n"
"add %[function_params], x26, x9\n"
"add x9, x26, x10\n"
"add x10, x11, #32\n"
"add x11, x12, #32\n"
"add x12, x13, #32\n"
"str x12, [sp, #312]\n"
"add x12, x14, #32\n"
"str x12, [sp, #304]\n"
"add x12, x15, x27\n"
"add x13, x15, %[filter_workspace]\n"
"add x23, x15, x21\n"
"add x14, x26, x15\n"
"add x15, x27, x5\n"
"add x20, x26, x17\n"
"mov w17, w7\n"
"add x19, x26, x15\n"
"add x15, %[filter_workspace], x5\n"
"mov x22, xzr\n"
"str x14, [sp, #296]\n"
"add x14, x26, x16\n"
"add x7, x26, x6\n"
"add x16, x26, x15\n"
"add x15, x26, x13\n"
"add x6, x26, x23\n"
"and w13, w17, #0xfffffffe\n"
"lsl x23, x5, #2\n"
"dup v17.8b, v5.b[0]\n"
"dup v14.8b, v6.b[0]\n"
"add x8, x8, #32\n"
"str x14, [sp, #288]\n"
"add x14, x26, x12\n"
"mov x12, xzr\n"
"str w13, [sp, #12]\n"
"mov x13, x16\n"
"stp x26, x23, [sp, #80]\n"
"add x23, x26, x21\n"
"add x22, x26, x5\n"
"mov x28, %[filter_workspace]\n"
"add %[filter_workspace], x26, x1\n"
"add x25, x26, x27\n"
"str %[scratch_block_data], [sp, #184]\n"
"str x21, [sp, #136]\n"
"str w17, [sp, #76]\n"
"str x26, [sp, #16]\n"
"stp d14, d17, [sp, #96]\n"
"stp x6, x23, [sp, #240]\n"
"b " DC_KERNEL_NO_MULT_4 "f\n"
DC_KERNEL_NO_MULT_2 ":\n"
"mov %[bias_data], x16\n"
DC_KERNEL_NO_MULT_3 ":\n"
"ldr %[output_block_data], [sp, #24]\n"
"ldr x12, [sp, #184]\n"
"ldr w17, [sp, #60]\n"
"add x12, x12, %[output_block_data]\n"
"str x12, [sp, #184]\n"
"ldr x12, [sp, #80]\n"
"add w17, w17, #1\n"
"str w17, [sp, #60]\n"
"add x12, x12, #8\n"
"str x12, [sp, #80]\n"
"ldr x12, [sp, #64]\n"
"add x12, x12, %[output_block_data]\n"
"str x12, [sp, #64]\n"
"ldr w12, [sp, #36]\n"
"cmp w17, w12\n"
"ldp x12, %[output_block_data], [sp, #40]\n"
"ldr w17, [sp, #76]\n"
"add x12, x12, #8\n"
"b.eq " DC_KERNEL_NO_MULT_35 "f\n"
DC_KERNEL_NO_MULT_4 ":\n"
"ldp q18, q15, [%[output_block_data]]\n"
"ldp q19, q5, [%[output_block_data], #32]\n"
"ldp q20, q6, [%[output_block_data], #64]\n"
"cmp w17, #4\n"
"add %[output_block_data], x3, #96\n"
"stp x12, %[output_block_data], [sp, #40]\n"
"b.ne " DC_KERNEL_NO_MULT_16 "f\n"
"mov x24, x12\n"
"ldr x12, [sp, #64]\n"
"mov x16, xzr\n"
"stp q6, q5, [sp, #144]\n"
"str q15, [sp, #112]\n"
"str x12, [sp, #232]\n"
"b " DC_KERNEL_NO_MULT_7 "f\n"
DC_KERNEL_NO_MULT_6 ":\n"
"ldr x12, [sp, #232]\n"
"ldp q20, q19, [sp, #144]\n"
"add x16, x16, #1\n"
"cmp x16, #2\n"
"add x12, x12, #16\n"
"add x24, x24, #4\n"
"mov v18.16b, v15.16b\n"
"str x12, [sp, #232]\n"
"b.eq " DC_KERNEL_NO_MULT_3 "b\n"
DC_KERNEL_NO_MULT_7 ":\n"
"ldr x12, [sp, #184]\n"
"ldr q21, [%[bias_data]], #16\n"
"add %[output_block_data], x12, x16, lsl #4\n"
"ldr w12, [sp, #280]\n"
"ldr q22, [%[output_block_data]]\n"
"mov v31.16b, v21.16b\n"
"mov v8.16b, v21.16b\n"
"cmp w12, #1\n"
"ldr x12, [sp, #264]\n"
"mov v9.16b, v21.16b\n"
"mov v10.16b, v21.16b\n"
"ldr q27, [%[output_block_data], x12]\n"
"ldr x12, [sp, #256]\n"
"ldr q26, [%[output_block_data], x12]\n"
"ldr x12, [sp, #224]\n"
".word 0x4e9a969f // sdot v31.4s, v20.16b, v26.16b\n"
"ldr q25, [%[output_block_data], x12]\n"
"ldr x12, [sp, #216]\n"
".word 0x4e9a9668 // sdot v8.4s, v19.16b, v26.16b\n"
".word 0x4e9a9649 // sdot v9.4s, v18.16b, v26.16b\n"
".word 0x4e99964a // sdot v10.4s, v18.16b, v25.16b\n"
"ldr q24, [%[output_block_data], x12]\n"
"ldr x12, [sp, #208]\n"
"ldr q23, [%[output_block_data], x12]\n"
"b.lt " DC_KERNEL_NO_MULT_11 "f\n"
"stp x24, x16, [sp, #192]\n"
"ldr w12, [sp, #280]\n"
"mov x17, x24\n"
"ldr x21, [sp, #232]\n"
"mov x24, x25\n"
"mov x25, %[filter_workspace]\n"
"mov %[filter_workspace], x22\n"
"mov x22, x23\n"
"ldr x23, [sp, #88]\n"
"shl v28.4s, v18.4s, #8\n"
"shl v29.4s, v19.4s, #8\n"
"shl v30.4s, v20.4s, #8\n"
"mov v11.16b, v23.16b\n"
"mov v12.16b, v24.16b\n"
"mov v13.16b, v27.16b\n"
"mov v14.16b, v22.16b\n"
DC_KERNEL_NO_MULT_9 ":\n"
".word 0x4e8e965f // sdot v31.4s, v18.16b, v14.16b\n"
".word 0x4e8d9648 // sdot v8.4s, v18.16b, v13.16b\n"
".word 0x4e999669 // sdot v9.4s, v19.16b, v25.16b\n"
".word 0x4e8d967f // sdot v31.4s, v19.16b, v13.16b\n"
".word 0x4e8c966a // sdot v10.4s, v19.16b, v12.16b\n"
".word 0x4e999688 // sdot v8.4s, v20.16b, v25.16b\n"
".word 0x4e8c9689 // sdot v9.4s, v20.16b, v12.16b\n"
"sqrdmulh v31.4s, v31.4s, v1.4s\n"
".word 0x4e8b968a // sdot v10.4s, v20.16b, v11.16b\n"
"sqrdmulh v8.4s, v8.4s, v1.4s\n"
"sqrdmulh v9.4s, v9.4s, v1.4s\n"
"sqrshl v31.4s, v31.4s, v2.4s\n"
"sqrdmulh v10.4s, v10.4s, v1.4s\n"
"sqrshl v8.4s, v8.4s, v2.4s\n"
"sqrshl v9.4s, v9.4s, v2.4s\n"
"sqxtn v31.4h, v31.4s\n"
"sqrshl v10.4s, v10.4s, v2.4s\n"
"sqxtn v9.4h, v9.4s\n"
"sqxtn2 v31.8h, v8.4s\n"
"sqxtn2 v9.8h, v10.4s\n"
"sqadd v31.8h, v31.8h, v0.8h\n"
"sqadd v8.8h, v9.8h, v0.8h\n"
"sqxtun v31.8b, v31.8h\n"
"sqxtun2 v31.16b, v8.8h\n"
"umax v31.16b, v31.16b, v7.16b\n"
"add %[output_block_data], x22, x17\n"
"umin v31.16b, v31.16b, v16.16b\n"
"str s31, [x26, x17]\n"
"st1 { v31.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x25, x17\n"
"st1 { v31.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x24, x17\n"
"mov v10.16b, v21.16b\n"
"st1 { v31.s }[3], [%[output_block_data]]\n"
"mov v31.16b, v21.16b\n"
"mov v8.16b, v21.16b\n"
".word 0x4e99978a // sdot v10.4s, v28.16b, v25.16b\n"
"mov x16, x26\n"
"ldr x26, [sp, #304]\n"
".word 0x4e8e979f // sdot v31.4s, v28.16b, v14.16b\n"
".word 0x4e8d9788 // sdot v8.4s, v28.16b, v13.16b\n"
".word 0x4e8c97aa // sdot v10.4s, v29.16b, v12.16b\n"
"mov v9.16b, v21.16b\n"
".word 0x4e8d97bf // sdot v31.4s, v29.16b, v13.16b\n"
".word 0x4e9a97a8 // sdot v8.4s, v29.16b, v26.16b\n"
".word 0x4e8b97ca // sdot v10.4s, v30.16b, v11.16b\n"
"rev32 v4.8h, v26.8h\n"
".word 0x4e9a9789 // sdot v9.4s, v28.16b, v26.16b\n"
".word 0x4e9a97df // sdot v31.4s, v30.16b, v26.16b\n"
".word 0x4e9997c8 // sdot v8.4s, v30.16b, v25.16b\n"
"sqrdmulh v26.4s, v10.4s, v1.4s\n"
"rev32 v6.8h, v24.8h\n"
".word 0x4e9997a9 // sdot v9.4s, v29.16b, v25.16b\n"
"sqrdmulh v24.4s, v8.4s, v1.4s\n"
"sqrshl v8.4s, v26.4s, v2.4s\n"
"ldr q26, [x26, x21]\n"
"ldr x26, [sp, #312]\n"
"mov v17.16b, v16.16b\n"
"mov v16.16b, v7.16b\n"
"rev32 v7.8h, v23.8h\n"
".word 0x4e8c97c9 // sdot v9.4s, v30.16b, v12.16b\n"
"sqrdmulh v23.4s, v31.4s, v1.4s\n"
"rev32 v5.8h, v25.8h\n"
"sqrdmulh v25.4s, v9.4s, v1.4s\n"
"sqrshl v23.4s, v23.4s, v2.4s\n"
"add %[output_block_data], %[scratch_block_data], x21\n"
"sqrshl v31.4s, v24.4s, v2.4s\n"
"sqrshl v24.4s, v25.4s, v2.4s\n"
"sqxtn v9.4h, v23.4s\n"
"rev32 v15.8h, v22.8h\n"
"ldr q22, [%[output_block_data], #32]\n"
"rev32 v3.8h, v27.8h\n"
"sqxtn v10.4h, v24.4s\n"
"ldr q27, [x8, x21]\n"
"ldr q25, [x26, x21]\n"
"ldr q24, [x11, x21]\n"
"ldr q23, [x10, x21]\n"
"sqxtn2 v9.8h, v31.4s\n"
"sqxtn2 v10.8h, v8.4s\n"
"sqadd v31.8h, v9.8h, v0.8h\n"
"sqadd v8.8h, v10.8h, v0.8h\n"
"sqxtun v31.8b, v31.8h\n"
"sqxtun2 v31.16b, v8.8h\n"
"umax v31.16b, v31.16b, v16.16b\n"
"add %[output_block_data], x9, x17\n"
"umin v31.16b, v31.16b, v17.16b\n"
"str s31, [%[filter_workspace], x17]\n"
"st1 { v31.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x13, x17\n"
"st1 { v31.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x19, x17\n"
"mov v8.16b, v21.16b\n"
"st1 { v31.s }[3], [%[output_block_data]]\n"
"trn1 v31.8h, v15.8h, v22.8h\n"
"mov v9.16b, v21.16b\n"
"mov v10.16b, v21.16b\n"
"trn1 v3.8h, v3.8h, v27.8h\n"
"trn1 v4.8h, v4.8h, v26.8h\n"
".word 0x4e9f9648 // sdot v8.4s, v18.16b, v31.16b\n"
"mov v11.16b, v21.16b\n"
"trn1 v5.8h, v5.8h, v25.8h\n"
".word 0x4e839649 // sdot v9.4s, v18.16b, v3.16b\n"
".word 0x4e84964a // sdot v10.4s, v18.16b, v4.16b\n"
".word 0x4e839668 // sdot v8.4s, v19.16b, v3.16b\n"
"trn1 v6.8h, v6.8h, v24.8h\n"
".word 0x4e85964b // sdot v11.4s, v18.16b, v5.16b\n"
".word 0x4e849669 // sdot v9.4s, v19.16b, v4.16b\n"
".word 0x4e85966a // sdot v10.4s, v19.16b, v5.16b\n"
".word 0x4e849688 // sdot v8.4s, v20.16b, v4.16b\n"
"trn1 v7.8h, v7.8h, v23.8h\n"
".word 0x4e86966b // sdot v11.4s, v19.16b, v6.16b\n"
".word 0x4e859689 // sdot v9.4s, v20.16b, v5.16b\n"
".word 0x4e86968a // sdot v10.4s, v20.16b, v6.16b\n"
"sqrdmulh v8.4s, v8.4s, v1.4s\n"
".word 0x4e87968b // sdot v11.4s, v20.16b, v7.16b\n"
"sqrdmulh v9.4s, v9.4s, v1.4s\n"
"sqrdmulh v10.4s, v10.4s, v1.4s\n"
"sqrshl v8.4s, v8.4s, v2.4s\n"
"sqrdmulh v11.4s, v11.4s, v1.4s\n"
"sqrshl v9.4s, v9.4s, v2.4s\n"
"sqrshl v10.4s, v10.4s, v2.4s\n"
"sqxtn v8.4h, v8.4s\n"
"sqrshl v11.4s, v11.4s, v2.4s\n"
"sqxtn v10.4h, v10.4s\n"
"sqxtn2 v8.8h, v9.4s\n"
"sqxtn2 v10.8h, v11.4s\n"
"sqadd v8.8h, v8.8h, v0.8h\n"
"sqadd v9.8h, v10.8h, v0.8h\n"
"sqxtun v8.8b, v8.8h\n"
"sqxtun2 v8.16b, v9.8h\n"
"mov v9.16b, v21.16b\n"
"mov v10.16b, v21.16b\n"
"mov v11.16b, v21.16b\n"
".word 0x4e9f9789 // sdot v9.4s, v28.16b, v31.16b\n"
"mov x26, x16\n"
"ldr x16, [sp, #288]\n"
"mov v12.16b, v21.16b\n"
".word 0x4e83978a // sdot v10.4s, v28.16b, v3.16b\n"
".word 0x4e84978b // sdot v11.4s, v28.16b, v4.16b\n"
".word 0x4e8397a9 // sdot v9.4s, v29.16b, v3.16b\n"
"umax v8.16b, v8.16b, v16.16b\n"
".word 0x4e85978c // sdot v12.4s, v28.16b, v5.16b\n"
".word 0x4e8497aa // sdot v10.4s, v29.16b, v4.16b\n"
".word 0x4e8597ab // sdot v11.4s, v29.16b, v5.16b\n"
".word 0x4e8497c9 // sdot v9.4s, v30.16b, v4.16b\n"
"add %[output_block_data], x7, x17\n"
"umin v8.16b, v8.16b, v17.16b\n"
".word 0x4e8697ac // sdot v12.4s, v29.16b, v6.16b\n"
".word 0x4e8597ca // sdot v10.4s, v30.16b, v5.16b\n"
".word 0x4e8697cb // sdot v11.4s, v30.16b, v6.16b\n"
"sqrdmulh v3.4s, v9.4s, v1.4s\n"
"str s8, [%[function_params], x17]\n"
"st1 { v8.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x20, x17\n"
".word 0x4e8797cc // sdot v12.4s, v30.16b, v7.16b\n"
"sqrdmulh v4.4s, v10.4s, v1.4s\n"
"sqrdmulh v5.4s, v11.4s, v1.4s\n"
"sqrshl v3.4s, v3.4s, v2.4s\n"
"st1 { v8.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x16, x17\n"
"sqrdmulh v6.4s, v12.4s, v1.4s\n"
"sqrshl v4.4s, v4.4s, v2.4s\n"
"sqrshl v5.4s, v5.4s, v2.4s\n"
"sqxtn v3.4h, v3.4s\n"
"st1 { v8.s }[3], [%[output_block_data]]\n"
"sqrshl v6.4s, v6.4s, v2.4s\n"
"sqxtn v5.4h, v5.4s\n"
"sqxtn2 v3.8h, v4.4s\n"
"sqxtn2 v5.8h, v6.4s\n"
"sqadd v3.8h, v3.8h, v0.8h\n"
"sqadd v4.8h, v5.8h, v0.8h\n"
"sqxtun v3.8b, v3.8h\n"
"sqxtun2 v3.16b, v4.8h\n"
"ldr x16, [sp, #296]\n"
"mov v7.16b, v16.16b\n"
"umax v3.16b, v3.16b, v7.16b\n"
"add %[output_block_data], x6, x17\n"
"umin v3.16b, v3.16b, v17.16b\n"
"str s3, [x16, x17]\n"
"st1 { v3.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x15, x17\n"
"mov v31.16b, v21.16b\n"
"mov v8.16b, v21.16b\n"
"mov v9.16b, v21.16b\n"
"mov v10.16b, v21.16b\n"
"mov v16.16b, v17.16b\n"
"st1 { v3.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x14, x17\n"
"subs w12, w12, #1\n"
"add x21, x21, #32\n"
".word 0x4e9a969f // sdot v31.4s, v20.16b, v26.16b\n"
".word 0x4e9a9668 // sdot v8.4s, v19.16b, v26.16b\n"
".word 0x4e9a9649 // sdot v9.4s, v18.16b, v26.16b\n"
".word 0x4e99964a // sdot v10.4s, v18.16b, v25.16b\n"
"add x17, x17, x23\n"
"mov v11.16b, v23.16b\n"
"mov v12.16b, v24.16b\n"
"mov v13.16b, v27.16b\n"
"mov v14.16b, v22.16b\n"
"st1 { v3.s }[3], [%[output_block_data]]\n"
"b.ne " DC_KERNEL_NO_MULT_9 "b\n"
"add %[output_block_data], %[scratch_block_data], x21\n"
"ldr x21, [sp, #136]\n"
"ldp d14, d17, [sp, #96]\n"
"mov x23, x22\n"
"mov x22, %[filter_workspace]\n"
"mov %[filter_workspace], x25\n"
"mov x25, x24\n"
"ldr q15, [sp, #112]\n"
"ldp x24, x16, [sp, #192]\n"
"add x12, x26, x17\n"
"ldr w17, [sp, #284]\n"
"cmp w17, #0\n"
"b.gt " DC_KERNEL_NO_MULT_12 "f\n"
"b " DC_KERNEL_NO_MULT_6 "b\n"
DC_KERNEL_NO_MULT_11 ":\n"
"ldr x12, [sp, #80]\n"
"add x12, x12, x16, lsl #2\n"
"ldr w17, [sp, #284]\n"
"cmp w17, #0\n"
"b.le " DC_KERNEL_NO_MULT_6 "b\n"
DC_KERNEL_NO_MULT_12 ":\n"
"ldr w17, [sp, #284]\n"
"movi v28.16b, #0\n"
"movi v29.16b, #0\n"
"movi v30.16b, #0\n"
"cmp w17, #3\n"
"movi v11.16b, #0\n"
"movi v12.16b, #0\n"
"movi v13.16b, #0\n"
"b.lt " DC_KERNEL_NO_MULT_14 "f\n"
"add x17, %[output_block_data], #32\n"
"ldr %[output_block_data], [sp, #264]\n"
"ldr q13, [x17]\n"
"ldr q12, [x17, %[output_block_data]]\n"
"ldr %[output_block_data], [sp, #256]\n"
"ldr q11, [x17, %[output_block_data]]\n"
"ldr %[output_block_data], [sp, #224]\n"
"ldr q30, [x17, %[output_block_data]]\n"
"ldr %[output_block_data], [sp, #216]\n"
"ldr q29, [x17, %[output_block_data]]\n"
"ldr %[output_block_data], [sp, #208]\n"
"ldr q28, [x17, %[output_block_data]]\n"
DC_KERNEL_NO_MULT_14 ":\n"
"ldr w17, [sp, #284]\n"
DC_KERNEL_NO_MULT_15 ":\n"
".word 0x4e96965f // sdot v31.4s, v18.16b, v22.16b\n"
".word 0x4e9b9648 // sdot v8.4s, v18.16b, v27.16b\n"
".word 0x4e999669 // sdot v9.4s, v19.16b, v25.16b\n"
".word 0x4e9b967f // sdot v31.4s, v19.16b, v27.16b\n"
".word 0x4e98966a // sdot v10.4s, v19.16b, v24.16b\n"
".word 0x4e999688 // sdot v8.4s, v20.16b, v25.16b\n"
".word 0x4e989689 // sdot v9.4s, v20.16b, v24.16b\n"
"sqrdmulh v3.4s, v31.4s, v1.4s\n"
".word 0x4e97968a // sdot v10.4s, v20.16b, v23.16b\n"
"sqrdmulh v4.4s, v8.4s, v1.4s\n"
"sqrdmulh v5.4s, v9.4s, v1.4s\n"
"sqrshl v3.4s, v3.4s, v2.4s\n"
"sqrdmulh v6.4s, v10.4s, v1.4s\n"
"sqrshl v4.4s, v4.4s, v2.4s\n"
"sqrshl v5.4s, v5.4s, v2.4s\n"
"sqxtn v3.4h, v3.4s\n"
"sqrshl v6.4s, v6.4s, v2.4s\n"
"sqxtn v5.4h, v5.4s\n"
"sqxtn2 v3.8h, v4.4s\n"
"sqxtn2 v5.8h, v6.4s\n"
"sqadd v3.8h, v3.8h, v0.8h\n"
"sqadd v4.8h, v5.8h, v0.8h\n"
"sqxtun v3.8b, v3.8h\n"
"sqxtun2 v3.16b, v4.8h\n"
"umax v3.16b, v3.16b, v7.16b\n"
"add %[output_block_data], x12, x21\n"
"umin v3.16b, v3.16b, v16.16b\n"
"ushr v26.4s, v26.4s, #8\n"
"ushr v25.4s, v25.4s, #8\n"
"str s3, [x12]\n"
"st1 { v3.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x12, x28\n"
"ushr v22.4s, v22.4s, #8\n"
"ushr v27.4s, v27.4s, #8\n"
"sli v26.4s, v11.4s, #24\n"
"ushr v24.4s, v24.4s, #8\n"
"ushr v23.4s, v23.4s, #8\n"
"sli v25.4s, v30.4s, #24\n"
"mov v31.16b, v21.16b\n"
"mov v8.16b, v21.16b\n"
"mov v9.16b, v21.16b\n"
"mov v10.16b, v21.16b\n"
"st1 { v3.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x12, x27\n"
"subs w17, w17, #1\n"
"sli v22.4s, v13.4s, #24\n"
"ushr v13.4s, v13.4s, #8\n"
"ushr v11.4s, v11.4s, #8\n"
"sli v27.4s, v12.4s, #24\n"
"ushr v12.4s, v12.4s, #8\n"
"ushr v30.4s, v30.4s, #8\n"
"sli v24.4s, v29.4s, #24\n"
"ushr v29.4s, v29.4s, #8\n"
"sli v23.4s, v28.4s, #24\n"
"ushr v28.4s, v28.4s, #8\n"
".word 0x4e9a969f // sdot v31.4s, v20.16b, v26.16b\n"
".word 0x4e9a9668 // sdot v8.4s, v19.16b, v26.16b\n"
".word 0x4e9a9649 // sdot v9.4s, v18.16b, v26.16b\n"
"add x12, x12, x5\n"
".word 0x4e99964a // sdot v10.4s, v18.16b, v25.16b\n"
"st1 { v3.s }[3], [%[output_block_data]]\n"
"b.ne " DC_KERNEL_NO_MULT_15 "b\n"
"b " DC_KERNEL_NO_MULT_6 "b\n"
DC_KERNEL_NO_MULT_16 ":\n"
"cmp w17, #1\n"
"add x16, %[bias_data], #32\n"
"b.lt " DC_KERNEL_NO_MULT_2 "b\n"
"ldr w23, [sp, #276]\n"
"cmp w23, #1\n"
"b.lt " DC_KERNEL_NO_MULT_29 "f\n"
"str x16, [sp, #192]\n"
"ldp q21, q22, [%[bias_data]]\n"
"ldr x17, [sp, #184]\n"
"ldr x12, [sp, #80]\n"
"ldr x23, [sp, #248]\n"
"mov w24, wzr\n"
"b " DC_KERNEL_NO_MULT_20 "f\n"
DC_KERNEL_NO_MULT_19 ":\n"
"ldr w12, [sp, #76]\n"
"add w24, w24, #1\n"
"ldr x21, [sp, #136]\n"
"ldr x17, [sp, #200]\n"
"cmp w24, w12\n"
"ldr x12, [sp, #232]\n"
"add x12, x12, x21\n"
"b.eq " DC_KERNEL_NO_MULT_28 "f\n"
DC_KERNEL_NO_MULT_20 ":\n"
"ldr %[output_block_data], [sp, #264]\n"
"ldp q23, q24, [x17]\n"
"mov x21, x12\n"
"mov w12, wzr\n"
"add x16, x17, %[output_block_data]\n"
"ldr %[output_block_data], [sp, #256]\n"
"ldp q25, q26, [x16]\n"
"str x16, [sp, #200]\n"
"add %[output_block_data], x17, x3\n"
"ldp q27, q28, [%[output_block_data]]\n"
"str x21, [sp, #232]\n"
"b " DC_KERNEL_NO_MULT_23 "f\n"
DC_KERNEL_NO_MULT_21 ":\n"
"mov %[filter_workspace], x26\n"
DC_KERNEL_NO_MULT_22 ":\n"
"ldr w17, [sp, #276]\n"
"add w12, w12, #1\n"
"cmp w12, w17\n"
"mov x17, x16\n"
"b.eq " DC_KERNEL_NO_MULT_19 "b\n"
DC_KERNEL_NO_MULT_23 ":\n"
"mov x26, %[filter_workspace]\n"
"ldr w1, [sp, #280]\n"
"ldr w3, [sp, #284]\n"
"add x16, x17, #32\n"
"cmp w12, w1\n"
"mov w1, #4\n"
"csel w3, w3, w1, eq\n"
"cmp w3, #3\n"
"b.ge " DC_KERNEL_NO_MULT_25 "f\n"
"movi v29.16b, #0\n"
"cmp w3, #1\n"
"movi v30.16b, #0\n"
"movi v31.16b, #0\n"
"movi v9.16b, #0\n"
"movi v10.16b, #0\n"
"movi v8.16b, #0\n"
"b.ge " DC_KERNEL_NO_MULT_26 "f\n"
"b " DC_KERNEL_NO_MULT_21 "b\n"
DC_KERNEL_NO_MULT_25 ":\n"
"ldr x23, [sp, #264]\n"
"mov %[filter_workspace], x22\n"
"mov x22, x15\n"
"mov x15, x14\n"
"add x23, x16, x23\n"
"mov x14, x13\n"
"mov x13, x20\n"
"mov x20, x16\n"
"mov x16, x25\n"
"ldr x25, [sp, #256]\n"
"ldp q8, q31, [x17, #32]\n"
"ldp q10, q30, [x23]\n"
"ldp x6, x23, [sp, #240]\n"
"add x25, x20, x25\n"
"ldp q9, q29, [x25]\n"
"mov x25, x16\n"
"mov x16, x20\n"
"mov x20, x13\n"
"mov x13, x14\n"
"mov x14, x15\n"
"mov x15, x22\n"
"mov x22, %[filter_workspace]\n"
"mov %[bias_data], x7\n"
DC_KERNEL_NO_MULT_26 ":\n"
"mov %[filter_workspace], x26\n"
DC_KERNEL_NO_MULT_27 ":\n"
"mov v3.16b, v21.16b\n"
"mov v4.16b, v22.16b\n"
".word 0x4e979643 // sdot v3.4s, v18.16b, v23.16b\n"
".word 0x4e9895e4 // sdot v4.4s, v15.16b, v24.16b\n"
".word 0x4e999663 // sdot v3.4s, v19.16b, v25.16b\n"
".word 0x4e9a94a4 // sdot v4.4s, v5.16b, v26.16b\n"
".word 0x4e9b9683 // sdot v3.4s, v20.16b, v27.16b\n"
".word 0x4e9c94c4 // sdot v4.4s, v6.16b, v28.16b\n"
"sqrdmulh v3.4s, v3.4s, v1.4s\n"
"sqrdmulh v4.4s, v4.4s, v1.4s\n"
"sqrshl v3.4s, v3.4s, v2.4s\n"
"sqrshl v4.4s, v4.4s, v2.4s\n"
"sqxtn v3.4h, v3.4s\n"
"sqxtn2 v3.8h, v4.4s\n"
"sqadd v3.8h, v3.8h, v0.8h\n"
"sqxtun v3.8b, v3.8h\n"
"umax v3.8b, v3.8b, v17.8b\n"
"ushr v23.4s, v23.4s, #8\n"
"ushr v24.4s, v24.4s, #8\n"
"ushr v25.4s, v25.4s, #8\n"
"ushr v26.4s, v26.4s, #8\n"
"ushr v27.4s, v27.4s, #8\n"
"ushr v28.4s, v28.4s, #8\n"
"umin v3.8b, v3.8b, v14.8b\n"
"subs w3, w3, #1\n"
"sli v23.4s, v8.4s, #24\n"
"ushr v8.4s, v8.4s, #8\n"
"sli v24.4s, v31.4s, #24\n"
"ushr v31.4s, v31.4s, #8\n"
"sli v25.4s, v10.4s, #24\n"
"ushr v10.4s, v10.4s, #8\n"
"sli v26.4s, v30.4s, #24\n"
"ushr v30.4s, v30.4s, #8\n"
"sli v27.4s, v9.4s, #24\n"
"ushr v9.4s, v9.4s, #8\n"
"sli v28.4s, v29.4s, #24\n"
"ushr v29.4s, v29.4s, #8\n"
"str d3, [x21]\n"
"add x21, x21, x5\n"
"b.ne " DC_KERNEL_NO_MULT_27 "b\n"
"b " DC_KERNEL_NO_MULT_22 "b\n"
DC_KERNEL_NO_MULT_28 ":\n"
"ldr %[bias_data], [sp, #192]\n"
"ldr x26, [sp, #16]\n"
"b " DC_KERNEL_NO_MULT_3 "b\n"
DC_KERNEL_NO_MULT_29 ":\n"
"ldr w12, [sp, #12]\n"
"cmp w17, #2\n"
"b.hs " DC_KERNEL_NO_MULT_31 "f\n"
"ldr x23, [sp, #248]\n"
"mov w12, wzr\n"
"b " DC_KERNEL_NO_MULT_33 "f\n"
DC_KERNEL_NO_MULT_31 ":\n"
"subs w12, w12, #2\n"
"b.ne " DC_KERNEL_NO_MULT_31 "b\n"
"ldr w12, [sp, #12]\n"
"ldr x23, [sp, #248]\n"
"cmp w17, w12\n"
"b.eq " DC_KERNEL_NO_MULT_2 "b\n"
DC_KERNEL_NO_MULT_33 ":\n"
"sub w12, w17, w12\n"
DC_KERNEL_NO_MULT_34 ":\n"
"subs w12, w12, #1\n"
"b.ne " DC_KERNEL_NO_MULT_34 "b\n"
"b " DC_KERNEL_NO_MULT_2 "b\n"
DC_KERNEL_NO_MULT_35 ":\n"
"add sp, sp, #320\n"
:
[ scratch_block_data ] "+r"(scratch_block_data),
[ filter_workspace ] "+r"(filter_workspace),
[ bias_data ] "+r"(bias_data),
[ output_block_data ] "+r"(output_block_data)
:
[ function_params ] "r"(function_params)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31",
"x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
"x27", "x28");
#undef DC_KERNEL_NO_MULT_1
#undef DC_KERNEL_NO_MULT_2
#undef DC_KERNEL_NO_MULT_3
#undef DC_KERNEL_NO_MULT_4
#undef DC_KERNEL_NO_MULT_5
#undef DC_KERNEL_NO_MULT_6
#undef DC_KERNEL_NO_MULT_7
#undef DC_KERNEL_NO_MULT_8
#undef DC_KERNEL_NO_MULT_9
#undef DC_KERNEL_NO_MULT_10
#undef DC_KERNEL_NO_MULT_11
#undef DC_KERNEL_NO_MULT_12
#undef DC_KERNEL_NO_MULT_13
#undef DC_KERNEL_NO_MULT_14
#undef DC_KERNEL_NO_MULT_15
#undef DC_KERNEL_NO_MULT_16
#undef DC_KERNEL_NO_MULT_17
#undef DC_KERNEL_NO_MULT_18
#undef DC_KERNEL_NO_MULT_19
#undef DC_KERNEL_NO_MULT_20
#undef DC_KERNEL_NO_MULT_21
#undef DC_KERNEL_NO_MULT_22
#undef DC_KERNEL_NO_MULT_23
#undef DC_KERNEL_NO_MULT_24
#undef DC_KERNEL_NO_MULT_25
#undef DC_KERNEL_NO_MULT_26
#undef DC_KERNEL_NO_MULT_27
#undef DC_KERNEL_NO_MULT_28
#undef DC_KERNEL_NO_MULT_29
#undef DC_KERNEL_NO_MULT_30
#undef DC_KERNEL_NO_MULT_31
#undef DC_KERNEL_NO_MULT_32
#undef DC_KERNEL_NO_MULT_33
#undef DC_KERNEL_NO_MULT_34
#undef DC_KERNEL_NO_MULT_35
}
static void __attribute__((noinline))
Run(const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, uint8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
output_block_data, function_params);
}
};
template <>
struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
QuantizationType::kNonPerChannelUint8,
DepthwiseConvDepthMultiplication::kNoMultiplication,
2> {
static inline void KernelMacroBlockNeon(
const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, uint8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
#define DC_KERNEL_NO_MULT_STRIDE_1 …
#define DC_KERNEL_NO_MULT_STRIDE_2 …
#define DC_KERNEL_NO_MULT_STRIDE_3 …
#define DC_KERNEL_NO_MULT_STRIDE_4 …
#define DC_KERNEL_NO_MULT_STRIDE_5 …
#define DC_KERNEL_NO_MULT_STRIDE_6 …
#define DC_KERNEL_NO_MULT_STRIDE_7 …
#define DC_KERNEL_NO_MULT_STRIDE_8 …
#define DC_KERNEL_NO_MULT_STRIDE_9 …
#define DC_KERNEL_NO_MULT_STRIDE_10 …
#define DC_KERNEL_NO_MULT_STRIDE_11 …
#define DC_KERNEL_NO_MULT_STRIDE_12 …
#define DC_KERNEL_NO_MULT_STRIDE_13 …
#define DC_KERNEL_NO_MULT_STRIDE_14 …
#define DC_KERNEL_NO_MULT_STRIDE_15 …
#define DC_KERNEL_NO_MULT_STRIDE_16 …
#define DC_KERNEL_NO_MULT_STRIDE_17 …
#define DC_KERNEL_NO_MULT_STRIDE_18 …
#define DC_KERNEL_NO_MULT_STRIDE_19 …
#define DC_KERNEL_NO_MULT_STRIDE_20 …
#define DC_KERNEL_NO_MULT_STRIDE_21 …
#define DC_KERNEL_NO_MULT_STRIDE_22 …
#define DC_KERNEL_NO_MULT_STRIDE_23 …
#define DC_KERNEL_NO_MULT_STRIDE_24 …
#define DC_KERNEL_NO_MULT_STRIDE_25 …
#define DC_KERNEL_NO_MULT_STRIDE_26 …
#define DC_KERNEL_NO_MULT_STRIDE_27 …
#define DC_KERNEL_NO_MULT_STRIDE_28 …
#define DC_KERNEL_NO_MULT_STRIDE_29 …
#define DC_KERNEL_NO_MULT_STRIDE_30 …
#define DC_KERNEL_NO_MULT_STRIDE_31 …
#define DC_KERNEL_NO_MULT_STRIDE_32 …
#define DC_KERNEL_NO_MULT_STRIDE_33 …
#define DC_KERNEL_NO_MULT_STRIDE_34 …
#define DC_KERNEL_NO_MULT_STRIDE_35 …
asm volatile(
"sub sp, sp, #160\n"
"stp %[output_block_data], %[filter_workspace], [sp, #144]\n"
"ldr w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"cmp w8, #1\n"
"b.lt " DC_KERNEL_NO_MULT_STRIDE_35 "f\n"
"ldr x14, [%[function_params]]\n"
"ldpsw x11, x12, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
"ldp w13, w3, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"add x15, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "\n"
"add x17, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "\n"
"add x5, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"
"add x6, %[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT) "\n"
"add x7, %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n"
"ldrsw x19, [%[function_params], #" STR(DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"ldr w1, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
"ldp w16, w4, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
"ld1r { v1.8b }, [x15]\n"
"lsl w15, w14, #1\n"
"sxtw x20, w15\n"
"cmp w16, #1\n"
"ldr x15, [sp, #144]\n"
"ccmp w3, w13, #0, eq\n"
"ld1r { v0.8h }, [x5]\n"
"ld1r { v2.8b }, [x17]\n"
"ld1r { v3.4s }, [x7]\n"
"ld1r { v4.4s }, [x6]\n"
"csel w23, w3, w13, lt\n"
"sxtw x6, w14\n"
"bic w14, w23, w23, asr #31\n"
"lsl x5, x12, #1\n"
"madd x15, x20, x14, x15\n"
"sub x14, x13, x14\n"
"mov x9, xzr\n"
"mov x10, xzr\n"
"str w4, [sp, #84]\n"
"lsl %[function_params], x19, #5\n"
"lsl x7, x12, #2\n"
"add x19, x5, x12\n"
"str x14, [sp, #136]\n"
"add x14, x15, #4\n"
"str %[output_block_data], [sp, #72]\n"
"str x15, [sp, #88]\n"
"str x14, [sp, #8]\n"
"b " DC_KERNEL_NO_MULT_STRIDE_4 "f\n"
DC_KERNEL_NO_MULT_STRIDE_2 ":\n"
"add x25, %[bias_data], #32\n"
"mov v22.16b, v12.16b\n"
DC_KERNEL_NO_MULT_STRIDE_3 ":\n"
"add x10, x10, #1\n"
"cmp x10, x8\n"
"add x9, x9, #8\n"
"mov %[bias_data], x25\n"
"b.eq " DC_KERNEL_NO_MULT_STRIDE_35 "f\n"
DC_KERNEL_NO_MULT_STRIDE_4 ":\n"
"ldr x15, [sp, #152]\n"
"add w14, w10, w10, lsl #1\n"
"lsl w14, w14, #5\n"
"cmp w1, #2\n"
"add x27, x15, x14\n"
"madd x26, x10, %[function_params], %[scratch_block_data]\n"
"b.ne " DC_KERNEL_NO_MULT_STRIDE_15 "f\n"
"ubfx x14, x9, #3, #29\n"
"lsl x25, x14, #3\n"
"ldr x14, [sp, #88]\n"
"ldr q24, [x27]\n"
"ldr q25, [x27, #32]\n"
"ldr q26, [x27, #64]\n"
"add x24, x14, x25\n"
"ldr x14, [sp, #144]\n"
"ldr q27, [%[bias_data]]\n"
"ldr q31, [x26]\n"
"ldr q8, [x26, x12]\n"
"ldr q30, [x26, x5]\n"
"ldr q29, [x26, x19]\n"
"ldr q28, [x26, x7]\n"
"lsl w15, w10, #3\n"
"cmp w23, #1\n"
"add x28, x14, x15\n"
"mov v12.16b, v22.16b\n"
"mov w14, wzr\n"
"b.lt " DC_KERNEL_NO_MULT_STRIDE_9 "f\n"
"mov x17, xzr\n"
"add x22, x26, #32\n"
"mov x21, x23\n"
"mov v19.16b, v30.16b\n"
DC_KERNEL_NO_MULT_STRIDE_7 ":\n"
"mov v20.16b, v27.16b\n"
"mov v21.16b, v27.16b\n"
".word 0x4e9f9714 // sdot v20.4s, v24.16b, v31.16b\n"
".word 0x4e939715 // sdot v21.4s, v24.16b, v19.16b\n"
".word 0x4e889734 // sdot v20.4s, v25.16b, v8.16b\n"
".word 0x4e9d9735 // sdot v21.4s, v25.16b, v29.16b\n"
".word 0x4e939754 // sdot v20.4s, v26.16b, v19.16b\n"
".word 0x4e9c9755 // sdot v21.4s, v26.16b, v28.16b\n"
"sqrdmulh v20.4s, v20.4s, v3.4s\n"
"and %[output_block_data], x17, #0xffffffe0\n"
"sqrdmulh v21.4s, v21.4s, v3.4s\n"
"sqrshl v20.4s, v20.4s, v4.4s\n"
"add %[output_block_data], x22, x3\n"
"sqrshl v21.4s, v21.4s, v4.4s\n"
"sqxtn v20.4h, v20.4s\n"
"rev32 v22.8h, v31.8h\n"
"rev32 v23.8h, v8.8h\n"
"rev32 v9.8h, v30.8h\n"
"rev32 v10.8h, v29.8h\n"
"ldr q31, [%[output_block_data]]\n"
"ldr q8, [%[output_block_data], x12]\n"
"ldr q30, [%[output_block_data], x5]\n"
"ldr q29, [%[output_block_data], x19]\n"
"rev32 v19.8h, v28.8h\n"
"ldr q28, [%[output_block_data], x7]\n"
"sqxtn2 v20.8h, v21.4s\n"
"sqadd v20.8h, v20.8h, v0.8h\n"
"sqxtun v20.8b, v20.8h\n"
"add x15, x28, w14, sxtw\n"
"umax v20.8b, v20.8b, v1.8b\n"
"add %[output_block_data], x15, x11\n"
"umin v20.8b, v20.8b, v2.8b\n"
"mov v11.16b, v27.16b\n"
"str s20, [x15]\n"
"st1 { v20.s }[1], [%[output_block_data]]\n"
"trn1 v20.8h, v22.8h, v31.8h\n"
"mov v21.16b, v27.16b\n"
"trn1 v22.8h, v23.8h, v8.8h\n"
"trn1 v23.8h, v9.8h, v30.8h\n"
".word 0x4e94970b // sdot v11.4s, v24.16b, v20.16b\n"
"trn1 v9.8h, v10.8h, v29.8h\n"
".word 0x4e979715 // sdot v21.4s, v24.16b, v23.16b\n"
".word 0x4e96972b // sdot v11.4s, v25.16b, v22.16b\n"
"trn1 v19.8h, v19.8h, v28.8h\n"
".word 0x4e899735 // sdot v21.4s, v25.16b, v9.16b\n"
".word 0x4e97974b // sdot v11.4s, v26.16b, v23.16b\n"
".word 0x4e939755 // sdot v21.4s, v26.16b, v19.16b\n"
"sqrdmulh v19.4s, v11.4s, v3.4s\n"
"sqrdmulh v20.4s, v21.4s, v3.4s\n"
"sqrshl v19.4s, v19.4s, v4.4s\n"
"sqrshl v20.4s, v20.4s, v4.4s\n"
"sqxtn v19.4h, v19.4s\n"
"sqxtn2 v19.8h, v20.4s\n"
"sqadd v19.8h, v19.8h, v0.8h\n"
"sqxtun v19.8b, v19.8h\n"
"add x15, x15, x6\n"
"umax v19.8b, v19.8b, v1.8b\n"
"add %[output_block_data], x15, x11\n"
"umin v19.8b, v19.8b, v2.8b\n"
"add x17, x17, #32\n"
"subs x21, x21, #1\n"
"str s19, [x15]\n"
"st1 { v19.s }[1], [%[output_block_data]]\n"
"add w14, w14, w20\n"
"mov v19.16b, v30.16b\n"
"b.ne " DC_KERNEL_NO_MULT_STRIDE_7 "b\n"
"mov v20.16b, v31.16b\n"
"mov v15.16b, v8.16b\n"
"mov v14.16b, v30.16b\n"
"mov v13.16b, v29.16b\n"
"mov v11.16b, v28.16b\n"
"mov w14, w23\n"
DC_KERNEL_NO_MULT_STRIDE_9 ":\n"
"cmp w14, w13\n"
"ldr x14, [sp, #136]\n"
"b.ge " DC_KERNEL_NO_MULT_STRIDE_11 "f\n"
DC_KERNEL_NO_MULT_STRIDE_10 ":\n"
"mov v9.16b, v27.16b\n"
"mov v10.16b, v27.16b\n"
".word 0x4e9f9709 // sdot v9.4s, v24.16b, v31.16b\n"
".word 0x4e889729 // sdot v9.4s, v25.16b, v8.16b\n"
".word 0x4e9e970a // sdot v10.4s, v24.16b, v30.16b\n"
".word 0x4e9e9749 // sdot v9.4s, v26.16b, v30.16b\n"
".word 0x4e9d972a // sdot v10.4s, v25.16b, v29.16b\n"
".word 0x4e9c974a // sdot v10.4s, v26.16b, v28.16b\n"
"sqrdmulh v9.4s, v9.4s, v3.4s\n"
"sqrdmulh v10.4s, v10.4s, v3.4s\n"
"sqrshl v9.4s, v9.4s, v4.4s\n"
"sqrshl v10.4s, v10.4s, v4.4s\n"
"sqxtn v9.4h, v9.4s\n"
"sqxtn2 v9.8h, v10.4s\n"
"sqadd v9.8h, v9.8h, v0.8h\n"
"sqxtun v9.8b, v9.8h\n"
"umax v9.8b, v9.8b, v1.8b\n"
"rev32 v31.8h, v31.8h\n"
"rev32 v8.8h, v8.8h\n"
"rev32 v30.8h, v30.8h\n"
"rev32 v29.8h, v29.8h\n"
"rev32 v28.8h, v28.8h\n"
"umin v9.8b, v9.8b, v2.8b\n"
"add x15, x24, x11\n"
"subs x14, x14, #1\n"
"trn1 v31.8h, v31.8h, v20.8h\n"
"trn1 v8.8h, v8.8h, v15.8h\n"
"trn1 v29.8h, v29.8h, v13.8h\n"
"trn1 v30.8h, v30.8h, v14.8h\n"
"trn1 v28.8h, v28.8h, v11.8h\n"
"str s9, [x24]\n"
"add x24, x24, x20\n"
"st1 { v9.s }[1], [x15]\n"
"b.ne " DC_KERNEL_NO_MULT_STRIDE_10 "b\n"
DC_KERNEL_NO_MULT_STRIDE_11 ":\n"
"ldr q24, [x27, #16]\n"
"ldr q25, [x27, #48]\n"
"ldr q26, [x27, #80]\n"
"ldr q30, [x26, #16]!\n"
"ldr q27, [%[bias_data], #16]\n"
"cmp w23, #0\n"
"ldr q8, [x26, x12]\n"
"ldr q31, [x26, x5]\n"
"ldr q29, [x26, x19]\n"
"ldr q28, [x26, x7]\n"
"b.le " DC_KERNEL_NO_MULT_STRIDE_24 "f\n"
"mov w14, wzr\n"
"mov x17, xzr\n"
"add x22, x26, #32\n"
"add x24, x28, #4\n"
"mov x21, x23\n"
"mov v19.16b, v31.16b\n"
DC_KERNEL_NO_MULT_STRIDE_13 ":\n"
"mov v5.16b, v27.16b\n"
"mov v20.16b, v27.16b\n"
".word 0x4e9e9705 // sdot v5.4s, v24.16b, v30.16b\n"
".word 0x4e939714 // sdot v20.4s, v24.16b, v19.16b\n"
".word 0x4e889725 // sdot v5.4s, v25.16b, v8.16b\n"
".word 0x4e9d9734 // sdot v20.4s, v25.16b, v29.16b\n"
".word 0x4e939745 // sdot v5.4s, v26.16b, v19.16b\n"
".word 0x4e9c9754 // sdot v20.4s, v26.16b, v28.16b\n"
"sqrdmulh v5.4s, v5.4s, v3.4s\n"
"and %[output_block_data], x17, #0xffffffe0\n"
"sqrdmulh v20.4s, v20.4s, v3.4s\n"
"sqrshl v5.4s, v5.4s, v4.4s\n"
"add %[output_block_data], x22, x3\n"
"sqrshl v20.4s, v20.4s, v4.4s\n"
"sqxtn v5.4h, v5.4s\n"
"rev32 v21.8h, v30.8h\n"
"rev32 v22.8h, v8.8h\n"
"rev32 v23.8h, v31.8h\n"
"rev32 v9.8h, v29.8h\n"
"ldr q30, [%[output_block_data]]\n"
"ldr q8, [%[output_block_data], x12]\n"
"ldr q31, [%[output_block_data], x5]\n"
"ldr q29, [%[output_block_data], x19]\n"
"rev32 v19.8h, v28.8h\n"
"ldr q28, [%[output_block_data], x7]\n"
"sqxtn2 v5.8h, v20.4s\n"
"sqadd v5.8h, v5.8h, v0.8h\n"
"sqxtun v5.8b, v5.8h\n"
"add x15, x24, w14, sxtw\n"
"umax v5.8b, v5.8b, v1.8b\n"
"add %[output_block_data], x15, x11\n"
"umin v5.8b, v5.8b, v2.8b\n"
"mov v10.16b, v27.16b\n"
"str s5, [x15]\n"
"st1 { v5.s }[1], [%[output_block_data]]\n"
"trn1 v5.8h, v21.8h, v30.8h\n"
"mov v20.16b, v27.16b\n"
"trn1 v21.8h, v22.8h, v8.8h\n"
"trn1 v22.8h, v23.8h, v31.8h\n"
".word 0x4e85970a // sdot v10.4s, v24.16b, v5.16b\n"
"trn1 v23.8h, v9.8h, v29.8h\n"
".word 0x4e969714 // sdot v20.4s, v24.16b, v22.16b\n"
".word 0x4e95972a // sdot v10.4s, v25.16b, v21.16b\n"
"trn1 v19.8h, v19.8h, v28.8h\n"
".word 0x4e979734 // sdot v20.4s, v25.16b, v23.16b\n"
".word 0x4e96974a // sdot v10.4s, v26.16b, v22.16b\n"
".word 0x4e939754 // sdot v20.4s, v26.16b, v19.16b\n"
"sqrdmulh v5.4s, v10.4s, v3.4s\n"
"sqrdmulh v19.4s, v20.4s, v3.4s\n"
"sqrshl v5.4s, v5.4s, v4.4s\n"
"sqrshl v19.4s, v19.4s, v4.4s\n"
"sqxtn v5.4h, v5.4s\n"
"sqxtn2 v5.8h, v19.4s\n"
"sqadd v5.8h, v5.8h, v0.8h\n"
"sqxtun v5.8b, v5.8h\n"
"add x15, x15, x6\n"
"umax v5.8b, v5.8b, v1.8b\n"
"add x17, x17, #32\n"
"subs x21, x21, #1\n"
"add %[output_block_data], x15, x11\n"
"umin v5.8b, v5.8b, v2.8b\n"
"add w14, w14, w20\n"
"mov v19.16b, v31.16b\n"
"str s5, [x15]\n"
"st1 { v5.s }[1], [%[output_block_data]]\n"
"b.ne " DC_KERNEL_NO_MULT_STRIDE_13 "b\n"
"mov v20.16b, v30.16b\n"
"mov v15.16b, v8.16b\n"
"mov v14.16b, v31.16b\n"
"mov v13.16b, v29.16b\n"
"mov v11.16b, v28.16b\n"
"mov w14, w23\n"
"cmp w14, w13\n"
"b.ge " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
"b " DC_KERNEL_NO_MULT_STRIDE_25 "f\n"
DC_KERNEL_NO_MULT_STRIDE_15 ":\n"
"cmp w13, #1\n"
"add x25, %[bias_data], #32\n"
"b.lt " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
"stp q13, q11, [sp, #96]\n"
"add x15, x26, x12\n"
"ldp q9, q10, [x15]\n"
"ldr x15, [sp, #144]\n"
"lsl w14, w10, #3\n"
"ldp q30, q31, [%[bias_data]]\n"
"add x17, x26, x5\n"
"add %[bias_data], x15, x14\n"
"ldr w14, [sp, #84]\n"
"ldp q24, q25, [x27]\n"
"ldp q26, q27, [x27, #32]\n"
"ldp q28, q29, [x27, #64]\n"
"ldp q12, q11, [x26], #32\n"
"ldp q8, q13, [x17]\n"
"cmp w13, w14\n"
"b.ne " DC_KERNEL_NO_MULT_STRIDE_27 "f\n"
"ldr x14, [sp, #72]\n"
"mov x24, xzr\n"
"mov w27, wzr\n"
"mov x28, x13\n"
"mov v19.16b, v15.16b\n"
"mov v5.16b, v14.16b\n"
"cbnz x14, " DC_KERNEL_NO_MULT_STRIDE_21 "f\n"
"b " DC_KERNEL_NO_MULT_STRIDE_22 "f\n"
DC_KERNEL_NO_MULT_STRIDE_18 ":\n"
"mov v14.16b, v30.16b\n"
".word 0x4e8c970e // sdot v14.4s, v24.16b, v12.16b\n"
"mov v12.16b, v31.16b\n"
".word 0x4e8b972c // sdot v12.4s, v25.16b, v11.16b\n"
".word 0x4e89974e // sdot v14.4s, v26.16b, v9.16b\n"
".word 0x4e8a976c // sdot v12.4s, v27.16b, v10.16b\n"
".word 0x4e88978e // sdot v14.4s, v28.16b, v8.16b\n"
".word 0x4e8d97ac // sdot v12.4s, v29.16b, v13.16b\n"
"sqrdmulh v8.4s, v14.4s, v3.4s\n"
"sqrdmulh v9.4s, v12.4s, v3.4s\n"
"sqrshl v8.4s, v8.4s, v4.4s\n"
"sqrshl v9.4s, v9.4s, v4.4s\n"
"sqxtn v8.4h, v8.4s\n"
"sqxtn2 v8.8h, v9.4s\n"
"sqadd v8.8h, v8.8h, v0.8h\n"
"sqxtun v8.8b, v8.8h\n"
"umax v8.8b, v8.8b, v1.8b\n"
"umin v8.8b, v8.8b, v2.8b\n"
"str d8, [x15, x6]\n"
"mov v12.16b, v6.16b\n"
"mov v9.16b, v17.16b\n"
"mov v8.16b, v18.16b\n"
"mov v11.16b, v22.16b\n"
"mov v10.16b, v7.16b\n"
"mov v13.16b, v16.16b\n"
DC_KERNEL_NO_MULT_STRIDE_19 ":\n"
"mov v14.16b, v5.16b\n"
"mov v15.16b, v19.16b\n"
"add w27, w27, w20\n"
"add x24, x24, #32\n"
"subs x28, x28, #1\n"
"sub x14, x14, #1\n"
"b.eq " DC_KERNEL_NO_MULT_STRIDE_33 "f\n"
"mov v19.16b, v15.16b\n"
"mov v5.16b, v14.16b\n"
"cbz x14, " DC_KERNEL_NO_MULT_STRIDE_22 "f\n"
DC_KERNEL_NO_MULT_STRIDE_21 ":\n"
"and x15, x24, #0xffffffe0\n"
"add x15, x26, x15\n"
"add x17, x15, x12\n"
"add %[output_block_data], x15, x5\n"
"ldp q6, q22, [x15]\n"
"ldp q17, q7, [x17]\n"
"ldp q18, q16, [%[output_block_data]]\n"
DC_KERNEL_NO_MULT_STRIDE_22 ":\n"
"mov v14.16b, v30.16b\n"
"mov v15.16b, v31.16b\n"
".word 0x4e8c970e // sdot v14.4s, v24.16b, v12.16b\n"
".word 0x4e89974e // sdot v14.4s, v26.16b, v9.16b\n"
".word 0x4e8b972f // sdot v15.4s, v25.16b, v11.16b\n"
".word 0x4e88978e // sdot v14.4s, v28.16b, v8.16b\n"
".word 0x4e8a976f // sdot v15.4s, v27.16b, v10.16b\n"
".word 0x4e8d97af // sdot v15.4s, v29.16b, v13.16b\n"
"sqrdmulh v14.4s, v14.4s, v3.4s\n"
"sqrdmulh v15.4s, v15.4s, v3.4s\n"
"sqrshl v14.4s, v14.4s, v4.4s\n"
"sqrshl v15.4s, v15.4s, v4.4s\n"
"sqxtn v14.4h, v14.4s\n"
"sqxtn2 v14.8h, v15.4s\n"
"sqadd v14.8h, v14.8h, v0.8h\n"
"sqxtun v14.8b, v14.8h\n"
"rev32 v12.8h, v12.8h\n"
"rev32 v9.8h, v9.8h\n"
"rev32 v8.8h, v8.8h\n"
"rev32 v11.8h, v11.8h\n"
"rev32 v10.8h, v10.8h\n"
"rev32 v13.8h, v13.8h\n"
"umax v14.8b, v14.8b, v1.8b\n"
"add x15, %[bias_data], w27, sxtw\n"
"cmp w16, #1\n"
"trn1 v12.8h, v12.8h, v6.8h\n"
"trn1 v11.8h, v11.8h, v22.8h\n"
"trn1 v9.8h, v9.8h, v17.8h\n"
"trn1 v10.8h, v10.8h, v7.8h\n"
"trn1 v8.8h, v8.8h, v18.8h\n"
"umin v14.8b, v14.8b, v2.8b\n"
"trn1 v13.8h, v13.8h, v16.8h\n"
"str d14, [x15]\n"
"b.gt " DC_KERNEL_NO_MULT_STRIDE_18 "b\n"
"cbz x14, " DC_KERNEL_NO_MULT_STRIDE_19 "b\n"
"b " DC_KERNEL_NO_MULT_STRIDE_18 "b\n"
DC_KERNEL_NO_MULT_STRIDE_24 ":\n"
"mov w14, wzr\n"
"cmp w14, w13\n"
"b.ge " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
DC_KERNEL_NO_MULT_STRIDE_25 ":\n"
"ldr x14, [sp, #8]\n"
"ldr x15, [sp, #136]\n"
"add x14, x14, x25\n"
DC_KERNEL_NO_MULT_STRIDE_26 ":\n"
"mov v5.16b, v27.16b\n"
"mov v19.16b, v27.16b\n"
".word 0x4e9e9705 // sdot v5.4s, v24.16b, v30.16b\n"
".word 0x4e889725 // sdot v5.4s, v25.16b, v8.16b\n"
".word 0x4e9f9713 // sdot v19.4s, v24.16b, v31.16b\n"
".word 0x4e9f9745 // sdot v5.4s, v26.16b, v31.16b\n"
".word 0x4e9d9733 // sdot v19.4s, v25.16b, v29.16b\n"
".word 0x4e9c9753 // sdot v19.4s, v26.16b, v28.16b\n"
"sqrdmulh v5.4s, v5.4s, v3.4s\n"
"sqrdmulh v19.4s, v19.4s, v3.4s\n"
"sqrshl v5.4s, v5.4s, v4.4s\n"
"sqrshl v19.4s, v19.4s, v4.4s\n"
"sqxtn v5.4h, v5.4s\n"
"sqxtn2 v5.8h, v19.4s\n"
"sqadd v5.8h, v5.8h, v0.8h\n"
"sqxtun v5.8b, v5.8h\n"
"umax v5.8b, v5.8b, v1.8b\n"
"mov v9.16b, v20.16b\n"
"rev32 v20.8h, v30.8h\n"
"rev32 v21.8h, v8.8h\n"
"rev32 v22.8h, v31.8h\n"
"rev32 v23.8h, v29.8h\n"
"rev32 v28.8h, v28.8h\n"
"umin v5.8b, v5.8b, v2.8b\n"
"add x17, x14, x11\n"
"subs x15, x15, #1\n"
"trn1 v30.8h, v20.8h, v9.8h\n"
"mov v20.16b, v9.16b\n"
"trn1 v8.8h, v21.8h, v15.8h\n"
"trn1 v29.8h, v23.8h, v13.8h\n"
"trn1 v31.8h, v22.8h, v14.8h\n"
"trn1 v28.8h, v28.8h, v11.8h\n"
"str s5, [x14]\n"
"add x14, x14, x20\n"
"st1 { v5.s }[1], [x17]\n"
"b.ne " DC_KERNEL_NO_MULT_STRIDE_26 "b\n"
"b " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
DC_KERNEL_NO_MULT_STRIDE_27 ":\n"
"ldr x28, [sp, #72]\n"
"mov w14, wzr\n"
"mov x24, xzr\n"
"mov x27, x13\n"
"stp q20, q15, [sp, #16]\n"
"str q14, [sp, #48]\n"
"b " DC_KERNEL_NO_MULT_STRIDE_30 "f\n"
DC_KERNEL_NO_MULT_STRIDE_28 ":\n"
"mov v5.16b, v30.16b\n"
".word 0x4e8c9705 // sdot v5.4s, v24.16b, v12.16b\n"
"mov v19.16b, v31.16b\n"
".word 0x4e8b9733 // sdot v19.4s, v25.16b, v11.16b\n"
".word 0x4e899745 // sdot v5.4s, v26.16b, v9.16b\n"
".word 0x4e8a9773 // sdot v19.4s, v27.16b, v10.16b\n"
".word 0x4e889785 // sdot v5.4s, v28.16b, v8.16b\n"
".word 0x4e8d97b3 // sdot v19.4s, v29.16b, v13.16b\n"
"sqrdmulh v5.4s, v5.4s, v3.4s\n"
"sqrdmulh v19.4s, v19.4s, v3.4s\n"
"sqrshl v5.4s, v5.4s, v4.4s\n"
"sqrshl v19.4s, v19.4s, v4.4s\n"
"sqxtn v5.4h, v5.4s\n"
"sqxtn2 v5.8h, v19.4s\n"
"sqadd v5.8h, v5.8h, v0.8h\n"
"sqxtun v5.8b, v5.8h\n"
"umax v5.8b, v5.8b, v1.8b\n"
"umin v5.8b, v5.8b, v2.8b\n"
"mov v6.16b, v14.16b\n"
"mov v12.16b, v14.16b\n"
"mov v9.16b, v17.16b\n"
"mov v8.16b, v18.16b\n"
"mov v11.16b, v22.16b\n"
"mov v10.16b, v7.16b\n"
"mov v13.16b, v16.16b\n"
"str d5, [x15, x6]\n"
DC_KERNEL_NO_MULT_STRIDE_29 ":\n"
"add x24, x24, #32\n"
"sub x28, x28, #1\n"
"subs x27, x27, #1\n"
"add w14, w14, w20\n"
"b.eq " DC_KERNEL_NO_MULT_STRIDE_34 "f\n"
DC_KERNEL_NO_MULT_STRIDE_30 ":\n"
"mov v14.16b, v30.16b\n"
"mov v15.16b, v31.16b\n"
".word 0x4e8c970e // sdot v14.4s, v24.16b, v12.16b\n"
"and x17, x24, #0xffffffe0\n"
".word 0x4e8b972f // sdot v15.4s, v25.16b, v11.16b\n"
".word 0x4e89974e // sdot v14.4s, v26.16b, v9.16b\n"
"add x17, x26, x17\n"
".word 0x4e8a976f // sdot v15.4s, v27.16b, v10.16b\n"
".word 0x4e88978e // sdot v14.4s, v28.16b, v8.16b\n"
"rev32 v21.8h, v8.8h\n"
"rev32 v6.8h, v11.8h\n"
"ldp q11, q22, [x17]\n"
".word 0x4e8d97af // sdot v15.4s, v29.16b, v13.16b\n"
"sqrdmulh v8.4s, v14.4s, v3.4s\n"
"rev32 v20.8h, v9.8h\n"
"sqrdmulh v9.4s, v15.4s, v3.4s\n"
"sqrshl v8.4s, v8.4s, v4.4s\n"
"rev32 v5.8h, v13.8h\n"
"add %[output_block_data], x17, x12\n"
"add x17, x17, x5\n"
"sqrshl v9.4s, v9.4s, v4.4s\n"
"sqxtn v13.4h, v8.4s\n"
"rev32 v19.8h, v12.8h\n"
"ldp q17, q7, [%[output_block_data]]\n"
"ldp q18, q16, [x17]\n"
"sqxtn2 v13.8h, v9.4s\n"
"trn1 v12.8h, v19.8h, v11.8h\n"
"sqadd v19.8h, v13.8h, v0.8h\n"
"sqxtun v19.8b, v19.8h\n"
"rev32 v23.8h, v10.8h\n"
"umax v19.8b, v19.8b, v1.8b\n"
"add x15, %[bias_data], w14, sxtw\n"
"cmp w16, #1\n"
"mov v14.16b, v11.16b\n"
"trn1 v11.8h, v6.8h, v22.8h\n"
"trn1 v9.8h, v20.8h, v17.8h\n"
"trn1 v8.8h, v21.8h, v18.8h\n"
"trn1 v10.8h, v23.8h, v7.8h\n"
"umin v19.8b, v19.8b, v2.8b\n"
"trn1 v13.8h, v5.8h, v16.8h\n"
"str d19, [x15]\n"
"b.gt " DC_KERNEL_NO_MULT_STRIDE_28 "b\n"
"cbnz x28, " DC_KERNEL_NO_MULT_STRIDE_28 "b\n"
"mov v6.16b, v14.16b\n"
"b " DC_KERNEL_NO_MULT_STRIDE_29 "b\n"
DC_KERNEL_NO_MULT_STRIDE_33 ":\n"
"ldp q13, q11, [sp, #96]\n"
"b " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
DC_KERNEL_NO_MULT_STRIDE_34 ":\n"
"ldp q13, q11, [sp, #96]\n"
"ldp q15, q14, [sp, #32]\n"
"ldr q20, [sp, #16]\n"
"b " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
DC_KERNEL_NO_MULT_STRIDE_35 ":\n"
"add sp, sp, #160\n"
:
[ scratch_block_data ] "+r"(scratch_block_data),
[ filter_workspace ] "+r"(filter_workspace),
[ bias_data ] "+r"(bias_data),
[ output_block_data ] "+r"(output_block_data)
:
[ function_params ] "r"(function_params)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31",
"x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
"x27", "x28");
#undef DC_KERNEL_NO_MULT_STRIDE_1
#undef DC_KERNEL_NO_MULT_STRIDE_2
#undef DC_KERNEL_NO_MULT_STRIDE_3
#undef DC_KERNEL_NO_MULT_STRIDE_4
#undef DC_KERNEL_NO_MULT_STRIDE_5
#undef DC_KERNEL_NO_MULT_STRIDE_6
#undef DC_KERNEL_NO_MULT_STRIDE_7
#undef DC_KERNEL_NO_MULT_STRIDE_8
#undef DC_KERNEL_NO_MULT_STRIDE_9
#undef DC_KERNEL_NO_MULT_STRIDE_10
#undef DC_KERNEL_NO_MULT_STRIDE_11
#undef DC_KERNEL_NO_MULT_STRIDE_12
#undef DC_KERNEL_NO_MULT_STRIDE_13
#undef DC_KERNEL_NO_MULT_STRIDE_14
#undef DC_KERNEL_NO_MULT_STRIDE_15
#undef DC_KERNEL_NO_MULT_STRIDE_16
#undef DC_KERNEL_NO_MULT_STRIDE_17
#undef DC_KERNEL_NO_MULT_STRIDE_18
#undef DC_KERNEL_NO_MULT_STRIDE_19
#undef DC_KERNEL_NO_MULT_STRIDE_20
#undef DC_KERNEL_NO_MULT_STRIDE_21
#undef DC_KERNEL_NO_MULT_STRIDE_22
#undef DC_KERNEL_NO_MULT_STRIDE_23
#undef DC_KERNEL_NO_MULT_STRIDE_24
#undef DC_KERNEL_NO_MULT_STRIDE_25
#undef DC_KERNEL_NO_MULT_STRIDE_26
#undef DC_KERNEL_NO_MULT_STRIDE_27
#undef DC_KERNEL_NO_MULT_STRIDE_28
#undef DC_KERNEL_NO_MULT_STRIDE_29
#undef DC_KERNEL_NO_MULT_STRIDE_30
#undef DC_KERNEL_NO_MULT_STRIDE_31
#undef DC_KERNEL_NO_MULT_STRIDE_32
#undef DC_KERNEL_NO_MULT_STRIDE_33
#undef DC_KERNEL_NO_MULT_STRIDE_34
#undef DC_KERNEL_NO_MULT_STRIDE_35
}
static void __attribute__((noinline))
Run(const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, uint8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
output_block_data, function_params);
}
};
template <>
struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
QuantizationType::kNonPerChannelUint8,
DepthwiseConvDepthMultiplication::kUnitInputDepth,
1> {
static inline void KernelMacroBlockNeon(
const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, uint8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
#define DC_KERNEL_MULT_1 …
#define DC_KERNEL_MULT_2 …
#define DC_KERNEL_MULT_3 …
#define DC_KERNEL_MULT_4 …
#define DC_KERNEL_MULT_5 …
#define DC_KERNEL_MULT_6 …
#define DC_KERNEL_MULT_7 …
#define DC_KERNEL_MULT_8 …
#define DC_KERNEL_MULT_9 …
#define DC_KERNEL_MULT_10 …
#define DC_KERNEL_MULT_11 …
#define DC_KERNEL_MULT_12 …
#define DC_KERNEL_MULT_13 …
#define DC_KERNEL_MULT_14 …
#define DC_KERNEL_MULT_15 …
#define DC_KERNEL_MULT_16 …
#define DC_KERNEL_MULT_17 …
#define DC_KERNEL_MULT_18 …
#define DC_KERNEL_MULT_19 …
#define DC_KERNEL_MULT_20 …
#define DC_KERNEL_MULT_21 …
#define DC_KERNEL_MULT_22 …
asm volatile(
"sub sp, sp, #304\n"
"ldr w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"str %[filter_workspace], [sp, #32]\n"
"cmp w8, #1\n"
"str w8, [sp, #12]\n"
"b.lt " DC_KERNEL_MULT_22 "f\n"
"str wzr, [sp, #28]\n"
"ldpsw x21, x5, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
"ldrb w8, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "]\n"
"ldrsw x17, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
"ldr w13, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
"add x11, %[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT) "\n"
"ldp w1, w15, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"add x10, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"
"add x12, %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n"
"ld1r { v2.4s }, [x11]\n"
"dup v3.16b, w8\n"
"fmov s5, w8\n"
"lsl x11, x21, #1\n"
"add x7, x21, x21, lsl #1\n"
"lsl x8, x17, #1\n"
"ldr w16, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
"ld1r { v0.8h }, [x10]\n"
"ld1r { v1.4s }, [x12]\n"
"str w13, [sp, #272]\n"
"cmp w13, #4\n"
"add x10, x8, x17\n"
"add x6, x8, x7\n"
"add x12, x8, x11\n"
"add x13, x8, x21\n"
"add x8, %[output_block_data], x8\n"
"str x8, [sp, #176]\n"
"add x8, x7, x17\n"
"add x14, x11, x17\n"
"add x24, %[output_block_data], x8\n"
"add x8, %[output_block_data], x14\n"
"add x14, x5, #4\n"
"ccmp w15, w1, #0, lt\n"
"str x14, [sp, #136]\n"
"lsl x14, x17, #2\n"
"ldrb w9, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "]\n"
"csel w25, w15, w1, lt\n"
"cmp w16, #1\n"
"str x14, [sp, #128]\n"
"add x14, %[output_block_data], x21\n"
"add x22, x5, x5, lsl #2\n"
"str x16, [sp, #56]\n"
"cset w16, lt\n"
"cmp w1, #1\n"
"str x14, [sp, #120]\n"
"add x14, %[output_block_data], x17\n"
"lsl x20, x5, #2\n"
"str w1, [sp, #276]\n"
"cset w1, lt\n"
"str x14, [sp, #112]\n"
"add x14, x22, #4\n"
"add x19, x5, x5, lsl #1\n"
"orr w16, w16, w1\n"
"str x14, [sp, #104]\n"
"add x14, x20, #4\n"
"dup v4.16b, w9\n"
"fmov s6, w9\n"
"lsl %[function_params], x5, #1\n"
"add x9, x21, x17\n"
"str w16, [sp, #8]\n"
"add x16, x10, x21\n"
"str x14, [sp, #96]\n"
"add x14, x19, #4\n"
"mov x23, xzr\n"
"add x9, %[output_block_data], x9\n"
"str w15, [sp, #268]\n"
"add x15, x10, x11\n"
"add x27, %[output_block_data], x12\n"
"add x12, %[output_block_data], x16\n"
"str x14, [sp, #88]\n"
"add x14, %[function_params], #4\n"
"stp x11, x21, [sp, #184]\n"
"add x11, %[output_block_data], x11\n"
"str x9, [sp, #168]\n"
"add x9, x10, x7\n"
"add x26, %[output_block_data], x6\n"
"add x28, %[output_block_data], x13\n"
"mov x13, x23\n"
"str x12, [sp, #144]\n"
"mov x12, x7\n"
"stp x7, %[output_block_data], [sp, #40]\n"
"stp x19, x5, [sp, #248]\n"
"stp x22, x20, [sp, #232]\n"
"stp x11, x14, [sp, #72]\n"
"add x11, %[output_block_data], x7\n"
"ldp x7, x6, [sp, #120]\n"
"ldr x23, [sp, #112]\n"
"ldp x22, x19, [sp, #88]\n"
"add x10, %[output_block_data], x10\n"
"dup v5.8b, v5.b[0]\n"
"dup v6.8b, v6.b[0]\n"
"str x10, [sp, #152]\n"
"add x9, %[output_block_data], x9\n"
"add x10, %[output_block_data], x15\n"
"mov w15, #4\n"
"mov x20, x14\n"
"str %[function_params], [sp, #280]\n"
"str x11, [sp, #64]\n"
"str %[scratch_block_data], [sp, #200]\n"
"str w25, [sp, #164]\n"
"str x9, [sp, #288]\n"
"b " DC_KERNEL_MULT_4 "f\n"
DC_KERNEL_MULT_2 ":\n"
"mov %[bias_data], x11\n"
DC_KERNEL_MULT_3 ":\n"
"ldr w13, [sp, #28]\n"
"ldr w12, [sp, #12]\n"
"ldr x11, [sp, #48]\n"
"add w13, w13, #1\n"
"str w13, [sp, #28]\n"
"cmp w13, w12\n"
"ldr x13, [sp, #16]\n"
"add x11, x11, #8\n"
"str x11, [sp, #48]\n"
"add x13, x13, #8\n"
"b.eq " DC_KERNEL_MULT_22 "f\n"
DC_KERNEL_MULT_4 ":\n"
"ldr x12, [sp, #32]\n"
"ldr x14, [sp, #56]\n"
"ldp q20, q7, [x12]\n"
"ldp q19, q16, [x12, #32]\n"
"ldp q18, q17, [x12, #64]\n"
"cmp w14, #4\n"
"add x12, x12, #96\n"
"str x12, [sp, #32]\n"
"str x13, [sp, #16]\n"
"b.ne " DC_KERNEL_MULT_15 "f\n"
"mov %[filter_workspace], xzr\n"
"mov x5, x13\n"
"b " DC_KERNEL_MULT_7 "f\n"
DC_KERNEL_MULT_6 ":\n"
"add %[filter_workspace], x1, #1\n"
"cmp %[filter_workspace], #2\n"
"add x5, x5, #4\n"
"mov v18.16b, v17.16b\n"
"mov v19.16b, v16.16b\n"
"mov v20.16b, v7.16b\n"
"b.eq " DC_KERNEL_MULT_3 "b\n"
DC_KERNEL_MULT_7 ":\n"
"ldr q21, [%[bias_data]], #16\n"
"ldr w12, [%[scratch_block_data]]\n"
"ldp %[function_params], x13, [sp, #248]\n"
"ldr x16, [sp, #240]\n"
"ldr x14, [sp, #280]\n"
"fmov s22, w12\n"
"add x13, %[scratch_block_data], x13\n"
"ldr w16, [%[scratch_block_data], x16]\n"
"mov v22.s[1], w12\n"
"ld1 { v22.s }[2], [x13]\n"
"ldr x13, [sp, #232]\n"
"ldr w14, [%[scratch_block_data], x14]\n"
"fmov s23, w16\n"
"ldr w4, [%[scratch_block_data], %[function_params]]\n"
"add x13, %[scratch_block_data], x13\n"
"mov v23.s[1], w16\n"
"ld1 { v23.s }[2], [x13]\n"
"fmov s24, w14\n"
"mov v24.s[1], w14\n"
"dup v25.4s, w14\n"
"mov v28.16b, v21.16b\n"
"mov v29.16b, v21.16b\n"
"mov v30.16b, v21.16b\n"
"dup v26.4s, w4\n"
"mov v31.16b, v21.16b\n"
"mov v24.s[2], w4\n"
"cmp w25, #1\n"
".word 0x4e99965c // sdot v28.4s, v18.16b, v25.16b\n"
".word 0x4e99967d // sdot v29.4s, v19.16b, v25.16b\n"
".word 0x4e99969e // sdot v30.4s, v20.16b, v25.16b\n"
"mov v24.s[3], w14\n"
"mov v22.s[3], w12\n"
"mov v23.s[3], w16\n"
".word 0x4e9a969f // sdot v31.4s, v20.16b, v26.16b\n"
"b.lt " DC_KERNEL_MULT_14 "f\n"
"stp %[filter_workspace], %[bias_data], [sp, #216]\n"
"mov w13, w25\n"
"str x5, [sp, #208]\n"
"mov x16, x5\n"
"mov x14, %[scratch_block_data]\n"
"ldp x25, %[scratch_block_data], [sp, #168]\n"
"mov x15, x10\n"
"mov x9, x8\n"
"mov x8, x24\n"
"mov x24, x28\n"
"mov x28, x27\n"
"ldp %[filter_workspace], x27, [sp, #144]\n"
"ldr x5, [sp, #136]\n"
"ldr %[bias_data], [sp, #104]\n"
"ldp x10, x11, [sp, #64]\n"
"shl v25.4s, v20.4s, #8\n"
"shl v26.4s, v19.4s, #8\n"
"shl v27.4s, v18.4s, #8\n"
DC_KERNEL_MULT_9 ":\n"
".word 0x4f96e29c // sdot v28.4s, v20.16b, v22.4b[0]\n"
".word 0x4f96ea9d // sdot v29.4s, v20.16b, v22.4b[2]\n"
".word 0x4f98ea7e // sdot v30.4s, v19.16b, v24.4b[2]\n"
".word 0x4f96ea7c // sdot v28.4s, v19.16b, v22.4b[2]\n"
".word 0x4f97e27f // sdot v31.4s, v19.16b, v23.4b[0]\n"
".word 0x4f98ea5d // sdot v29.4s, v18.16b, v24.4b[2]\n"
".word 0x4f97e25e // sdot v30.4s, v18.16b, v23.4b[0]\n"
"sqrdmulh v28.4s, v28.4s, v1.4s\n"
".word 0x4f97ea5f // sdot v31.4s, v18.16b, v23.4b[2]\n"
"sqrdmulh v29.4s, v29.4s, v1.4s\n"
"sqrdmulh v30.4s, v30.4s, v1.4s\n"
"sqrshl v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v1.4s\n"
"sqrshl v29.4s, v29.4s, v2.4s\n"
"sqrshl v30.4s, v30.4s, v2.4s\n"
"sqxtn v28.4h, v28.4s\n"
"sqrshl v31.4s, v31.4s, v2.4s\n"
"sqxtn v30.4h, v30.4s\n"
"sqxtn2 v28.8h, v29.4s\n"
"sqxtn2 v30.8h, v31.4s\n"
"sqadd v28.8h, v28.8h, v0.8h\n"
"sqadd v29.8h, v30.8h, v0.8h\n"
"sqxtun v28.8b, v28.8h\n"
"sqxtun2 v28.16b, v29.8h\n"
"umax v28.16b, v28.16b, v3.16b\n"
"add %[function_params], x7, x16\n"
"umin v28.16b, v28.16b, v4.16b\n"
"add x21, x11, x16\n"
"str s28, [%[output_block_data], x16]\n"
"st1 { v28.s }[1], [%[function_params]]\n"
"add %[function_params], x10, x16\n"
"st1 { v28.s }[2], [x21]\n"
"st1 { v28.s }[3], [%[function_params]]\n"
"mov x12, x14\n"
"add x21, x14, x20\n"
"ldr w4, [x14, #4]!\n"
"ld1 { v24.s }[1], [x21]\n"
"add x21, x12, x19\n"
"ld1 { v23.s }[1], [x21]\n"
"mov v22.s[1], w4\n"
"add %[function_params], x12, x22\n"
"ld1 { v24.s }[3], [%[function_params]]\n"
"add %[function_params], x12, x5\n"
"ld1 { v22.s }[3], [%[function_params]]\n"
"add x12, x12, %[bias_data]\n"
"mov v28.16b, v21.16b\n"
"ld1 { v23.s }[3], [x12]\n"
"mov v29.16b, v21.16b\n"
"mov v30.16b, v21.16b\n"
".word 0x4f96e33c // sdot v28.4s, v25.16b, v22.4b[0]\n"
"mov v31.16b, v21.16b\n"
".word 0x4f98e33e // sdot v30.4s, v25.16b, v24.4b[0]\n"
".word 0x4f96eb3d // sdot v29.4s, v25.16b, v22.4b[2]\n"
".word 0x4f96eb5c // sdot v28.4s, v26.16b, v22.4b[2]\n"
".word 0x4f98eb3f // sdot v31.4s, v25.16b, v24.4b[2]\n"
".word 0x4f98eb5e // sdot v30.4s, v26.16b, v24.4b[2]\n"
".word 0x4f98e35d // sdot v29.4s, v26.16b, v24.4b[0]\n"
".word 0x4f98e37c // sdot v28.4s, v27.16b, v24.4b[0]\n"
".word 0x4f97e35f // sdot v31.4s, v26.16b, v23.4b[0]\n"
".word 0x4f97e37e // sdot v30.4s, v27.16b, v23.4b[0]\n"
".word 0x4f98eb7d // sdot v29.4s, v27.16b, v24.4b[2]\n"
"sqrdmulh v28.4s, v28.4s, v1.4s\n"
".word 0x4f97eb7f // sdot v31.4s, v27.16b, v23.4b[2]\n"
"sqrdmulh v30.4s, v30.4s, v1.4s\n"
"sqrdmulh v29.4s, v29.4s, v1.4s\n"
"sqrshl v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v1.4s\n"
"sqrshl v30.4s, v30.4s, v2.4s\n"
"sqrshl v29.4s, v29.4s, v2.4s\n"
"sqxtn v28.4h, v28.4s\n"
"sqrshl v31.4s, v31.4s, v2.4s\n"
"sqxtn v30.4h, v30.4s\n"
"sqxtn2 v28.8h, v29.4s\n"
"sqxtn2 v30.8h, v31.4s\n"
"sqadd v28.8h, v28.8h, v0.8h\n"
"sqadd v29.8h, v30.8h, v0.8h\n"
"sqxtun v28.8b, v28.8h\n"
"sqxtun2 v28.16b, v29.8h\n"
"umax v28.16b, v28.16b, v3.16b\n"
"add x12, x25, x16\n"
"umin v28.16b, v28.16b, v4.16b\n"
"add %[function_params], x9, x16\n"
"str s28, [x23, x16]\n"
"st1 { v28.s }[1], [x12]\n"
"add x12, x8, x16\n"
"mov v29.16b, v21.16b\n"
"ushr v10.2d, v22.2d, #16\n"
"mov v30.16b, v21.16b\n"
"mov v31.16b, v21.16b\n"
"st1 { v28.s }[2], [%[function_params]]\n"
"st1 { v28.s }[3], [x12]\n"
"ushr v28.2d, v24.2d, #16\n"
".word 0x4f8ae29d // sdot v29.4s, v20.16b, v10.4b[0]\n"
"mov v8.16b, v21.16b\n"
".word 0x4f9ce29f // sdot v31.4s, v20.16b, v28.4b[0]\n"
".word 0x4f8aea9e // sdot v30.4s, v20.16b, v10.4b[2]\n"
".word 0x4f8aea7d // sdot v29.4s, v19.16b, v10.4b[2]\n"
"ushr v9.2d, v23.2d, #16\n"
".word 0x4f9cea88 // sdot v8.4s, v20.16b, v28.4b[2]\n"
".word 0x4f9cea7f // sdot v31.4s, v19.16b, v28.4b[2]\n"
".word 0x4f9ce27e // sdot v30.4s, v19.16b, v28.4b[0]\n"
".word 0x4f9ce25d // sdot v29.4s, v18.16b, v28.4b[0]\n"
".word 0x4f89e268 // sdot v8.4s, v19.16b, v9.4b[0]\n"
".word 0x4f89e25f // sdot v31.4s, v18.16b, v9.4b[0]\n"
".word 0x4f9cea5e // sdot v30.4s, v18.16b, v28.4b[2]\n"
"sqrdmulh v29.4s, v29.4s, v1.4s\n"
".word 0x4f89ea48 // sdot v8.4s, v18.16b, v9.4b[2]\n"
"sqrdmulh v31.4s, v31.4s, v1.4s\n"
"sqrdmulh v30.4s, v30.4s, v1.4s\n"
"sqrshl v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v8.4s, v8.4s, v1.4s\n"
"sqrshl v31.4s, v31.4s, v2.4s\n"
"sqrshl v30.4s, v30.4s, v2.4s\n"
"sqxtn v29.4h, v29.4s\n"
"sqrshl v8.4s, v8.4s, v2.4s\n"
"sqxtn v31.4h, v31.4s\n"
"sqxtn2 v29.8h, v30.4s\n"
"sqxtn2 v31.8h, v8.4s\n"
"sqadd v29.8h, v29.8h, v0.8h\n"
"sqadd v30.8h, v31.8h, v0.8h\n"
"sqxtun v29.8b, v29.8h\n"
"sqxtun2 v29.16b, v30.8h\n"
"umax v29.16b, v29.16b, v3.16b\n"
"add %[function_params], x24, x16\n"
"umin v29.16b, v29.16b, v4.16b\n"
"mov v30.16b, v21.16b\n"
"add x12, x28, x16\n"
"str s29, [%[scratch_block_data], x16]\n"
"st1 { v29.s }[1], [%[function_params]]\n"
"add %[function_params], x26, x16\n"
"mov v31.16b, v21.16b\n"
"mov v8.16b, v21.16b\n"
".word 0x4f8ae33e // sdot v30.4s, v25.16b, v10.4b[0]\n"
"st1 { v29.s }[2], [x12]\n"
"st1 { v29.s }[3], [%[function_params]]\n"
"mov v29.16b, v21.16b\n"
".word 0x4f9ce328 // sdot v8.4s, v25.16b, v28.4b[0]\n"
".word 0x4f8aeb3f // sdot v31.4s, v25.16b, v10.4b[2]\n"
".word 0x4f8aeb5e // sdot v30.4s, v26.16b, v10.4b[2]\n"
".word 0x4f9ceb3d // sdot v29.4s, v25.16b, v28.4b[2]\n"
".word 0x4f9ceb48 // sdot v8.4s, v26.16b, v28.4b[2]\n"
".word 0x4f9ce35f // sdot v31.4s, v26.16b, v28.4b[0]\n"
".word 0x4f9ce37e // sdot v30.4s, v27.16b, v28.4b[0]\n"
".word 0x4f89e35d // sdot v29.4s, v26.16b, v9.4b[0]\n"
".word 0x4f89e368 // sdot v8.4s, v27.16b, v9.4b[0]\n"
".word 0x4f9ceb7f // sdot v31.4s, v27.16b, v28.4b[2]\n"
"sqrdmulh v30.4s, v30.4s, v1.4s\n"
".word 0x4f89eb7d // sdot v29.4s, v27.16b, v9.4b[2]\n"
"sqrdmulh v28.4s, v8.4s, v1.4s\n"
"sqrdmulh v31.4s, v31.4s, v1.4s\n"
"sqrshl v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v1.4s\n"
"sqrshl v28.4s, v28.4s, v2.4s\n"
"sqrshl v31.4s, v31.4s, v2.4s\n"
"sqxtn v30.4h, v30.4s\n"
"sqrshl v29.4s, v29.4s, v2.4s\n"
"sqxtn v28.4h, v28.4s\n"
"sqxtn2 v30.8h, v31.4s\n"
"sqxtn2 v28.8h, v29.4s\n"
"sqadd v29.8h, v30.8h, v0.8h\n"
"sqadd v28.8h, v28.8h, v0.8h\n"
"sqxtun v29.8b, v29.8h\n"
"sqxtun2 v29.16b, v28.8h\n"
"umax v28.16b, v29.16b, v3.16b\n"
"add x12, %[filter_workspace], x16\n"
"umin v8.16b, v28.16b, v4.16b\n"
"str s8, [x27, x16]\n"
"st1 { v8.s }[1], [x12]\n"
"ldr x12, [sp, #288]\n"
"mov v28.16b, v21.16b\n"
"mov v29.16b, v21.16b\n"
"mov v30.16b, v21.16b\n"
"mov v31.16b, v21.16b\n"
"ushr v24.2d, v24.2d, #32\n"
"add %[function_params], x15, x16\n"
"add x12, x12, x16\n"
"subs w13, w13, #1\n"
"ushr v22.2d, v22.2d, #32\n"
"ushr v23.2d, v23.2d, #32\n"
".word 0x4f98e25c // sdot v28.4s, v18.16b, v24.4b[0]\n"
".word 0x4f98e27d // sdot v29.4s, v19.16b, v24.4b[0]\n"
".word 0x4f98e29e // sdot v30.4s, v20.16b, v24.4b[0]\n"
".word 0x4f98ea9f // sdot v31.4s, v20.16b, v24.4b[2]\n"
"add x16, x16, x6\n"
"st1 { v8.s }[2], [%[function_params]]\n"
"st1 { v8.s }[3], [x12]\n"
"b.ne " DC_KERNEL_MULT_9 "b\n"
"ldr w25, [sp, #164]\n"
"ldp x21, %[scratch_block_data], [sp, #192]\n"
"ldr %[function_params], [sp, #184]\n"
"ldp %[filter_workspace], %[bias_data], [sp, #216]\n"
"ldr x5, [sp, #208]\n"
"add x13, %[output_block_data], x16\n"
"mov w12, w25\n"
"mov x27, x28\n"
"mov x28, x24\n"
"mov x24, x8\n"
"mov x8, x9\n"
"mov x10, x15\n"
"mov w15, #4\n"
"ldr w16, [sp, #276]\n"
"cmp w12, w16\n"
"b.ge " DC_KERNEL_MULT_6 "b\n"
DC_KERNEL_MULT_11 ":\n"
"ldr w12, [sp, #272]\n"
"cmp w12, #1\n"
"b.lt " DC_KERNEL_MULT_6 "b\n"
"add x12, x14, #4\n"
"ldr x14, [sp, #240]\n"
"ldr x16, [sp, #280]\n"
"add x14, x12, x14\n"
"ld1 { v23.s }[1], [x14]\n"
"ldr x14, [sp, #232]\n"
"add x16, x12, x16\n"
"ld1 { v24.s }[1], [x16]\n"
"add x14, x12, x14\n"
"ld1 { v23.s }[3], [x14]\n"
"ldp x16, x14, [sp, #248]\n"
"add x16, x12, x16\n"
"ld1 { v24.s }[3], [x16]\n"
"ldr x16, [sp, #40]\n"
"ld1 { v22.s }[1], [x12], x14\n"
"ld1 { v22.s }[3], [x12]\n"
"ldr w12, [sp, #272]\n"
DC_KERNEL_MULT_13 ":\n"
".word 0x4f96e29c // sdot v28.4s, v20.16b, v22.4b[0]\n"
".word 0x4f96ea9d // sdot v29.4s, v20.16b, v22.4b[2]\n"
".word 0x4f98ea7e // sdot v30.4s, v19.16b, v24.4b[2]\n"
".word 0x4f96ea7c // sdot v28.4s, v19.16b, v22.4b[2]\n"
".word 0x4f97e27f // sdot v31.4s, v19.16b, v23.4b[0]\n"
".word 0x4f98ea5d // sdot v29.4s, v18.16b, v24.4b[2]\n"
".word 0x4f97e25e // sdot v30.4s, v18.16b, v23.4b[0]\n"
"sqrdmulh v25.4s, v28.4s, v1.4s\n"
".word 0x4f97ea5f // sdot v31.4s, v18.16b, v23.4b[2]\n"
"sqrdmulh v26.4s, v29.4s, v1.4s\n"
"sqrdmulh v27.4s, v30.4s, v1.4s\n"
"sqrshl v25.4s, v25.4s, v2.4s\n"
"sqrdmulh v28.4s, v31.4s, v1.4s\n"
"sqrshl v26.4s, v26.4s, v2.4s\n"
"sqrshl v27.4s, v27.4s, v2.4s\n"
"sqxtn v25.4h, v25.4s\n"
"sqrshl v28.4s, v28.4s, v2.4s\n"
"sqxtn v27.4h, v27.4s\n"
"sqxtn2 v25.8h, v26.4s\n"
"sqxtn2 v27.8h, v28.4s\n"
"sqadd v25.8h, v25.8h, v0.8h\n"
"sqadd v26.8h, v27.8h, v0.8h\n"
"sqxtun v25.8b, v25.8h\n"
"sqxtun2 v25.16b, v26.8h\n"
"umax v25.16b, v25.16b, v3.16b\n"
"add x14, x13, x21\n"
"umin v25.16b, v25.16b, v4.16b\n"
"str s25, [x13]\n"
"st1 { v25.s }[1], [x14]\n"
"add x14, x13, %[function_params]\n"
"ushr v24.2d, v24.2d, #8\n"
"mov v28.16b, v21.16b\n"
"mov v29.16b, v21.16b\n"
"mov v30.16b, v21.16b\n"
"mov v31.16b, v21.16b\n"
"st1 { v25.s }[2], [x14]\n"
"add x14, x13, x16\n"
"subs w12, w12, #1\n"
"ushr v22.2d, v22.2d, #8\n"
"ushr v23.2d, v23.2d, #8\n"
".word 0x4f98e25c // sdot v28.4s, v18.16b, v24.4b[0]\n"
".word 0x4f98e27d // sdot v29.4s, v19.16b, v24.4b[0]\n"
".word 0x4f98e29e // sdot v30.4s, v20.16b, v24.4b[0]\n"
"add x13, x13, x17\n"
".word 0x4f98ea9f // sdot v31.4s, v20.16b, v24.4b[2]\n"
"st1 { v25.s }[3], [x14]\n"
"b.ne " DC_KERNEL_MULT_13 "b\n"
"b " DC_KERNEL_MULT_6 "b\n"
DC_KERNEL_MULT_14 ":\n"
"ldr x11, [sp, #48]\n"
"ldr %[function_params], [sp, #184]\n"
"mov w12, wzr\n"
"mov x14, %[scratch_block_data]\n"
"add x13, x11, %[filter_workspace], lsl #2\n"
"ldr w16, [sp, #276]\n"
"cmp w12, w16\n"
"b.ge " DC_KERNEL_MULT_6 "b\n"
"b " DC_KERNEL_MULT_11 "b\n"
DC_KERNEL_MULT_15 ":\n"
"ldr w14, [sp, #8]\n"
"add x11, %[bias_data], #32\n"
"tbnz w14, #0, " DC_KERNEL_MULT_2 "b\n"
"ldp q21, q22, [%[bias_data]]\n"
"ldr %[filter_workspace], [sp, #48]\n"
"mov x14, xzr\n"
"b " DC_KERNEL_MULT_18 "f\n"
DC_KERNEL_MULT_17 ":\n"
"ldr x12, [sp, #56]\n"
"ldp x21, %[scratch_block_data], [sp, #192]\n"
"add x14, x14, #1\n"
"cmp x14, x12\n"
"add %[filter_workspace], x1, x21\n"
"b.eq " DC_KERNEL_MULT_2 "b\n"
DC_KERNEL_MULT_18 ":\n"
"ldr x16, [sp, #256]\n"
"mov w13, wzr\n"
"madd x12, x14, x16, %[scratch_block_data]\n"
"mov %[scratch_block_data], x16\n"
"ldr w16, [x12]\n"
"add %[function_params], x12, %[scratch_block_data]\n"
"fmov s23, w16\n"
"mov v23.s[1], w16\n"
"ld1 { v23.s }[2], [%[function_params]]\n"
"ldr %[function_params], [sp, #280]\n"
"mov v23.s[3], w16\n"
"add %[function_params], x12, %[function_params]\n"
"ld1r { v24.4s }, [%[function_params]]\n"
"mov x16, %[filter_workspace]\n"
"b " DC_KERNEL_MULT_20 "f\n"
DC_KERNEL_MULT_19 ":\n"
"ldr w4, [sp, #276]\n"
"add w13, w13, #1\n"
"cmp w13, w4\n"
"b.eq " DC_KERNEL_MULT_17 "b\n"
DC_KERNEL_MULT_20 ":\n"
"ldr x21, [sp, #280]\n"
"add x12, x12, #4\n"
"mov %[function_params], x12\n"
"ld1 { v23.s }[1], [%[function_params]], x21\n"
"ldr w21, [sp, #268]\n"
"ld1 { v24.s }[1], [%[function_params]]\n"
"ldr w4, [sp, #272]\n"
"cmp w13, w21\n"
"add x21, x12, %[scratch_block_data]\n"
"ld1 { v23.s }[3], [x21]\n"
"csel w4, w4, w15, eq\n"
"cmp w4, #1\n"
"b.lt " DC_KERNEL_MULT_19 "b\n"
DC_KERNEL_MULT_21 ":\n"
"mov v25.16b, v21.16b\n"
"mov v26.16b, v22.16b\n"
".word 0x4f97e299 // sdot v25.4s, v20.16b, v23.4b[0]\n"
".word 0x4f97e0fa // sdot v26.4s, v7.16b, v23.4b[0]\n"
".word 0x4f97ea79 // sdot v25.4s, v19.16b, v23.4b[2]\n"
".word 0x4f97ea1a // sdot v26.4s, v16.16b, v23.4b[2]\n"
".word 0x4f98e259 // sdot v25.4s, v18.16b, v24.4b[0]\n"
".word 0x4f98e23a // sdot v26.4s, v17.16b, v24.4b[0]\n"
"sqrdmulh v25.4s, v25.4s, v1.4s\n"
"sqrdmulh v26.4s, v26.4s, v1.4s\n"
"sqrshl v25.4s, v25.4s, v2.4s\n"
"sqrshl v26.4s, v26.4s, v2.4s\n"
"sqxtn v25.4h, v25.4s\n"
"sqxtn2 v25.8h, v26.4s\n"
"sqadd v25.8h, v25.8h, v0.8h\n"
"sqxtun v25.8b, v25.8h\n"
"umax v25.8b, v25.8b, v5.8b\n"
"umin v25.8b, v25.8b, v6.8b\n"
"subs w4, w4, #1\n"
"ushr v23.2d, v23.2d, #8\n"
"ushr v24.2d, v24.2d, #8\n"
"str d25, [x16]\n"
"add x16, x16, x17\n"
"b.ne " DC_KERNEL_MULT_21 "b\n"
"b " DC_KERNEL_MULT_19 "b\n"
DC_KERNEL_MULT_22 ":\n"
"add sp, sp, #304\n"
:
[ scratch_block_data ] "+r"(scratch_block_data),
[ filter_workspace ] "+r"(filter_workspace),
[ bias_data ] "+r"(bias_data),
[ output_block_data ] "+r"(output_block_data)
:
[ function_params ] "r"(function_params)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31",
"x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
"x27", "x28");
#undef DC_KERNEL_MULT_1
#undef DC_KERNEL_MULT_2
#undef DC_KERNEL_MULT_3
#undef DC_KERNEL_MULT_4
#undef DC_KERNEL_MULT_5
#undef DC_KERNEL_MULT_6
#undef DC_KERNEL_MULT_7
#undef DC_KERNEL_MULT_8
#undef DC_KERNEL_MULT_9
#undef DC_KERNEL_MULT_10
#undef DC_KERNEL_MULT_11
#undef DC_KERNEL_MULT_12
#undef DC_KERNEL_MULT_13
#undef DC_KERNEL_MULT_14
#undef DC_KERNEL_MULT_15
#undef DC_KERNEL_MULT_16
#undef DC_KERNEL_MULT_17
#undef DC_KERNEL_MULT_18
#undef DC_KERNEL_MULT_19
#undef DC_KERNEL_MULT_20
#undef DC_KERNEL_MULT_21
#undef DC_KERNEL_MULT_22
}
static void __attribute__((noinline))
Run(const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, uint8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
output_block_data, function_params);
}
};
template <>
struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
QuantizationType::kNonPerChannelUint8,
DepthwiseConvDepthMultiplication::kUnitInputDepth,
2> {
static inline void KernelMacroBlockNeon(
const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, uint8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
#define DC_KERNEL_MULT_STRIDE_1 …
#define DC_KERNEL_MULT_STRIDE_2 …
#define DC_KERNEL_MULT_STRIDE_3 …
#define DC_KERNEL_MULT_STRIDE_4 …
#define DC_KERNEL_MULT_STRIDE_5 …
#define DC_KERNEL_MULT_STRIDE_6 …
#define DC_KERNEL_MULT_STRIDE_7 …
#define DC_KERNEL_MULT_STRIDE_8 …
#define DC_KERNEL_MULT_STRIDE_9 …
#define DC_KERNEL_MULT_STRIDE_10 …
#define DC_KERNEL_MULT_STRIDE_11 …
#define DC_KERNEL_MULT_STRIDE_12 …
#define DC_KERNEL_MULT_STRIDE_13 …
asm volatile(
"ldr w15, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
"ldp w11, w6, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"ldpsw x9, x10, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
"ldrsw x12, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"ldrsw x13, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
"ldr w14, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
"add x17, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "\n"
"add x5, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "\n"
"add x7, %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n"
"add x19, %[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT) "\n"
"add %[function_params], %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"
"sxtw x11, w11\n"
"ld1r { v0.8h }, [%[function_params]]\n"
"ld1r { v1.4s }, [x7]\n"
"ld1r { v2.4s }, [x19]\n"
"ld1r { v3.8b }, [x17]\n"
"ld1r { v4.8b }, [x5]\n"
"cmp w15, #2\n"
"ccmp w6, w11, #0, lt\n"
"lsl x5, x6, #2\n"
"csel w6, w6, w11, lt\n"
"mov x8, xzr\n"
"add x16, %[scratch_block_data], #4\n"
"lsl x17, x10, #1\n"
"add %[function_params], x10, x10, lsl #1\n"
"sxtw x6, w6\n"
"add x7, x9, x13\n"
"b " DC_KERNEL_MULT_STRIDE_13 "f\n"
DC_KERNEL_MULT_STRIDE_1 ":\n"
"ldr w20, [%[scratch_block_data]]\n"
"add x21, %[scratch_block_data], x10\n"
"ldp q5, q6, [%[filter_workspace]]\n"
"ldp q7, q16, [%[filter_workspace], #32]\n"
"fmov s21, w20\n"
"mov v21.s[1], w20\n"
"ld1 { v21.s }[2], [x21]\n"
"ldp q17, q18, [%[filter_workspace], #64]\n"
"ldp q19, q20, [%[bias_data]], #32\n"
"ldr s22, [%[scratch_block_data], x17]\n"
"ubfiz x19, x8, #3, #29\n"
"add %[filter_workspace], %[filter_workspace], #96\n"
"add x19, %[output_block_data], x19\n"
"cmp w14, #2\n"
"mov v21.s[3], w20\n"
"mov x20, xzr\n"
"b.ne " DC_KERNEL_MULT_STRIDE_7 "f\n"
"dup v22.4s, v22.s[0]\n"
"add x21, %[scratch_block_data], %[function_params]\n"
"add x22, %[scratch_block_data], x10, lsl #2\n"
"ld1 { v22.s }[2], [x21]\n"
"ld1r { v23.4s }, [x22]\n"
"mov x21, xzr\n"
"b " DC_KERNEL_MULT_STRIDE_4 "f\n"
DC_KERNEL_MULT_STRIDE_3 ":\n"
"and x22, x20, #0xfffffffc\n"
"add x23, x16, x22\n"
"lsl x24, x10, #2\n"
"mov x22, x23\n"
"ld1 { v21.s }[1], [x22], x24\n"
"add x24, x23, x17\n"
"ld1 { v22.s }[1], [x24]\n"
"add x24, x23, x10\n"
"ld1 { v21.s }[3], [x24]\n"
"add x23, x23, %[function_params]\n"
"ld1 { v22.s }[3], [x23]\n"
"mov v25.16b, v19.16b\n"
"mov v27.16b, v20.16b\n"
"ld1 { v23.s }[1], [x22]\n"
"ushr v29.2d, v21.2d, #16\n"
".word 0x4f9de0b9 // sdot v25.4s, v5.16b, v29.4b[0]\n"
".word 0x4f9de0db // sdot v27.4s, v6.16b, v29.4b[0]\n"
"mov v26.16b, v19.16b\n"
"mov v28.16b, v20.16b\n"
".word 0x4f9de8f9 // sdot v25.4s, v7.16b, v29.4b[2]\n"
".word 0x4f9dea1b // sdot v27.4s, v16.16b, v29.4b[2]\n"
"ushr v29.2d, v22.2d, #16\n"
".word 0x4f9de0ba // sdot v26.4s, v5.16b, v29.4b[0]\n"
".word 0x4f9de0dc // sdot v28.4s, v6.16b, v29.4b[0]\n"
"mov v24.16b, v19.16b\n"
".word 0x4f9de8fa // sdot v26.4s, v7.16b, v29.4b[2]\n"
".word 0x4f9dea1c // sdot v28.4s, v16.16b, v29.4b[2]\n"
".word 0x4f9de239 // sdot v25.4s, v17.16b, v29.4b[0]\n"
".word 0x4f9de25b // sdot v27.4s, v18.16b, v29.4b[0]\n"
"ushr v29.2d, v23.2d, #16\n"
".word 0x4f9de23a // sdot v26.4s, v17.16b, v29.4b[0]\n"
".word 0x4f9de25c // sdot v28.4s, v18.16b, v29.4b[0]\n"
"mov v29.16b, v19.16b\n"
".word 0x4f95e0b8 // sdot v24.4s, v5.16b, v21.4b[0]\n"
".word 0x4f96e0bd // sdot v29.4s, v5.16b, v22.4b[0]\n"
".word 0x4f95e8f8 // sdot v24.4s, v7.16b, v21.4b[2]\n"
".word 0x4f96e8fd // sdot v29.4s, v7.16b, v22.4b[2]\n"
".word 0x4f96e238 // sdot v24.4s, v17.16b, v22.4b[0]\n"
".word 0x4f97e23d // sdot v29.4s, v17.16b, v23.4b[0]\n"
"sqrdmulh v24.4s, v24.4s, v1.4s\n"
"sqrdmulh v29.4s, v29.4s, v1.4s\n"
"sqrshl v24.4s, v24.4s, v2.4s\n"
"sqrshl v29.4s, v29.4s, v2.4s\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v29.4s\n"
"sqadd v24.8h, v24.8h, v0.8h\n"
"sqxtun v24.8b, v24.8h\n"
"umax v24.8b, v24.8b, v3.8b\n"
"add x22, x19, x9\n"
"mov v29.16b, v20.16b\n"
"umin v24.8b, v24.8b, v4.8b\n"
"str s24, [x19]\n"
"st1 { v24.s }[1], [x22]\n"
"mov v24.16b, v20.16b\n"
".word 0x4f95e0dd // sdot v29.4s, v6.16b, v21.4b[0]\n"
".word 0x4f96e0d8 // sdot v24.4s, v6.16b, v22.4b[0]\n"
".word 0x4f95ea1d // sdot v29.4s, v16.16b, v21.4b[2]\n"
".word 0x4f96ea18 // sdot v24.4s, v16.16b, v22.4b[2]\n"
".word 0x4f96e25d // sdot v29.4s, v18.16b, v22.4b[0]\n"
".word 0x4f97e258 // sdot v24.4s, v18.16b, v23.4b[0]\n"
"sqrdmulh v29.4s, v29.4s, v1.4s\n"
"sqrdmulh v24.4s, v24.4s, v1.4s\n"
"sqrshl v29.4s, v29.4s, v2.4s\n"
"sqrshl v24.4s, v24.4s, v2.4s\n"
"sqxtn v29.4h, v29.4s\n"
"sqxtn2 v29.8h, v24.4s\n"
"sqadd v24.8h, v29.8h, v0.8h\n"
"sqxtun v24.8b, v24.8h\n"
"sqrdmulh v25.4s, v25.4s, v1.4s\n"
"umax v24.8b, v24.8b, v3.8b\n"
"sqrdmulh v26.4s, v26.4s, v1.4s\n"
"sqrshl v25.4s, v25.4s, v2.4s\n"
"add x22, x22, #4\n"
"umin v24.8b, v24.8b, v4.8b\n"
"sqrshl v26.4s, v26.4s, v2.4s\n"
"sqxtn v25.4h, v25.4s\n"
"str s24, [x19, #4]\n"
"st1 { v24.s }[1], [x22]\n"
"sqxtn2 v25.8h, v26.4s\n"
"sqadd v24.8h, v25.8h, v0.8h\n"
"sqrdmulh v27.4s, v27.4s, v1.4s\n"
"sqxtun v24.8b, v24.8h\n"
"sqrdmulh v28.4s, v28.4s, v1.4s\n"
"sqrshl v27.4s, v27.4s, v2.4s\n"
"umax v24.8b, v24.8b, v3.8b\n"
"add x23, x19, x13\n"
"add x24, x19, x7\n"
"sqrshl v28.4s, v28.4s, v2.4s\n"
"sqxtn v27.4h, v27.4s\n"
"umin v24.8b, v24.8b, v4.8b\n"
"str s24, [x23]\n"
"st1 { v24.s }[1], [x24]\n"
"sqxtn2 v27.8h, v28.4s\n"
"sqadd v24.8h, v27.8h, v0.8h\n"
"sqxtun v24.8b, v24.8h\n"
"umax v24.8b, v24.8b, v3.8b\n"
"add x25, x24, #4\n"
"umin v24.8b, v24.8b, v4.8b\n"
"add x21, x21, #1\n"
"ushr v21.2d, v21.2d, #32\n"
"ushr v22.2d, v22.2d, #32\n"
"ushr v23.2d, v23.2d, #32\n"
"add x19, x23, x13\n"
"str s24, [x23, #4]\n"
"st1 { v24.s }[1], [x25]\n"
"add x20, x20, #4\n"
DC_KERNEL_MULT_STRIDE_4 ":\n"
"cmp x21, x6\n"
"b.lt " DC_KERNEL_MULT_STRIDE_3 "b\n"
"b " DC_KERNEL_MULT_STRIDE_6 "f\n"
DC_KERNEL_MULT_STRIDE_5 ":\n"
"and x22, x20, #0xfffffffc\n"
"add x22, x16, x22\n"
"lsl x23, x10, #2\n"
"mov x25, x22\n"
"add x24, x22, x17\n"
"ld1 { v21.s }[1], [x25], x23\n"
"ld1 { v22.s }[1], [x24]\n"
"add x23, x22, x10\n"
"add x22, x22, %[function_params]\n"
"ld1 { v21.s }[3], [x23]\n"
"ld1 { v22.s }[3], [x22]\n"
"mov v24.16b, v19.16b\n"
"ld1 { v23.s }[1], [x25]\n"
"mov v25.16b, v19.16b\n"
".word 0x4f95e0b8 // sdot v24.4s, v5.16b, v21.4b[0]\n"
".word 0x4f96e0b9 // sdot v25.4s, v5.16b, v22.4b[0]\n"
".word 0x4f95e8f8 // sdot v24.4s, v7.16b, v21.4b[2]\n"
".word 0x4f96e8f9 // sdot v25.4s, v7.16b, v22.4b[2]\n"
".word 0x4f96e238 // sdot v24.4s, v17.16b, v22.4b[0]\n"
".word 0x4f97e239 // sdot v25.4s, v17.16b, v23.4b[0]\n"
"sqrdmulh v24.4s, v24.4s, v1.4s\n"
"sqrdmulh v25.4s, v25.4s, v1.4s\n"
"sqrshl v24.4s, v24.4s, v2.4s\n"
"sqrshl v25.4s, v25.4s, v2.4s\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v25.4s\n"
"sqadd v24.8h, v24.8h, v0.8h\n"
"sqxtun v24.8b, v24.8h\n"
"umax v24.8b, v24.8b, v3.8b\n"
"add x22, x19, x9\n"
"mov v25.16b, v20.16b\n"
"umin v24.8b, v24.8b, v4.8b\n"
"str s24, [x19]\n"
"st1 { v24.s }[1], [x22]\n"
"mov v24.16b, v20.16b\n"
".word 0x4f95e0d9 // sdot v25.4s, v6.16b, v21.4b[0]\n"
".word 0x4f96e0d8 // sdot v24.4s, v6.16b, v22.4b[0]\n"
".word 0x4f95ea19 // sdot v25.4s, v16.16b, v21.4b[2]\n"
".word 0x4f96ea18 // sdot v24.4s, v16.16b, v22.4b[2]\n"
".word 0x4f96e259 // sdot v25.4s, v18.16b, v22.4b[0]\n"
".word 0x4f97e258 // sdot v24.4s, v18.16b, v23.4b[0]\n"
"sqrdmulh v25.4s, v25.4s, v1.4s\n"
"sqrdmulh v24.4s, v24.4s, v1.4s\n"
"sqrshl v25.4s, v25.4s, v2.4s\n"
"sqrshl v24.4s, v24.4s, v2.4s\n"
"sqxtn v25.4h, v25.4s\n"
"sqxtn2 v25.8h, v24.4s\n"
"sqadd v24.8h, v25.8h, v0.8h\n"
"sqxtun v24.8b, v24.8h\n"
"umax v24.8b, v24.8b, v3.8b\n"
"add x22, x22, #4\n"
"umin v24.8b, v24.8b, v4.8b\n"
"add x21, x21, #1\n"
"ushr v21.2d, v21.2d, #16\n"
"ushr v22.2d, v22.2d, #16\n"
"ushr v23.2d, v23.2d, #16\n"
"str s24, [x19, #4]\n"
"st1 { v24.s }[1], [x22]\n"
"add x19, x19, x13\n"
"add x20, x20, #4\n"
DC_KERNEL_MULT_STRIDE_6 ":\n"
"cmp x21, x11\n"
"b.lt " DC_KERNEL_MULT_STRIDE_5 "b\n"
"b " DC_KERNEL_MULT_STRIDE_12 "f\n"
DC_KERNEL_MULT_STRIDE_7 ":\n"
"mov x21, xzr\n"
"dup v22.4s, v22.s[0]\n"
"b " DC_KERNEL_MULT_STRIDE_11 "f\n"
DC_KERNEL_MULT_STRIDE_8 ":\n"
"and x22, x20, #0xfffffffc\n"
"add x22, x16, x22\n"
"mov x23, x22\n"
"ld1 { v21.s }[1], [x23], x17\n"
"add x22, x22, x10\n"
"mov v23.16b, v19.16b\n"
"mov v24.16b, v20.16b\n"
"ld1 { v22.s }[1], [x23]\n"
"ld1 { v21.s }[3], [x22]\n"
"cmp w15, #2\n"
"ccmp x5, x20, #0, ne\n"
".word 0x4f96e237 // sdot v23.4s, v17.16b, v22.4b[0]\n"
".word 0x4f96e258 // sdot v24.4s, v18.16b, v22.4b[0]\n"
".word 0x4f95e0b7 // sdot v23.4s, v5.16b, v21.4b[0]\n"
".word 0x4f95e0d8 // sdot v24.4s, v6.16b, v21.4b[0]\n"
".word 0x4f95e8f7 // sdot v23.4s, v7.16b, v21.4b[2]\n"
".word 0x4f95ea18 // sdot v24.4s, v16.16b, v21.4b[2]\n"
"sqrdmulh v23.4s, v23.4s, v1.4s\n"
"sqrdmulh v24.4s, v24.4s, v1.4s\n"
"sqrshl v23.4s, v23.4s, v2.4s\n"
"sqrshl v24.4s, v24.4s, v2.4s\n"
"sqxtn v25.4h, v23.4s\n"
"sqxtn2 v25.8h, v24.4s\n"
"sqadd v24.8h, v25.8h, v0.8h\n"
"sqxtun v24.8b, v24.8h\n"
"umax v24.8b, v24.8b, v3.8b\n"
"umin v24.8b, v24.8b, v4.8b\n"
"ushr v23.2d, v21.2d, #16\n"
"str d24, [x19]\n"
"ushr v24.2d, v22.2d, #16\n"
"add x19, x19, x13\n"
"b.eq " DC_KERNEL_MULT_STRIDE_10 "f\n"
"mov v25.16b, v19.16b\n"
"mov v26.16b, v20.16b\n"
".word 0x4f98e239 // sdot v25.4s, v17.16b, v24.4b[0]\n"
".word 0x4f98e25a // sdot v26.4s, v18.16b, v24.4b[0]\n"
".word 0x4f97e0b9 // sdot v25.4s, v5.16b, v23.4b[0]\n"
".word 0x4f97e0da // sdot v26.4s, v6.16b, v23.4b[0]\n"
".word 0x4f97e8f9 // sdot v25.4s, v7.16b, v23.4b[2]\n"
".word 0x4f97ea1a // sdot v26.4s, v16.16b, v23.4b[2]\n"
"ushr v23.2d, v21.2d, #32\n"
"sqrdmulh v21.4s, v25.4s, v1.4s\n"
"ushr v24.2d, v22.2d, #32\n"
"sqrdmulh v22.4s, v26.4s, v1.4s\n"
"sqrshl v21.4s, v21.4s, v2.4s\n"
"sqrshl v22.4s, v22.4s, v2.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqadd v21.8h, v21.8h, v0.8h\n"
"sqxtun v21.8b, v21.8h\n"
"umax v21.8b, v21.8b, v3.8b\n"
"umin v21.8b, v21.8b, v4.8b\n"
"str d21, [x19]\n"
"add x19, x19, x13\n"
DC_KERNEL_MULT_STRIDE_10 ":\n"
"add x21, x21, #1\n"
"add x20, x20, #4\n"
"mov v22.16b, v24.16b\n"
"mov v21.16b, v23.16b\n"
DC_KERNEL_MULT_STRIDE_11 ":\n"
"cmp x21, x11\n"
"b.lt " DC_KERNEL_MULT_STRIDE_8 "b\n"
DC_KERNEL_MULT_STRIDE_12 ":\n"
"add x8, x8, #1\n"
DC_KERNEL_MULT_STRIDE_13 ":\n"
"cmp x8, x12\n"
"b.lt " DC_KERNEL_MULT_STRIDE_1 "b\n"
:
[ scratch_block_data ] "+r"(scratch_block_data),
[ filter_workspace ] "+r"(filter_workspace),
[ bias_data ] "+r"(bias_data),
[ output_block_data ] "+r"(output_block_data)
:
[ function_params ] "r"(function_params)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25");
#undef DC_KERNEL_MULT_STRIDE_1
#undef DC_KERNEL_MULT_STRIDE_2
#undef DC_KERNEL_MULT_STRIDE_3
#undef DC_KERNEL_MULT_STRIDE_4
#undef DC_KERNEL_MULT_STRIDE_5
#undef DC_KERNEL_MULT_STRIDE_6
#undef DC_KERNEL_MULT_STRIDE_7
#undef DC_KERNEL_MULT_STRIDE_8
#undef DC_KERNEL_MULT_STRIDE_9
#undef DC_KERNEL_MULT_STRIDE_10
#undef DC_KERNEL_MULT_STRIDE_11
#undef DC_KERNEL_MULT_STRIDE_12
#undef DC_KERNEL_MULT_STRIDE_13
}
static void __attribute__((noinline))
Run(const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, uint8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
output_block_data, function_params);
}
};
template <>
struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
QuantizationType::kPerChannelInt8,
DepthwiseConvDepthMultiplication::kNoMultiplication,
1> {
static inline void KernelMacroBlockNeon(
const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, int8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
#define DC_KERNEL_NO_MULT_1 …
#define DC_KERNEL_NO_MULT_2 …
#define DC_KERNEL_NO_MULT_3 …
#define DC_KERNEL_NO_MULT_4 …
#define DC_KERNEL_NO_MULT_5 …
#define DC_KERNEL_NO_MULT_6 …
#define DC_KERNEL_NO_MULT_7 …
#define DC_KERNEL_NO_MULT_8 …
#define DC_KERNEL_NO_MULT_9 …
#define DC_KERNEL_NO_MULT_10 …
#define DC_KERNEL_NO_MULT_11 …
#define DC_KERNEL_NO_MULT_12 …
#define DC_KERNEL_NO_MULT_13 …
#define DC_KERNEL_NO_MULT_14 …
#define DC_KERNEL_NO_MULT_15 …
#define DC_KERNEL_NO_MULT_16 …
#define DC_KERNEL_NO_MULT_17 …
#define DC_KERNEL_NO_MULT_18 …
#define DC_KERNEL_NO_MULT_19 …
#define DC_KERNEL_NO_MULT_20 …
#define DC_KERNEL_NO_MULT_21 …
#define DC_KERNEL_NO_MULT_22 …
#define DC_KERNEL_NO_MULT_23 …
#define DC_KERNEL_NO_MULT_24 …
#define DC_KERNEL_NO_MULT_25 …
#define DC_KERNEL_NO_MULT_26 …
#define DC_KERNEL_NO_MULT_27 …
#define DC_KERNEL_NO_MULT_28 …
#define DC_KERNEL_NO_MULT_29 …
#define DC_KERNEL_NO_MULT_30 …
#define DC_KERNEL_NO_MULT_31 …
#define DC_KERNEL_NO_MULT_32 …
#define DC_KERNEL_NO_MULT_33 …
asm volatile(
"sub sp, sp, #384\n"
"ldr w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"str %[scratch_block_data], [sp, #376]\n"
"cmp w8, #1\n"
"str x8, [sp, #56]\n"
"b.lt " DC_KERNEL_NO_MULT_33 "f\n"
"stp xzr, xzr, [sp, #72]\n"
"ldr w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"str xzr, [sp, #88]\n"
"ldpsw x22, x5, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
"ldr x11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL) "]\n"
"str w8, [sp, #340]\n"
"ldr w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_MICRO_REPEATS) "]\n"
"ldrb w9, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "]\n"
"str x11, [sp, #40]\n"
"ldr x11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL) "]\n"
"str w8, [sp, #344]\n"
"ldr w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
"ldrsw x7, [%[function_params]]\n"
"str x11, [sp, #32]\n"
"ldrsw x11, [%[function_params], #" STR(DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"str w8, [sp, #348]\n"
"ldrb w8, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "]\n"
"ldr x26, [sp, #376]\n"
"mov x23, %[output_block_data]\n"
"add x10, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"
"dup v5.16b, w8\n"
"fmov s3, w8\n"
"lsl x8, x11, #5\n"
"dup v6.16b, w9\n"
"fmov s4, w9\n"
"str x8, [sp, #48]\n"
"add x8, x5, x26\n"
"lsl x9, x7, #1\n"
"ld1r { v0.8h }, [x10]\n"
"add x13, x5, x5, lsl #1\n"
"add x10, x22, x7\n"
"add x28, x8, #32\n"
"add x8, x23, x9\n"
"str x13, [sp, #312]\n"
"add x13, x13, x26\n"
"str x8, [sp, #360]\n"
"add x8, x23, x10\n"
"str x8, [sp, #352]\n"
"add x8, x13, #32\n"
"ldr w6, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
"lsl x12, x5, #2\n"
"add x11, x5, x5, lsl #2\n"
"add x24, x22, x22, lsl #1\n"
"str x8, [sp, #368]\n"
"lsl x8, x5, #1\n"
"mov %[output_block_data], %[filter_workspace]\n"
"lsl %[filter_workspace], x22, #1\n"
"stp x11, x12, [sp, #296]\n"
"add x11, x11, x26\n"
"add x12, x12, x26\n"
"add x14, x9, x7\n"
"add x15, x9, x24\n"
"stp x8, x5, [sp, #320]\n"
"add x8, x8, x26\n"
"add x10, x11, #32\n"
"add x11, x12, #32\n"
"add x19, x8, #32\n"
"add x12, x14, x24\n"
"add x13, x14, %[filter_workspace]\n"
"add x8, x14, x22\n"
"add x25, x23, x14\n"
"add x14, x23, x15\n"
"add x17, x9, x22\n"
"mov %[scratch_block_data], x19\n"
"mov x19, x14\n"
"add x14, x24, x7\n"
"add x21, x23, x17\n"
"mov w17, w6\n"
"add x15, x23, x14\n"
"add x14, %[filter_workspace], x7\n"
"add x6, x23, x12\n"
"add x12, x23, x13\n"
"add %[function_params], x23, x14\n"
"mov x14, x12\n"
"and w12, w17, #0xfffffffe\n"
"str w12, [sp, #20]\n"
"lsl x12, x7, #2\n"
"str x12, [sp, #152]\n"
"add x12, x23, x22\n"
"str x12, [sp, #144]\n"
"add x12, x23, x7\n"
"add x16, x9, %[filter_workspace]\n"
"str x12, [sp, #136]\n"
"add x12, x23, %[filter_workspace]\n"
"dup v7.8b, v3.b[0]\n"
"dup v14.8b, v4.b[0]\n"
"add x20, x23, x16\n"
"mov x13, x15\n"
"add x15, x23, x8\n"
"mov x5, %[filter_workspace]\n"
"str x12, [sp, #128]\n"
"mov x8, x24\n"
"add x12, x23, x24\n"
"mov w1, #4\n"
"stp x23, x12, [sp, #112]\n"
"str x26, [sp, #264]\n"
"str x22, [sp, #200]\n"
"str w17, [sp, #108]\n"
"str %[scratch_block_data], [sp, #96]\n"
"str x23, [sp, #24]\n"
"stp d14, d7, [sp, #160]\n"
"b " DC_KERNEL_NO_MULT_4 "f\n"
DC_KERNEL_NO_MULT_2 ":\n"
"mov %[bias_data], x9\n"
DC_KERNEL_NO_MULT_3 ":\n"
"ldr %[output_block_data], [sp, #48]\n"
"ldr x12, [sp, #264]\n"
"ldr x17, [sp, #88]\n"
"add x12, x12, %[output_block_data]\n"
"str x12, [sp, #264]\n"
"ldr x12, [sp, #112]\n"
"add x17, x17, #1\n"
"add x12, x12, #8\n"
"str x12, [sp, #112]\n"
"ldr x12, [sp, #72]\n"
"add x12, x12, %[output_block_data]\n"
"str x12, [sp, #72]\n"
"ldp x12, %[output_block_data], [sp, #56]\n"
"cmp x17, x12\n"
"ldr x12, [sp, #80]\n"
"add x12, x12, #8\n"
"stp x12, x17, [sp, #80]\n"
"ldr w17, [sp, #108]\n"
"b.eq " DC_KERNEL_NO_MULT_33 "f\n"
DC_KERNEL_NO_MULT_4 ":\n"
"ldp q16, q15, [%[output_block_data]]\n"
"ldp q17, q3, [%[output_block_data], #32]\n"
"ldp q18, q4, [%[output_block_data], #64]\n"
"cmp w17, #4\n"
"add %[output_block_data], x3, #96\n"
"str %[output_block_data], [sp, #64]\n"
"b.ne " DC_KERNEL_NO_MULT_16 "f\n"
"ldp x24, x12, [sp, #80]\n"
"ldr x17, [sp, #32]\n"
"ldr x26, [sp, #72]\n"
"mov x9, xzr\n"
"lsl w12, w12, #3\n"
"lsl x12, x12, #2\n"
"add x16, x17, x12\n"
"ldr x17, [sp, #40]\n"
"stp q4, q3, [sp, #224]\n"
"str q15, [sp, #176]\n"
"add x12, x17, x12\n"
"stp x12, x16, [sp, #208]\n"
"b " DC_KERNEL_NO_MULT_7 "f\n"
DC_KERNEL_NO_MULT_6 ":\n"
"ldp q18, q17, [sp, #224]\n"
"add x9, x9, #1\n"
"add x26, x26, #16\n"
"cmp x9, #2\n"
"add x24, x24, #4\n"
"mov v16.16b, v15.16b\n"
"b.eq " DC_KERNEL_NO_MULT_3 "b\n"
DC_KERNEL_NO_MULT_7 ":\n"
"ldr q19, [%[bias_data]], #16\n"
"ldr x16, [sp, #264]\n"
"lsl x12, x9, #4\n"
"ldr w17, [sp, #344]\n"
"mov v31.16b, v19.16b\n"
"add %[output_block_data], x16, x12\n"
"ldr x16, [sp, #216]\n"
"ldr q22, [%[output_block_data]]\n"
"mov v8.16b, v19.16b\n"
"mov v9.16b, v19.16b\n"
"ldr q20, [x16, x12]\n"
"ldr x16, [sp, #208]\n"
"mov v10.16b, v19.16b\n"
"cmp w17, #1\n"
"ldr q21, [x16, x12]\n"
"ldr x12, [sp, #328]\n"
"ldr q27, [%[output_block_data], x12]\n"
"ldr x12, [sp, #320]\n"
"ldr q26, [%[output_block_data], x12]\n"
"ldr x12, [sp, #312]\n"
".word 0x4e9a965f // sdot v31.4s, v18.16b, v26.16b\n"
"ldr q25, [%[output_block_data], x12]\n"
"ldr x12, [sp, #304]\n"
".word 0x4e9a9628 // sdot v8.4s, v17.16b, v26.16b\n"
".word 0x4e9a9609 // sdot v9.4s, v16.16b, v26.16b\n"
".word 0x4e99960a // sdot v10.4s, v16.16b, v25.16b\n"
"ldr q24, [%[output_block_data], x12]\n"
"ldr x12, [sp, #296]\n"
"ldr q23, [%[output_block_data], x12]\n"
"b.lt " DC_KERNEL_NO_MULT_11 "f\n"
"stp x24, x9, [sp, #280]\n"
"ldr w12, [sp, #344]\n"
"mov x17, x24\n"
"str x26, [sp, #272]\n"
"mov x22, x26\n"
"ldp x27, x24, [sp, #144]\n"
"ldp x26, %[filter_workspace], [sp, #128]\n"
"ldr x16, [sp, #120]\n"
"shl v28.4s, v16.4s, #8\n"
"shl v29.4s, v17.4s, #8\n"
"shl v30.4s, v18.4s, #8\n"
"mov v11.16b, v23.16b\n"
"mov v12.16b, v24.16b\n"
"mov v13.16b, v27.16b\n"
"mov v14.16b, v22.16b\n"
DC_KERNEL_NO_MULT_9 ":\n"
".word 0x4e8e961f // sdot v31.4s, v16.16b, v14.16b\n"
".word 0x4e8d9608 // sdot v8.4s, v16.16b, v13.16b\n"
".word 0x4e999629 // sdot v9.4s, v17.16b, v25.16b\n"
".word 0x4e8d963f // sdot v31.4s, v17.16b, v13.16b\n"
".word 0x4e8c962a // sdot v10.4s, v17.16b, v12.16b\n"
".word 0x4e999648 // sdot v8.4s, v18.16b, v25.16b\n"
".word 0x4e8c9649 // sdot v9.4s, v18.16b, v12.16b\n"
"sqrdmulh v31.4s, v31.4s, v21.4s\n"
".word 0x4e8b964a // sdot v10.4s, v18.16b, v11.16b\n"
"sqrdmulh v8.4s, v8.4s, v21.4s\n"
"sqrdmulh v9.4s, v9.4s, v21.4s\n"
"sqrshl v31.4s, v31.4s, v20.4s\n"
"sqrdmulh v10.4s, v10.4s, v21.4s\n"
"sqrshl v8.4s, v8.4s, v20.4s\n"
"sqrshl v9.4s, v9.4s, v20.4s\n"
"sqxtn v31.4h, v31.4s\n"
"sqrshl v10.4s, v10.4s, v20.4s\n"
"sqxtn v9.4h, v9.4s\n"
"sqxtn2 v31.8h, v8.4s\n"
"sqxtn2 v9.8h, v10.4s\n"
"sqadd v31.8h, v31.8h, v0.8h\n"
"sqadd v8.8h, v9.8h, v0.8h\n"
"sqxtn v31.8b, v31.8h\n"
"sqxtn2 v31.16b, v8.8h\n"
"smax v31.16b, v31.16b, v5.16b\n"
"add %[output_block_data], x27, x17\n"
"smin v31.16b, v31.16b, v6.16b\n"
"str s31, [x23, x17]\n"
"st1 { v31.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x26, x17\n"
"st1 { v31.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x16, x17\n"
"st1 { v31.s }[3], [%[output_block_data]]\n"
"ldr %[output_block_data], [sp, #376]\n"
"mov v10.16b, v19.16b\n"
"mov v31.16b, v19.16b\n"
"mov v8.16b, v19.16b\n"
"ldr x9, [sp, #352]\n"
".word 0x4e99978a // sdot v10.4s, v28.16b, v25.16b\n"
".word 0x4e8e979f // sdot v31.4s, v28.16b, v14.16b\n"
".word 0x4e8d9788 // sdot v8.4s, v28.16b, v13.16b\n"
".word 0x4e8c97aa // sdot v10.4s, v29.16b, v12.16b\n"
"mov v9.16b, v19.16b\n"
".word 0x4e8d97bf // sdot v31.4s, v29.16b, v13.16b\n"
".word 0x4e9a97a8 // sdot v8.4s, v29.16b, v26.16b\n"
".word 0x4e8b97ca // sdot v10.4s, v30.16b, v11.16b\n"
"add %[output_block_data], x3, x22\n"
"rev32 v2.8h, v26.8h\n"
".word 0x4e9a9789 // sdot v9.4s, v28.16b, v26.16b\n"
".word 0x4e9a97df // sdot v31.4s, v30.16b, v26.16b\n"
".word 0x4e9997c8 // sdot v8.4s, v30.16b, v25.16b\n"
"sqrdmulh v26.4s, v10.4s, v21.4s\n"
"rev32 v15.8h, v22.8h\n"
"ldr q22, [%[output_block_data], #32]\n"
"add %[output_block_data], x9, x17\n"
"rev32 v4.8h, v24.8h\n"
".word 0x4e9997a9 // sdot v9.4s, v29.16b, v25.16b\n"
"sqrdmulh v24.4s, v8.4s, v21.4s\n"
"sqrshl v8.4s, v26.4s, v20.4s\n"
"ldr q26, [%[scratch_block_data], x22]\n"
"mov x9, %[scratch_block_data]\n"
"ldr %[scratch_block_data], [sp, #368]\n"
"mov v7.16b, v6.16b\n"
"mov v6.16b, v5.16b\n"
"rev32 v5.8h, v23.8h\n"
".word 0x4e8c97c9 // sdot v9.4s, v30.16b, v12.16b\n"
"sqrdmulh v23.4s, v31.4s, v21.4s\n"
"rev32 v3.8h, v25.8h\n"
"sqrdmulh v25.4s, v9.4s, v21.4s\n"
"sqrshl v23.4s, v23.4s, v20.4s\n"
"sqrshl v31.4s, v24.4s, v20.4s\n"
"sqrshl v24.4s, v25.4s, v20.4s\n"
"sqxtn v9.4h, v23.4s\n"
"rev32 v1.8h, v27.8h\n"
"sqxtn v10.4h, v24.4s\n"
"ldr q27, [x28, x22]\n"
"ldr q25, [%[scratch_block_data], x22]\n"
"ldr q24, [x11, x22]\n"
"ldr q23, [x10, x22]\n"
"sqxtn2 v9.8h, v31.4s\n"
"sqxtn2 v10.8h, v8.4s\n"
"sqadd v31.8h, v9.8h, v0.8h\n"
"sqadd v8.8h, v10.8h, v0.8h\n"
"sqxtn v31.8b, v31.8h\n"
"sqxtn2 v31.16b, v8.8h\n"
"smax v31.16b, v31.16b, v6.16b\n"
"smin v31.16b, v31.16b, v7.16b\n"
"str s31, [%[filter_workspace], x17]\n"
"st1 { v31.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], %[function_params], x17\n"
"st1 { v31.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x13, x17\n"
"mov v8.16b, v19.16b\n"
"st1 { v31.s }[3], [%[output_block_data]]\n"
"trn1 v31.8h, v15.8h, v22.8h\n"
"mov v9.16b, v19.16b\n"
"mov v10.16b, v19.16b\n"
"trn1 v1.8h, v1.8h, v27.8h\n"
"trn1 v2.8h, v2.8h, v26.8h\n"
".word 0x4e9f9608 // sdot v8.4s, v16.16b, v31.16b\n"
"mov v11.16b, v19.16b\n"
"trn1 v3.8h, v3.8h, v25.8h\n"
".word 0x4e819609 // sdot v9.4s, v16.16b, v1.16b\n"
".word 0x4e82960a // sdot v10.4s, v16.16b, v2.16b\n"
".word 0x4e819628 // sdot v8.4s, v17.16b, v1.16b\n"
"trn1 v4.8h, v4.8h, v24.8h\n"
".word 0x4e83960b // sdot v11.4s, v16.16b, v3.16b\n"
".word 0x4e829629 // sdot v9.4s, v17.16b, v2.16b\n"
".word 0x4e83962a // sdot v10.4s, v17.16b, v3.16b\n"
".word 0x4e829648 // sdot v8.4s, v18.16b, v2.16b\n"
"trn1 v5.8h, v5.8h, v23.8h\n"
".word 0x4e84962b // sdot v11.4s, v17.16b, v4.16b\n"
".word 0x4e839649 // sdot v9.4s, v18.16b, v3.16b\n"
".word 0x4e84964a // sdot v10.4s, v18.16b, v4.16b\n"
"sqrdmulh v8.4s, v8.4s, v21.4s\n"
".word 0x4e85964b // sdot v11.4s, v18.16b, v5.16b\n"
"sqrdmulh v9.4s, v9.4s, v21.4s\n"
"sqrdmulh v10.4s, v10.4s, v21.4s\n"
"sqrshl v8.4s, v8.4s, v20.4s\n"
"sqrdmulh v11.4s, v11.4s, v21.4s\n"
"sqrshl v9.4s, v9.4s, v20.4s\n"
"sqrshl v10.4s, v10.4s, v20.4s\n"
"sqxtn v8.4h, v8.4s\n"
"sqrshl v11.4s, v11.4s, v20.4s\n"
"sqxtn v10.4h, v10.4s\n"
"sqxtn2 v8.8h, v9.4s\n"
"sqxtn2 v10.8h, v11.4s\n"
"sqadd v8.8h, v8.8h, v0.8h\n"
"sqadd v9.8h, v10.8h, v0.8h\n"
"sqxtn v8.8b, v8.8h\n"
"sqxtn2 v8.16b, v9.8h\n"
"mov v9.16b, v19.16b\n"
"ldr %[scratch_block_data], [sp, #360]\n"
"mov v10.16b, v19.16b\n"
"mov v11.16b, v19.16b\n"
".word 0x4e9f9789 // sdot v9.4s, v28.16b, v31.16b\n"
"mov v12.16b, v19.16b\n"
".word 0x4e81978a // sdot v10.4s, v28.16b, v1.16b\n"
".word 0x4e82978b // sdot v11.4s, v28.16b, v2.16b\n"
".word 0x4e8197a9 // sdot v9.4s, v29.16b, v1.16b\n"
"smax v8.16b, v8.16b, v6.16b\n"
".word 0x4e83978c // sdot v12.4s, v28.16b, v3.16b\n"
".word 0x4e8297aa // sdot v10.4s, v29.16b, v2.16b\n"
".word 0x4e8397ab // sdot v11.4s, v29.16b, v3.16b\n"
".word 0x4e8297c9 // sdot v9.4s, v30.16b, v2.16b\n"
"add %[output_block_data], x21, x17\n"
"smin v8.16b, v8.16b, v7.16b\n"
".word 0x4e8497ac // sdot v12.4s, v29.16b, v4.16b\n"
".word 0x4e8397ca // sdot v10.4s, v30.16b, v3.16b\n"
".word 0x4e8497cb // sdot v11.4s, v30.16b, v4.16b\n"
"sqrdmulh v1.4s, v9.4s, v21.4s\n"
"str s8, [%[scratch_block_data], x17]\n"
"st1 { v8.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x20, x17\n"
".word 0x4e8597cc // sdot v12.4s, v30.16b, v5.16b\n"
"sqrdmulh v2.4s, v10.4s, v21.4s\n"
"sqrdmulh v3.4s, v11.4s, v21.4s\n"
"sqrshl v1.4s, v1.4s, v20.4s\n"
"st1 { v8.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x19, x17\n"
"sqrdmulh v4.4s, v12.4s, v21.4s\n"
"sqrshl v2.4s, v2.4s, v20.4s\n"
"sqrshl v3.4s, v3.4s, v20.4s\n"
"sqxtn v1.4h, v1.4s\n"
"st1 { v8.s }[3], [%[output_block_data]]\n"
"sqrshl v4.4s, v4.4s, v20.4s\n"
"sqxtn v3.4h, v3.4s\n"
"sqxtn2 v1.8h, v2.4s\n"
"sqxtn2 v3.8h, v4.4s\n"
"sqadd v1.8h, v1.8h, v0.8h\n"
"sqadd v2.8h, v3.8h, v0.8h\n"
"sqxtn v1.8b, v1.8h\n"
"mov v5.16b, v6.16b\n"
"sqxtn2 v1.16b, v2.8h\n"
"smax v1.16b, v1.16b, v5.16b\n"
"add %[output_block_data], x15, x17\n"
"smin v1.16b, v1.16b, v7.16b\n"
"str s1, [x25, x17]\n"
"st1 { v1.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x14, x17\n"
"mov v31.16b, v19.16b\n"
"mov v8.16b, v19.16b\n"
"mov v9.16b, v19.16b\n"
"mov v10.16b, v19.16b\n"
"mov %[scratch_block_data], x9\n"
"mov v6.16b, v7.16b\n"
"st1 { v1.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x6, x17\n"
"subs w12, w12, #1\n"
"add x22, x22, #32\n"
".word 0x4e9a965f // sdot v31.4s, v18.16b, v26.16b\n"
".word 0x4e9a9628 // sdot v8.4s, v17.16b, v26.16b\n"
".word 0x4e9a9609 // sdot v9.4s, v16.16b, v26.16b\n"
".word 0x4e99960a // sdot v10.4s, v16.16b, v25.16b\n"
"add x17, x17, x24\n"
"mov v11.16b, v23.16b\n"
"mov v12.16b, v24.16b\n"
"mov v13.16b, v27.16b\n"
"mov v14.16b, v22.16b\n"
"st1 { v1.s }[3], [%[output_block_data]]\n"
"b.ne " DC_KERNEL_NO_MULT_9 "b\n"
"ldr x12, [sp, #376]\n"
"ldp d14, d7, [sp, #160]\n"
"ldr q15, [sp, #176]\n"
"ldp x24, x9, [sp, #280]\n"
"add %[output_block_data], x12, x22\n"
"ldr x22, [sp, #200]\n"
"ldr x26, [sp, #272]\n"
"add x12, x23, x17\n"
"mov w1, #4\n"
"ldr w17, [sp, #348]\n"
"cmp w17, #0\n"
"b.gt " DC_KERNEL_NO_MULT_12 "f\n"
"b " DC_KERNEL_NO_MULT_6 "b\n"
DC_KERNEL_NO_MULT_11 ":\n"
"ldr x12, [sp, #112]\n"
"add x12, x12, x9, lsl #2\n"
"ldr w17, [sp, #348]\n"
"cmp w17, #0\n"
"b.le " DC_KERNEL_NO_MULT_6 "b\n"
DC_KERNEL_NO_MULT_12 ":\n"
"ldr w17, [sp, #348]\n"
"movi v28.16b, #0\n"
"movi v29.16b, #0\n"
"movi v30.16b, #0\n"
"cmp w17, #3\n"
"movi v11.16b, #0\n"
"movi v12.16b, #0\n"
"movi v13.16b, #0\n"
"b.lt " DC_KERNEL_NO_MULT_14 "f\n"
"add x17, %[output_block_data], #32\n"
"ldp x16, %[output_block_data], [sp, #320]\n"
"ldr q13, [x17]\n"
"ldr %[scratch_block_data], [sp, #96]\n"
"ldr q12, [x17, %[output_block_data]]\n"
"ldr %[output_block_data], [sp, #312]\n"
"ldr q11, [x17, x16]\n"
"ldr q30, [x17, %[output_block_data]]\n"
"ldr %[output_block_data], [sp, #304]\n"
"ldr q29, [x17, %[output_block_data]]\n"
"ldr %[output_block_data], [sp, #296]\n"
"ldr q28, [x17, %[output_block_data]]\n"
DC_KERNEL_NO_MULT_14 ":\n"
"ldr w17, [sp, #348]\n"
DC_KERNEL_NO_MULT_15 ":\n"
".word 0x4e96961f // sdot v31.4s, v16.16b, v22.16b\n"
".word 0x4e9b9608 // sdot v8.4s, v16.16b, v27.16b\n"
".word 0x4e999629 // sdot v9.4s, v17.16b, v25.16b\n"
".word 0x4e9b963f // sdot v31.4s, v17.16b, v27.16b\n"
".word 0x4e98962a // sdot v10.4s, v17.16b, v24.16b\n"
".word 0x4e999648 // sdot v8.4s, v18.16b, v25.16b\n"
".word 0x4e989649 // sdot v9.4s, v18.16b, v24.16b\n"
"sqrdmulh v1.4s, v31.4s, v21.4s\n"
".word 0x4e97964a // sdot v10.4s, v18.16b, v23.16b\n"
"sqrdmulh v2.4s, v8.4s, v21.4s\n"
"sqrdmulh v3.4s, v9.4s, v21.4s\n"
"sqrshl v1.4s, v1.4s, v20.4s\n"
"sqrdmulh v4.4s, v10.4s, v21.4s\n"
"sqrshl v2.4s, v2.4s, v20.4s\n"
"sqrshl v3.4s, v3.4s, v20.4s\n"
"sqxtn v1.4h, v1.4s\n"
"sqrshl v4.4s, v4.4s, v20.4s\n"
"sqxtn v3.4h, v3.4s\n"
"sqxtn2 v1.8h, v2.4s\n"
"sqxtn2 v3.8h, v4.4s\n"
"sqadd v1.8h, v1.8h, v0.8h\n"
"sqadd v2.8h, v3.8h, v0.8h\n"
"sqxtn v1.8b, v1.8h\n"
"sqxtn2 v1.16b, v2.8h\n"
"smax v1.16b, v1.16b, v5.16b\n"
"add %[output_block_data], x12, x22\n"
"smin v1.16b, v1.16b, v6.16b\n"
"ushr v26.4s, v26.4s, #8\n"
"ushr v25.4s, v25.4s, #8\n"
"str s1, [x12]\n"
"st1 { v1.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x12, x5\n"
"ushr v22.4s, v22.4s, #8\n"
"ushr v27.4s, v27.4s, #8\n"
"sli v26.4s, v11.4s, #24\n"
"ushr v24.4s, v24.4s, #8\n"
"ushr v23.4s, v23.4s, #8\n"
"sli v25.4s, v30.4s, #24\n"
"mov v31.16b, v19.16b\n"
"mov v8.16b, v19.16b\n"
"mov v9.16b, v19.16b\n"
"mov v10.16b, v19.16b\n"
"st1 { v1.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x12, x8\n"
"subs w17, w17, #1\n"
"sli v22.4s, v13.4s, #24\n"
"ushr v13.4s, v13.4s, #8\n"
"ushr v11.4s, v11.4s, #8\n"
"sli v27.4s, v12.4s, #24\n"
"ushr v12.4s, v12.4s, #8\n"
"ushr v30.4s, v30.4s, #8\n"
"sli v24.4s, v29.4s, #24\n"
"ushr v29.4s, v29.4s, #8\n"
"sli v23.4s, v28.4s, #24\n"
"ushr v28.4s, v28.4s, #8\n"
".word 0x4e9a965f // sdot v31.4s, v18.16b, v26.16b\n"
".word 0x4e9a9628 // sdot v8.4s, v17.16b, v26.16b\n"
".word 0x4e9a9609 // sdot v9.4s, v16.16b, v26.16b\n"
"add x12, x12, x7\n"
".word 0x4e99960a // sdot v10.4s, v16.16b, v25.16b\n"
"st1 { v1.s }[3], [%[output_block_data]]\n"
"b.ne " DC_KERNEL_NO_MULT_15 "b\n"
"b " DC_KERNEL_NO_MULT_6 "b\n"
DC_KERNEL_NO_MULT_16 ":\n"
"cmp w17, #1\n"
"add x9, %[bias_data], #32\n"
"b.lt " DC_KERNEL_NO_MULT_2 "b\n"
"ldr w12, [sp, #340]\n"
"cmp w12, #1\n"
"b.lt " DC_KERNEL_NO_MULT_27 "f\n"
"ldr x12, [sp, #88]\n"
"ldp x17, %[output_block_data], [sp, #32]\n"
"str x9, [sp, #288]\n"
"ldp q19, q20, [%[bias_data]]\n"
"lsl w12, w12, #3\n"
"lsl x12, x12, #2\n"
"add x17, x17, x12\n"
"add x12, %[output_block_data], x12\n"
"ldp q21, q22, [x17]\n"
"ldp q23, q24, [x12]\n"
"ldr x9, [sp, #264]\n"
"ldr x27, [sp, #112]\n"
"mov w26, wzr\n"
"b " DC_KERNEL_NO_MULT_20 "f\n"
DC_KERNEL_NO_MULT_19 ":\n"
"ldr w12, [sp, #108]\n"
"ldr x22, [sp, #200]\n"
"add w26, w26, #1\n"
"cmp w26, w12\n"
"add x27, x27, x22\n"
"b.eq " DC_KERNEL_NO_MULT_26 "f\n"
DC_KERNEL_NO_MULT_20 ":\n"
"ldp x16, %[output_block_data], [sp, #320]\n"
"ldp q25, q26, [x9]\n"
"mov w12, wzr\n"
"mov x17, x9\n"
"add %[scratch_block_data], x9, %[output_block_data]\n"
"add %[output_block_data], x9, x16\n"
"ldp q27, q28, [%[scratch_block_data]]\n"
"ldp q29, q30, [%[output_block_data]]\n"
"mov x9, %[scratch_block_data]\n"
"mov x22, x27\n"
"b " DC_KERNEL_NO_MULT_22 "f\n"
DC_KERNEL_NO_MULT_21 ":\n"
"ldr w16, [sp, #340]\n"
"add w12, w12, #1\n"
"mov x17, %[scratch_block_data]\n"
"cmp w12, w16\n"
"b.eq " DC_KERNEL_NO_MULT_19 "b\n"
DC_KERNEL_NO_MULT_22 ":\n"
"ldr w16, [sp, #344]\n"
"add %[scratch_block_data], x17, #32\n"
"cmp w12, w16\n"
"ldr w16, [sp, #348]\n"
"csel w3, w16, w1, eq\n"
"cmp w3, #3\n"
"b.ge " DC_KERNEL_NO_MULT_24 "f\n"
"movi v31.16b, #0\n"
"cmp w3, #1\n"
"movi v8.16b, #0\n"
"movi v9.16b, #0\n"
"movi v11.16b, #0\n"
"movi v12.16b, #0\n"
"movi v10.16b, #0\n"
"b.ge " DC_KERNEL_NO_MULT_25 "f\n"
"b " DC_KERNEL_NO_MULT_21 "b\n"
DC_KERNEL_NO_MULT_24 ":\n"
"ldr x24, [sp, #328]\n"
"mov x16, x11\n"
"mov x11, x10\n"
"mov x10, %[scratch_block_data]\n"
"add x24, %[scratch_block_data], x24\n"
"ldr %[scratch_block_data], [sp, #320]\n"
"ldp q10, q9, [x17, #32]\n"
"ldp q12, q8, [x24]\n"
"mov x23, x15\n"
"add %[scratch_block_data], x10, x0\n"
"ldp q11, q31, [%[scratch_block_data]]\n"
"mov x15, x14\n"
"mov x14, x6\n"
"mov %[bias_data], x13\n"
"mov x13, x21\n"
"mov x21, x20\n"
"mov x20, x19\n"
"mov x19, x25\n"
"mov x19, x20\n"
"mov x20, x21\n"
"mov x21, x13\n"
"mov x13, %[bias_data]\n"
"mov x14, x15\n"
"mov x15, x23\n"
"mov %[scratch_block_data], x10\n"
"mov x10, x11\n"
"mov x11, x16\n"
DC_KERNEL_NO_MULT_25 ":\n"
"mov v1.16b, v19.16b\n"
"mov v2.16b, v20.16b\n"
".word 0x4e999601 // sdot v1.4s, v16.16b, v25.16b\n"
".word 0x4e9a95e2 // sdot v2.4s, v15.16b, v26.16b\n"
".word 0x4e9b9621 // sdot v1.4s, v17.16b, v27.16b\n"
".word 0x4e9c9462 // sdot v2.4s, v3.16b, v28.16b\n"
".word 0x4e9d9641 // sdot v1.4s, v18.16b, v29.16b\n"
".word 0x4e9e9482 // sdot v2.4s, v4.16b, v30.16b\n"
"sqrdmulh v1.4s, v1.4s, v23.4s\n"
"sqrdmulh v2.4s, v2.4s, v24.4s\n"
"sqrshl v1.4s, v1.4s, v21.4s\n"
"sqrshl v2.4s, v2.4s, v22.4s\n"
"sqxtn v1.4h, v1.4s\n"
"sqxtn2 v1.8h, v2.4s\n"
"sqadd v1.8h, v1.8h, v0.8h\n"
"sqxtn v1.8b, v1.8h\n"
"smax v1.8b, v1.8b, v7.8b\n"
"ushr v25.4s, v25.4s, #8\n"
"ushr v26.4s, v26.4s, #8\n"
"ushr v27.4s, v27.4s, #8\n"
"ushr v28.4s, v28.4s, #8\n"
"ushr v29.4s, v29.4s, #8\n"
"ushr v30.4s, v30.4s, #8\n"
"smin v1.8b, v1.8b, v14.8b\n"
"subs w3, w3, #1\n"
"sli v25.4s, v10.4s, #24\n"
"ushr v10.4s, v10.4s, #8\n"
"sli v26.4s, v9.4s, #24\n"
"ushr v9.4s, v9.4s, #8\n"
"sli v27.4s, v12.4s, #24\n"
"ushr v12.4s, v12.4s, #8\n"
"sli v28.4s, v8.4s, #24\n"
"ushr v8.4s, v8.4s, #8\n"
"sli v29.4s, v11.4s, #24\n"
"ushr v11.4s, v11.4s, #8\n"
"sli v30.4s, v31.4s, #24\n"
"ushr v31.4s, v31.4s, #8\n"
"str d1, [x22]\n"
"add x22, x22, x7\n"
"b.ne " DC_KERNEL_NO_MULT_25 "b\n"
"b " DC_KERNEL_NO_MULT_21 "b\n"
DC_KERNEL_NO_MULT_26 ":\n"
"ldr %[bias_data], [sp, #288]\n"
"ldr x23, [sp, #24]\n"
"ldr %[scratch_block_data], [sp, #96]\n"
"b " DC_KERNEL_NO_MULT_3 "b\n"
DC_KERNEL_NO_MULT_27 ":\n"
"ldr w12, [sp, #20]\n"
"cmp w17, #2\n"
"b.hs " DC_KERNEL_NO_MULT_29 "f\n"
"mov w12, wzr\n"
"b " DC_KERNEL_NO_MULT_31 "f\n"
DC_KERNEL_NO_MULT_29 ":\n"
"subs w12, w12, #2\n"
"b.ne " DC_KERNEL_NO_MULT_29 "b\n"
"ldr w12, [sp, #20]\n"
"cmp w17, w12\n"
"b.eq " DC_KERNEL_NO_MULT_2 "b\n"
DC_KERNEL_NO_MULT_31 ":\n"
"sub w12, w17, w12\n"
DC_KERNEL_NO_MULT_32 ":\n"
"subs w12, w12, #1\n"
"b.ne " DC_KERNEL_NO_MULT_32 "b\n"
"b " DC_KERNEL_NO_MULT_2 "b\n"
DC_KERNEL_NO_MULT_33 ":\n"
"add sp, sp, #384\n"
:
[ scratch_block_data ] "+r"(scratch_block_data),
[ filter_workspace ] "+r"(filter_workspace),
[ bias_data ] "+r"(bias_data),
[ output_block_data ] "+r"(output_block_data)
:
[ function_params ] "r"(function_params)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31",
"x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
"x27", "x28");
#undef DC_KERNEL_NO_MULT_1
#undef DC_KERNEL_NO_MULT_2
#undef DC_KERNEL_NO_MULT_3
#undef DC_KERNEL_NO_MULT_4
#undef DC_KERNEL_NO_MULT_5
#undef DC_KERNEL_NO_MULT_6
#undef DC_KERNEL_NO_MULT_7
#undef DC_KERNEL_NO_MULT_8
#undef DC_KERNEL_NO_MULT_9
#undef DC_KERNEL_NO_MULT_10
#undef DC_KERNEL_NO_MULT_11
#undef DC_KERNEL_NO_MULT_12
#undef DC_KERNEL_NO_MULT_13
#undef DC_KERNEL_NO_MULT_14
#undef DC_KERNEL_NO_MULT_15
#undef DC_KERNEL_NO_MULT_16
#undef DC_KERNEL_NO_MULT_17
#undef DC_KERNEL_NO_MULT_18
#undef DC_KERNEL_NO_MULT_19
#undef DC_KERNEL_NO_MULT_20
#undef DC_KERNEL_NO_MULT_21
#undef DC_KERNEL_NO_MULT_22
#undef DC_KERNEL_NO_MULT_23
#undef DC_KERNEL_NO_MULT_24
#undef DC_KERNEL_NO_MULT_25
#undef DC_KERNEL_NO_MULT_26
#undef DC_KERNEL_NO_MULT_27
#undef DC_KERNEL_NO_MULT_28
#undef DC_KERNEL_NO_MULT_29
#undef DC_KERNEL_NO_MULT_30
#undef DC_KERNEL_NO_MULT_31
#undef DC_KERNEL_NO_MULT_32
#undef DC_KERNEL_NO_MULT_33
}
static inline void Run(const int8* scratch_block_data,
const int8* filter_workspace, const int32* bias_data,
int8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
output_block_data, function_params);
}
};
template <>
struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
QuantizationType::kPerChannelInt8,
DepthwiseConvDepthMultiplication::kNoMultiplication,
2> {
static inline void KernelMacroBlockNeon(
const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, int8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
#define DC_KERNEL_NO_MULT_STRIDE_1 …
#define DC_KERNEL_NO_MULT_STRIDE_2 …
#define DC_KERNEL_NO_MULT_STRIDE_3 …
#define DC_KERNEL_NO_MULT_STRIDE_4 …
#define DC_KERNEL_NO_MULT_STRIDE_5 …
#define DC_KERNEL_NO_MULT_STRIDE_6 …
#define DC_KERNEL_NO_MULT_STRIDE_7 …
#define DC_KERNEL_NO_MULT_STRIDE_8 …
#define DC_KERNEL_NO_MULT_STRIDE_9 …
#define DC_KERNEL_NO_MULT_STRIDE_10 …
#define DC_KERNEL_NO_MULT_STRIDE_11 …
#define DC_KERNEL_NO_MULT_STRIDE_12 …
#define DC_KERNEL_NO_MULT_STRIDE_13 …
#define DC_KERNEL_NO_MULT_STRIDE_14 …
#define DC_KERNEL_NO_MULT_STRIDE_15 …
#define DC_KERNEL_NO_MULT_STRIDE_16 …
#define DC_KERNEL_NO_MULT_STRIDE_17 …
#define DC_KERNEL_NO_MULT_STRIDE_18 …
#define DC_KERNEL_NO_MULT_STRIDE_19 …
#define DC_KERNEL_NO_MULT_STRIDE_20 …
#define DC_KERNEL_NO_MULT_STRIDE_21 …
#define DC_KERNEL_NO_MULT_STRIDE_22 …
#define DC_KERNEL_NO_MULT_STRIDE_23 …
#define DC_KERNEL_NO_MULT_STRIDE_24 …
#define DC_KERNEL_NO_MULT_STRIDE_25 …
#define DC_KERNEL_NO_MULT_STRIDE_26 …
#define DC_KERNEL_NO_MULT_STRIDE_27 …
#define DC_KERNEL_NO_MULT_STRIDE_28 …
#define DC_KERNEL_NO_MULT_STRIDE_29 …
#define DC_KERNEL_NO_MULT_STRIDE_30 …
#define DC_KERNEL_NO_MULT_STRIDE_31 …
#define DC_KERNEL_NO_MULT_STRIDE_32 …
#define DC_KERNEL_NO_MULT_STRIDE_33 …
#define DC_KERNEL_NO_MULT_STRIDE_34 …
#define DC_KERNEL_NO_MULT_STRIDE_35 …
asm volatile(
"sub sp, sp, #176\n"
"ldr w23, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"str %[scratch_block_data], [sp, #168]\n"
"cmp w23, #1\n"
"b.lt " DC_KERNEL_NO_MULT_STRIDE_35 "f\n"
"ldr x8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL) "]\n"
"ldpsw x11, x12, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
"ldp w13, w0, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"ldr w5, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
"str x8, [sp, #144]\n"
"ldr x8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL) "]\n"
"ldr x14, [%[function_params]]\n"
"str w5, [sp, #164]\n"
"add x15, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "\n"
"str x8, [sp, #136]\n"
"add x16, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "\n"
"add x17, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"
"ldrsw x8, [%[function_params], #" STR(DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"ldp w5, w4, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
"ld1r { v0.8h }, [x17]\n"
"ld1r { v1.8b }, [x15]\n"
"ld1r { v2.8b }, [x16]\n"
"cmp w5, #1\n"
"ccmp w0, w13, #0, eq\n"
"lsl w15, w14, #1\n"
"csel w6, w0, w13, lt\n"
"lsl x8, x8, #5\n"
"sxtw x19, w14\n"
"sxtw x22, w15\n"
"bic w14, w6, w6, asr #31\n"
"str x8, [sp, #152]\n"
"lsl x7, x12, #1\n"
"madd x8, x22, x14, %[output_block_data]\n"
"mov x9, xzr\n"
"mov x10, xzr\n"
"lsl x20, x12, #2\n"
"add x21, x7, x12\n"
"sub x14, x13, x14\n"
"stp x8, x23, [sp, #48]\n"
"add x8, x8, #4\n"
"str w4, [sp, #44]\n"
"str %[scratch_block_data], [sp, #32]\n"
"str x14, [sp, #128]\n"
"str x8, [sp, #8]\n"
"b " DC_KERNEL_NO_MULT_STRIDE_4 "f\n"
DC_KERNEL_NO_MULT_STRIDE_2 ":\n"
"add x27, %[bias_data], #32\n"
"mov v19.16b, v12.16b\n"
"mov v3.16b, v9.16b\n"
"mov v5.16b, v10.16b\n"
"mov v20.16b, v7.16b\n"
DC_KERNEL_NO_MULT_STRIDE_3 ":\n"
"add x10, x10, #1\n"
"cmp x10, x23\n"
"add x9, x9, #8\n"
"mov %[bias_data], x27\n"
"b.eq " DC_KERNEL_NO_MULT_STRIDE_35 "f\n"
DC_KERNEL_NO_MULT_STRIDE_4 ":\n"
"ldr w8, [sp, #164]\n"
"add w14, w10, w10, lsl #1\n"
"lsl w14, w14, #5\n"
"add x26, %[filter_workspace], x14\n"
"cmp w8, #2\n"
"ldr x8, [sp, #168]\n"
"ldr x14, [sp, #152]\n"
"nop\n"
"madd x28, x10, x14, x8\n"
"b.ne " DC_KERNEL_NO_MULT_STRIDE_14 "f\n"
"ldr x8, [sp, #136]\n"
"ubfx x14, x9, #3, #29\n"
"lsl w15, w10, #3\n"
"lsl x27, x14, #3\n"
"lsl x14, x15, #2\n"
"add x24, x8, x14\n"
"ldr x8, [sp, #144]\n"
"ldr q22, [x26]\n"
"ldr q23, [x26, #32]\n"
"ldr q24, [x26, #64]\n"
"add x14, x8, x14\n"
"ldr x8, [sp, #48]\n"
"ldr q25, [%[bias_data]]\n"
"ldr q31, [x28]\n"
"ldr q8, [x28, x12]\n"
"ldr q30, [x28, x7]\n"
"ldr q29, [x28, x21]\n"
"ldr q26, [x24]\n"
"ldr q27, [x14]\n"
"ldr q28, [x28, x20]\n"
"add x25, x8, x27\n"
"cmp w6, #1\n"
"add %[function_params], %[output_block_data], x15\n"
"mov v12.16b, v19.16b\n"
"mov v7.16b, v20.16b\n"
"b.lt " DC_KERNEL_NO_MULT_STRIDE_23 "f\n"
"mov v4.16b, v21.16b\n"
"mov x8, %[filter_workspace]\n"
"mov w15, wzr\n"
"mov x16, xzr\n"
"add x17, x28, #32\n"
"mov x23, x6\n"
"mov v17.16b, v30.16b\n"
DC_KERNEL_NO_MULT_STRIDE_7 ":\n"
"mov v18.16b, v25.16b\n"
"mov v19.16b, v25.16b\n"
".word 0x4e9f96d2 // sdot v18.4s, v22.16b, v31.16b\n"
".word 0x4e9196d3 // sdot v19.4s, v22.16b, v17.16b\n"
".word 0x4e8896f2 // sdot v18.4s, v23.16b, v8.16b\n"
".word 0x4e9d96f3 // sdot v19.4s, v23.16b, v29.16b\n"
".word 0x4e919712 // sdot v18.4s, v24.16b, v17.16b\n"
".word 0x4e9c9713 // sdot v19.4s, v24.16b, v28.16b\n"
"sqrdmulh v18.4s, v18.4s, v27.4s\n"
"and %[scratch_block_data], x16, #0xffffffe0\n"
"sqrdmulh v19.4s, v19.4s, v27.4s\n"
"sqrshl v18.4s, v18.4s, v26.4s\n"
"add %[scratch_block_data], x17, x0\n"
"sqrshl v19.4s, v19.4s, v26.4s\n"
"sqxtn v18.4h, v18.4s\n"
"rev32 v20.8h, v31.8h\n"
"rev32 v21.8h, v8.8h\n"
"rev32 v9.8h, v30.8h\n"
"rev32 v10.8h, v29.8h\n"
"ldr q31, [%[scratch_block_data]]\n"
"ldr q8, [%[scratch_block_data], x12]\n"
"ldr q30, [%[scratch_block_data], x7]\n"
"ldr q29, [%[scratch_block_data], x21]\n"
"rev32 v17.8h, v28.8h\n"
"ldr q28, [%[scratch_block_data], x20]\n"
"sqxtn2 v18.8h, v19.4s\n"
"sqadd v18.8h, v18.8h, v0.8h\n"
"sqxtn v18.8b, v18.8h\n"
"add %[filter_workspace], %[function_params], w15, sxtw\n"
"smax v18.8b, v18.8b, v1.8b\n"
"add %[scratch_block_data], %[filter_workspace], x11\n"
"smin v18.8b, v18.8b, v2.8b\n"
"mov v11.16b, v25.16b\n"
"str s18, [%[filter_workspace]]\n"
"st1 { v18.s }[1], [%[scratch_block_data]]\n"
"trn1 v18.8h, v20.8h, v31.8h\n"
"mov v19.16b, v25.16b\n"
"trn1 v20.8h, v21.8h, v8.8h\n"
"trn1 v21.8h, v9.8h, v30.8h\n"
".word 0x4e9296cb // sdot v11.4s, v22.16b, v18.16b\n"
"trn1 v9.8h, v10.8h, v29.8h\n"
".word 0x4e9596d3 // sdot v19.4s, v22.16b, v21.16b\n"
".word 0x4e9496eb // sdot v11.4s, v23.16b, v20.16b\n"
"trn1 v17.8h, v17.8h, v28.8h\n"
".word 0x4e8996f3 // sdot v19.4s, v23.16b, v9.16b\n"
".word 0x4e95970b // sdot v11.4s, v24.16b, v21.16b\n"
".word 0x4e919713 // sdot v19.4s, v24.16b, v17.16b\n"
"sqrdmulh v17.4s, v11.4s, v27.4s\n"
"sqrdmulh v18.4s, v19.4s, v27.4s\n"
"sqrshl v17.4s, v17.4s, v26.4s\n"
"sqrshl v18.4s, v18.4s, v26.4s\n"
"sqxtn v17.4h, v17.4s\n"
"sqxtn2 v17.8h, v18.4s\n"
"sqadd v17.8h, v17.8h, v0.8h\n"
"sqxtn v17.8b, v17.8h\n"
"add %[filter_workspace], x1, x19\n"
"smax v17.8b, v17.8b, v1.8b\n"
"add %[scratch_block_data], %[filter_workspace], x11\n"
"smin v17.8b, v17.8b, v2.8b\n"
"add x16, x16, #32\n"
"subs x23, x23, #1\n"
"str s17, [%[filter_workspace]]\n"
"st1 { v17.s }[1], [%[scratch_block_data]]\n"
"add w15, w15, w22\n"
"mov v17.16b, v30.16b\n"
"b.ne " DC_KERNEL_NO_MULT_STRIDE_7 "b\n"
"mov v6.16b, v31.16b\n"
"mov v15.16b, v8.16b\n"
"mov v14.16b, v30.16b\n"
"mov v13.16b, v29.16b\n"
"mov v11.16b, v28.16b\n"
"mov w15, w6\n"
"mov %[filter_workspace], x8\n"
"mov v21.16b, v4.16b\n"
"cmp w15, w13\n"
"ldr x15, [sp, #128]\n"
"b.ge " DC_KERNEL_NO_MULT_STRIDE_10 "f\n"
DC_KERNEL_NO_MULT_STRIDE_9 ":\n"
"mov v9.16b, v25.16b\n"
"mov v10.16b, v25.16b\n"
".word 0x4e9f96c9 // sdot v9.4s, v22.16b, v31.16b\n"
".word 0x4e8896e9 // sdot v9.4s, v23.16b, v8.16b\n"
".word 0x4e9e96ca // sdot v10.4s, v22.16b, v30.16b\n"
".word 0x4e9e9709 // sdot v9.4s, v24.16b, v30.16b\n"
".word 0x4e9d96ea // sdot v10.4s, v23.16b, v29.16b\n"
".word 0x4e9c970a // sdot v10.4s, v24.16b, v28.16b\n"
"sqrdmulh v9.4s, v9.4s, v27.4s\n"
"sqrdmulh v10.4s, v10.4s, v27.4s\n"
"sqrshl v9.4s, v9.4s, v26.4s\n"
"sqrshl v10.4s, v10.4s, v26.4s\n"
"sqxtn v9.4h, v9.4s\n"
"sqxtn2 v9.8h, v10.4s\n"
"sqadd v9.8h, v9.8h, v0.8h\n"
"sqxtn v9.8b, v9.8h\n"
"smax v9.8b, v9.8b, v1.8b\n"
"rev32 v31.8h, v31.8h\n"
"rev32 v8.8h, v8.8h\n"
"rev32 v30.8h, v30.8h\n"
"rev32 v29.8h, v29.8h\n"
"rev32 v28.8h, v28.8h\n"
"smin v9.8b, v9.8b, v2.8b\n"
"add x16, x25, x11\n"
"subs x15, x15, #1\n"
"trn1 v31.8h, v31.8h, v6.8h\n"
"trn1 v8.8h, v8.8h, v15.8h\n"
"trn1 v29.8h, v29.8h, v13.8h\n"
"trn1 v30.8h, v30.8h, v14.8h\n"
"trn1 v28.8h, v28.8h, v11.8h\n"
"str s9, [x25]\n"
"add x25, x25, x22\n"
"st1 { v9.s }[1], [x16]\n"
"b.ne " DC_KERNEL_NO_MULT_STRIDE_9 "b\n"
DC_KERNEL_NO_MULT_STRIDE_10 ":\n"
"ldr q22, [x26, #16]\n"
"ldr q23, [x26, #48]\n"
"ldr q24, [x26, #80]\n"
"ldr q29, [x28, #16]!\n"
"ldr q25, [%[bias_data], #16]\n"
"ldr q26, [x24, #16]\n"
"ldr q27, [x14, #16]\n"
"ldr q8, [x28, x12]\n"
"ldr q31, [x28, x7]\n"
"ldr q30, [x28, x21]\n"
"ldr q28, [x28, x20]\n"
"ldr x23, [sp, #56]\n"
"cmp w6, #0\n"
"mov v10.16b, v5.16b\n"
"b.le " DC_KERNEL_NO_MULT_STRIDE_24 "f\n"
"mov v6.16b, v21.16b\n"
"mov v9.16b, v3.16b\n"
"mov w14, wzr\n"
"mov x15, xzr\n"
"add x16, x28, #32\n"
"add x17, %[function_params], #4\n"
"mov %[function_params], x6\n"
"mov v17.16b, v31.16b\n"
DC_KERNEL_NO_MULT_STRIDE_12 ":\n"
"mov v3.16b, v25.16b\n"
"mov v4.16b, v25.16b\n"
".word 0x4e9d96c3 // sdot v3.4s, v22.16b, v29.16b\n"
".word 0x4e9196c4 // sdot v4.4s, v22.16b, v17.16b\n"
".word 0x4e8896e3 // sdot v3.4s, v23.16b, v8.16b\n"
".word 0x4e9e96e4 // sdot v4.4s, v23.16b, v30.16b\n"
".word 0x4e919703 // sdot v3.4s, v24.16b, v17.16b\n"
".word 0x4e9c9704 // sdot v4.4s, v24.16b, v28.16b\n"
"sqrdmulh v3.4s, v3.4s, v27.4s\n"
"and %[scratch_block_data], x15, #0xffffffe0\n"
"sqrdmulh v4.4s, v4.4s, v27.4s\n"
"sqrshl v3.4s, v3.4s, v26.4s\n"
"add %[scratch_block_data], x16, x0\n"
"sqrshl v4.4s, v4.4s, v26.4s\n"
"sqxtn v3.4h, v3.4s\n"
"rev32 v5.8h, v29.8h\n"
"rev32 v18.8h, v8.8h\n"
"rev32 v19.8h, v31.8h\n"
"rev32 v20.8h, v30.8h\n"
"ldr q29, [%[scratch_block_data]]\n"
"ldr q8, [%[scratch_block_data], x12]\n"
"ldr q31, [%[scratch_block_data], x7]\n"
"ldr q30, [%[scratch_block_data], x21]\n"
"rev32 v17.8h, v28.8h\n"
"ldr q28, [%[scratch_block_data], x20]\n"
"sqxtn2 v3.8h, v4.4s\n"
"sqadd v3.8h, v3.8h, v0.8h\n"
"sqxtn v3.8b, v3.8h\n"
"add x8, x17, w14, sxtw\n"
"smax v3.8b, v3.8b, v1.8b\n"
"add %[scratch_block_data], x8, x11\n"
"smin v3.8b, v3.8b, v2.8b\n"
"mov v21.16b, v25.16b\n"
"str s3, [x8]\n"
"st1 { v3.s }[1], [%[scratch_block_data]]\n"
"trn1 v3.8h, v5.8h, v29.8h\n"
"mov v4.16b, v25.16b\n"
"trn1 v5.8h, v18.8h, v8.8h\n"
"trn1 v18.8h, v19.8h, v31.8h\n"
".word 0x4e8396d5 // sdot v21.4s, v22.16b, v3.16b\n"
"trn1 v19.8h, v20.8h, v30.8h\n"
".word 0x4e9296c4 // sdot v4.4s, v22.16b, v18.16b\n"
".word 0x4e8596f5 // sdot v21.4s, v23.16b, v5.16b\n"
"trn1 v17.8h, v17.8h, v28.8h\n"
".word 0x4e9396e4 // sdot v4.4s, v23.16b, v19.16b\n"
".word 0x4e929715 // sdot v21.4s, v24.16b, v18.16b\n"
".word 0x4e919704 // sdot v4.4s, v24.16b, v17.16b\n"
"sqrdmulh v3.4s, v21.4s, v27.4s\n"
"sqrdmulh v4.4s, v4.4s, v27.4s\n"
"sqrshl v3.4s, v3.4s, v26.4s\n"
"sqrshl v4.4s, v4.4s, v26.4s\n"
"sqxtn v3.4h, v3.4s\n"
"sqxtn2 v3.8h, v4.4s\n"
"sqadd v3.8h, v3.8h, v0.8h\n"
"sqxtn v3.8b, v3.8h\n"
"add x8, x8, x19\n"
"smax v3.8b, v3.8b, v1.8b\n"
"add x15, x15, #32\n"
"subs %[function_params], %[function_params], #1\n"
"add %[scratch_block_data], x8, x11\n"
"smin v3.8b, v3.8b, v2.8b\n"
"add w14, w14, w22\n"
"mov v17.16b, v31.16b\n"
"str s3, [x8]\n"
"st1 { v3.s }[1], [%[scratch_block_data]]\n"
"b.ne " DC_KERNEL_NO_MULT_STRIDE_12 "b\n"
"mov v15.16b, v8.16b\n"
"mov v14.16b, v31.16b\n"
"mov v13.16b, v30.16b\n"
"mov v11.16b, v28.16b\n"
"mov w14, w6\n"
"mov v21.16b, v6.16b\n"
"mov v6.16b, v29.16b\n"
"mov v3.16b, v29.16b\n"
"cmp w14, w13\n"
"b.ge " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
"b " DC_KERNEL_NO_MULT_STRIDE_25 "f\n"
DC_KERNEL_NO_MULT_STRIDE_14 ":\n"
"cmp w13, #1\n"
"add x27, %[bias_data], #32\n"
"b.lt " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
"ldr x8, [sp, #136]\n"
"lsl w14, w10, #3\n"
"stp q15, q14, [sp, #64]\n"
"stp q13, q11, [sp, #96]\n"
"add x15, x28, x12\n"
"lsl x16, x14, #2\n"
"ldp q10, q11, [x15]\n"
"add x15, x8, x16\n"
"ldr x8, [sp, #144]\n"
"ldp q30, q31, [x15]\n"
"add x15, x28, x7\n"
"ldp q22, q23, [x26]\n"
"add x16, x8, x16\n"
"ldr w8, [sp, #44]\n"
"ldp q24, q25, [x26, #32]\n"
"ldp q26, q27, [x26, #64]\n"
"ldp q17, q18, [%[bias_data]]\n"
"ldp q14, q13, [x28], #32\n"
"ldp q8, q9, [x16]\n"
"ldp q12, q15, [x15]\n"
"add %[bias_data], %[output_block_data], x14\n"
"cmp w13, w8\n"
"b.ne " DC_KERNEL_NO_MULT_STRIDE_27 "f\n"
"ldr x25, [sp, #32]\n"
"mov x14, xzr\n"
"mov w4, wzr\n"
"mov x24, x13\n"
"cbnz x25, " DC_KERNEL_NO_MULT_STRIDE_20 "f\n"
"b " DC_KERNEL_NO_MULT_STRIDE_21 "f\n"
DC_KERNEL_NO_MULT_STRIDE_17 ":\n"
"mov v28.16b, v17.16b\n"
".word 0x4e8e96dc // sdot v28.4s, v22.16b, v14.16b\n"
"mov v29.16b, v18.16b\n"
".word 0x4e8d96fd // sdot v29.4s, v23.16b, v13.16b\n"
".word 0x4e8a971c // sdot v28.4s, v24.16b, v10.16b\n"
".word 0x4e8b973d // sdot v29.4s, v25.16b, v11.16b\n"
".word 0x4e8c975c // sdot v28.4s, v26.16b, v12.16b\n"
".word 0x4e8f977d // sdot v29.4s, v27.16b, v15.16b\n"
"sqrdmulh v28.4s, v28.4s, v8.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrshl v28.4s, v28.4s, v30.4s\n"
"sqrshl v29.4s, v29.4s, v31.4s\n"
"sqxtn v28.4h, v28.4s\n"
"sqxtn2 v28.8h, v29.4s\n"
"sqadd v28.8h, v28.8h, v0.8h\n"
"sqxtn v28.8b, v28.8h\n"
"smax v28.8b, v28.8b, v1.8b\n"
"smin v28.8b, v28.8b, v2.8b\n"
"mov v14.16b, v3.16b\n"
"mov v10.16b, v20.16b\n"
"mov v12.16b, v16.16b\n"
"mov v13.16b, v19.16b\n"
"mov v11.16b, v21.16b\n"
"mov v15.16b, v5.16b\n"
"str d28, [x15, x19]\n"
DC_KERNEL_NO_MULT_STRIDE_18 ":\n"
"add w4, w4, w22\n"
"add x14, x14, #32\n"
"subs x24, x24, #1\n"
"sub x25, x25, #1\n"
"b.eq " DC_KERNEL_NO_MULT_STRIDE_33 "f\n"
"cbz x25, " DC_KERNEL_NO_MULT_STRIDE_21 "f\n"
DC_KERNEL_NO_MULT_STRIDE_20 ":\n"
"and x15, x14, #0xffffffe0\n"
"add x15, x28, x15\n"
"add x16, x15, x12\n"
"add x17, x15, x7\n"
"ldp q3, q19, [x15]\n"
"ldp q20, q21, [x16]\n"
"ldp q16, q5, [x17]\n"
DC_KERNEL_NO_MULT_STRIDE_21 ":\n"
"mov v28.16b, v17.16b\n"
"mov v29.16b, v18.16b\n"
".word 0x4e8e96dc // sdot v28.4s, v22.16b, v14.16b\n"
".word 0x4e8a971c // sdot v28.4s, v24.16b, v10.16b\n"
".word 0x4e8d96fd // sdot v29.4s, v23.16b, v13.16b\n"
".word 0x4e8c975c // sdot v28.4s, v26.16b, v12.16b\n"
".word 0x4e8b973d // sdot v29.4s, v25.16b, v11.16b\n"
".word 0x4e8f977d // sdot v29.4s, v27.16b, v15.16b\n"
"sqrdmulh v28.4s, v28.4s, v8.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrshl v28.4s, v28.4s, v30.4s\n"
"sqrshl v29.4s, v29.4s, v31.4s\n"
"sqxtn v28.4h, v28.4s\n"
"sqxtn2 v28.8h, v29.4s\n"
"sqadd v28.8h, v28.8h, v0.8h\n"
"sqxtn v28.8b, v28.8h\n"
"rev32 v14.8h, v14.8h\n"
"rev32 v10.8h, v10.8h\n"
"rev32 v12.8h, v12.8h\n"
"rev32 v13.8h, v13.8h\n"
"rev32 v11.8h, v11.8h\n"
"rev32 v15.8h, v15.8h\n"
"smax v28.8b, v28.8b, v1.8b\n"
"add x15, %[bias_data], w4, sxtw\n"
"cmp w5, #1\n"
"trn1 v14.8h, v14.8h, v3.8h\n"
"trn1 v13.8h, v13.8h, v19.8h\n"
"trn1 v10.8h, v10.8h, v20.8h\n"
"trn1 v11.8h, v11.8h, v21.8h\n"
"trn1 v12.8h, v12.8h, v16.8h\n"
"smin v28.8b, v28.8b, v2.8b\n"
"trn1 v15.8h, v15.8h, v5.8h\n"
"str d28, [x15]\n"
"b.gt " DC_KERNEL_NO_MULT_STRIDE_17 "b\n"
"cbz x25, " DC_KERNEL_NO_MULT_STRIDE_18 "b\n"
"b " DC_KERNEL_NO_MULT_STRIDE_17 "b\n"
DC_KERNEL_NO_MULT_STRIDE_23 ":\n"
"mov w15, wzr\n"
"cmp w15, w13\n"
"ldr x15, [sp, #128]\n"
"b.lt " DC_KERNEL_NO_MULT_STRIDE_9 "b\n"
"b " DC_KERNEL_NO_MULT_STRIDE_10 "b\n"
DC_KERNEL_NO_MULT_STRIDE_24 ":\n"
"mov v9.16b, v3.16b\n"
"mov w14, wzr\n"
"cmp w14, w13\n"
"b.ge " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
DC_KERNEL_NO_MULT_STRIDE_25 ":\n"
"ldr x8, [sp, #8]\n"
"ldr x15, [sp, #128]\n"
"add x14, x8, x27\n"
DC_KERNEL_NO_MULT_STRIDE_26 ":\n"
"mov v3.16b, v25.16b\n"
"mov v4.16b, v25.16b\n"
".word 0x4e9d96c3 // sdot v3.4s, v22.16b, v29.16b\n"
".word 0x4e8896e3 // sdot v3.4s, v23.16b, v8.16b\n"
".word 0x4e9f96c4 // sdot v4.4s, v22.16b, v31.16b\n"
".word 0x4e9f9703 // sdot v3.4s, v24.16b, v31.16b\n"
".word 0x4e9e96e4 // sdot v4.4s, v23.16b, v30.16b\n"
".word 0x4e9c9704 // sdot v4.4s, v24.16b, v28.16b\n"
"sqrdmulh v3.4s, v3.4s, v27.4s\n"
"sqrdmulh v4.4s, v4.4s, v27.4s\n"
"sqrshl v3.4s, v3.4s, v26.4s\n"
"sqrshl v4.4s, v4.4s, v26.4s\n"
"sqxtn v3.4h, v3.4s\n"
"sqxtn2 v3.8h, v4.4s\n"
"sqadd v3.8h, v3.8h, v0.8h\n"
"sqxtn v3.8b, v3.8h\n"
"smax v3.8b, v3.8b, v1.8b\n"
"rev32 v5.8h, v29.8h\n"
"rev32 v17.8h, v8.8h\n"
"rev32 v18.8h, v31.8h\n"
"rev32 v19.8h, v30.8h\n"
"rev32 v20.8h, v28.8h\n"
"smin v3.8b, v3.8b, v2.8b\n"
"add x16, x14, x11\n"
"subs x15, x15, #1\n"
"trn1 v29.8h, v5.8h, v6.8h\n"
"trn1 v8.8h, v17.8h, v15.8h\n"
"trn1 v30.8h, v19.8h, v13.8h\n"
"trn1 v31.8h, v18.8h, v14.8h\n"
"trn1 v28.8h, v20.8h, v11.8h\n"
"str s3, [x14]\n"
"add x14, x14, x22\n"
"st1 { v3.s }[1], [x16]\n"
"b.ne " DC_KERNEL_NO_MULT_STRIDE_26 "b\n"
"b " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
DC_KERNEL_NO_MULT_STRIDE_27 ":\n"
"ldr x25, [sp, #32]\n"
"mov w14, wzr\n"
"mov %[function_params], xzr\n"
"mov x24, x13\n"
"str q6, [sp, #16]\n"
"b " DC_KERNEL_NO_MULT_STRIDE_30 "f\n"
DC_KERNEL_NO_MULT_STRIDE_28 ":\n"
"mov v3.16b, v17.16b\n"
".word 0x4e8e96c3 // sdot v3.4s, v22.16b, v14.16b\n"
"mov v4.16b, v18.16b\n"
".word 0x4e8d96e4 // sdot v4.4s, v23.16b, v13.16b\n"
".word 0x4e8a9703 // sdot v3.4s, v24.16b, v10.16b\n"
".word 0x4e8b9724 // sdot v4.4s, v25.16b, v11.16b\n"
".word 0x4e8c9743 // sdot v3.4s, v26.16b, v12.16b\n"
".word 0x4e8f9764 // sdot v4.4s, v27.16b, v15.16b\n"
"sqrdmulh v3.4s, v3.4s, v8.4s\n"
"sqrdmulh v4.4s, v4.4s, v9.4s\n"
"sqrshl v3.4s, v3.4s, v30.4s\n"
"sqrshl v4.4s, v4.4s, v31.4s\n"
"sqxtn v3.4h, v3.4s\n"
"sqxtn2 v3.8h, v4.4s\n"
"sqadd v3.8h, v3.8h, v0.8h\n"
"sqxtn v3.8b, v3.8h\n"
"smax v3.8b, v3.8b, v1.8b\n"
"smin v3.8b, v3.8b, v2.8b\n"
"str d3, [x15, x19]\n"
"mov v3.16b, v6.16b\n"
"mov v14.16b, v6.16b\n"
"mov v10.16b, v20.16b\n"
"mov v12.16b, v16.16b\n"
"mov v13.16b, v19.16b\n"
"mov v11.16b, v21.16b\n"
"mov v15.16b, v5.16b\n"
DC_KERNEL_NO_MULT_STRIDE_29 ":\n"
"add %[function_params], %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n"
"sub x25, x25, #1\n"
"subs x24, x24, #1\n"
"add w14, w14, w22\n"
"b.eq " DC_KERNEL_NO_MULT_STRIDE_34 "f\n"
DC_KERNEL_NO_MULT_STRIDE_30 ":\n"
"mov v28.16b, v17.16b\n"
"mov v29.16b, v18.16b\n"
".word 0x4e8e96dc // sdot v28.4s, v22.16b, v14.16b\n"
"and x16, %[function_params], #0xffffffe0\n"
".word 0x4e8d96fd // sdot v29.4s, v23.16b, v13.16b\n"
".word 0x4e8a971c // sdot v28.4s, v24.16b, v10.16b\n"
"add x16, x28, x16\n"
".word 0x4e8b973d // sdot v29.4s, v25.16b, v11.16b\n"
".word 0x4e8c975c // sdot v28.4s, v26.16b, v12.16b\n"
"rev32 v19.8h, v14.8h\n"
"rev32 v3.8h, v13.8h\n"
"ldp q14, q13, [x16]\n"
".word 0x4e8f977d // sdot v29.4s, v27.16b, v15.16b\n"
"sqrdmulh v28.4s, v28.4s, v8.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrshl v28.4s, v28.4s, v30.4s\n"
"add x17, x16, x12\n"
"add x16, x16, x7\n"
"sqrshl v29.4s, v29.4s, v31.4s\n"
"sqxtn v28.4h, v28.4s\n"
"rev32 v21.8h, v12.8h\n"
"rev32 v4.8h, v11.8h\n"
"ldp q20, q11, [x17]\n"
"ldp q12, q5, [x16]\n"
"sqxtn2 v28.8h, v29.4s\n"
"mov v6.16b, v14.16b\n"
"trn1 v14.8h, v19.8h, v14.8h\n"
"mov v19.16b, v13.16b\n"
"trn1 v13.8h, v3.8h, v13.8h\n"
"sqadd v3.8h, v28.8h, v0.8h\n"
"sqxtn v3.8b, v3.8h\n"
"rev32 v16.8h, v10.8h\n"
"rev32 v7.8h, v15.8h\n"
"smax v3.8b, v3.8b, v1.8b\n"
"add x15, %[bias_data], w14, sxtw\n"
"cmp w5, #1\n"
"trn1 v10.8h, v16.8h, v20.8h\n"
"mov v16.16b, v12.16b\n"
"trn1 v12.8h, v21.8h, v12.8h\n"
"mov v21.16b, v11.16b\n"
"trn1 v11.8h, v4.8h, v11.8h\n"
"smin v3.8b, v3.8b, v2.8b\n"
"trn1 v15.8h, v7.8h, v5.8h\n"
"str d3, [x15]\n"
"b.gt " DC_KERNEL_NO_MULT_STRIDE_28 "b\n"
"cbnz x25, " DC_KERNEL_NO_MULT_STRIDE_28 "b\n"
"mov v3.16b, v6.16b\n"
"b " DC_KERNEL_NO_MULT_STRIDE_29 "b\n"
DC_KERNEL_NO_MULT_STRIDE_33 ":\n"
"ldp q13, q11, [sp, #96]\n"
"ldp q15, q14, [sp, #64]\n"
"b " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
DC_KERNEL_NO_MULT_STRIDE_34 ":\n"
"ldp q13, q11, [sp, #96]\n"
"ldp q15, q14, [sp, #64]\n"
"ldr q6, [sp, #16]\n"
"b " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
DC_KERNEL_NO_MULT_STRIDE_35 ":\n"
"add sp, sp, #176\n"
:
[ scratch_block_data ] "+r"(scratch_block_data),
[ filter_workspace ] "+r"(filter_workspace),
[ bias_data ] "+r"(bias_data),
[ output_block_data ] "+r"(output_block_data)
:
[ function_params ] "r"(function_params)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31",
"x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
"x27", "x28");
#undef DC_KERNEL_NO_MULT_STRIDE_1
#undef DC_KERNEL_NO_MULT_STRIDE_2
#undef DC_KERNEL_NO_MULT_STRIDE_3
#undef DC_KERNEL_NO_MULT_STRIDE_4
#undef DC_KERNEL_NO_MULT_STRIDE_5
#undef DC_KERNEL_NO_MULT_STRIDE_6
#undef DC_KERNEL_NO_MULT_STRIDE_7
#undef DC_KERNEL_NO_MULT_STRIDE_8
#undef DC_KERNEL_NO_MULT_STRIDE_9
#undef DC_KERNEL_NO_MULT_STRIDE_10
#undef DC_KERNEL_NO_MULT_STRIDE_11
#undef DC_KERNEL_NO_MULT_STRIDE_12
#undef DC_KERNEL_NO_MULT_STRIDE_13
#undef DC_KERNEL_NO_MULT_STRIDE_14
#undef DC_KERNEL_NO_MULT_STRIDE_15
#undef DC_KERNEL_NO_MULT_STRIDE_16
#undef DC_KERNEL_NO_MULT_STRIDE_17
#undef DC_KERNEL_NO_MULT_STRIDE_18
#undef DC_KERNEL_NO_MULT_STRIDE_19
#undef DC_KERNEL_NO_MULT_STRIDE_20
#undef DC_KERNEL_NO_MULT_STRIDE_21
#undef DC_KERNEL_NO_MULT_STRIDE_22
#undef DC_KERNEL_NO_MULT_STRIDE_23
#undef DC_KERNEL_NO_MULT_STRIDE_24
#undef DC_KERNEL_NO_MULT_STRIDE_25
#undef DC_KERNEL_NO_MULT_STRIDE_26
#undef DC_KERNEL_NO_MULT_STRIDE_27
#undef DC_KERNEL_NO_MULT_STRIDE_28
#undef DC_KERNEL_NO_MULT_STRIDE_29
#undef DC_KERNEL_NO_MULT_STRIDE_30
#undef DC_KERNEL_NO_MULT_STRIDE_31
#undef DC_KERNEL_NO_MULT_STRIDE_32
#undef DC_KERNEL_NO_MULT_STRIDE_33
#undef DC_KERNEL_NO_MULT_STRIDE_34
#undef DC_KERNEL_NO_MULT_STRIDE_35
}
static inline void Run(const int8* scratch_block_data,
const int8* filter_workspace, const int32* bias_data,
int8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
output_block_data, function_params);
}
};
template <>
struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
QuantizationType::kPerChannelInt8,
DepthwiseConvDepthMultiplication::kUnitInputDepth,
1> {
static inline void KernelMacroBlockNeon(
const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, int8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
#define DC_KERNEL_MULT_1 …
#define DC_KERNEL_MULT_2 …
#define DC_KERNEL_MULT_3 …
#define DC_KERNEL_MULT_4 …
#define DC_KERNEL_MULT_5 …
#define DC_KERNEL_MULT_6 …
#define DC_KERNEL_MULT_7 …
#define DC_KERNEL_MULT_8 …
#define DC_KERNEL_MULT_9 …
#define DC_KERNEL_MULT_10 …
#define DC_KERNEL_MULT_11 …
#define DC_KERNEL_MULT_12 …
#define DC_KERNEL_MULT_13 …
#define DC_KERNEL_MULT_14 …
#define DC_KERNEL_MULT_15 …
#define DC_KERNEL_MULT_16 …
#define DC_KERNEL_MULT_17 …
#define DC_KERNEL_MULT_18 …
#define DC_KERNEL_MULT_19 …
#define DC_KERNEL_MULT_20 …
#define DC_KERNEL_MULT_21 …
#define DC_KERNEL_MULT_22 …
#define DC_KERNEL_MULT_23 …
asm volatile(
"sub sp, sp, #352\n"
"ldr w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"str %[filter_workspace], [sp, #56]\n"
"cmp w8, #1\n"
"str x8, [sp, #32]\n"
"b.lt " DC_KERNEL_MULT_23 "f\n"
"ldr w11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
"ldr x12, [%[function_params], #" STR(DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL) "]\n"
"ldp w17, w15, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"ldr w16, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
"ldpsw x21, x6, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
"ldrb w8, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "]\n"
"ldrb w9, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "]\n"
"add x10, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"
"str x12, [sp, #24]\n"
"ldr x12, [%[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL) "]\n"
"ldrsw %[function_params], [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
"cmp w11, #4\n"
"ccmp w15, w17, #0, lt\n"
"csel w25, w15, w17, lt\n"
"cmp w16, #1\n"
"str x16, [sp, #80]\n"
"cset w16, lt\n"
"cmp w17, #1\n"
"dup v1.16b, w8\n"
"fmov s3, w8\n"
"dup v2.16b, w9\n"
"fmov s4, w9\n"
"lsl x8, %[function_params], #1\n"
"add x9, x21, %[function_params]\n"
"str w17, [sp, #324]\n"
"cset w17, lt\n"
"ld1r { v0.8h }, [x10]\n"
"lsl x7, x21, #1\n"
"add x22, x21, x21, lsl #1\n"
"add x10, x8, %[function_params]\n"
"add x9, %[output_block_data], x9\n"
"orr w16, w16, w17\n"
"str x9, [sp, #216]\n"
"str w15, [sp, #316]\n"
"add x9, x10, x22\n"
"add x15, x10, x7\n"
"str w16, [sp, #12]\n"
"add x16, x10, x21\n"
"add x10, %[output_block_data], x10\n"
"str x10, [sp, #200]\n"
"add x10, x6, #4\n"
"str x10, [sp, #160]\n"
"lsl x10, %[function_params], #2\n"
"str x10, [sp, #152]\n"
"add x10, %[output_block_data], x21\n"
"add x17, x6, x6, lsl #2\n"
"str x10, [sp, #144]\n"
"add x10, %[output_block_data], %[function_params]\n"
"lsl x24, x6, #2\n"
"str x10, [sp, #136]\n"
"add x10, x17, #4\n"
"add x19, x6, x6, lsl #1\n"
"str x10, [sp, #128]\n"
"add x10, x24, #4\n"
"str x12, [sp, #16]\n"
"str w11, [sp, #320]\n"
"lsl x20, x6, #1\n"
"add x11, x8, x22\n"
"add x12, x8, x7\n"
"add x13, x8, x21\n"
"add x8, %[output_block_data], x8\n"
"str x10, [sp, #120]\n"
"add x10, x19, #4\n"
"stp x8, x7, [sp, #224]\n"
"add x8, x22, %[function_params]\n"
"str x10, [sp, #112]\n"
"add x10, x20, #4\n"
"mov x5, xzr\n"
"add x14, x7, %[function_params]\n"
"add x8, %[output_block_data], x8\n"
"str x10, [sp, #104]\n"
"add x10, %[output_block_data], x7\n"
"add x26, %[output_block_data], x11\n"
"str x8, [sp, #184]\n"
"add x8, %[output_block_data], x14\n"
"mov x14, x5\n"
"add x5, %[output_block_data], x9\n"
"add x9, %[output_block_data], x16\n"
"mov x16, x22\n"
"stp x19, x6, [sp, #296]\n"
"mov x11, x7\n"
"str x20, [sp, #328]\n"
"str x10, [sp, #96]\n"
"add x10, %[output_block_data], x22\n"
"stp x22, %[output_block_data], [sp, #64]\n"
"ldr x7, [sp, #160]\n"
"ldr x23, [sp, #136]\n"
"ldp x22, x19, [sp, #112]\n"
"ldr x20, [sp, #104]\n"
"mov %[filter_workspace], xzr\n"
"dup v3.8b, v3.b[0]\n"
"dup v4.8b, v4.b[0]\n"
"add x27, %[output_block_data], x12\n"
"add x28, %[output_block_data], x13\n"
"mov x13, %[filter_workspace]\n"
"stp x8, x17, [sp, #168]\n"
"add x8, %[output_block_data], x15\n"
"str x10, [sp, #88]\n"
"mov w10, #4\n"
"stp x21, %[scratch_block_data], [sp, #256]\n"
"str w25, [sp, #212]\n"
"str x24, [sp, #192]\n"
"str x9, [sp, #336]\n"
"b " DC_KERNEL_MULT_5 "f\n"
DC_KERNEL_MULT_2 ":\n"
"mov %[output_block_data], x21\n"
"ldp x21, %[scratch_block_data], [sp, #256]\n"
DC_KERNEL_MULT_3 ":\n"
"mov %[bias_data], x11\n"
DC_KERNEL_MULT_4 ":\n"
"ldp x12, x14, [sp, #32]\n"
"ldr x11, [sp, #72]\n"
"ldr x13, [sp, #48]\n"
"add x14, x14, #1\n"
"add x11, x11, #8\n"
"cmp x14, x12\n"
"add x13, x13, #8\n"
"str x11, [sp, #72]\n"
"b.eq " DC_KERNEL_MULT_23 "f\n"
DC_KERNEL_MULT_5 ":\n"
"ldr x12, [sp, #56]\n"
"ldr x16, [sp, #80]\n"
"ldp q18, q5, [x12]\n"
"ldp q17, q6, [x12, #32]\n"
"ldp q16, q7, [x12, #64]\n"
"cmp w16, #4\n"
"add x12, x12, #96\n"
"stp x13, x12, [sp, #48]\n"
"str x14, [sp, #40]\n"
"b.ne " DC_KERNEL_MULT_16 "f\n"
"lsl w12, w14, #3\n"
"ldr x14, [sp, #16]\n"
"lsl x12, x12, #2\n"
"mov x15, xzr\n"
"mov %[filter_workspace], x13\n"
"add x11, x14, x12\n"
"ldr x14, [sp, #24]\n"
"str x11, [sp, #248]\n"
"add x11, x14, x12\n"
"str x11, [sp, #240]\n"
"b " DC_KERNEL_MULT_8 "f\n"
DC_KERNEL_MULT_7 ":\n"
"add x15, x15, #1\n"
"cmp x15, #2\n"
"add %[filter_workspace], x1, #4\n"
"mov v16.16b, v7.16b\n"
"mov v17.16b, v6.16b\n"
"mov v18.16b, v5.16b\n"
"b.eq " DC_KERNEL_MULT_4 "b\n"
DC_KERNEL_MULT_8 ":\n"
"ldr q19, [%[bias_data]], #16\n"
"ldr x11, [sp, #248]\n"
"lsl x12, x15, #4\n"
"ldr w13, [%[scratch_block_data]]\n"
"ldr x16, [sp, #328]\n"
"ldr q20, [x11, x12]\n"
"ldr x11, [sp, #240]\n"
"ldr w6, [%[scratch_block_data], x24]\n"
"ldr w16, [%[scratch_block_data], x16]\n"
"ldr q21, [x11, x12]\n"
"ldp x12, x14, [sp, #296]\n"
"fmov s22, w13\n"
"add x14, %[scratch_block_data], x14\n"
"mov v22.s[1], w13\n"
"fmov s23, w6\n"
"ldr w12, [%[scratch_block_data], x12]\n"
"ld1 { v22.s }[2], [x14]\n"
"add x14, %[scratch_block_data], x17\n"
"mov v23.s[1], w6\n"
"ld1 { v23.s }[2], [x14]\n"
"fmov s24, w16\n"
"mov v24.s[1], w16\n"
"dup v25.4s, w16\n"
"mov v28.16b, v19.16b\n"
"mov v29.16b, v19.16b\n"
"mov v30.16b, v19.16b\n"
"dup v26.4s, w12\n"
"mov v31.16b, v19.16b\n"
"mov v24.s[2], w12\n"
"cmp w25, #1\n"
".word 0x4e99961c // sdot v28.4s, v16.16b, v25.16b\n"
".word 0x4e99963d // sdot v29.4s, v17.16b, v25.16b\n"
".word 0x4e99965e // sdot v30.4s, v18.16b, v25.16b\n"
"mov v24.s[3], w16\n"
"mov v22.s[3], w13\n"
"mov v23.s[3], w6\n"
".word 0x4e9a965f // sdot v31.4s, v18.16b, v26.16b\n"
"b.lt " DC_KERNEL_MULT_15 "f\n"
"stp x15, %[bias_data], [sp, #280]\n"
"mov w13, w25\n"
"str %[filter_workspace], [sp, #272]\n"
"mov x16, %[filter_workspace]\n"
"mov x14, %[scratch_block_data]\n"
"ldp x25, %[scratch_block_data], [sp, #216]\n"
"mov x24, x28\n"
"mov x28, x27\n"
"ldr x27, [sp, #200]\n"
"ldr x17, [sp, #184]\n"
"mov x9, x8\n"
"mov x8, x5\n"
"ldr x5, [sp, #168]\n"
"ldp x15, x10, [sp, #144]\n"
"ldr %[bias_data], [sp, #128]\n"
"ldp %[filter_workspace], x11, [sp, #88]\n"
"shl v25.4s, v18.4s, #8\n"
"shl v26.4s, v17.4s, #8\n"
"shl v27.4s, v16.4s, #8\n"
"mov x21, %[output_block_data]\n"
DC_KERNEL_MULT_10 ":\n"
".word 0x4f96e25c // sdot v28.4s, v18.16b, v22.4b[0]\n"
".word 0x4f96ea5d // sdot v29.4s, v18.16b, v22.4b[2]\n"
".word 0x4f98ea3e // sdot v30.4s, v17.16b, v24.4b[2]\n"
".word 0x4f96ea3c // sdot v28.4s, v17.16b, v22.4b[2]\n"
".word 0x4f97e23f // sdot v31.4s, v17.16b, v23.4b[0]\n"
".word 0x4f98ea1d // sdot v29.4s, v16.16b, v24.4b[2]\n"
".word 0x4f97e21e // sdot v30.4s, v16.16b, v23.4b[0]\n"
"sqrdmulh v28.4s, v28.4s, v21.4s\n"
".word 0x4f97ea1f // sdot v31.4s, v16.16b, v23.4b[2]\n"
"sqrdmulh v29.4s, v29.4s, v21.4s\n"
"sqrdmulh v30.4s, v30.4s, v21.4s\n"
"sqrshl v28.4s, v28.4s, v20.4s\n"
"sqrdmulh v31.4s, v31.4s, v21.4s\n"
"sqrshl v29.4s, v29.4s, v20.4s\n"
"sqrshl v30.4s, v30.4s, v20.4s\n"
"sqxtn v28.4h, v28.4s\n"
"sqrshl v31.4s, v31.4s, v20.4s\n"
"sqxtn v30.4h, v30.4s\n"
"sqxtn2 v28.8h, v29.4s\n"
"sqxtn2 v30.8h, v31.4s\n"
"sqadd v28.8h, v28.8h, v0.8h\n"
"sqadd v29.8h, v30.8h, v0.8h\n"
"sqxtn v28.8b, v28.8h\n"
"sqxtn2 v28.16b, v29.8h\n"
"smax v28.16b, v28.16b, v1.16b\n"
"add %[output_block_data], x15, x16\n"
"smin v28.16b, v28.16b, v2.16b\n"
"add x6, x11, x16\n"
"str s28, [x21, x16]\n"
"st1 { v28.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], %[filter_workspace], x16\n"
"st1 { v28.s }[2], [x6]\n"
"st1 { v28.s }[3], [%[output_block_data]]\n"
"mov x12, x14\n"
"add x6, x14, x20\n"
"ldr w3, [x14, #4]!\n"
"ld1 { v24.s }[1], [x6]\n"
"add x6, x12, x19\n"
"ld1 { v23.s }[1], [x6]\n"
"mov v22.s[1], w3\n"
"add %[output_block_data], x12, x22\n"
"ld1 { v24.s }[3], [%[output_block_data]]\n"
"add %[output_block_data], x12, x7\n"
"ld1 { v22.s }[3], [%[output_block_data]]\n"
"add x12, x12, %[bias_data]\n"
"mov v28.16b, v19.16b\n"
"ld1 { v23.s }[3], [x12]\n"
"mov v29.16b, v19.16b\n"
"mov v30.16b, v19.16b\n"
".word 0x4f96e33c // sdot v28.4s, v25.16b, v22.4b[0]\n"
"mov v31.16b, v19.16b\n"
".word 0x4f98e33e // sdot v30.4s, v25.16b, v24.4b[0]\n"
".word 0x4f96eb3d // sdot v29.4s, v25.16b, v22.4b[2]\n"
".word 0x4f96eb5c // sdot v28.4s, v26.16b, v22.4b[2]\n"
".word 0x4f98eb3f // sdot v31.4s, v25.16b, v24.4b[2]\n"
".word 0x4f98eb5e // sdot v30.4s, v26.16b, v24.4b[2]\n"
".word 0x4f98e35d // sdot v29.4s, v26.16b, v24.4b[0]\n"
".word 0x4f98e37c // sdot v28.4s, v27.16b, v24.4b[0]\n"
".word 0x4f97e35f // sdot v31.4s, v26.16b, v23.4b[0]\n"
".word 0x4f97e37e // sdot v30.4s, v27.16b, v23.4b[0]\n"
".word 0x4f98eb7d // sdot v29.4s, v27.16b, v24.4b[2]\n"
"sqrdmulh v28.4s, v28.4s, v21.4s\n"
".word 0x4f97eb7f // sdot v31.4s, v27.16b, v23.4b[2]\n"
"sqrdmulh v30.4s, v30.4s, v21.4s\n"
"sqrdmulh v29.4s, v29.4s, v21.4s\n"
"sqrshl v28.4s, v28.4s, v20.4s\n"
"sqrdmulh v31.4s, v31.4s, v21.4s\n"
"sqrshl v30.4s, v30.4s, v20.4s\n"
"sqrshl v29.4s, v29.4s, v20.4s\n"
"sqxtn v28.4h, v28.4s\n"
"sqrshl v31.4s, v31.4s, v20.4s\n"
"sqxtn v30.4h, v30.4s\n"
"sqxtn2 v28.8h, v29.4s\n"
"sqxtn2 v30.8h, v31.4s\n"
"sqadd v28.8h, v28.8h, v0.8h\n"
"sqadd v29.8h, v30.8h, v0.8h\n"
"sqxtn v28.8b, v28.8h\n"
"sqxtn2 v28.16b, v29.8h\n"
"smax v28.16b, v28.16b, v1.16b\n"
"add x12, x25, x16\n"
"smin v28.16b, v28.16b, v2.16b\n"
"add %[output_block_data], x5, x16\n"
"str s28, [x23, x16]\n"
"st1 { v28.s }[1], [x12]\n"
"add x12, x17, x16\n"
"mov v29.16b, v19.16b\n"
"ushr v10.2d, v22.2d, #16\n"
"mov v30.16b, v19.16b\n"
"mov v31.16b, v19.16b\n"
"st1 { v28.s }[2], [%[output_block_data]]\n"
"st1 { v28.s }[3], [x12]\n"
"ushr v28.2d, v24.2d, #16\n"
".word 0x4f8ae25d // sdot v29.4s, v18.16b, v10.4b[0]\n"
"mov v8.16b, v19.16b\n"
".word 0x4f9ce25f // sdot v31.4s, v18.16b, v28.4b[0]\n"
".word 0x4f8aea5e // sdot v30.4s, v18.16b, v10.4b[2]\n"
".word 0x4f8aea3d // sdot v29.4s, v17.16b, v10.4b[2]\n"
"ushr v9.2d, v23.2d, #16\n"
".word 0x4f9cea48 // sdot v8.4s, v18.16b, v28.4b[2]\n"
".word 0x4f9cea3f // sdot v31.4s, v17.16b, v28.4b[2]\n"
".word 0x4f9ce23e // sdot v30.4s, v17.16b, v28.4b[0]\n"
".word 0x4f9ce21d // sdot v29.4s, v16.16b, v28.4b[0]\n"
".word 0x4f89e228 // sdot v8.4s, v17.16b, v9.4b[0]\n"
".word 0x4f89e21f // sdot v31.4s, v16.16b, v9.4b[0]\n"
".word 0x4f9cea1e // sdot v30.4s, v16.16b, v28.4b[2]\n"
"sqrdmulh v29.4s, v29.4s, v21.4s\n"
".word 0x4f89ea08 // sdot v8.4s, v16.16b, v9.4b[2]\n"
"sqrdmulh v31.4s, v31.4s, v21.4s\n"
"sqrdmulh v30.4s, v30.4s, v21.4s\n"
"sqrshl v29.4s, v29.4s, v20.4s\n"
"sqrdmulh v8.4s, v8.4s, v21.4s\n"
"sqrshl v31.4s, v31.4s, v20.4s\n"
"sqrshl v30.4s, v30.4s, v20.4s\n"
"sqxtn v29.4h, v29.4s\n"
"sqrshl v8.4s, v8.4s, v20.4s\n"
"sqxtn v31.4h, v31.4s\n"
"sqxtn2 v29.8h, v30.4s\n"
"sqxtn2 v31.8h, v8.4s\n"
"sqadd v29.8h, v29.8h, v0.8h\n"
"sqadd v30.8h, v31.8h, v0.8h\n"
"sqxtn v29.8b, v29.8h\n"
"sqxtn2 v29.16b, v30.8h\n"
"smax v29.16b, v29.16b, v1.16b\n"
"add %[output_block_data], x24, x16\n"
"smin v29.16b, v29.16b, v2.16b\n"
"mov v30.16b, v19.16b\n"
"add x12, x28, x16\n"
"str s29, [%[scratch_block_data], x16]\n"
"st1 { v29.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x26, x16\n"
"mov v31.16b, v19.16b\n"
"mov v8.16b, v19.16b\n"
".word 0x4f8ae33e // sdot v30.4s, v25.16b, v10.4b[0]\n"
"st1 { v29.s }[2], [x12]\n"
"st1 { v29.s }[3], [%[output_block_data]]\n"
"mov v29.16b, v19.16b\n"
".word 0x4f9ce328 // sdot v8.4s, v25.16b, v28.4b[0]\n"
".word 0x4f8aeb3f // sdot v31.4s, v25.16b, v10.4b[2]\n"
".word 0x4f8aeb5e // sdot v30.4s, v26.16b, v10.4b[2]\n"
".word 0x4f9ceb3d // sdot v29.4s, v25.16b, v28.4b[2]\n"
".word 0x4f9ceb48 // sdot v8.4s, v26.16b, v28.4b[2]\n"
".word 0x4f9ce35f // sdot v31.4s, v26.16b, v28.4b[0]\n"
".word 0x4f9ce37e // sdot v30.4s, v27.16b, v28.4b[0]\n"
".word 0x4f89e35d // sdot v29.4s, v26.16b, v9.4b[0]\n"
".word 0x4f89e368 // sdot v8.4s, v27.16b, v9.4b[0]\n"
".word 0x4f9ceb7f // sdot v31.4s, v27.16b, v28.4b[2]\n"
"sqrdmulh v30.4s, v30.4s, v21.4s\n"
".word 0x4f89eb7d // sdot v29.4s, v27.16b, v9.4b[2]\n"
"sqrdmulh v28.4s, v8.4s, v21.4s\n"
"sqrdmulh v31.4s, v31.4s, v21.4s\n"
"sqrshl v30.4s, v30.4s, v20.4s\n"
"sqrdmulh v29.4s, v29.4s, v21.4s\n"
"sqrshl v28.4s, v28.4s, v20.4s\n"
"sqrshl v31.4s, v31.4s, v20.4s\n"
"sqxtn v30.4h, v30.4s\n"
"ldr x12, [sp, #336]\n"
"sqrshl v29.4s, v29.4s, v20.4s\n"
"sqxtn v28.4h, v28.4s\n"
"sqxtn2 v30.8h, v31.4s\n"
"sqxtn2 v28.8h, v29.4s\n"
"sqadd v29.8h, v30.8h, v0.8h\n"
"sqadd v28.8h, v28.8h, v0.8h\n"
"sqxtn v29.8b, v29.8h\n"
"sqxtn2 v29.16b, v28.8h\n"
"smax v28.16b, v29.16b, v1.16b\n"
"add x12, x12, x16\n"
"smin v8.16b, v28.16b, v2.16b\n"
"mov v28.16b, v19.16b\n"
"mov v29.16b, v19.16b\n"
"mov v30.16b, v19.16b\n"
"mov v31.16b, v19.16b\n"
"ushr v24.2d, v24.2d, #32\n"
"add %[output_block_data], x9, x16\n"
"str s8, [x27, x16]\n"
"st1 { v8.s }[1], [x12]\n"
"add x12, x8, x16\n"
"subs w13, w13, #1\n"
"ushr v22.2d, v22.2d, #32\n"
"ushr v23.2d, v23.2d, #32\n"
".word 0x4f98e21c // sdot v28.4s, v16.16b, v24.4b[0]\n"
".word 0x4f98e23d // sdot v29.4s, v17.16b, v24.4b[0]\n"
".word 0x4f98e25e // sdot v30.4s, v18.16b, v24.4b[0]\n"
".word 0x4f98ea5f // sdot v31.4s, v18.16b, v24.4b[2]\n"
"add x16, x16, x10\n"
"st1 { v8.s }[2], [%[output_block_data]]\n"
"st1 { v8.s }[3], [x12]\n"
"b.ne " DC_KERNEL_MULT_10 "b\n"
"ldr w25, [sp, #212]\n"
"add x13, x21, x16\n"
"mov %[output_block_data], x21\n"
"ldp x21, %[scratch_block_data], [sp, #256]\n"
"ldr x6, [sp, #232]\n"
"mov x27, x28\n"
"mov x28, x24\n"
"ldr x24, [sp, #192]\n"
"ldr x17, [sp, #176]\n"
"ldp x15, %[bias_data], [sp, #280]\n"
"ldr %[filter_workspace], [sp, #272]\n"
"mov w12, w25\n"
"mov x5, x8\n"
"mov x8, x9\n"
"mov w10, #4\n"
"ldr w16, [sp, #324]\n"
"cmp w12, w16\n"
"b.ge " DC_KERNEL_MULT_7 "b\n"
DC_KERNEL_MULT_12 ":\n"
"ldr w12, [sp, #320]\n"
"cmp w12, #1\n"
"b.lt " DC_KERNEL_MULT_7 "b\n"
"add x12, x14, #4\n"
"ldr x16, [sp, #328]\n"
"add x14, x12, x24\n"
"ld1 { v23.s }[1], [x14]\n"
"add x14, x12, x17\n"
"add x16, x12, x16\n"
"ld1 { v24.s }[1], [x16]\n"
"ld1 { v23.s }[3], [x14]\n"
"ldp x16, x14, [sp, #296]\n"
"add x16, x12, x16\n"
"ld1 { v24.s }[3], [x16]\n"
"ldr x16, [sp, #64]\n"
"ld1 { v22.s }[1], [x12], x14\n"
"ldr w14, [sp, #320]\n"
"ld1 { v22.s }[3], [x12]\n"
DC_KERNEL_MULT_14 ":\n"
".word 0x4f96e25c // sdot v28.4s, v18.16b, v22.4b[0]\n"
".word 0x4f96ea5d // sdot v29.4s, v18.16b, v22.4b[2]\n"
".word 0x4f98ea3e // sdot v30.4s, v17.16b, v24.4b[2]\n"
".word 0x4f96ea3c // sdot v28.4s, v17.16b, v22.4b[2]\n"
".word 0x4f97e23f // sdot v31.4s, v17.16b, v23.4b[0]\n"
".word 0x4f98ea1d // sdot v29.4s, v16.16b, v24.4b[2]\n"
".word 0x4f97e21e // sdot v30.4s, v16.16b, v23.4b[0]\n"
"sqrdmulh v25.4s, v28.4s, v21.4s\n"
".word 0x4f97ea1f // sdot v31.4s, v16.16b, v23.4b[2]\n"
"sqrdmulh v26.4s, v29.4s, v21.4s\n"
"sqrdmulh v27.4s, v30.4s, v21.4s\n"
"sqrshl v25.4s, v25.4s, v20.4s\n"
"sqrdmulh v28.4s, v31.4s, v21.4s\n"
"sqrshl v26.4s, v26.4s, v20.4s\n"
"sqrshl v27.4s, v27.4s, v20.4s\n"
"sqxtn v25.4h, v25.4s\n"
"sqrshl v28.4s, v28.4s, v20.4s\n"
"sqxtn v27.4h, v27.4s\n"
"sqxtn2 v25.8h, v26.4s\n"
"sqxtn2 v27.8h, v28.4s\n"
"sqadd v25.8h, v25.8h, v0.8h\n"
"sqadd v26.8h, v27.8h, v0.8h\n"
"sqxtn v25.8b, v25.8h\n"
"sqxtn2 v25.16b, v26.8h\n"
"smax v25.16b, v25.16b, v1.16b\n"
"add x12, x13, x21\n"
"smin v25.16b, v25.16b, v2.16b\n"
"str s25, [x13]\n"
"st1 { v25.s }[1], [x12]\n"
"add x12, x13, x6\n"
"ushr v24.2d, v24.2d, #8\n"
"mov v28.16b, v19.16b\n"
"mov v29.16b, v19.16b\n"
"mov v30.16b, v19.16b\n"
"mov v31.16b, v19.16b\n"
"st1 { v25.s }[2], [x12]\n"
"add x12, x13, x16\n"
"subs w14, w14, #1\n"
"ushr v22.2d, v22.2d, #8\n"
"ushr v23.2d, v23.2d, #8\n"
".word 0x4f98e21c // sdot v28.4s, v16.16b, v24.4b[0]\n"
".word 0x4f98e23d // sdot v29.4s, v17.16b, v24.4b[0]\n"
".word 0x4f98e25e // sdot v30.4s, v18.16b, v24.4b[0]\n"
"add x13, x13, %[function_params]\n"
".word 0x4f98ea5f // sdot v31.4s, v18.16b, v24.4b[2]\n"
"st1 { v25.s }[3], [x12]\n"
"b.ne " DC_KERNEL_MULT_14 "b\n"
"b " DC_KERNEL_MULT_7 "b\n"
DC_KERNEL_MULT_15 ":\n"
"ldr x11, [sp, #72]\n"
"ldr x6, [sp, #232]\n"
"mov w12, wzr\n"
"mov x14, %[scratch_block_data]\n"
"add x13, x11, x15, lsl #2\n"
"ldr w16, [sp, #324]\n"
"cmp w12, w16\n"
"b.lt " DC_KERNEL_MULT_12 "b\n"
"b " DC_KERNEL_MULT_7 "b\n"
DC_KERNEL_MULT_16 ":\n"
"ldr w16, [sp, #12]\n"
"add x11, %[bias_data], #32\n"
"tbnz w16, #0, " DC_KERNEL_MULT_3 "b\n"
"ldp x13, x16, [sp, #16]\n"
"mov x12, x14\n"
"lsl w12, w12, #3\n"
"lsl x12, x12, #2\n"
"add x13, x13, x12\n"
"add x12, x16, x12\n"
"ldp q19, q20, [%[bias_data]]\n"
"ldp q21, q22, [x13]\n"
"ldp q23, q24, [x12]\n"
"ldr x15, [sp, #72]\n"
"ldr %[scratch_block_data], [sp, #304]\n"
"mov x21, %[output_block_data]\n"
"mov x14, xzr\n"
"b " DC_KERNEL_MULT_19 "f\n"
DC_KERNEL_MULT_18 ":\n"
"ldr x12, [sp, #80]\n"
"add x14, x14, #1\n"
"cmp x14, x12\n"
"ldr x12, [sp, #256]\n"
"add x15, x15, x12\n"
"b.eq " DC_KERNEL_MULT_2 "b\n"
DC_KERNEL_MULT_19 ":\n"
"ldr x12, [sp, #264]\n"
"mov w13, wzr\n"
"madd x6, x14, %[scratch_block_data], x12\n"
"ldr w12, [x6]\n"
"add x16, x6, %[scratch_block_data]\n"
"fmov s25, w12\n"
"mov v25.s[1], w12\n"
"ld1 { v25.s }[2], [x16]\n"
"ldr x16, [sp, #328]\n"
"mov v25.s[3], w12\n"
"add x16, x6, x16\n"
"ld1r { v26.4s }, [x16]\n"
"mov x16, x15\n"
"b " DC_KERNEL_MULT_21 "f\n"
DC_KERNEL_MULT_20 ":\n"
"ldr w12, [sp, #324]\n"
"add w13, w13, #1\n"
"cmp w13, w12\n"
"b.eq " DC_KERNEL_MULT_18 "b\n"
DC_KERNEL_MULT_21 ":\n"
"ldr %[output_block_data], [sp, #328]\n"
"add x6, x6, #4\n"
"mov x12, x6\n"
"ld1 { v25.s }[1], [x12], %[output_block_data]\n"
"ldr w3, [sp, #316]\n"
"ld1 { v26.s }[1], [x12]\n"
"ldr w12, [sp, #320]\n"
"cmp w13, w3\n"
"add %[output_block_data], x6, %[scratch_block_data]\n"
"ld1 { v25.s }[3], [%[output_block_data]]\n"
"csel w12, w12, w10, eq\n"
"cmp w12, #1\n"
"b.lt " DC_KERNEL_MULT_20 "b\n"
DC_KERNEL_MULT_22 ":\n"
"mov v27.16b, v19.16b\n"
"mov v28.16b, v20.16b\n"
".word 0x4f99e25b // sdot v27.4s, v18.16b, v25.4b[0]\n"
".word 0x4f99e0bc // sdot v28.4s, v5.16b, v25.4b[0]\n"
".word 0x4f99ea3b // sdot v27.4s, v17.16b, v25.4b[2]\n"
".word 0x4f99e8dc // sdot v28.4s, v6.16b, v25.4b[2]\n"
".word 0x4f9ae21b // sdot v27.4s, v16.16b, v26.4b[0]\n"
".word 0x4f9ae0fc // sdot v28.4s, v7.16b, v26.4b[0]\n"
"sqrdmulh v27.4s, v27.4s, v23.4s\n"
"sqrdmulh v28.4s, v28.4s, v24.4s\n"
"sqrshl v27.4s, v27.4s, v21.4s\n"
"sqrshl v28.4s, v28.4s, v22.4s\n"
"sqxtn v27.4h, v27.4s\n"
"sqxtn2 v27.8h, v28.4s\n"
"sqadd v27.8h, v27.8h, v0.8h\n"
"sqxtn v27.8b, v27.8h\n"
"smax v27.8b, v27.8b, v3.8b\n"
"smin v27.8b, v27.8b, v4.8b\n"
"subs w12, w12, #1\n"
"ushr v25.2d, v25.2d, #8\n"
"ushr v26.2d, v26.2d, #8\n"
"str d27, [x16]\n"
"add x16, x16, %[function_params]\n"
"b.ne " DC_KERNEL_MULT_22 "b\n"
"b " DC_KERNEL_MULT_20 "b\n"
DC_KERNEL_MULT_23 ":\n"
"add sp, sp, #352\n"
:
[ scratch_block_data ] "+r"(scratch_block_data),
[ filter_workspace ] "+r"(filter_workspace),
[ bias_data ] "+r"(bias_data),
[ output_block_data ] "+r"(output_block_data)
:
[ function_params ] "r"(function_params)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31",
"x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
"x27", "x28");
#undef DC_KERNEL_MULT_1
#undef DC_KERNEL_MULT_2
#undef DC_KERNEL_MULT_3
#undef DC_KERNEL_MULT_4
#undef DC_KERNEL_MULT_5
#undef DC_KERNEL_MULT_6
#undef DC_KERNEL_MULT_7
#undef DC_KERNEL_MULT_8
#undef DC_KERNEL_MULT_9
#undef DC_KERNEL_MULT_10
#undef DC_KERNEL_MULT_11
#undef DC_KERNEL_MULT_12
#undef DC_KERNEL_MULT_13
#undef DC_KERNEL_MULT_14
#undef DC_KERNEL_MULT_15
#undef DC_KERNEL_MULT_16
#undef DC_KERNEL_MULT_17
#undef DC_KERNEL_MULT_18
#undef DC_KERNEL_MULT_19
#undef DC_KERNEL_MULT_20
#undef DC_KERNEL_MULT_21
#undef DC_KERNEL_MULT_22
#undef DC_KERNEL_MULT_23
}
static inline void Run(const int8* scratch_block_data,
const int8* filter_workspace, const int32* bias_data,
int8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
output_block_data, function_params);
}
};
template <>
struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
QuantizationType::kPerChannelInt8,
DepthwiseConvDepthMultiplication::kUnitInputDepth,
2> {
static inline void KernelMacroBlockNeon(
const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, int8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
#define DC_KERNEL_MULT_STRIDE_1 …
#define DC_KERNEL_MULT_STRIDE_2 …
#define DC_KERNEL_MULT_STRIDE_3 …
#define DC_KERNEL_MULT_STRIDE_4 …
#define DC_KERNEL_MULT_STRIDE_5 …
#define DC_KERNEL_MULT_STRIDE_6 …
#define DC_KERNEL_MULT_STRIDE_7 …
#define DC_KERNEL_MULT_STRIDE_8 …
#define DC_KERNEL_MULT_STRIDE_9 …
#define DC_KERNEL_MULT_STRIDE_10 …
#define DC_KERNEL_MULT_STRIDE_11 …
#define DC_KERNEL_MULT_STRIDE_12 …
#define DC_KERNEL_MULT_STRIDE_13 …
#define DC_KERNEL_MULT_STRIDE_14 …
#define DC_KERNEL_MULT_STRIDE_15 …
#define DC_KERNEL_MULT_STRIDE_16 …
#define DC_KERNEL_MULT_STRIDE_17 …
#define DC_KERNEL_MULT_STRIDE_18 …
asm volatile(
"sub sp, sp, #32\n"
"ldr w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"cmp w8, #1\n"
"b.lt " DC_KERNEL_MULT_STRIDE_18 "f\n"
"ldr w7, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
"ldp w12, w22, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"ldpsw x10, x11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
"ldrsw x17, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
"add x13, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"
"add x14, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "\n"
"add x6, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "\n"
"cmp w7, #2\n"
"ldp x15, x16, [%[function_params], #" STR(DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL) "]\n"
"ldr w4, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
"ld1r { v0.8h }, [x13]\n"
"ld1r { v1.8b }, [x6]\n"
"ld1r { v2.8b }, [x14]\n"
"ccmp w22, w12, #0, lt\n"
"add x13, x10, x17\n"
"str x22, [sp]\n"
"csel w22, w22, w12, lt\n"
"lsl x6, x11, #1\n"
"add x21, x13, #4\n"
"bic w13, w22, w22, asr #31\n"
"mov x9, xzr\n"
"add x5, %[scratch_block_data], #4\n"
"str w7, [sp, #12]\n"
"add x7, x17, #4\n"
"add x19, x10, #4\n"
"add x20, x6, x11\n"
"lsl x14, x13, #2\n"
"sub x13, x12, x13\n"
"stp x13, x14, [sp, #16]\n"
"b " DC_KERNEL_MULT_STRIDE_3 "f\n"
DC_KERNEL_MULT_STRIDE_2 ":\n"
"add x9, x9, #1\n"
"cmp x9, x8\n"
"b.eq " DC_KERNEL_MULT_STRIDE_18 "f\n"
DC_KERNEL_MULT_STRIDE_3 ":\n"
"lsl w13, w9, #3\n"
"lsl x14, x13, #2\n"
"add x23, x16, x14\n"
"ldp q19, q20, [x23]\n"
"ldr w23, [%[scratch_block_data]]\n"
"add x14, x15, x14\n"
"ldp q21, q22, [x14]\n"
"add x14, %[scratch_block_data], x11\n"
"fmov s23, w23\n"
"mov v23.s[1], w23\n"
"ld1 { v23.s }[2], [x14]\n"
"ldp q3, q4, [%[filter_workspace]]\n"
"ldp q5, q6, [%[filter_workspace], #32]\n"
"ldp q7, q16, [%[filter_workspace], #64]\n"
"ldp q17, q18, [%[bias_data]], #32\n"
"ldr s24, [%[scratch_block_data], x6]\n"
"add %[filter_workspace], x1, #96\n"
"add x25, %[output_block_data], x13\n"
"cmp w4, #2\n"
"mov v23.s[3], w23\n"
"b.ne " DC_KERNEL_MULT_STRIDE_8 "f\n"
"dup v24.4s, v24.s[0]\n"
"add x13, %[scratch_block_data], x20\n"
"add x14, %[scratch_block_data], x11, lsl #2\n"
"ld1 { v24.s }[2], [x13]\n"
"ld1r { v25.4s }, [x14]\n"
"cmp w22, #1\n"
"lsl x26, x11, #2\n"
"b.lt " DC_KERNEL_MULT_STRIDE_12 "f\n"
"mov x27, xzr\n"
"mov x28, x22\n"
DC_KERNEL_MULT_STRIDE_6 ":\n"
"and x13, x27, #0xfffffffc\n"
"add x13, x5, x13\n"
"mov x23, x13\n"
"ld1 { v23.s }[1], [x23], x26\n"
"add x24, x13, x6\n"
"ld1 { v24.s }[1], [x24]\n"
"add x14, x13, x11\n"
"add x24, x13, x20\n"
"ld1 { v23.s }[3], [x14]\n"
"ld1 { v24.s }[3], [x24]\n"
"mov v27.16b, v17.16b\n"
"ld1 { v25.s }[1], [x23]\n"
"mov v28.16b, v17.16b\n"
".word 0x4f97e07b // sdot v27.4s, v3.16b, v23.4b[0]\n"
".word 0x4f98e07c // sdot v28.4s, v3.16b, v24.4b[0]\n"
".word 0x4f97e8bb // sdot v27.4s, v5.16b, v23.4b[2]\n"
".word 0x4f98e8bc // sdot v28.4s, v5.16b, v24.4b[2]\n"
".word 0x4f98e0fb // sdot v27.4s, v7.16b, v24.4b[0]\n"
".word 0x4f99e0fc // sdot v28.4s, v7.16b, v25.4b[0]\n"
"sqrdmulh v27.4s, v27.4s, v21.4s\n"
"sqrdmulh v28.4s, v28.4s, v21.4s\n"
"sqrshl v27.4s, v27.4s, v19.4s\n"
"sqrshl v28.4s, v28.4s, v19.4s\n"
"sqxtn v31.4h, v27.4s\n"
"sqxtn2 v31.8h, v28.4s\n"
"mov v29.16b, v18.16b\n"
"sqadd v28.8h, v31.8h, v0.8h\n"
"mov v30.16b, v18.16b\n"
"sqxtn v28.8b, v28.8h\n"
".word 0x4f97e09d // sdot v29.4s, v4.16b, v23.4b[0]\n"
"add x13, x25, x19\n"
"smax v28.8b, v28.8b, v1.8b\n"
".word 0x4f98e09e // sdot v30.4s, v4.16b, v24.4b[0]\n"
".word 0x4f97e8dd // sdot v29.4s, v6.16b, v23.4b[2]\n"
"sub x23, x13, #4\n"
"smin v28.8b, v28.8b, v2.8b\n"
".word 0x4f98e8de // sdot v30.4s, v6.16b, v24.4b[2]\n"
".word 0x4f98e21d // sdot v29.4s, v16.16b, v24.4b[0]\n"
"str s28, [x25]\n"
"st1 { v28.s }[1], [x23]\n"
".word 0x4f99e21e // sdot v30.4s, v16.16b, v25.4b[0]\n"
"sqrdmulh v28.4s, v29.4s, v22.4s\n"
"sqrdmulh v29.4s, v30.4s, v22.4s\n"
"sqrshl v28.4s, v28.4s, v20.4s\n"
"sqrshl v29.4s, v29.4s, v20.4s\n"
"sqxtn v28.4h, v28.4s\n"
"sqxtn2 v28.8h, v29.4s\n"
"sqadd v28.8h, v28.8h, v0.8h\n"
"sqxtn v28.8b, v28.8h\n"
"smax v28.8b, v28.8b, v1.8b\n"
"smin v28.8b, v28.8b, v2.8b\n"
"mov v26.16b, v17.16b\n"
"str s28, [x25, #4]\n"
"mov v29.16b, v18.16b\n"
"st1 { v28.s }[1], [x13]\n"
"ushr v28.2d, v23.2d, #16\n"
".word 0x4f9ce07a // sdot v26.4s, v3.16b, v28.4b[0]\n"
".word 0x4f9ce09d // sdot v29.4s, v4.16b, v28.4b[0]\n"
"mov v27.16b, v17.16b\n"
"mov v30.16b, v18.16b\n"
".word 0x4f9ce8ba // sdot v26.4s, v5.16b, v28.4b[2]\n"
".word 0x4f9ce8dd // sdot v29.4s, v6.16b, v28.4b[2]\n"
"ushr v28.2d, v24.2d, #16\n"
".word 0x4f9ce07b // sdot v27.4s, v3.16b, v28.4b[0]\n"
".word 0x4f9ce09e // sdot v30.4s, v4.16b, v28.4b[0]\n"
".word 0x4f9ce8bb // sdot v27.4s, v5.16b, v28.4b[2]\n"
".word 0x4f9ce8de // sdot v30.4s, v6.16b, v28.4b[2]\n"
".word 0x4f9ce0fa // sdot v26.4s, v7.16b, v28.4b[0]\n"
".word 0x4f9ce21d // sdot v29.4s, v16.16b, v28.4b[0]\n"
"ushr v28.2d, v25.2d, #16\n"
".word 0x4f9ce0fb // sdot v27.4s, v7.16b, v28.4b[0]\n"
"sqrdmulh v26.4s, v26.4s, v21.4s\n"
"sqrdmulh v27.4s, v27.4s, v21.4s\n"
"sqrshl v26.4s, v26.4s, v19.4s\n"
"sqrshl v27.4s, v27.4s, v19.4s\n"
"sqxtn v26.4h, v26.4s\n"
"sqxtn2 v26.8h, v27.4s\n"
"sqadd v26.8h, v26.8h, v0.8h\n"
".word 0x4f9ce21e // sdot v30.4s, v16.16b, v28.4b[0]\n"
"sqrdmulh v28.4s, v29.4s, v22.4s\n"
"sqxtn v26.8b, v26.8h\n"
"add x24, x25, x21\n"
"sqrdmulh v29.4s, v30.4s, v22.4s\n"
"sqrshl v28.4s, v28.4s, v20.4s\n"
"smax v26.8b, v26.8b, v1.8b\n"
"add x23, x25, x7\n"
"sub x13, x24, #4\n"
"sqrshl v29.4s, v29.4s, v20.4s\n"
"sqxtn v28.4h, v28.4s\n"
"smin v26.8b, v26.8b, v2.8b\n"
"stur s26, [x23, #-4]\n"
"st1 { v26.s }[1], [x13]\n"
"sqxtn2 v28.8h, v29.4s\n"
"sqadd v26.8h, v28.8h, v0.8h\n"
"sqxtn v26.8b, v26.8h\n"
"add x14, x25, x17\n"
"smax v26.8b, v26.8b, v1.8b\n"
"subs x28, x28, #1\n"
"ushr v23.2d, v23.2d, #32\n"
"ushr v24.2d, v24.2d, #32\n"
"ushr v25.2d, v25.2d, #32\n"
"add x25, x14, x17\n"
"smin v26.8b, v26.8b, v2.8b\n"
"add x27, x27, #4\n"
"str s26, [x23]\n"
"st1 { v26.s }[1], [x24]\n"
"b.ne " DC_KERNEL_MULT_STRIDE_6 "b\n"
"mov w13, w22\n"
"cmp w13, w12\n"
"ldp x13, x27, [sp, #16]\n"
"b.lt " DC_KERNEL_MULT_STRIDE_13 "f\n"
"b " DC_KERNEL_MULT_STRIDE_2 "b\n"
DC_KERNEL_MULT_STRIDE_8 ":\n"
"cmp w12, #1\n"
"b.lt " DC_KERNEL_MULT_STRIDE_2 "b\n"
"ldr w13, [sp, #12]\n"
"dup v24.4s, v24.s[0]\n"
"cmp w13, #2\n"
"b.ne " DC_KERNEL_MULT_STRIDE_14 "f\n"
"mov x26, xzr\n"
"mov x13, x12\n"
DC_KERNEL_MULT_STRIDE_11 ":\n"
"and x14, x26, #0xfffffffc\n"
"add x14, x5, x14\n"
"mov x23, x14\n"
"ld1 { v23.s }[1], [x23], x6\n"
"add x14, x14, x11\n"
"mov v26.16b, v17.16b\n"
"mov v27.16b, v18.16b\n"
"ld1 { v24.s }[1], [x23]\n"
"ld1 { v23.s }[3], [x14]\n"
"mov v25.16b, v17.16b\n"
"add x14, x25, x17\n"
"ushr v28.2d, v24.2d, #16\n"
".word 0x4f9ce0fa // sdot v26.4s, v7.16b, v28.4b[0]\n"
".word 0x4f9ce21b // sdot v27.4s, v16.16b, v28.4b[0]\n"
"ushr v28.2d, v23.2d, #16\n"
".word 0x4f9ce07a // sdot v26.4s, v3.16b, v28.4b[0]\n"
".word 0x4f9ce09b // sdot v27.4s, v4.16b, v28.4b[0]\n"
".word 0x4f9ce8ba // sdot v26.4s, v5.16b, v28.4b[2]\n"
".word 0x4f9ce8db // sdot v27.4s, v6.16b, v28.4b[2]\n"
"mov v28.16b, v18.16b\n"
".word 0x4f98e0f9 // sdot v25.4s, v7.16b, v24.4b[0]\n"
".word 0x4f98e21c // sdot v28.4s, v16.16b, v24.4b[0]\n"
".word 0x4f97e079 // sdot v25.4s, v3.16b, v23.4b[0]\n"
".word 0x4f97e09c // sdot v28.4s, v4.16b, v23.4b[0]\n"
".word 0x4f97e8b9 // sdot v25.4s, v5.16b, v23.4b[2]\n"
".word 0x4f97e8dc // sdot v28.4s, v6.16b, v23.4b[2]\n"
"sqrdmulh v25.4s, v25.4s, v21.4s\n"
"sqrdmulh v28.4s, v28.4s, v22.4s\n"
"sqrshl v25.4s, v25.4s, v19.4s\n"
"sqrshl v28.4s, v28.4s, v20.4s\n"
"sqxtn v25.4h, v25.4s\n"
"sqxtn2 v25.8h, v28.4s\n"
"sqadd v25.8h, v25.8h, v0.8h\n"
"sqrdmulh v26.4s, v26.4s, v21.4s\n"
"sqxtn v25.8b, v25.8h\n"
"sqrdmulh v27.4s, v27.4s, v22.4s\n"
"sqrshl v26.4s, v26.4s, v19.4s\n"
"smax v25.8b, v25.8b, v1.8b\n"
"sqrshl v27.4s, v27.4s, v20.4s\n"
"sqxtn v26.4h, v26.4s\n"
"smin v25.8b, v25.8b, v2.8b\n"
"str d25, [x25]\n"
"sqxtn2 v26.8h, v27.4s\n"
"sqadd v25.8h, v26.8h, v0.8h\n"
"sqxtn v25.8b, v25.8h\n"
"smax v25.8b, v25.8b, v1.8b\n"
"smin v25.8b, v25.8b, v2.8b\n"
"subs x13, x13, #1\n"
"ushr v24.2d, v24.2d, #32\n"
"ushr v23.2d, v23.2d, #32\n"
"str d25, [x25, x17]\n"
"add x25, x14, x17\n"
"add x26, x26, #4\n"
"b.ne " DC_KERNEL_MULT_STRIDE_11 "b\n"
"b " DC_KERNEL_MULT_STRIDE_2 "b\n"
DC_KERNEL_MULT_STRIDE_12 ":\n"
"mov w13, wzr\n"
"cmp w13, w12\n"
"ldp x13, x27, [sp, #16]\n"
"b.ge " DC_KERNEL_MULT_STRIDE_2 "b\n"
DC_KERNEL_MULT_STRIDE_13 ":\n"
"and x14, x27, #0xfffffffc\n"
"add x14, x5, x14\n"
"mov x24, x14\n"
"add x23, x14, x6\n"
"ld1 { v23.s }[1], [x24], x26\n"
"ld1 { v24.s }[1], [x23]\n"
"add x23, x14, x11\n"
"add x14, x14, x20\n"
"ld1 { v23.s }[3], [x23]\n"
"ld1 { v24.s }[3], [x14]\n"
"mov v26.16b, v17.16b\n"
"ld1 { v25.s }[1], [x24]\n"
"mov v27.16b, v17.16b\n"
".word 0x4f97e07a // sdot v26.4s, v3.16b, v23.4b[0]\n"
".word 0x4f98e07b // sdot v27.4s, v3.16b, v24.4b[0]\n"
".word 0x4f97e8ba // sdot v26.4s, v5.16b, v23.4b[2]\n"
".word 0x4f98e8bb // sdot v27.4s, v5.16b, v24.4b[2]\n"
".word 0x4f98e0fa // sdot v26.4s, v7.16b, v24.4b[0]\n"
".word 0x4f99e0fb // sdot v27.4s, v7.16b, v25.4b[0]\n"
"sqrdmulh v26.4s, v26.4s, v21.4s\n"
"sqrdmulh v27.4s, v27.4s, v21.4s\n"
"sqrshl v26.4s, v26.4s, v19.4s\n"
"sqrshl v27.4s, v27.4s, v19.4s\n"
"sqxtn v26.4h, v26.4s\n"
"sqxtn2 v26.8h, v27.4s\n"
"sqadd v26.8h, v26.8h, v0.8h\n"
"sqxtn v26.8b, v26.8h\n"
"smax v26.8b, v26.8b, v1.8b\n"
"add x14, x25, x10\n"
"mov v27.16b, v18.16b\n"
"smin v26.8b, v26.8b, v2.8b\n"
"str s26, [x25]\n"
"st1 { v26.s }[1], [x14]\n"
"mov v26.16b, v18.16b\n"
".word 0x4f97e09b // sdot v27.4s, v4.16b, v23.4b[0]\n"
".word 0x4f98e09a // sdot v26.4s, v4.16b, v24.4b[0]\n"
".word 0x4f97e8db // sdot v27.4s, v6.16b, v23.4b[2]\n"
".word 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n"
".word 0x4f98e21b // sdot v27.4s, v16.16b, v24.4b[0]\n"
".word 0x4f99e21a // sdot v26.4s, v16.16b, v25.4b[0]\n"
"sqrdmulh v27.4s, v27.4s, v22.4s\n"
"sqrdmulh v26.4s, v26.4s, v22.4s\n"
"sqrshl v27.4s, v27.4s, v20.4s\n"
"sqrshl v26.4s, v26.4s, v20.4s\n"
"sqxtn v27.4h, v27.4s\n"
"sqxtn2 v27.8h, v26.4s\n"
"sqadd v26.8h, v27.8h, v0.8h\n"
"sqxtn v26.8b, v26.8h\n"
"smax v26.8b, v26.8b, v1.8b\n"
"smin v26.8b, v26.8b, v2.8b\n"
"subs x13, x13, #1\n"
"add x14, x14, #4\n"
"ushr v23.2d, v23.2d, #16\n"
"ushr v24.2d, v24.2d, #16\n"
"ushr v25.2d, v25.2d, #16\n"
"str s26, [x25, #4]\n"
"add x25, x25, x17\n"
"add x27, x27, #4\n"
"st1 { v26.s }[1], [x14]\n"
"b.ne " DC_KERNEL_MULT_STRIDE_13 "b\n"
"b " DC_KERNEL_MULT_STRIDE_2 "b\n"
DC_KERNEL_MULT_STRIDE_14 ":\n"
"ldr x27, [sp]\n"
"mov x13, xzr\n"
"mov x26, x12\n"
"b " DC_KERNEL_MULT_STRIDE_16 "f\n"
DC_KERNEL_MULT_STRIDE_15 ":\n"
"add x13, x13, #4\n"
"subs x26, x26, #1\n"
"sub x27, x27, #1\n"
"mov v23.16b, v25.16b\n"
"mov v24.16b, v26.16b\n"
"b.eq " DC_KERNEL_MULT_STRIDE_2 "b\n"
DC_KERNEL_MULT_STRIDE_16 ":\n"
"and x14, x13, #0xfffffffc\n"
"add x14, x5, x14\n"
"mov x23, x14\n"
"ld1 { v23.s }[1], [x23], x6\n"
"add x14, x14, x11\n"
"mov v25.16b, v17.16b\n"
"mov v26.16b, v18.16b\n"
"ld1 { v24.s }[1], [x23]\n"
"ld1 { v23.s }[3], [x14]\n"
".word 0x4f98e0f9 // sdot v25.4s, v7.16b, v24.4b[0]\n"
".word 0x4f98e21a // sdot v26.4s, v16.16b, v24.4b[0]\n"
".word 0x4f97e079 // sdot v25.4s, v3.16b, v23.4b[0]\n"
".word 0x4f97e09a // sdot v26.4s, v4.16b, v23.4b[0]\n"
".word 0x4f97e8b9 // sdot v25.4s, v5.16b, v23.4b[2]\n"
".word 0x4f97e8da // sdot v26.4s, v6.16b, v23.4b[2]\n"
"sqrdmulh v25.4s, v25.4s, v21.4s\n"
"sqrdmulh v26.4s, v26.4s, v22.4s\n"
"sqrshl v25.4s, v25.4s, v19.4s\n"
"sqrshl v26.4s, v26.4s, v20.4s\n"
"sqxtn v27.4h, v25.4s\n"
"sqxtn2 v27.8h, v26.4s\n"
"sqadd v26.8h, v27.8h, v0.8h\n"
"sqxtn v26.8b, v26.8h\n"
"smax v26.8b, v26.8b, v1.8b\n"
"smin v26.8b, v26.8b, v2.8b\n"
"ushr v25.2d, v23.2d, #16\n"
"str d26, [x25]\n"
"ushr v26.2d, v24.2d, #16\n"
"add x25, x25, x17\n"
"cbz x27, " DC_KERNEL_MULT_STRIDE_15 "b\n"
"mov v27.16b, v17.16b\n"
"mov v28.16b, v18.16b\n"
".word 0x4f9ae0fb // sdot v27.4s, v7.16b, v26.4b[0]\n"
".word 0x4f9ae21c // sdot v28.4s, v16.16b, v26.4b[0]\n"
".word 0x4f99e07b // sdot v27.4s, v3.16b, v25.4b[0]\n"
".word 0x4f99e09c // sdot v28.4s, v4.16b, v25.4b[0]\n"
".word 0x4f99e8bb // sdot v27.4s, v5.16b, v25.4b[2]\n"
".word 0x4f99e8dc // sdot v28.4s, v6.16b, v25.4b[2]\n"
"ushr v25.2d, v23.2d, #32\n"
"sqrdmulh v23.4s, v27.4s, v21.4s\n"
"ushr v26.2d, v24.2d, #32\n"
"sqrdmulh v24.4s, v28.4s, v22.4s\n"
"sqrshl v23.4s, v23.4s, v19.4s\n"
"sqrshl v24.4s, v24.4s, v20.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v23.8h, v23.8h, v0.8h\n"
"sqxtn v23.8b, v23.8h\n"
"smax v23.8b, v23.8b, v1.8b\n"
"smin v23.8b, v23.8b, v2.8b\n"
"str d23, [x25]\n"
"add x25, x25, x17\n"
"b " DC_KERNEL_MULT_STRIDE_15 "b\n"
DC_KERNEL_MULT_STRIDE_18 ":\n"
"add sp, sp, #32\n"
:
[ scratch_block_data ] "+r"(scratch_block_data),
[ filter_workspace ] "+r"(filter_workspace),
[ bias_data ] "+r"(bias_data),
[ output_block_data ] "+r"(output_block_data)
:
[ function_params ] "r"(function_params)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31",
"x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
"x27", "x28");
#undef DC_KERNEL_MULT_STRIDE_1
#undef DC_KERNEL_MULT_STRIDE_2
#undef DC_KERNEL_MULT_STRIDE_3
#undef DC_KERNEL_MULT_STRIDE_4
#undef DC_KERNEL_MULT_STRIDE_5
#undef DC_KERNEL_MULT_STRIDE_6
#undef DC_KERNEL_MULT_STRIDE_7
#undef DC_KERNEL_MULT_STRIDE_8
#undef DC_KERNEL_MULT_STRIDE_9
#undef DC_KERNEL_MULT_STRIDE_10
#undef DC_KERNEL_MULT_STRIDE_11
#undef DC_KERNEL_MULT_STRIDE_12
#undef DC_KERNEL_MULT_STRIDE_13
#undef DC_KERNEL_MULT_STRIDE_14
#undef DC_KERNEL_MULT_STRIDE_15
#undef DC_KERNEL_MULT_STRIDE_16
#undef DC_KERNEL_MULT_STRIDE_17
#undef DC_KERNEL_MULT_STRIDE_18
}
static inline void Run(const int8* scratch_block_data,
const int8* filter_workspace, const int32* bias_data,
int8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
output_block_data, function_params);
}
};
#undef DP_OFFSET_INPUT_DEPTH
#undef DP_OFFSET_OUTPUT_DEPTH
#undef DP_OFFSET_STRIDE
#undef DP_OFFSET_BIAS_INCREMENT
#undef DP_OFFSET_INPUT_OFFSET
#undef DP_OFFSET_OUTPUT_OFFSET
#undef DP_OFFSET_OUTPUT_MULTIPLIER
#undef DP_OFFSET_OUTPUT_SHIFT
#undef DP_OFFSET_QUANTIZED_ACTIVATION_MIN
#undef DP_OFFSET_QUANTIZED_ACTIVATION_MAX
#undef DP_OFFSET_PADDING_LEFT
#undef DP_OFFSET_PADDING_RIGHT
#undef DP_OFFSET_PADDING_TOP
#undef DP_OFFSET_PADDING_BOTTOM
#undef DP_OFFSET_DEPTH_MICRO_REPEATS
#undef DP_OFFSET_WIDTH_MACRO_COUNT
#undef DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS
#undef DP_OFFSET_INPUT_WIDTH_MICRO_REPEATS
#undef DP_OFFSET_RESIDUAL_WIDTH
#undef DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS
#undef DP_OFFSET_OUTPUT_WIDTH_MICRO_REPEATS
#undef DP_OFFSET_OUTPUT_RESIDUAL_WIDTH
#undef DP_OFFSET_WORKSPACE_WIDTH_MICRO_REPEATS
#undef DP_OFFSET_HEIGHT_MACRO_COUNT
#undef DP_OFFSET_INBOUND_BLOCK_HEIGHT
#undef DP_OFFSET_OUTBOUND_BLOCK_HEIGHT
#undef DP_OFFSET_INPUT_HEIGHT_STRIDE
#undef DP_OFFSET_OUTPUT_HEIGHT_STRIDE
#undef DP_OFFSET_WORKSPACE_HEIGHT_STRIDE
#undef DP_OFFSET_FOUR_OVER_STRIDE
#endif
template <DepthwiseConvImplementation implementation,
QuantizationType quantization_type>
inline void DepthwiseConvDotProduct3x3Impl(
const DepthwiseParams& params, const RuntimeShape& input_shape,
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
input_data,
const RuntimeShape& filter_shape,
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
filter_data,
const RuntimeShape& bias_shape, const int32_t* bias_data,
const RuntimeShape& output_shape,
typename QuantizationTypeImpl<quantization_type>::ExternalType* output_data,
int thread_start, int thread_end, int thread_dim) { … }
template <DepthwiseConvImplementation implementation>
inline void DepthwiseConvDotProduct3x3(
const DepthwiseParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& filter_shape,
const uint8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
uint8_t* output_data, int thread_start, int thread_end, int thread_dim) { … }
template <DepthwiseConvImplementation implementation>
inline void DepthwiseConvDotProduct3x3PerChannel(
const DepthwiseParams& params, const RuntimeShape& input_shape,
const int8_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int8_t* output_data, int thread_start, int thread_end, int thread_dim) { … }
#undef vst1_lane_8x4
#undef vst1q_lane_8x4
#undef vld1q_lane_s8x8
#undef vld1_lane_8x4
#undef vld1q_lane_8x4
#undef vld1q_dup_s8x4
#undef STR
#undef STR_UNEXPANDED
}
}
}
#endif