#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_3X3_FILTER_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_3X3_FILTER_H_
#include <stddef.h>
#include <memory>
#include "ruy/profiler/instrumentation.h"
#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace optimized_ops {
namespace depthwise_conv {
#define STR …
#define STR_UNEXPANDED …
#if defined(__aarch64__) && !defined(GOOGLE_L4T)
#define OFFSET_INPUT_DEPTH …
#define OFFSET_INPUT_ROW_SIZE …
#define OFFSET_OUTPUT_DEPTH …
#define OFFSET_OUTPUT_ROW_SIZE …
#define OFFSET_FILTER_ROW_SIZE …
#define OFFSET_INPUT_OFFSET …
#define OFFSET_OUTPUT_OFFSET …
#define OFFSET_OUTPUT_MULTIPLIER …
#define OFFSET_OUTPUT_ACTIVATION_MIN …
#define OFFSET_OUTPUT_ACTIVATION_MAX …
#define OFFSET_OUTPUT_RIGHT_SHIFT …
#define OFFSET_INPUT_WIDTH …
#define OFFSET_INPUT_HEIGHT …
#define OFFSET_STRIDE_WIDTH …
#define OFFSET_STRIDE_HEIGHT …
#define OFFSET_OUTPUT_WIDTH …
#define OFFSET_OUTPUT_HEIGHT …
static_assert(offsetof(DepthwiseConvParams, input_depth) == OFFSET_INPUT_DEPTH,
"");
static_assert(offsetof(DepthwiseConvParams, input_row_size) ==
OFFSET_INPUT_ROW_SIZE,
"");
static_assert(offsetof(DepthwiseConvParams, output_depth) ==
OFFSET_OUTPUT_DEPTH,
"");
static_assert(offsetof(DepthwiseConvParams, output_row_size) ==
OFFSET_OUTPUT_ROW_SIZE,
"");
static_assert(offsetof(DepthwiseConvParams, filter_row_size) ==
OFFSET_FILTER_ROW_SIZE,
"");
static_assert(offsetof(DepthwiseConvParams, input_offset) ==
OFFSET_INPUT_OFFSET,
"");
static_assert(offsetof(DepthwiseConvParams, output_offset) ==
OFFSET_OUTPUT_OFFSET,
"");
static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
OFFSET_OUTPUT_MULTIPLIER,
"");
static_assert(offsetof(DepthwiseConvParams, output_activation_min) ==
OFFSET_OUTPUT_ACTIVATION_MIN,
"");
static_assert(offsetof(DepthwiseConvParams, output_activation_max) ==
OFFSET_OUTPUT_ACTIVATION_MAX,
"");
static_assert(offsetof(DepthwiseConvParams, output_right_shift) ==
OFFSET_OUTPUT_RIGHT_SHIFT,
"");
static_assert(offsetof(DepthwiseConvParams, input_width) == OFFSET_INPUT_WIDTH,
"");
static_assert(offsetof(DepthwiseConvParams, input_height) ==
OFFSET_INPUT_HEIGHT,
"");
static_assert(offsetof(DepthwiseConvParams, stride_width) ==
OFFSET_STRIDE_WIDTH,
"");
static_assert(offsetof(DepthwiseConvParams, stride_height) ==
OFFSET_STRIDE_HEIGHT,
"");
static_assert(offsetof(DepthwiseConvParams, output_width) ==
OFFSET_OUTPUT_WIDTH,
"");
static_assert(offsetof(DepthwiseConvParams, output_height) ==
OFFSET_OUTPUT_HEIGHT,
"");
template <>
struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
1> {
public:
static inline void Run(const int32* output_multiplier_ptr,
const int32* output_shift_ptr, const int8* input_ptr,
const int8* filter_ptr, const int32* bias_ptr,
int8* output_ptr, int64_t input_depth,
int64_t input_row_size, int32 output_window_height,
int32 output_window_width,
const DepthwiseConvParams* params_ptr) {
const int64_t input_width_increment = 2 * input_depth;
const int64_t input_height_increment = 2 * input_row_size;
const int64_t output_height_increment = 2 * params_ptr->output_row_size;
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1 …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_END …
asm volatile(
"ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"cmp %w[output_window_height], #2\n"
"dup v26.8h, w9\n"
"ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v29.8h, w2\n"
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"add x10, %[bias_ptr], #16\n"
"ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
"dup v25.16b, w4\n"
"ld1 {v27.4s, v28.4s}, [%[output_multiplier_ptr]]\n"
"ld1 {v30.4s, v31.4s}, [%[output_shift_ptr]]\n"
"ld1 {v0.8b}, [%[filter_ptr]], x3\n"
"ld1 {v1.8b}, [%[filter_ptr]], x3\n"
"sshll v0.8h, v0.8b, #0\n"
"ld1 {v2.8b}, [%[filter_ptr]], x3\n"
"sshll v1.8h, v1.8b, #0\n"
"ld1 {v3.8b}, [%[filter_ptr]], x3\n"
"sshll v2.8h, v2.8b, #0\n"
"ld1 {v4.8b}, [%[filter_ptr]], x3\n"
"sshll v3.8h, v3.8b, #0\n"
"ld1 {v5.8b}, [%[filter_ptr]], x3\n"
"sshll v4.8h, v4.8b, #0\n"
"ld1 {v6.8b}, [%[filter_ptr]], x3\n"
"sshll v5.8h, v5.8b, #0\n"
"ld1 {v7.8b}, [%[filter_ptr]], x3\n"
"sshll v6.8h, v6.8b, #0\n"
"ld1 {v8.8b}, [%[filter_ptr]], x3\n"
"sshll v7.8h, v7.8b, #0\n"
"sshll v8.8h, v8.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
"mov x11, %[input_ptr]\n"
"mov x12, x11\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"add x13, x11, %[input_row_size]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"add x14, x13, %[input_row_size]\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x15, x14, %[input_row_size]\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"mov w5, %w[output_window_width]\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x1\n"
"ld1 {v15.8b}, [x14], %[input_depth]\n"
"cmp w5, #2\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v16.8b}, [x14], %[input_depth]\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"ld1 {v18.8b}, [x15], %[input_depth]\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"ld1 {v19.8b}, [x15], %[input_depth]\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"ld1 {v20.8b}, [x15], %[input_depth]\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v22.4s}, [x10]\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"ld1 {v24.4s}, [x10]\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
"cmp w5, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"subs w5, w5, #2\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"cmp w5, #3\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"ld1 {v9.8b}, [x12]\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"ld1 {v12.8b}, [x13]\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"ld1 {v15.8b}, [x14]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"ld1 {v18.8b}, [x15]\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v30.4s\n"
"sqrshl v24.4s, v24.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v24.16b, w0\n"
"ld1 {v22.4s}, [x10]\n"
"smax v21.16b, v21.16b, v25.16b\n"
"smin v21.16b, v21.16b, v24.16b\n"
"ld1 {v24.4s}, [x10]\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x3\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"smlal v21.4s, v0.4h, v10.4h\n"
"add x11, x11, %[input_width_increment]\n"
"smlal2 v22.4s, v0.8h, v10.8h\n"
"mov x12, x11\n"
"smlal v23.4s, v0.4h, v13.4h\n"
"add x13, x11, %[input_row_size]\n"
"smlal2 v24.4s, v0.8h, v13.8h\n"
"add x14, x13, %[input_row_size]\n"
"smlal v21.4s, v1.4h, v11.4h\n"
"add x15, x14, %[input_row_size]\n"
"smlal2 v22.4s, v1.8h, v11.8h\n"
"smlal v23.4s, v1.4h, v14.4h\n"
"smlal2 v24.4s, v1.8h, v14.8h\n"
"smlal v21.4s, v2.4h, v9.4h\n"
"smlal2 v22.4s, v2.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"smlal v21.4s, v5.4h, v12.4h\n"
"smlal2 v22.4s, v5.8h, v12.8h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v5.4h, v15.4h\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v5.8h, v15.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v21.4s, v6.4h, v16.4h\n"
"smlal2 v22.4s, v6.8h, v16.8h\n"
"smlal v23.4s, v6.4h, v19.4h\n"
"smlal2 v24.4s, v6.8h, v19.8h\n"
"smlal v21.4s, v7.4h, v17.4h\n"
"smlal2 v22.4s, v7.8h, v17.8h\n"
"smlal v23.4s, v7.4h, v20.4h\n"
"smlal2 v24.4s, v7.8h, v20.8h\n"
"smlal v21.4s, v8.4h, v15.4h\n"
"smlal2 v22.4s, v8.8h, v15.8h\n"
"ld1 {v15.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v8.4h, v18.4h\n"
"ld1 {v16.8b}, [x14], %[input_depth]\n"
"smlal2 v24.4s, v8.8h, v18.8h\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"ld1 {v18.8b}, [x15], %[input_depth]\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"ld1 {v19.8b}, [x15], %[input_depth]\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"ld1 {v20.8b}, [x15], %[input_depth]\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v30.4s\n"
"sqrshl v24.4s, v24.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v24.16b, w0\n"
"ld1 {v22.4s}, [x10]\n"
"smax v21.16b, v21.16b, v25.16b\n"
"smin v21.16b, v21.16b, v24.16b\n"
"ld1 {v24.4s}, [x10]\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x3\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
"cmp w5, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"ld1 {v9.8b}, [x12]\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"ld1 {v12.8b}, [x13]\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"ld1 {v15.8b}, [x14]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"ld1 {v18.8b}, [x15]\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v30.4s\n"
"sqrshl v24.4s, v24.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v24.16b, w0\n"
"ld1 {v22.4s}, [x10]\n"
"smax v21.16b, v21.16b, v25.16b\n"
"smin v21.16b, v21.16b, v24.16b\n"
"ld1 {v24.4s}, [x10]\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x3\n"
"mov v23.d[0], v21.d[1]\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"st1 {v23.8b}, [x7], x3\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"smlal v21.4s, v0.4h, v10.4h\n"
"smlal2 v22.4s, v0.8h, v10.8h\n"
"smlal v23.4s, v0.4h, v13.4h\n"
"smlal2 v24.4s, v0.8h, v13.8h\n"
"smlal v21.4s, v1.4h, v11.4h\n"
"smlal2 v22.4s, v1.8h, v11.8h\n"
"smlal v23.4s, v1.4h, v14.4h\n"
"smlal2 v24.4s, v1.8h, v14.8h\n"
"smlal v21.4s, v2.4h, v9.4h\n"
"smlal2 v22.4s, v2.8h, v9.8h\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"smlal v21.4s, v5.4h, v12.4h\n"
"smlal2 v22.4s, v5.8h, v12.8h\n"
"smlal v23.4s, v5.4h, v15.4h\n"
"smlal2 v24.4s, v5.8h, v15.8h\n"
"smlal v21.4s, v6.4h, v16.4h\n"
"smlal2 v22.4s, v6.8h, v16.8h\n"
"smlal v23.4s, v6.4h, v19.4h\n"
"smlal2 v24.4s, v6.8h, v19.8h\n"
"smlal v21.4s, v7.4h, v17.4h\n"
"smlal2 v22.4s, v7.8h, v17.8h\n"
"smlal v23.4s, v7.4h, v20.4h\n"
"smlal2 v24.4s, v7.8h, v20.8h\n"
"smlal v21.4s, v8.4h, v15.4h\n"
"smlal2 v22.4s, v8.8h, v15.8h\n"
"smlal v23.4s, v8.4h, v18.4h\n"
"smlal2 v24.4s, v8.8h, v18.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v30.4s\n"
"sqrshl v24.4s, v24.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v24.16b, w0\n"
"smax v21.16b, v21.16b, v25.16b\n"
"smin v21.16b, v21.16b, v24.16b\n"
"st1 {v21.8b}, [x6], x3\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v30.4s\n"
"sqrshl v24.4s, v24.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v24.16b, w0\n"
"smax v21.16b, v21.16b, v25.16b\n"
"smin v21.16b, v21.16b, v24.16b\n"
"st1 {v21.8b}, [x6], x3\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
"subs %w[output_window_height], %w[output_window_height], #2\n"
"add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
"cmp %w[output_window_height], #2\n"
"add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
"cmp %w[output_window_height], #1\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
"mov x12, %[input_ptr]\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"add x13, %[input_ptr], %[input_row_size]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"add x14, x13, %[input_row_size]\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x15, x14, %[input_row_size]\n"
"mov w5, %w[output_window_width]\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x1\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"cmp w5, #2\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"ld1 {v18.8b}, [x14], %[input_depth]\n"
"ld1 {v19.8b}, [x14], %[input_depth]\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"ld1 {v22.4s}, [x10]\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"ld1 {v24.4s}, [x10]\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
"cmp w5, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v16.8b}, [x13]\n"
"smlal v23.4s, v0.4h, v10.4h\n"
"ld1 {v20.8b}, [x14]\n"
"smlal2 v24.4s, v0.8h, v10.8h\n"
"subs w5, w5, #2\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"cmp w5, #3\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"add %[input_ptr], %[input_ptr], %[input_width_increment]\n"
"smlal v23.4s, v1.4h, v11.4h\n"
"mov x12, %[input_ptr]\n"
"smlal2 v24.4s, v1.8h, v11.8h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x13, %[input_ptr], %[input_row_size]\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"add x14, x13, %[input_row_size]\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"add x15, x14, %[input_row_size]\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v3.4h, v14.4h\n"
"smlal2 v24.4s, v3.8h, v14.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v4.4h, v15.4h\n"
"smlal2 v24.4s, v4.8h, v15.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v5.4h, v16.4h\n"
"smlal2 v24.4s, v5.8h, v16.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"ld1 {v18.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"ld1 {v19.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v30.4s\n"
"sqrshl v24.4s, v24.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v24.16b, w0\n"
"ld1 {v22.4s}, [x10]\n"
"smax v21.16b, v21.16b, v25.16b\n"
"smin v21.16b, v21.16b, v24.16b\n"
"ld1 {v24.4s}, [x10]\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [%[output_ptr]], x3\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [%[output_ptr]], x3\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
"cmp w5, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v0.4h, v10.4h\n"
"ld1 {v20.8b}, [x14], %[input_depth]\n"
"smlal2 v24.4s, v0.8h, v10.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v11.4h\n"
"smlal2 v24.4s, v1.8h, v11.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v14.4h\n"
"smlal2 v24.4s, v3.8h, v14.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v15.4h\n"
"smlal2 v24.4s, v4.8h, v15.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"smlal v23.4s, v5.4h, v16.4h\n"
"smlal2 v24.4s, v5.8h, v16.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v30.4s\n"
"sqrshl v24.4s, v24.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v24.16b, w0\n"
"smax v21.16b, v21.16b, v25.16b\n"
"smin v21.16b, v21.16b, v24.16b\n"
"st1 {v21.8b}, [%[output_ptr]], x3\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [%[output_ptr]], x3\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"dup v24.16b, w0\n"
"smax v21.8b, v21.8b, v25.8b\n"
"smin v21.8b, v21.8b, v24.8b\n"
"st1 {v21.8b}, [%[output_ptr]]\n"
DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr),
[output_window_height] "+r"(output_window_height)
:
[output_multiplier_ptr] "r"(output_multiplier_ptr),
[output_shift_ptr] "r"(output_shift_ptr),
[bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
[input_depth] "r"(input_depth),
[output_window_width] "r"(output_window_width),
[input_width_increment] "r"(input_width_increment),
[input_height_increment] "r"(input_height_increment),
[output_height_increment] "r"(output_height_increment),
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"v30", "v31",
"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
"x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
}
};
template <>
struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
2> {
static inline void Run(const int32* output_multiplier_ptr,
const int32* output_shift_ptr, const int8* input_ptr,
const int8* filter_ptr, const int32* bias_ptr,
int8* output_ptr, int64_t input_depth,
int64_t input_row_size, int32 output_window_height,
int32 output_window_width,
const DepthwiseConvParams* params_ptr) {
const int64_t input_width_increment = 4 * input_depth;
const int64_t input_height_increment = 4 * input_row_size;
const int64_t output_height_increment = 2 * params_ptr->output_row_size;
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1 …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_END …
asm volatile(
"ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"cmp %w[output_window_height], #2\n"
"dup v28.8h, w0\n"
"ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"dup v29.8h, w2\n"
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
"ld1 {v30.4s, v31.4s}, [%[output_multiplier_ptr]]\n"
"add x10, %[bias_ptr], #16\n"
"ld1 {v0.8b}, [%[filter_ptr]], x5\n"
"ld1 {v1.8b}, [%[filter_ptr]], x5\n"
"sshll v0.8h, v0.8b, #0\n"
"ld1 {v2.8b}, [%[filter_ptr]], x5\n"
"sshll v1.8h, v1.8b, #0\n"
"ld1 {v3.8b}, [%[filter_ptr]], x5\n"
"sshll v2.8h, v2.8b, #0\n"
"ld1 {v4.8b}, [%[filter_ptr]], x5\n"
"sshll v3.8h, v3.8b, #0\n"
"ld1 {v5.8b}, [%[filter_ptr]], x5\n"
"sshll v4.8h, v4.8b, #0\n"
"ld1 {v6.8b}, [%[filter_ptr]], x5\n"
"sshll v5.8h, v5.8b, #0\n"
"ld1 {v7.8b}, [%[filter_ptr]], x5\n"
"sshll v6.8h, v6.8b, #0\n"
"ld1 {v8.8b}, [%[filter_ptr]]\n"
"sshll v7.8h, v7.8b, #0\n"
"sshll v8.8h, v8.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
"mov x11, %[input_ptr]\n"
"mov x12, x11\n"
"add x13, x12, %[input_row_size]\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"mov w14, %w[output_window_width]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"cmp w14, #2\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x15, x13, %[input_row_size]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x19\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"ld1 {v22.4s}, [x10]\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"ld1 {v24.4s}, [x10]\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"ld1 {v19.4s}, [%[bias_ptr]]\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"ld1 {v20.4s}, [x10]\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"ld1 {v25.4s}, [%[bias_ptr]]\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"ld1 {v26.4s}, [x10]\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
"cmp w14, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v13.8b}, [x12]\n"
"add x12, x15, %[input_row_size]\n"
"smlal v23.4s, v0.4h, v11.4h\n"
"ld1 {v17.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v0.8h, v11.8h\n"
"ld1 {v18.8b}, [x13]\n"
"add x13, x12, %[input_row_size]\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"ld1 {v9.8b}, [x15], %[input_depth]\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v3.4h, v14.4h\n"
"smlal2 v22.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"subs w14, w14, #2\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"cmp w14, #3\n"
"smlal v21.4s, v4.4h, v15.4h\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v22.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v5.4h, v16.4h\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v22.4s, v5.8h, v16.8h\n"
"ld1 {v16.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v1.4h, v12.4h\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v24.4s, v1.8h, v12.8h\n"
"ld1 {v12.8b}, [x15], %[input_depth]\n"
"smlal v23.4s, v2.4h, v13.4h\n"
"saddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v24.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x15]\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"ld1 {v17.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v5.4h, v18.4h\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"smlal2 v24.4s, v5.8h, v18.8h\n"
"ld1 {v18.8b}, [x12]\n"
"smlal v21.4s, v6.4h, v9.4h\n"
"smlal2 v22.4s, v6.8h, v9.8h\n"
"smlal v19.4s, v0.4h, v9.4h\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"smlal2 v20.4s, v0.8h, v9.8h\n"
"ld1 {v9.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v6.4h, v11.4h\n"
"smlal2 v24.4s, v6.8h, v11.8h\n"
"smlal v21.4s, v7.4h, v10.4h\n"
"smlal2 v22.4s, v7.8h, v10.8h\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"smlal v19.4s, v1.4h, v10.4h\n"
"smlal2 v20.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v7.4h, v12.4h\n"
"smlal2 v24.4s, v7.8h, v12.8h\n"
"smlal v25.4s, v1.4h, v12.4h\n"
"smlal2 v26.4s, v1.8h, v12.8h\n"
"smlal v21.4s, v8.4h, v11.4h\n"
"smlal2 v22.4s, v8.8h, v11.8h\n"
"add x11, x11, %[input_width_increment]\n"
"smlal v19.4s, v2.4h, v11.4h\n"
"mov x12, x11\n"
"smlal2 v20.4s, v2.8h, v11.8h\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"smlal v25.4s, v0.4h, v11.4h\n"
"smlal2 v26.4s, v0.8h, v11.8h\n"
"ld1 {v11.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v8.4h, v13.4h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v8.8h, v13.8h\n"
"smlal v25.4s, v2.4h, v13.4h\n"
"smlal2 v26.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x13]\n"
"add x13, x12, %[input_row_size]\n"
"add x15, x13, %[input_row_size]\n"
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v21.4s, v21.4s, v30.4s\n"
"sqrdmulh v22.4s, v22.4s, v31.4s\n"
"sqrdmulh v23.4s, v23.4s, v30.4s\n"
"sqrdmulh v24.4s, v24.4s, v31.4s\n"
"sqrshl v21.4s, v21.4s, v27.4s\n"
"sqrshl v22.4s, v22.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v27.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v27.16b, w3\n"
"dup v29.16b, w4\n"
"ld1 {v22.4s}, [x10]\n"
"smax v21.16b, v21.16b, v27.16b\n"
"smin v21.16b, v21.16b, v29.16b\n"
"ld1 {v24.4s}, [x10]\n"
"dup v29.8h, w2\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x5\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x6], x5\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"smlal v19.4s, v6.4h, v9.4h\n"
"smlal2 v20.4s, v6.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal v25.4s, v6.4h, v11.4h\n"
"smlal2 v26.4s, v6.8h, v11.8h\n"
"smlal v19.4s, v7.4h, v10.4h\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v20.4s, v7.8h, v10.8h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal v25.4s, v7.4h, v12.4h\n"
"smlal2 v26.4s, v7.8h, v12.8h\n"
"smlal v19.4s, v8.4h, v11.4h\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v20.4s, v8.8h, v11.8h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"smlal v25.4s, v8.4h, v13.4h\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"smlal2 v26.4s, v8.8h, v13.8h\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"smlal v19.4s, v3.4h, v14.4h\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"smlal2 v20.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v25.4s, v3.4h, v16.4h\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"smlal2 v26.4s, v3.8h, v16.8h\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"smlal v19.4s, v4.4h, v15.4h\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v20.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"smlal v25.4s, v4.4h, v17.4h\n"
"smlal2 v26.4s, v4.8h, v17.8h\n"
"smlal v19.4s, v5.4h, v16.4h\n"
"saddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v20.4s, v5.8h, v16.8h\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n"
"smlal v25.4s, v5.4h, v18.4h\n"
"smlal2 v26.4s, v5.8h, v18.8h\n"
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v19.4s, v19.4s, v30.4s\n"
"sqrdmulh v20.4s, v20.4s, v31.4s\n"
"sqrdmulh v25.4s, v25.4s, v30.4s\n"
"sqrdmulh v26.4s, v26.4s, v31.4s\n"
"sqrshl v19.4s, v19.4s, v27.4s\n"
"sqrshl v20.4s, v20.4s, v28.4s\n"
"sqrshl v25.4s, v25.4s, v27.4s\n"
"sqrshl v26.4s, v26.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v19.4h, v19.4s\n"
"sqxtn2 v19.8h, v20.4s\n"
"sqxtn v25.4h, v25.4s\n"
"sqxtn2 v25.8h, v26.4s\n"
"sqadd v19.8h, v19.8h, v29.8h\n"
"sqadd v25.8h, v25.8h, v29.8h\n"
"sqxtn v19.8b, v19.8h\n"
"sqxtn2 v19.16b, v25.8h\n"
"dup v27.16b, w3\n"
"dup v29.16b, w4\n"
"ld1 {v20.4s}, [x10]\n"
"smax v19.16b, v19.16b, v27.16b\n"
"smin v19.16b, v19.16b, v29.16b\n"
"ld1 {v26.4s}, [x10]\n"
"dup v29.8h, w2\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"st1 {v19.8b}, [x7], x5\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"mov v25.d[0], v19.d[1]\n"
"st1 {v25.8b}, [x7], x5\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"ld1 {v19.4s}, [%[bias_ptr]]\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"ld1 {v25.4s}, [%[bias_ptr]]\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
"cmp w14, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v13.8b}, [x12]\n"
"add x12, x15, %[input_row_size]\n"
"smlal v23.4s, v0.4h, v11.4h\n"
"ld1 {v17.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v0.8h, v11.8h\n"
"ld1 {v18.8b}, [x13]\n"
"add x13, x12, %[input_row_size]\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"ld1 {v9.8b}, [x15], %[input_depth]\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v3.4h, v14.4h\n"
"smlal2 v22.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"smlal v21.4s, v4.4h, v15.4h\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v22.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v5.4h, v16.4h\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v22.4s, v5.8h, v16.8h\n"
"ld1 {v16.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v1.4h, v12.4h\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v24.4s, v1.8h, v12.8h\n"
"ld1 {v12.8b}, [x15], %[input_depth]\n"
"smlal v23.4s, v2.4h, v13.4h\n"
"saddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v24.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x15]\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"ld1 {v17.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v5.4h, v18.4h\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"smlal2 v24.4s, v5.8h, v18.8h\n"
"ld1 {v18.8b}, [x12]\n"
"smlal v21.4s, v6.4h, v9.4h\n"
"smlal2 v22.4s, v6.8h, v9.8h\n"
"smlal v19.4s, v0.4h, v9.4h\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"smlal2 v20.4s, v0.8h, v9.8h\n"
"ld1 {v9.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v6.4h, v11.4h\n"
"smlal2 v24.4s, v6.8h, v11.8h\n"
"smlal v21.4s, v7.4h, v10.4h\n"
"smlal2 v22.4s, v7.8h, v10.8h\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"smlal v19.4s, v1.4h, v10.4h\n"
"smlal2 v20.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v7.4h, v12.4h\n"
"smlal2 v24.4s, v7.8h, v12.8h\n"
"smlal v25.4s, v1.4h, v12.4h\n"
"smlal2 v26.4s, v1.8h, v12.8h\n"
"smlal v21.4s, v8.4h, v11.4h\n"
"smlal2 v22.4s, v8.8h, v11.8h\n"
"smlal v19.4s, v2.4h, v11.4h\n"
"smlal2 v20.4s, v2.8h, v11.8h\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"smlal v25.4s, v0.4h, v11.4h\n"
"smlal2 v26.4s, v0.8h, v11.8h\n"
"ld1 {v11.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v8.4h, v13.4h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v8.8h, v13.8h\n"
"smlal v25.4s, v2.4h, v13.4h\n"
"smlal2 v26.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x13]\n"
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v21.4s, v21.4s, v30.4s\n"
"sqrdmulh v22.4s, v22.4s, v31.4s\n"
"sqrdmulh v23.4s, v23.4s, v30.4s\n"
"sqrdmulh v24.4s, v24.4s, v31.4s\n"
"sqrshl v21.4s, v21.4s, v27.4s\n"
"sqrshl v22.4s, v22.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v27.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v27.16b, w3\n"
"dup v29.16b, w4\n"
"ld1 {v22.4s}, [x10]\n"
"smax v21.16b, v21.16b, v27.16b\n"
"smin v21.16b, v21.16b, v29.16b\n"
"ld1 {v24.4s}, [x10]\n"
"dup v29.8h, w2\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x5\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x6]\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"smlal v19.4s, v6.4h, v9.4h\n"
"smlal2 v20.4s, v6.8h, v9.8h\n"
"smlal v25.4s, v6.4h, v11.4h\n"
"smlal2 v26.4s, v6.8h, v11.8h\n"
"smlal v19.4s, v7.4h, v10.4h\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v20.4s, v7.8h, v10.8h\n"
"smlal v25.4s, v7.4h, v12.4h\n"
"smlal2 v26.4s, v7.8h, v12.8h\n"
"smlal v19.4s, v8.4h, v11.4h\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v20.4s, v8.8h, v11.8h\n"
"smlal v25.4s, v8.4h, v13.4h\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"smlal2 v26.4s, v8.8h, v13.8h\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"smlal v19.4s, v3.4h, v14.4h\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"smlal2 v20.4s, v3.8h, v14.8h\n"
"smlal v25.4s, v3.4h, v16.4h\n"
"smlal2 v26.4s, v3.8h, v16.8h\n"
"smlal v19.4s, v4.4h, v15.4h\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v20.4s, v4.8h, v15.8h\n"
"smlal v25.4s, v4.4h, v17.4h\n"
"smlal2 v26.4s, v4.8h, v17.8h\n"
"smlal v19.4s, v5.4h, v16.4h\n"
"saddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v20.4s, v5.8h, v16.8h\n"
"smlal v25.4s, v5.4h, v18.4h\n"
"smlal2 v26.4s, v5.8h, v18.8h\n"
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v19.4s, v19.4s, v30.4s\n"
"sqrdmulh v20.4s, v20.4s, v31.4s\n"
"sqrdmulh v25.4s, v25.4s, v30.4s\n"
"sqrdmulh v26.4s, v26.4s, v31.4s\n"
"sqrshl v19.4s, v19.4s, v27.4s\n"
"sqrshl v20.4s, v20.4s, v28.4s\n"
"sqrshl v25.4s, v25.4s, v27.4s\n"
"sqrshl v26.4s, v26.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v19.4h, v19.4s\n"
"sqxtn2 v19.8h, v20.4s\n"
"sqxtn v25.4h, v25.4s\n"
"sqxtn2 v25.8h, v26.4s\n"
"sqadd v19.8h, v19.8h, v29.8h\n"
"sqadd v25.8h, v25.8h, v29.8h\n"
"dup v27.16b, w3\n"
"dup v29.16b, w4\n"
"sqxtn v19.8b, v19.8h\n"
"sqxtn2 v19.16b, v25.8h\n"
"smax v19.16b, v19.16b, v27.16b\n"
"smin v19.16b, v19.16b, v29.16b\n"
"st1 {v19.8b}, [x7], x5\n"
"dup v29.8h, w2\n"
"mov v25.d[0], v19.d[1]\n"
"st1 {v25.8b}, [x7]\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
"add x12, x15, %[input_row_size]\n"
"add x13, x12, %[input_row_size]\n"
"ld1 {v12.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v13.8b}, [x15], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v17.8b}, [x15]\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x12]\n"
"smlal v21.4s, v3.4h, v14.4h\n"
"smlal2 v22.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v21.4s, v4.4h, v15.4h\n"
"smlal2 v22.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"smlal v21.4s, v5.4h, v16.4h\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v22.4s, v5.8h, v16.8h\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"ld1 {v16.8b}, [x13]\n"
"smlal v21.4s, v6.4h, v12.4h\n"
"smlal2 v22.4s, v6.8h, v12.8h\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v7.4h, v13.4h\n"
"smlal2 v22.4s, v7.8h, v13.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v2.4h, v17.4h\n"
"smlal2 v24.4s, v2.8h, v17.8h\n"
"ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v21.4s, v21.4s, v30.4s\n"
"sqrdmulh v22.4s, v22.4s, v31.4s\n"
"sqrshl v21.4s, v21.4s, v26.4s\n"
"sqrshl v22.4s, v22.4s, v27.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"dup v26.16b, w3\n"
"dup v27.16b, w4\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"smax v21.8b, v21.8b, v26.8b\n"
"smin v21.8b, v21.8b, v27.8b\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"st1 {v21.8b}, [x6]\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"smlal v23.4s, v3.4h, v9.4h\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"smlal2 v24.4s, v3.8h, v9.8h\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"smlal v23.4s, v4.4h, v10.4h\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"smlal2 v24.4s, v4.8h, v10.8h\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"smlal v23.4s, v5.4h, v11.4h\n"
"smlal2 v24.4s, v5.8h, v11.8h\n"
"smlal v23.4s, v6.4h, v14.4h\n"
"smlal2 v24.4s, v6.8h, v14.8h\n"
"smlal v23.4s, v7.4h, v15.4h\n"
"smlal2 v24.4s, v7.8h, v15.8h\n"
"smlal v23.4s, v8.4h, v16.4h\n"
"smlal2 v24.4s, v8.8h, v16.8h\n"
"ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v23.4s, v23.4s, v30.4s\n"
"sqrdmulh v24.4s, v24.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v26.4s\n"
"sqrshl v24.4s, v24.4s, v27.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"dup v26.16b, w3\n"
"dup v27.16b, w4\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v23.8b, v23.8h\n"
"smax v23.8b, v23.8b, v26.8b\n"
"smin v23.8b, v23.8b, v27.8b\n"
"st1 {v23.8b}, [x7]\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
"subs %w[output_window_height], %w[output_window_height], #2\n"
"add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
"cmp %w[output_window_height], #2\n"
"add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
"cmp %w[output_window_height], #1\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
"mov x11, %[input_ptr]\n"
"mov x12, x11\n"
"add x13, x12, %[input_row_size]\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"add x15, x13, %[input_row_size]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"mov w14, %w[output_window_width]\n"
"cmp w14, #2\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"ld1 {v15.8b}, [x15], %[input_depth]\n"
"ld1 {v16.8b}, [x15], %[input_depth]\n"
"ld1 {v17.8b}, [x15], %[input_depth]\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"ld1 {v24.4s}, [%[bias_ptr]]\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"ld1 {v25.4s}, [x10]\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"ld1 {v26.4s}, [%[bias_ptr]]\n"
"ld1 {v27.4s}, [x10]\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
"cmp w14, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
"smlal v24.4s, v0.4h, v9.4h\n"
"ld1 {v18.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v0.8h, v9.8h\n"
"ld1 {v19.8b}, [x12]\n"
"smlal v26.4s, v0.4h, v11.4h\n"
"ld1 {v20.8b}, [x13], %[input_depth]\n"
"smlal2 v27.4s, v0.8h, v11.8h\n"
"ld1 {v21.8b}, [x13]\n"
"smlal v24.4s, v1.4h, v10.4h\n"
"ld1 {v22.8b}, [x15], %[input_depth]\n"
"smlal2 v25.4s, v1.8h, v10.8h\n"
"ld1 {v23.8b}, [x15]\n"
"smlal v24.4s, v2.4h, v11.4h\n"
"subs w14, w14, #2\n"
"smlal2 v25.4s, v2.8h, v11.8h\n"
"cmp w14, #3\n"
"smlal v24.4s, v3.4h, v12.4h\n"
"add x11, x11, %[input_width_increment]\n"
"smlal2 v25.4s, v3.8h, v12.8h\n"
"mov x12, x11\n"
"smlal v26.4s, v3.4h, v14.4h\n"
"add x13, x12, %[input_row_size]\n"
"smlal2 v27.4s, v3.8h, v14.8h\n"
"add x15, x13, %[input_row_size]\n"
"smlal v24.4s, v4.4h, v13.4h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v4.8h, v13.8h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal v24.4s, v5.4h, v14.4h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v5.8h, v14.8h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal v24.4s, v6.4h, v15.4h\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"smlal2 v25.4s, v6.8h, v15.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v26.4s, v6.4h, v17.4h\n"
"ld1 {v15.8b}, [x15], %[input_depth]\n"
"smlal2 v27.4s, v6.8h, v17.8h\n"
"smlal v24.4s, v7.4h, v16.4h\n"
"smlal2 v25.4s, v7.8h, v16.8h\n"
"ld1 {v16.8b}, [x15], %[input_depth]\n"
"smlal v24.4s, v8.4h, v17.4h\n"
"saddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v25.4s, v8.8h, v17.8h\n"
"ld1 {v17.8b}, [x15], %[input_depth]\n"
"saddw v19.8h, v28.8h, v19.8b\n"
"smlal v26.4s, v1.4h, v18.4h\n"
"saddw v20.8h, v28.8h, v20.8b\n"
"smlal2 v27.4s, v1.8h, v18.8h\n"
"smlal v26.4s, v2.4h, v19.4h\n"
"saddw v21.8h, v28.8h, v21.8b\n"
"smlal2 v27.4s, v2.8h, v19.8h\n"
"smlal v26.4s, v4.4h, v20.4h\n"
"smlal v26.4s, v5.4h, v21.4h\n"
"smlal2 v27.4s, v4.8h, v20.8h\n"
"saddw v22.8h, v28.8h, v22.8b\n"
"smlal2 v27.4s, v5.8h, v21.8h\n"
"saddw v23.8h, v28.8h, v23.8b\n"
"smlal v26.4s, v7.4h, v22.4h\n"
"smlal2 v27.4s, v7.8h, v22.8h\n"
"smlal v26.4s, v8.4h, v23.4h\n"
"smlal2 v27.4s, v8.8h, v23.8h\n"
"ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v24.4s, v24.4s, v30.4s\n"
"sqrdmulh v25.4s, v25.4s, v31.4s\n"
"sqrdmulh v26.4s, v26.4s, v30.4s\n"
"sqrdmulh v27.4s, v27.4s, v31.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"sqrshl v25.4s, v25.4s, v29.4s\n"
"sqrshl v26.4s, v26.4s, v28.4s\n"
"sqrshl v27.4s, v27.4s, v29.4s\n"
"dup v28.8h, w2\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v25.4s\n"
"sqxtn v26.4h, v26.4s\n"
"sqxtn2 v26.8h, v27.4s\n"
"sqadd v24.8h, v24.8h, v28.8h\n"
"sqadd v26.8h, v26.8h, v28.8h\n"
"sqxtn v24.8b, v24.8h\n"
"sqxtn2 v24.16b, v26.8h\n"
"dup v28.8h, w0\n"
"dup v27.16b, w3\n"
"dup v29.16b, w4\n"
"ld1 {v25.4s}, [x10]\n"
"smax v24.16b, v24.16b, v27.16b\n"
"smin v24.16b, v24.16b, v29.16b\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"st1 {v24.8b}, [x6], x5\n"
"ld1 {v27.4s}, [x10]\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"mov v26.d[0], v24.d[1]\n"
"st1 {v26.8b}, [x6], x5\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"ld1 {v24.4s}, [%[bias_ptr]]\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"ld1 {v26.4s}, [%[bias_ptr]]\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
"cmp w14, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
"smlal v24.4s, v0.4h, v9.4h\n"
"ld1 {v18.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v0.8h, v9.8h\n"
"ld1 {v19.8b}, [x12]\n"
"smlal v26.4s, v0.4h, v11.4h\n"
"ld1 {v20.8b}, [x13], %[input_depth]\n"
"smlal2 v27.4s, v0.8h, v11.8h\n"
"ld1 {v21.8b}, [x13]\n"
"smlal v24.4s, v1.4h, v10.4h\n"
"ld1 {v22.8b}, [x15], %[input_depth]\n"
"smlal2 v25.4s, v1.8h, v10.8h\n"
"ld1 {v23.8b}, [x15]\n"
"smlal v24.4s, v2.4h, v11.4h\n"
"smlal2 v25.4s, v2.8h, v11.8h\n"
"smlal v24.4s, v3.4h, v12.4h\n"
"smlal2 v25.4s, v3.8h, v12.8h\n"
"smlal v26.4s, v3.4h, v14.4h\n"
"smlal2 v27.4s, v3.8h, v14.8h\n"
"smlal v24.4s, v4.4h, v13.4h\n"
"smlal2 v25.4s, v4.8h, v13.8h\n"
"smlal v24.4s, v5.4h, v14.4h\n"
"smlal2 v25.4s, v5.8h, v14.8h\n"
"smlal v24.4s, v6.4h, v15.4h\n"
"smlal2 v25.4s, v6.8h, v15.8h\n"
"smlal v26.4s, v6.4h, v17.4h\n"
"smlal2 v27.4s, v6.8h, v17.8h\n"
"smlal v24.4s, v7.4h, v16.4h\n"
"smlal2 v25.4s, v7.8h, v16.8h\n"
"smlal v24.4s, v8.4h, v17.4h\n"
"saddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v25.4s, v8.8h, v17.8h\n"
"saddw v19.8h, v28.8h, v19.8b\n"
"smlal v26.4s, v1.4h, v18.4h\n"
"saddw v20.8h, v28.8h, v20.8b\n"
"smlal2 v27.4s, v1.8h, v18.8h\n"
"smlal v26.4s, v2.4h, v19.4h\n"
"saddw v21.8h, v28.8h, v21.8b\n"
"smlal2 v27.4s, v2.8h, v19.8h\n"
"smlal v26.4s, v4.4h, v20.4h\n"
"smlal v26.4s, v5.4h, v21.4h\n"
"smlal2 v27.4s, v4.8h, v20.8h\n"
"saddw v22.8h, v28.8h, v22.8b\n"
"smlal2 v27.4s, v5.8h, v21.8h\n"
"saddw v23.8h, v28.8h, v23.8b\n"
"smlal v26.4s, v7.4h, v22.4h\n"
"smlal2 v27.4s, v7.8h, v22.8h\n"
"smlal v26.4s, v8.4h, v23.4h\n"
"smlal2 v27.4s, v8.8h, v23.8h\n"
"ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v24.4s, v24.4s, v30.4s\n"
"sqrdmulh v25.4s, v25.4s, v31.4s\n"
"sqrdmulh v26.4s, v26.4s, v30.4s\n"
"sqrdmulh v27.4s, v27.4s, v31.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"sqrshl v25.4s, v25.4s, v29.4s\n"
"sqrshl v26.4s, v26.4s, v28.4s\n"
"sqrshl v27.4s, v27.4s, v29.4s\n"
"dup v28.8h, w2\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v25.4s\n"
"sqxtn v26.4h, v26.4s\n"
"sqxtn2 v26.8h, v27.4s\n"
"sqadd v24.8h, v24.8h, v28.8h\n"
"sqadd v26.8h, v26.8h, v28.8h\n"
"sqxtn v24.8b, v24.8h\n"
"dup v28.16b, w3\n"
"dup v29.16b, w4\n"
"sqxtn2 v24.16b, v26.8h\n"
"smax v24.16b, v24.16b, v28.16b\n"
"smin v24.16b, v24.16b, v29.16b\n"
"st1 {v24.8b}, [x6], x5\n"
"mov v26.d[0], v24.d[1]\n"
"st1 {v26.8b}, [x6]\n"
"dup v28.8h, w0\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
"dup v29.8h, w2\n"
"smlal v24.4s, v0.4h, v9.4h\n"
"smlal2 v25.4s, v0.8h, v9.8h\n"
"smlal v24.4s, v1.4h, v10.4h\n"
"smlal2 v25.4s, v1.8h, v10.8h\n"
"smlal v24.4s, v2.4h, v11.4h\n"
"smlal2 v25.4s, v2.8h, v11.8h\n"
"smlal v24.4s, v3.4h, v12.4h\n"
"smlal2 v25.4s, v3.8h, v12.8h\n"
"smlal v24.4s, v4.4h, v13.4h\n"
"smlal2 v25.4s, v4.8h, v13.8h\n"
"smlal v24.4s, v5.4h, v14.4h\n"
"smlal2 v25.4s, v5.8h, v14.8h\n"
"smlal v24.4s, v6.4h, v15.4h\n"
"smlal2 v25.4s, v6.8h, v15.8h\n"
"smlal v24.4s, v7.4h, v16.4h\n"
"smlal2 v25.4s, v7.8h, v16.8h\n"
"smlal v24.4s, v8.4h, v17.4h\n"
"smlal2 v25.4s, v8.8h, v17.8h\n"
"ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v24.4s, v24.4s, v30.4s\n"
"sqrdmulh v25.4s, v25.4s, v31.4s\n"
"sqrshl v24.4s, v24.4s, v26.4s\n"
"sqrshl v25.4s, v25.4s, v27.4s\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v25.4s\n"
"dup v26.16b, w3\n"
"dup v27.16b, w4\n"
"sqadd v24.8h, v24.8h, v29.8h\n"
"sqxtn v24.8b, v24.8h\n"
"smax v24.8b, v24.8b, v26.8b\n"
"smin v24.8b, v24.8b, v27.8b\n"
"st1 {v24.8b}, [x6]\n"
DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr),
[output_window_height] "+r"(output_window_height)
:
[output_multiplier_ptr] "r"(output_multiplier_ptr),
[output_shift_ptr] "r"(output_shift_ptr),
[bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
[input_depth] "r"(input_depth),
[output_window_width] "r"(output_window_width),
[input_width_increment] "r"(input_width_increment),
[input_height_increment] "r"(input_height_increment),
[output_height_increment] "r"(output_height_increment),
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"v30", "v31",
"x0", "x2", "x3", "x4", "x5", "x6", "x7",
"x10", "x11", "x12", "x13", "x14", "x15",
"x19", "x20");
#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
}
};
template <>
struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
EdgeType::kCenter, 1, 1> {
static inline void Run(const int32* output_multiplier_ptr,
const int32* output_shift_ptr, const int8* input_ptr,
const int8* filter_ptr, const int32* bias_ptr,
int8* output_ptr,
const DepthwiseConvParams* params_ptr) {
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr x11, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"dup v26.8h, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"cmp x11, #16\n"
"dup v28.8h, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.16b, w9\n"
"dup v31.16b, w10\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"sshll v0.8h, v0.8b, #0\n"
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v10.4s}, [%[output_shift_ptr]], #16\n"
"ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v11.4s}, [%[output_shift_ptr]], #16\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"subs x11, x11, #8\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"cmp x11, #16\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"sqrdmulh v16.4s, v16.4s, v6.4s\n"
"sqrdmulh v17.4s, v17.4s, v7.4s\n"
"sqrshl v16.4s, v16.4s, v10.4s\n"
"sqrshl v17.4s, v17.4s, v11.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtn v16.8b, v16.8h\n"
"smax v16.8b, v16.8b, v30.8b\n"
"smin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]], #8\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"sshll v0.8h, v0.8b, #0\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v10.4s}, [%[output_shift_ptr]], #16\n"
"ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v11.4s}, [%[output_shift_ptr]], #16\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"sqrdmulh v16.4s, v16.4s, v6.4s\n"
"sqrdmulh v17.4s, v17.4s, v7.4s\n"
"sqrshl v16.4s, v16.4s, v10.4s\n"
"sqrshl v17.4s, v17.4s, v11.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtn v16.8b, v16.8h\n"
"smax v16.8b, v16.8b, v30.8b\n"
"smin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]]\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
[output_multiplier_ptr] "+r"(output_multiplier_ptr),
[output_shift_ptr] "+r"(output_shift_ptr)
:
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v6", "v7", "v8", "v10", "v11", "v16", "v17", "v18", "v19",
"v26", "v28", "v30", "v31",
"x9", "x10", "x11");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
template <>
struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
EdgeType::kCorner, 1, 1> {
static inline void Run(const int32* output_multiplier_ptr,
const int32* output_shift_ptr, const int8* input_ptr,
const int8* filter_ptr, const int32* bias_ptr,
int8* output_ptr,
const DepthwiseConvParams* params_ptr) {
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"ldr x9, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
"cmp x15, #16\n"
"add x12, %[input_ptr], x15\n"
"add x13, %[input_ptr], x9\n"
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"add x14, x13, x15\n"
"ld1 {v9.8b}, [x12], #8\n"
"ldr x6, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
"add x9, %[filter_ptr], x15\n"
"ld1 {v10.8b}, [x13], #8\n"
"add x10, %[filter_ptr], x6\n"
"ld1 {v11.8b}, [x14], #8\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"add x11, x10, x15\n"
"ld1 {v1.8b}, [x9], #8\n"
"ld1 {v2.8b}, [x10], #8\n"
"ld1 {v3.8b}, [x11], #8\n"
"ldr w6, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"dup v26.8h, w6\n"
"ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v28.8h, w6\n"
"ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.16b, w6\n"
"dup v31.16b, w7\n"
"ld1 {v4.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v6.4s}, [%[output_shift_ptr]], #16\n"
"ld1 {v5.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v7.4s}, [%[output_shift_ptr]], #16\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"subs x15, x15, #8\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"cmp x15, #16\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], #8\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"ld1 {v1.8b}, [x9], #8\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], #8\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"ld1 {v2.8b}, [x10], #8\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"ld1 {v11.8b}, [x14], #8\n"
"ld1 {v3.8b}, [x11], #8\n"
"sqrdmulh v16.4s, v16.4s, v4.4s\n"
"sqrdmulh v17.4s, v17.4s, v5.4s\n"
"sqrshl v16.4s, v16.4s, v6.4s\n"
"sqrshl v17.4s, v17.4s, v7.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtn v16.8b, v16.8h\n"
"smax v16.8b, v16.8b, v30.8b\n"
"smin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]], #8\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"ld1 {v4.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v6.4s}, [%[output_shift_ptr]], #16\n"
"ld1 {v5.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v7.4s}, [%[output_shift_ptr]], #16\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"sqrdmulh v16.4s, v16.4s, v4.4s\n"
"sqrdmulh v17.4s, v17.4s, v5.4s\n"
"sqrshl v16.4s, v16.4s, v6.4s\n"
"sqrshl v17.4s, v17.4s, v7.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtn v16.8b, v16.8h\n"
"smax v16.8b, v16.8b, v30.8b\n"
"smin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]]\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
[output_multiplier_ptr] "+r"(output_multiplier_ptr),
[output_shift_ptr] "+r"(output_shift_ptr)
:
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v16", "v17","v18", "v19", "v26", "v28", "v30", "v31",
"x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
template <>
struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
EdgeType::kHorizontal, 1, 1> {
static inline void Run(const int32* output_multiplier_ptr,
const int32* output_shift_ptr, const int8* input_ptr,
const int8* filter_ptr, const int32* bias_ptr,
int8* output_ptr,
const DepthwiseConvParams* params_ptr) {
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ldr x7, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
"mov x12, %[input_ptr]\n"
"ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
"mov x9, %[filter_ptr]\n"
"ldr x14, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
"add x13, x12, x11\n"
"ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"ld1 {v8.8b}, [x12], x7\n"
"add x10, x9, x14\n"
"ld1 {v9.8b}, [x12], x7\n"
"cmp x15, #16\n"
"ld1 {v10.8b}, [x12]\n"
"add %[input_ptr], %[input_ptr], #8\n"
"ld1 {v11.8b}, [x13], x7\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"ld1 {v12.8b}, [x13], x7\n"
"ld1 {v13.8b}, [x13]\n"
"ld1 {v0.8b}, [x9], x7\n"
"ld1 {v1.8b}, [x9], x7\n"
"ld1 {v2.8b}, [x9]\n"
"ld1 {v3.8b}, [x10], x7\n"
"ld1 {v4.8b}, [x10], x7\n"
"ld1 {v5.8b}, [x10]\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"dup v26.8h, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v28.8h, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.8b, w12\n"
"dup v31.8b, w13\n"
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
"ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v15.4s}, [%[output_shift_ptr]], #16\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"mov x12, %[input_ptr]\n"
"subs x15, x15, #8\n"
"add x13, x12, x11\n"
"cmp x15, #16\n"
"add %[input_ptr], %[input_ptr], #8\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"mov x9, %[filter_ptr]\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [x12], x7\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"add x10, x9, x14\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], x7\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"ld1 {v10.8b}, [x12]\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"ld1 {v0.8b}, [x9], x7\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"ld1 {v11.8b}, [x13], x7\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"ld1 {v1.8b}, [x9], x7\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"ld1 {v12.8b}, [x13], x7\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"ld1 {v2.8b}, [x9]\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"ld1 {v13.8b}, [x13]\n"
"sqrdmulh v16.4s, v16.4s, v6.4s\n"
"ld1 {v3.8b}, [x10], x7\n"
"sqrdmulh v17.4s, v17.4s, v7.4s\n"
"ld1 {v4.8b}, [x10], x7\n"
"sqrshl v16.4s, v16.4s, v14.4s\n"
"ld1 {v5.8b}, [x10]\n"
"sqrshl v17.4s, v17.4s, v15.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtn v16.8b, v16.8h\n"
"smax v16.8b, v16.8b, v30.8b\n"
"smin v16.8b, v16.8b, v31.8b\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"st1 {v16.8b}, [%[output_ptr]], #8\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"sshll v3.8h, v3.8b, #0\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n"
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
"ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v15.4s}, [%[output_shift_ptr]], #16\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"sqrdmulh v16.4s, v16.4s, v6.4s\n"
"sqrdmulh v17.4s, v17.4s, v7.4s\n"
"sqrshl v16.4s, v16.4s, v14.4s\n"
"sqrshl v17.4s, v17.4s, v15.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtn v16.8b, v16.8h\n"
"smax v16.8b, v16.8b, v30.8b\n"
"smin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]]\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
[output_multiplier_ptr] "+r"(output_multiplier_ptr),
[output_shift_ptr] "+r"(output_shift_ptr)
:
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v26", "v28", "v30", "v31",
"x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
template <>
struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
EdgeType::kVertical, 1, 1> {
static inline void Run(const int32* output_multiplier_ptr,
const int32* output_shift_ptr, const int8* input_ptr,
const int8* filter_ptr, const int32* bias_ptr,
int8* output_ptr,
const DepthwiseConvParams* params_ptr) {
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ldr x6, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
"mov x12, %[input_ptr]\n"
"ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
"mov x7, %[filter_ptr]\n"
"ldr x5, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
"add x13, x12, x11\n"
"ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"add x14, x13, x11\n"
"ld1 {v8.8b}, [x12], x6\n"
"add x9, x7, x5\n"
"ld1 {v9.8b}, [x12]\n"
"cmp x15, #16\n"
"add x10, x9, x5\n"
"ld1 {v10.8b}, [x13], x6\n"
"add %[input_ptr], %[input_ptr], #8\n"
"ld1 {v11.8b}, [x13]\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"ld1 {v12.8b}, [x14], x6\n"
"ld1 {v13.8b}, [x14]\n"
"ld1 {v0.8b}, [x7], x6\n"
"ld1 {v1.8b}, [x7]\n"
"ld1 {v2.8b}, [x9], x6\n"
"ld1 {v3.8b}, [x9]\n"
"ld1 {v4.8b}, [x10], x6\n"
"ld1 {v5.8b}, [x10]\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"dup v26.8h, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v28.8h, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.8b, w12\n"
"dup v31.8b, w13\n"
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
"ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v15.4s}, [%[output_shift_ptr]], #16\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"mov x12, %[input_ptr]\n"
"subs x15, x15, #8\n"
"add x13, x12, x11\n"
"cmp x15, #16\n"
"add x14, x13, x11\n"
"add %[input_ptr], %[input_ptr], #8\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"mov x7, %[filter_ptr]\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [x12], x6\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"add x9, x7, x5\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"add x10, x9, x5\n"
"ld1 {v9.8b}, [x12]\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], x6\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"ld1 {v0.8b}, [x7], x6\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"ld1 {v11.8b}, [x13]\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"ld1 {v1.8b}, [x7]\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"ld1 {v12.8b}, [x14], x6\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"ld1 {v2.8b}, [x9], x6\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"ld1 {v13.8b}, [x14]\n"
"sqrdmulh v16.4s, v16.4s, v6.4s\n"
"ld1 {v3.8b}, [x9]\n"
"sqrdmulh v17.4s, v17.4s, v7.4s\n"
"ld1 {v4.8b}, [x10], x6\n"
"sqrshl v16.4s, v16.4s, v14.4s\n"
"ld1 {v5.8b}, [x10]\n"
"sqrshl v17.4s, v17.4s, v15.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtn v16.8b, v16.8h\n"
"smax v16.8b, v16.8b, v30.8b\n"
"smin v16.8b, v16.8b, v31.8b\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"st1 {v16.8b}, [%[output_ptr]], #8\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"sshll v3.8h, v3.8b, #0\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n"
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
"ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v15.4s}, [%[output_shift_ptr]], #16\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"sqrdmulh v16.4s, v16.4s, v6.4s\n"
"sqrdmulh v17.4s, v17.4s, v7.4s\n"
"sqrshl v16.4s, v16.4s, v14.4s\n"
"sqrshl v17.4s, v17.4s, v15.4s\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqadd v16.8h, v16.8h, v28.8h\n"
"sqxtn v16.8b, v16.8h\n"
"smax v16.8b, v16.8b, v30.8b\n"
"smin v16.8b, v16.8b, v31.8b\n"
"st1 {v16.8b}, [%[output_ptr]]\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
[output_multiplier_ptr] "+r"(output_multiplier_ptr),
[output_shift_ptr] "+r"(output_shift_ptr)
:
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v26", "v28", "v30", "v31",
"x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
#undef OFFSET_INPUT_DEPTH
#undef OFFSET_INPUT_ROW_SIZE
#undef OFFSET_OUTPUT_DEPTH
#undef OFFSET_OUTPUT_ROW_SIZE
#undef OFFSET_INPUT_OFFSET
#undef OFFSET_OUTPUT_OFFSET
#undef OFFSET_OUTPUT_MULTIPLIER
#undef OFFSET_OUTPUT_ACTIVATION_MIN
#undef OFFSET_OUTPUT_ACTIVATION_MAX
#undef OFFSET_OUTPUT_RIGHT_SHIFT
#undef OFFSET_INPUT_WIDTH
#undef OFFSET_INPUT_HEIGHT
#undef OFFSET_OUTPUT_WIDTH
#undef OFFSET_OUTPUT_HEIGHT
template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
int32 kStrideHeight>
struct DepthwiseConvThroughDepthPerChannel {
static void __attribute__((noinline))
Run(const int32* output_multiplier_ptr, const int32* output_shift_ptr,
const int8* input_ptr, const int8* filter_ptr, const int32* bias_ptr,
int8* output_ptr, int64_t start_depth, int64_t end_depth,
int64_t input_depth, int64_t input_row_size, int32 output_window_height,
int32 output_window_width, const DepthwiseConvParams& params) {
for (; start_depth <= end_depth - 8; start_depth += 8) {
DepthwiseConvWindowPerChannel<output_rounding, 8, kStrideWidth,
kStrideHeight>::Run(output_multiplier_ptr,
output_shift_ptr,
input_ptr, filter_ptr,
bias_ptr, output_ptr,
input_depth,
input_row_size,
output_window_height,
output_window_width,
¶ms);
input_ptr += 8;
output_ptr += 8;
filter_ptr += 8;
bias_ptr += 8;
output_multiplier_ptr += 8;
output_shift_ptr += 8;
}
}
};
template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
int32 kStrideHeight>
struct DepthwiseConvMultiRowPerChannel {
using ConvKernel =
DepthwiseConvThroughDepthPerChannel<output_rounding, kStrideWidth,
kStrideHeight>;
static inline void Run(const int32* output_multiplier,
const int32* output_shift, const int8* input_data,
int32 start_x, int32 end_x, const int8* filter_data,
const int32* bias_data, int8* output_data,
const DepthwiseConvParams& params,
const ShuffleParams& shuffle_params,
int8* shuffle_workspace) {
TFLITE_DCHECK(
shuffle_params.input_height ==
get_shuffle_input_size(kStrideHeight, shuffle_params.output_height));
TFLITE_DCHECK(
shuffle_params.input_width ==
get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
TFLITE_DCHECK_LE(
64 * shuffle_params.input_width * shuffle_params.input_height,
kDepthwiseConvScratchWorkspaceSize);
int32 out_x = start_x;
if (params.output_depth > 64 ||
(params.output_depth <= 64 && params.input_width > 150)) {
for (; out_x <= (end_x - shuffle_params.output_width);
out_x += shuffle_params.output_width) {
const int8* input_ptr = input_data;
const int32* bias_ptr = bias_data;
const int32* output_multiplier_ptr = output_multiplier;
const int32* output_shift_ptr = output_shift;
const int8* filter_ptr = filter_data;
int8* output_ptr = output_data;
int64_t depth = 0;
const int64_t shuffle_row_size = 64 * shuffle_params.input_width;
for (; depth <= params.output_depth - 64; depth += 64) {
const int8* h_ptr = input_ptr;
for (int32 i = 0; i < shuffle_params.input_height; i++) {
const int8* ptr = h_ptr;
for (int32 j = 0; j < shuffle_params.input_width; j++) {
optimized_ops_preload_l1_keep(ptr);
ptr += params.input_depth;
}
h_ptr += params.input_row_size;
}
ShuffleInput(input_ptr, params.input_depth, params.input_width,
params.input_height, 64, shuffle_params.input_width,
shuffle_params.input_height, shuffle_workspace);
ConvKernel::Run(output_multiplier_ptr, output_shift_ptr,
shuffle_workspace, filter_ptr, bias_ptr, output_ptr,
0, 64, 64, shuffle_row_size,
shuffle_params.output_height,
shuffle_params.output_width, params);
input_ptr += 64;
output_ptr += 64;
filter_ptr += 64;
bias_ptr += 64;
output_multiplier_ptr += 64;
output_shift_ptr += 64;
}
const int8* h_ptr = input_ptr;
for (int32 i = 0; i < shuffle_params.input_height; i++) {
const int8* ptr = h_ptr;
for (int32 j = 0; j < shuffle_params.input_width; j++) {
optimized_ops_preload_l1_keep(ptr);
ptr += params.input_depth;
}
h_ptr += params.input_row_size;
}
ConvKernel::Run(output_multiplier_ptr, output_shift_ptr, input_ptr,
filter_ptr, bias_ptr, output_ptr, depth,
params.output_depth, params.input_depth,
params.input_row_size, shuffle_params.output_height,
shuffle_params.output_width, params);
input_data +=
shuffle_params.output_width * kStrideWidth * params.input_depth;
output_data += shuffle_params.output_width * params.output_depth;
}
}
const int32 output_leftover_width = end_x - out_x;
if (output_leftover_width > 0) {
ConvKernel::Run(output_multiplier, output_shift, input_data, filter_data,
bias_data, output_data, 0, params.output_depth,
params.input_depth, params.input_row_size,
shuffle_params.output_height, output_leftover_width,
params);
}
}
};
template <DepthwiseConvOutputRounding output_rounding>
inline void DepthwiseConvHandlePaddingPerChannel(
const int32* output_multiplier_ptr, const int32* output_shift_ptr,
const int8* input_data, const int8* filter_data, const int32* bias_data,
int8* output_data, const DepthwiseConvParams& params) {
if (params.input_width == 1 && params.input_height == 1) {
const int8* filter_ptr =
filter_data + params.filter_row_size + params.output_depth;
DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCenter, 1,
1>::Run(output_multiplier_ptr,
output_shift_ptr, input_data,
filter_ptr, bias_data, output_data,
¶ms);
return;
}
const int32 out_x_start_corner = 0;
const int32 out_x_end_corner = params.output_width - 1;
const int32 out_y_start_corner = 0;
const int32 out_y_end_corner = params.output_height - 1;
const int8* input_ptr = input_data;
const int8* filter_ptr =
filter_data + params.filter_row_size + params.output_depth;
int8* output_ptr = output_data;
DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
output_multiplier_ptr, output_shift_ptr, input_ptr, filter_ptr, bias_data,
output_ptr, ¶ms);
input_ptr += (params.stride_width - 1) * params.input_depth;
filter_ptr = filter_data + params.filter_row_size;
output_ptr += params.output_depth;
for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
out_x++) {
DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kHorizontal, 1,
1>::Run(output_multiplier_ptr,
output_shift_ptr, input_ptr,
filter_ptr, bias_data, output_ptr,
¶ms);
input_ptr += params.stride_width * params.input_depth;
output_ptr += params.output_depth;
}
DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
output_multiplier_ptr, output_shift_ptr, input_ptr, filter_ptr, bias_data,
output_ptr, ¶ms);
input_ptr = input_data + (params.stride_width - 1) * params.input_row_size;
filter_ptr = filter_data + params.input_depth;
output_ptr = output_data + params.output_row_size;
for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
out_y++) {
DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kVertical, 1,
1>::Run(output_multiplier_ptr,
output_shift_ptr, input_ptr,
filter_ptr, bias_data, output_ptr,
¶ms);
input_ptr += params.stride_width * params.input_row_size;
output_ptr += params.output_row_size;
}
input_ptr = input_data + (params.input_width - 2) * params.input_depth +
(params.stride_width - 1) * params.input_row_size;
filter_ptr = filter_data;
output_ptr = output_data + params.output_row_size +
(params.output_width - 1) * params.output_depth;
for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
out_y++) {
DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kVertical, 1,
1>::Run(output_multiplier_ptr,
output_shift_ptr, input_ptr,
filter_ptr, bias_data, output_ptr,
¶ms);
input_ptr += params.stride_width * params.input_row_size;
output_ptr += params.output_row_size;
}
input_ptr = input_data + (params.input_height - 2) * params.input_row_size;
filter_ptr = filter_data + params.output_depth;
output_ptr =
output_data + (params.output_height - 1) * params.output_row_size;
DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
output_multiplier_ptr, output_shift_ptr, input_ptr, filter_ptr, bias_data,
output_ptr, ¶ms);
input_ptr += (params.stride_width == 1) ? 0 : params.input_depth;
filter_ptr = filter_data;
output_ptr += params.output_depth;
for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
out_x++) {
DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kHorizontal, 1,
1>::Run(output_multiplier_ptr,
output_shift_ptr, input_ptr,
filter_ptr, bias_data, output_ptr,
¶ms);
input_ptr += params.stride_width * params.input_depth;
output_ptr += params.output_depth;
}
DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
output_multiplier_ptr, output_shift_ptr, input_ptr, filter_ptr, bias_data,
output_ptr, ¶ms);
}
template <DepthwiseConvOutputRounding output_rounding>
inline void DepthwiseConv3x3FilterPerChannel(
const DepthwiseParams& rt_params, const int32* output_multiplier_ptr,
const int32* output_shift_ptr, const RuntimeShape& input_shape,
const int8* input_data, const RuntimeShape& filter_shape,
const int8* filter_data, const RuntimeShape& bias_shape,
const int32* bias_data, const RuntimeShape& output_shape, int8* output_data,
int thread_start, int thread_end, int thread_dim) {
DepthwiseConvParams params;
const int32 stride_width = rt_params.stride_width;
const int32 stride_height = rt_params.stride_height;
const int32 pad_width = rt_params.padding_values.width;
const int32 pad_height = rt_params.padding_values.height;
const int32 depth_multiplier = rt_params.depth_multiplier;
const int32 output_activation_min = rt_params.quantized_activation_min;
const int32 output_activation_max = rt_params.quantized_activation_max;
const int32 input_offset = rt_params.input_offset;
const int32 filter_offset = rt_params.weights_offset;
const int32 output_offset = rt_params.output_offset;
params.input_depth = input_shape.Dims(3);
params.input_width = input_shape.Dims(2);
params.input_height = input_shape.Dims(1);
params.input_row_size = params.input_depth * params.input_width;
params.input_offset = input_offset;
params.stride_width = stride_width;
params.stride_height = stride_height;
params.output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
params.output_width = output_shape.Dims(2);
params.output_height = output_shape.Dims(1);
params.output_row_size = params.output_depth * params.output_width;
params.output_offset = output_offset;
params.filter_offset = filter_offset;
params.output_activation_min = output_activation_min;
params.output_activation_max = output_activation_max;
const int32 filter_height = filter_shape.Dims(1);
const int32 filter_width = filter_shape.Dims(2);
params.filter_row_size = params.output_depth * filter_width;
TFLITE_DCHECK(params.output_depth == params.input_depth * depth_multiplier);
TFLITE_DCHECK(depth_multiplier == 1);
TFLITE_DCHECK(filter_height == 3);
TFLITE_DCHECK(filter_width == 3);
TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
TFLITE_DCHECK(stride_width == stride_height);
TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
TFLITE_DCHECK(pad_width == pad_height);
TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
const int64_t input_batch_size = params.input_row_size * params.input_height;
const int64_t output_batch_size =
params.output_row_size * params.output_height;
ShuffleParams one_row_shuffle_params, two_row_shuffle_params,
four_row_shuffle_params, eight_row_shuffle_params;
if (stride_width == 1) {
one_row_shuffle_params = ShuffleParams(30, 1, 1, 1);
two_row_shuffle_params = ShuffleParams(22, 2, 1, 1);
four_row_shuffle_params = ShuffleParams(14, 4, 1, 1);
eight_row_shuffle_params = ShuffleParams(8, 8, 1, 1);
} else {
one_row_shuffle_params = ShuffleParams(14, 1, 2, 2);
two_row_shuffle_params = ShuffleParams(8, 2, 2, 2);
four_row_shuffle_params = ShuffleParams(4, 4, 2, 2);
eight_row_shuffle_params = ShuffleParams(2, 8, 2, 2);
}
using conv_multirow_func_t =
decltype(&DepthwiseConvMultiRowPerChannel<output_rounding, 1, 1>::Run);
conv_multirow_func_t conv_multirow_func =
DepthwiseConvMultiRowPerChannel<output_rounding, 1, 1>::Run;
if (stride_width == 2) {
conv_multirow_func =
DepthwiseConvMultiRowPerChannel<output_rounding, 2, 2>::Run;
}
int8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
int batch_start = 0;
int batch_end = batches;
int row_start = 0;
int row_end = params.output_height;
switch (thread_dim) {
case 0:
TFLITE_DCHECK_GE(thread_start, 0);
TFLITE_DCHECK_LE(thread_end, batches);
batch_start = thread_start;
batch_end = thread_end;
break;
case 1:
TFLITE_DCHECK_GE(thread_start, 0);
TFLITE_DCHECK_LE(thread_end, params.output_height);
row_start = thread_start;
row_end = thread_end;
break;
}
for (int32 b = batch_start; b < batch_end; ++b) {
const int8* input_ptr = input_data + b * input_batch_size;
int8* output_ptr = output_data + b * output_batch_size;
int32 out_x = 0;
int32 out_y = row_start;
int32 end_x = params.output_width;
int32 end_y = row_end;
if (pad_width == 1 && pad_height == 1) {
DepthwiseConvHandlePaddingPerChannel<output_rounding>(
output_multiplier_ptr, output_shift_ptr, input_ptr, filter_data,
bias_data, output_ptr, params);
out_x = 1;
end_x = params.output_width - 1;
out_y = std::max(1, out_y);
end_y = std::min(params.output_height - 1, end_y);
}
const int in_x = (out_x * stride_width) - pad_width;
const int in_y = (out_y * stride_height) - pad_height;
input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
output_ptr += out_y * params.output_row_size + out_x * params.output_depth;
if (params.input_width < four_row_shuffle_params.input_width) {
for (; out_y <= end_y - 8; out_y += 8) {
conv_multirow_func(output_multiplier_ptr, output_shift_ptr, input_ptr,
out_x, end_x, filter_data, bias_data, output_ptr,
params, eight_row_shuffle_params, shuffle_workspace);
input_ptr += 8 * stride_height * params.input_row_size;
output_ptr += 8 * params.output_row_size;
}
}
if (params.input_width < two_row_shuffle_params.input_width) {
for (; out_y <= end_y - 4; out_y += 4) {
conv_multirow_func(output_multiplier_ptr, output_shift_ptr, input_ptr,
out_x, end_x, filter_data, bias_data, output_ptr,
params, four_row_shuffle_params, shuffle_workspace);
input_ptr += 4 * stride_height * params.input_row_size;
output_ptr += 4 * params.output_row_size;
}
}
for (; out_y <= end_y - 2; out_y += 2) {
conv_multirow_func(output_multiplier_ptr, output_shift_ptr, input_ptr,
out_x, end_x, filter_data, bias_data, output_ptr,
params, two_row_shuffle_params, shuffle_workspace);
input_ptr += 2 * stride_height * params.input_row_size;
output_ptr += 2 * params.output_row_size;
}
for (; out_y < end_y; out_y++) {
conv_multirow_func(output_multiplier_ptr, output_shift_ptr, input_ptr,
out_x, end_x, filter_data, bias_data, output_ptr,
params, one_row_shuffle_params, shuffle_workspace);
input_ptr += stride_height * params.input_row_size;
output_ptr += params.output_row_size;
}
}
}
#endif
#undef STR
#undef STR_UNEXPANDED
}
}
}
#endif