#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_3X3_FILTER_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_3X3_FILTER_H_
#include <stddef.h>
#include <memory>
#include "ruy/profiler/instrumentation.h"
#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace optimized_ops {
namespace depthwise_conv {
#define STR …
#define STR_UNEXPANDED …
#if defined(__aarch64__) && !defined(GOOGLE_L4T)
#define OFFSET_INPUT_DEPTH …
#define OFFSET_INPUT_ROW_SIZE …
#define OFFSET_OUTPUT_DEPTH …
#define OFFSET_OUTPUT_ROW_SIZE …
#define OFFSET_FILTER_ROW_SIZE …
#define OFFSET_INPUT_OFFSET …
#define OFFSET_OUTPUT_OFFSET …
#define OFFSET_OUTPUT_MULTIPLIER …
#define OFFSET_OUTPUT_ACTIVATION_MIN …
#define OFFSET_OUTPUT_ACTIVATION_MAX …
#define OFFSET_OUTPUT_RIGHT_SHIFT …
#define OFFSET_INPUT_WIDTH …
#define OFFSET_INPUT_HEIGHT …
#define OFFSET_STRIDE_WIDTH …
#define OFFSET_STRIDE_HEIGHT …
#define OFFSET_OUTPUT_WIDTH …
#define OFFSET_OUTPUT_HEIGHT …
#define OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN …
#define OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX …
static_assert(offsetof(DepthwiseConvParams, input_depth) == OFFSET_INPUT_DEPTH,
"");
static_assert(offsetof(DepthwiseConvParams, input_row_size) ==
OFFSET_INPUT_ROW_SIZE,
"");
static_assert(offsetof(DepthwiseConvParams, output_depth) ==
OFFSET_OUTPUT_DEPTH,
"");
static_assert(offsetof(DepthwiseConvParams, output_row_size) ==
OFFSET_OUTPUT_ROW_SIZE,
"");
static_assert(offsetof(DepthwiseConvParams, filter_row_size) ==
OFFSET_FILTER_ROW_SIZE,
"");
static_assert(offsetof(DepthwiseConvParams, input_offset) ==
OFFSET_INPUT_OFFSET,
"");
static_assert(offsetof(DepthwiseConvParams, output_offset) ==
OFFSET_OUTPUT_OFFSET,
"");
static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
OFFSET_OUTPUT_MULTIPLIER,
"");
static_assert(offsetof(DepthwiseConvParams, output_activation_min) ==
OFFSET_OUTPUT_ACTIVATION_MIN,
"");
static_assert(offsetof(DepthwiseConvParams, output_activation_max) ==
OFFSET_OUTPUT_ACTIVATION_MAX,
"");
static_assert(offsetof(DepthwiseConvParams, output_right_shift) ==
OFFSET_OUTPUT_RIGHT_SHIFT,
"");
static_assert(offsetof(DepthwiseConvParams, input_width) == OFFSET_INPUT_WIDTH,
"");
static_assert(offsetof(DepthwiseConvParams, input_height) ==
OFFSET_INPUT_HEIGHT,
"");
static_assert(offsetof(DepthwiseConvParams, stride_width) ==
OFFSET_STRIDE_WIDTH,
"");
static_assert(offsetof(DepthwiseConvParams, stride_height) ==
OFFSET_STRIDE_HEIGHT,
"");
static_assert(offsetof(DepthwiseConvParams, output_width) ==
OFFSET_OUTPUT_WIDTH,
"");
static_assert(offsetof(DepthwiseConvParams, output_height) ==
OFFSET_OUTPUT_HEIGHT,
"");
static_assert(offsetof(DepthwiseConvParams, float_output_activation_min) ==
OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN,
"");
static_assert(offsetof(DepthwiseConvParams, float_output_activation_max) ==
OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX,
"");
template <DepthwiseConvOutputRounding output_rounding, int32 kDepth,
int32 kStrideWidth, int32 kStrideHeight>
struct DepthwiseConvHybridWindowPerChannel {};
template <DepthwiseConvOutputRounding output_rounding, EdgeType kEdgeType,
int kPadWidth, int kPadHeight>
struct DepthwiseConvHybridPartialPerChannel {};
template <>
struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
8, 1, 1> {
public:
static inline void Run(const float* input_scale,
const int8* input_ptr,
const int8* filter_ptr, const float* bias_ptr,
float* output_ptr, int64_t input_depth,
int64_t input_row_size, int32 output_window_height,
int32 output_window_width,
const float* per_channel_scales,
const DepthwiseConvParams* params_ptr) {
const int64_t input_width_increment = 2 * input_depth;
const int64_t input_height_increment = 2 * input_row_size;
const int64_t output_height_increment = 2 * 4 * params_ptr->output_row_size;
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1 …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_END …
asm volatile(
"ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"cmp %w[output_window_height], #2\n"
"ldr w4, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w0, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v25.4s, w4\n"
"dup v29.4s, w0\n"
"ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
"mov x4, #4\n"
"mul x1, x1, x4\n"
"mul x4, x4, x3\n"
"ldr w2, [%[input_scale]]\n"
"ld1 {v27.4s, v28.4s}, [%[per_channel_scales]]\n"
"ld1 {v30.4s, v31.4s}, [%[bias_ptr]]\n"
"dup v26.4s, w2\n"
"fmul v27.4s, v27.4s, v26.4s\n"
"fmul v28.4s, v28.4s, v26.4s\n"
"dup v26.8h, w9\n"
"ld1 {v0.8b}, [%[filter_ptr]], x3\n"
"ld1 {v1.8b}, [%[filter_ptr]], x3\n"
"sshll v0.8h, v0.8b, #0\n"
"ld1 {v2.8b}, [%[filter_ptr]], x3\n"
"sshll v1.8h, v1.8b, #0\n"
"ld1 {v3.8b}, [%[filter_ptr]], x3\n"
"sshll v2.8h, v2.8b, #0\n"
"ld1 {v4.8b}, [%[filter_ptr]], x3\n"
"sshll v3.8h, v3.8b, #0\n"
"ld1 {v5.8b}, [%[filter_ptr]], x3\n"
"sshll v4.8h, v4.8b, #0\n"
"ld1 {v6.8b}, [%[filter_ptr]], x3\n"
"sshll v5.8h, v5.8b, #0\n"
"ld1 {v7.8b}, [%[filter_ptr]], x3\n"
"sshll v6.8h, v6.8b, #0\n"
"ld1 {v8.8b}, [%[filter_ptr]], x3\n"
"sshll v7.8h, v7.8b, #0\n"
"sshll v8.8h, v8.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
"mov x11, %[input_ptr]\n"
"mov x12, x11\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"add x13, x11, %[input_row_size]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"add x14, x13, %[input_row_size]\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x15, x14, %[input_row_size]\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"mov w5, %w[output_window_width]\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x1\n"
"ld1 {v15.8b}, [x14], %[input_depth]\n"
"cmp w5, #2\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v16.8b}, [x14], %[input_depth]\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"ld1 {v18.8b}, [x15], %[input_depth]\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"ld1 {v19.8b}, [x15], %[input_depth]\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"ld1 {v20.8b}, [x15], %[input_depth]\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"movi v21.4s, #0\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"movi v22.4s, #0\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"movi v23.4s, #0\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"movi v24.4s, #0\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
"cmp w5, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"subs w5, w5, #2\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"cmp w5, #3\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"ld1 {v9.8b}, [x12]\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"ld1 {v12.8b}, [x13]\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"ld1 {v15.8b}, [x14]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"ld1 {v18.8b}, [x15]\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"scvtf v21.4s, v21.4s\n"
"scvtf v22.4s, v22.4s\n"
"scvtf v23.4s, v23.4s\n"
"scvtf v24.4s, v24.4s\n"
"fmul v21.4s, v21.4s, v27.4s\n"
"fmul v22.4s, v22.4s, v28.4s\n"
"fmul v23.4s, v23.4s, v27.4s\n"
"fmul v24.4s, v24.4s, v28.4s\n"
"fadd v21.4s, v21.4s, v30.4s\n"
"fadd v22.4s, v22.4s, v31.4s\n"
"fadd v23.4s, v23.4s, v30.4s\n"
"fadd v24.4s, v24.4s, v31.4s\n"
"fmax v21.4s, v21.4s, v25.4s\n"
"fmin v21.4s, v21.4s, v29.4s\n"
"fmax v22.4s, v22.4s, v25.4s\n"
"fmin v22.4s, v22.4s, v29.4s\n"
"fmax v23.4s, v23.4s, v25.4s\n"
"fmin v23.4s, v23.4s, v29.4s\n"
"fmax v24.4s, v24.4s, v25.4s\n"
"fmin v24.4s, v24.4s, v29.4s\n"
"st1 {v21.4s, v22.4s}, [x6], x4\n"
"st1 {v23.4s, v24.4s}, [x7], x4\n"
"fcvtms v21.4s, v21.4s\n"
"fcvtms v22.4s, v22.4s\n"
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
"movi v22.4s, #0\n"
"movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"movi v21.4s, #0\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"movi v23.4s, #0\n"
"smlal v21.4s, v0.4h, v10.4h\n"
"add x11, x11, %[input_width_increment]\n"
"smlal2 v22.4s, v0.8h, v10.8h\n"
"mov x12, x11\n"
"smlal v23.4s, v0.4h, v13.4h\n"
"add x13, x11, %[input_row_size]\n"
"smlal2 v24.4s, v0.8h, v13.8h\n"
"add x14, x13, %[input_row_size]\n"
"smlal v21.4s, v1.4h, v11.4h\n"
"add x15, x14, %[input_row_size]\n"
"smlal2 v22.4s, v1.8h, v11.8h\n"
"smlal v23.4s, v1.4h, v14.4h\n"
"smlal2 v24.4s, v1.8h, v14.8h\n"
"smlal v21.4s, v2.4h, v9.4h\n"
"smlal2 v22.4s, v2.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"smlal v21.4s, v5.4h, v12.4h\n"
"smlal2 v22.4s, v5.8h, v12.8h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v5.4h, v15.4h\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v5.8h, v15.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v21.4s, v6.4h, v16.4h\n"
"smlal2 v22.4s, v6.8h, v16.8h\n"
"smlal v23.4s, v6.4h, v19.4h\n"
"smlal2 v24.4s, v6.8h, v19.8h\n"
"smlal v21.4s, v7.4h, v17.4h\n"
"smlal2 v22.4s, v7.8h, v17.8h\n"
"smlal v23.4s, v7.4h, v20.4h\n"
"smlal2 v24.4s, v7.8h, v20.8h\n"
"smlal v21.4s, v8.4h, v15.4h\n"
"smlal2 v22.4s, v8.8h, v15.8h\n"
"ld1 {v15.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v8.4h, v18.4h\n"
"ld1 {v16.8b}, [x14], %[input_depth]\n"
"smlal2 v24.4s, v8.8h, v18.8h\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"ld1 {v18.8b}, [x15], %[input_depth]\n"
"ld1 {v19.8b}, [x15], %[input_depth]\n"
"ld1 {v20.8b}, [x15], %[input_depth]\n"
"scvtf v21.4s, v21.4s\n"
"scvtf v22.4s, v22.4s\n"
"scvtf v23.4s, v23.4s\n"
"scvtf v24.4s, v24.4s\n"
"fmul v21.4s, v21.4s, v27.4s\n"
"fmul v22.4s, v22.4s, v28.4s\n"
"fmul v23.4s, v23.4s, v27.4s\n"
"fmul v24.4s, v24.4s, v28.4s\n"
"fadd v21.4s, v21.4s, v30.4s\n"
"fadd v22.4s, v22.4s, v31.4s\n"
"fadd v23.4s, v23.4s, v30.4s\n"
"fadd v24.4s, v24.4s, v31.4s\n"
"fmax v21.4s, v21.4s, v25.4s\n"
"fmin v21.4s, v21.4s, v29.4s\n"
"fmax v22.4s, v22.4s, v25.4s\n"
"fmin v22.4s, v22.4s, v29.4s\n"
"fmax v23.4s, v23.4s, v25.4s\n"
"fmin v23.4s, v23.4s, v29.4s\n"
"fmax v24.4s, v24.4s, v25.4s\n"
"fmin v24.4s, v24.4s, v29.4s\n"
"st1 {v21.4s, v22.4s}, [x6], x4\n"
"st1 {v23.4s, v24.4s}, [x7], x4\n"
"fcvtms v21.4s, v21.4s\n"
"fcvtms v22.4s, v22.4s\n"
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
"movi v22.4s, #0\n"
"movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"movi v21.4s, #0\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"movi v23.4s, #0\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
"cmp w5, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"ld1 {v9.8b}, [x12]\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"ld1 {v12.8b}, [x13]\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"ld1 {v15.8b}, [x14]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"ld1 {v18.8b}, [x15]\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"scvtf v21.4s, v21.4s\n"
"scvtf v22.4s, v22.4s\n"
"scvtf v23.4s, v23.4s\n"
"scvtf v24.4s, v24.4s\n"
"fmul v21.4s, v21.4s, v27.4s\n"
"fmul v22.4s, v22.4s, v28.4s\n"
"fmul v23.4s, v23.4s, v27.4s\n"
"fmul v24.4s, v24.4s, v28.4s\n"
"fadd v21.4s, v21.4s, v30.4s\n"
"fadd v22.4s, v22.4s, v31.4s\n"
"fadd v23.4s, v23.4s, v30.4s\n"
"fadd v24.4s, v24.4s, v31.4s\n"
"fmax v21.4s, v21.4s, v25.4s\n"
"fmin v21.4s, v21.4s, v29.4s\n"
"fmax v22.4s, v22.4s, v25.4s\n"
"fmin v22.4s, v22.4s, v29.4s\n"
"fmax v23.4s, v23.4s, v25.4s\n"
"fmin v23.4s, v23.4s, v29.4s\n"
"fmax v24.4s, v24.4s, v25.4s\n"
"fmin v24.4s, v24.4s, v29.4s\n"
"st1 {v21.4s, v22.4s}, [x6], x4\n"
"st1 {v23.4s, v24.4s}, [x7], x4\n"
"fcvtms v21.4s, v21.4s\n"
"fcvtms v22.4s, v22.4s\n"
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
"movi v22.4s, #0\n"
"movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"movi v21.4s, #0\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"movi v23.4s, #0\n"
"smlal v21.4s, v0.4h, v10.4h\n"
"smlal2 v22.4s, v0.8h, v10.8h\n"
"smlal v23.4s, v0.4h, v13.4h\n"
"smlal2 v24.4s, v0.8h, v13.8h\n"
"smlal v21.4s, v1.4h, v11.4h\n"
"smlal2 v22.4s, v1.8h, v11.8h\n"
"smlal v23.4s, v1.4h, v14.4h\n"
"smlal2 v24.4s, v1.8h, v14.8h\n"
"smlal v21.4s, v2.4h, v9.4h\n"
"smlal2 v22.4s, v2.8h, v9.8h\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"smlal v21.4s, v5.4h, v12.4h\n"
"smlal2 v22.4s, v5.8h, v12.8h\n"
"smlal v23.4s, v5.4h, v15.4h\n"
"smlal2 v24.4s, v5.8h, v15.8h\n"
"smlal v21.4s, v6.4h, v16.4h\n"
"smlal2 v22.4s, v6.8h, v16.8h\n"
"smlal v23.4s, v6.4h, v19.4h\n"
"smlal2 v24.4s, v6.8h, v19.8h\n"
"smlal v21.4s, v7.4h, v17.4h\n"
"smlal2 v22.4s, v7.8h, v17.8h\n"
"smlal v23.4s, v7.4h, v20.4h\n"
"smlal2 v24.4s, v7.8h, v20.8h\n"
"smlal v21.4s, v8.4h, v15.4h\n"
"smlal2 v22.4s, v8.8h, v15.8h\n"
"smlal v23.4s, v8.4h, v18.4h\n"
"smlal2 v24.4s, v8.8h, v18.8h\n"
"scvtf v21.4s, v21.4s\n"
"scvtf v22.4s, v22.4s\n"
"scvtf v23.4s, v23.4s\n"
"scvtf v24.4s, v24.4s\n"
"fmul v21.4s, v21.4s, v27.4s\n"
"fmul v22.4s, v22.4s, v28.4s\n"
"fmul v23.4s, v23.4s, v27.4s\n"
"fmul v24.4s, v24.4s, v28.4s\n"
"fadd v21.4s, v21.4s, v30.4s\n"
"fadd v22.4s, v22.4s, v31.4s\n"
"fadd v23.4s, v23.4s, v30.4s\n"
"fadd v24.4s, v24.4s, v31.4s\n"
"fmax v21.4s, v21.4s, v25.4s\n"
"fmin v21.4s, v21.4s, v29.4s\n"
"fmax v22.4s, v22.4s, v25.4s\n"
"fmin v22.4s, v22.4s, v29.4s\n"
"fmax v23.4s, v23.4s, v25.4s\n"
"fmin v23.4s, v23.4s, v29.4s\n"
"fmax v24.4s, v24.4s, v25.4s\n"
"fmin v24.4s, v24.4s, v29.4s\n"
"st1 {v21.4s, v22.4s}, [x6], x4\n"
"st1 {v23.4s, v24.4s}, [x7], x4\n"
"fcvtms v21.4s, v21.4s\n"
"fcvtms v22.4s, v22.4s\n"
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"scvtf v21.4s, v21.4s\n"
"scvtf v22.4s, v22.4s\n"
"scvtf v23.4s, v23.4s\n"
"scvtf v24.4s, v24.4s\n"
"fmul v21.4s, v21.4s, v27.4s\n"
"fmul v22.4s, v22.4s, v28.4s\n"
"fmul v23.4s, v23.4s, v27.4s\n"
"fmul v24.4s, v24.4s, v28.4s\n"
"fadd v21.4s, v21.4s, v30.4s\n"
"fadd v22.4s, v22.4s, v31.4s\n"
"fadd v23.4s, v23.4s, v30.4s\n"
"fadd v24.4s, v24.4s, v31.4s\n"
"fmax v21.4s, v21.4s, v25.4s\n"
"fmin v21.4s, v21.4s, v29.4s\n"
"fmax v22.4s, v22.4s, v25.4s\n"
"fmin v22.4s, v22.4s, v29.4s\n"
"fmax v23.4s, v23.4s, v25.4s\n"
"fmin v23.4s, v23.4s, v29.4s\n"
"fmax v24.4s, v24.4s, v25.4s\n"
"fmin v24.4s, v24.4s, v29.4s\n"
"st1 {v21.4s, v22.4s}, [x6], x4\n"
"st1 {v23.4s, v24.4s}, [x7], x4\n"
"fcvtms v21.4s, v21.4s\n"
"fcvtms v22.4s, v22.4s\n"
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
"subs %w[output_window_height], %w[output_window_height], #2\n"
"add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
"cmp %w[output_window_height], #2\n"
"add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
"cmp %w[output_window_height], #1\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
"mov x12, %[input_ptr]\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"add x13, %[input_ptr], %[input_row_size]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"add x14, x13, %[input_row_size]\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x15, x14, %[input_row_size]\n"
"mov w5, %w[output_window_width]\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x1\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"cmp w5, #2\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"ld1 {v18.8b}, [x14], %[input_depth]\n"
"ld1 {v19.8b}, [x14], %[input_depth]\n"
"movi v21.4s, #0\n"
"movi v22.4s, #0\n"
"movi v23.4s, #0\n"
"movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
"cmp w5, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v16.8b}, [x13]\n"
"smlal v23.4s, v0.4h, v10.4h\n"
"ld1 {v20.8b}, [x14]\n"
"smlal2 v24.4s, v0.8h, v10.8h\n"
"subs w5, w5, #2\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"cmp w5, #3\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"add %[input_ptr], %[input_ptr], %[input_width_increment]\n"
"smlal v23.4s, v1.4h, v11.4h\n"
"mov x12, %[input_ptr]\n"
"smlal2 v24.4s, v1.8h, v11.8h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x13, %[input_ptr], %[input_row_size]\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"add x14, x13, %[input_row_size]\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"add x15, x14, %[input_row_size]\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v3.4h, v14.4h\n"
"smlal2 v24.4s, v3.8h, v14.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v4.4h, v15.4h\n"
"smlal2 v24.4s, v4.8h, v15.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v5.4h, v16.4h\n"
"smlal2 v24.4s, v5.8h, v16.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"ld1 {v18.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"ld1 {v19.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"scvtf v21.4s, v21.4s\n"
"scvtf v22.4s, v22.4s\n"
"scvtf v23.4s, v23.4s\n"
"scvtf v24.4s, v24.4s\n"
"fmul v21.4s, v21.4s, v27.4s\n"
"fmul v22.4s, v22.4s, v28.4s\n"
"fmul v23.4s, v23.4s, v27.4s\n"
"fmul v24.4s, v24.4s, v28.4s\n"
"fadd v21.4s, v21.4s, v30.4s\n"
"fadd v22.4s, v22.4s, v31.4s\n"
"fadd v23.4s, v23.4s, v30.4s\n"
"fadd v24.4s, v24.4s, v31.4s\n"
"fmax v21.4s, v21.4s, v25.4s\n"
"fmin v21.4s, v21.4s, v29.4s\n"
"fmax v22.4s, v22.4s, v25.4s\n"
"fmin v22.4s, v22.4s, v29.4s\n"
"fmax v23.4s, v23.4s, v25.4s\n"
"fmin v23.4s, v23.4s, v29.4s\n"
"fmax v24.4s, v24.4s, v25.4s\n"
"fmin v24.4s, v24.4s, v29.4s\n"
"st1 {v21.4s, v22.4s}, [%[output_ptr]], x4\n"
"st1 {v23.4s, v24.4s}, [%[output_ptr]], x4\n"
"fcvtms v21.4s, v21.4s\n"
"fcvtms v22.4s, v22.4s\n"
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
"movi v22.4s, #0\n"
"movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"movi v21.4s, #0\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"movi v23.4s, #0\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
"cmp w5, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v0.4h, v10.4h\n"
"ld1 {v20.8b}, [x14], %[input_depth]\n"
"smlal2 v24.4s, v0.8h, v10.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v11.4h\n"
"smlal2 v24.4s, v1.8h, v11.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v14.4h\n"
"smlal2 v24.4s, v3.8h, v14.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v15.4h\n"
"smlal2 v24.4s, v4.8h, v15.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"smlal v23.4s, v5.4h, v16.4h\n"
"smlal2 v24.4s, v5.8h, v16.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"scvtf v21.4s, v21.4s\n"
"scvtf v22.4s, v22.4s\n"
"scvtf v23.4s, v23.4s\n"
"scvtf v24.4s, v24.4s\n"
"fmul v21.4s, v21.4s, v27.4s\n"
"fmul v22.4s, v22.4s, v28.4s\n"
"fmul v23.4s, v23.4s, v27.4s\n"
"fmul v24.4s, v24.4s, v28.4s\n"
"fadd v21.4s, v21.4s, v30.4s\n"
"fadd v22.4s, v22.4s, v31.4s\n"
"fadd v23.4s, v23.4s, v30.4s\n"
"fadd v24.4s, v24.4s, v31.4s\n"
"fmax v21.4s, v21.4s, v25.4s\n"
"fmin v21.4s, v21.4s, v29.4s\n"
"fmax v22.4s, v22.4s, v25.4s\n"
"fmin v22.4s, v22.4s, v29.4s\n"
"fmax v23.4s, v23.4s, v25.4s\n"
"fmin v23.4s, v23.4s, v29.4s\n"
"fmax v24.4s, v24.4s, v25.4s\n"
"fmin v24.4s, v24.4s, v29.4s\n"
"st1 {v21.4s, v22.4s}, [%[output_ptr]], x4\n"
"st1 {v23.4s, v24.4s}, [%[output_ptr]], x4\n"
"fcvtms v21.4s, v21.4s\n"
"fcvtms v22.4s, v22.4s\n"
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"scvtf v21.4s, v21.4s\n"
"scvtf v22.4s, v22.4s\n"
"fmul v21.4s, v21.4s, v27.4s\n"
"fmul v22.4s, v22.4s, v28.4s\n"
"fadd v21.4s, v21.4s, v30.4s\n"
"fadd v22.4s, v22.4s, v31.4s\n"
"fmax v21.4s, v21.4s, v25.4s\n"
"fmin v21.4s, v21.4s, v29.4s\n"
"fmax v22.4s, v22.4s, v25.4s\n"
"fmin v22.4s, v22.4s, v29.4s\n"
"st1 {v21.4s, v22.4s}, [%[output_ptr]]\n"
DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr),
[output_window_height] "+r"(output_window_height),
[per_channel_scales] "+r"(per_channel_scales)
:
[input_scale] "r"(input_scale),
[bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
[input_depth] "r"(input_depth),
[output_window_width] "r"(output_window_width),
[input_width_increment] "r"(input_width_increment),
[input_height_increment] "r"(input_height_increment),
[output_height_increment] "r"(output_height_increment),
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"v30", "v31",
"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
"x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
}
};
template <>
struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
8, 2, 2> {
static inline void Run(const float* input_scale, const int8* input_ptr,
const int8* filter_ptr, const float* bias_ptr,
float* output_ptr, int64_t input_depth,
int64_t input_row_size, int32 output_window_height,
int32 output_window_width,
const float* per_channel_scales,
const DepthwiseConvParams* params_ptr) {
const int64_t input_width_increment = 4 * input_depth;
const int64_t input_height_increment = 4 * input_row_size;
const int64_t output_height_increment = 2 * 4 * params_ptr->output_row_size;
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1 …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER …
#define DEPTHWISECONV_LABEL_HEIGHT_1_END …
asm volatile(
"ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"cmp %w[output_window_height], #2\n"
"ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
"mov x4, #4\n"
"mul x19, x19, x4\n"
"mul x4, x4, x5\n"
"ldr w2, [%[input_scale]]\n"
"dup v28.4s, w2\n"
"ldr w3, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w2, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v29.4s, w2\n"
"ld1 {v30.4s, v31.4s}, [%[per_channel_scales]]\n"
"fmul v30.4s, v30.4s, v28.4s\n"
"fmul v31.4s, v31.4s, v28.4s\n"
"dup v28.8h, w0\n"
"ld1 {v0.8b}, [%[filter_ptr]], x5\n"
"ld1 {v1.8b}, [%[filter_ptr]], x5\n"
"sshll v0.8h, v0.8b, #0\n"
"ld1 {v2.8b}, [%[filter_ptr]], x5\n"
"sshll v1.8h, v1.8b, #0\n"
"ld1 {v3.8b}, [%[filter_ptr]], x5\n"
"sshll v2.8h, v2.8b, #0\n"
"ld1 {v4.8b}, [%[filter_ptr]], x5\n"
"sshll v3.8h, v3.8b, #0\n"
"ld1 {v5.8b}, [%[filter_ptr]], x5\n"
"sshll v4.8h, v4.8b, #0\n"
"ld1 {v6.8b}, [%[filter_ptr]], x5\n"
"sshll v5.8h, v5.8b, #0\n"
"ld1 {v7.8b}, [%[filter_ptr]], x5\n"
"sshll v6.8h, v6.8b, #0\n"
"ld1 {v8.8b}, [%[filter_ptr]]\n"
"sshll v7.8h, v7.8b, #0\n"
"sshll v8.8h, v8.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
"mov x11, %[input_ptr]\n"
"mov x12, x11\n"
"add x13, x12, %[input_row_size]\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"mov w14, %w[output_window_width]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"cmp w14, #2\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x15, x13, %[input_row_size]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x19\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n"
"movi v21.4s, #0\n"
"movi v22.4s, #0\n"
"movi v23.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"movi v24.4s, #0\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"movi v19.4s, #0\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"movi v20.4s, #0\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"movi v25.4s, #0\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"movi v26.4s, #0\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
"cmp w14, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v13.8b}, [x12]\n"
"add x12, x15, %[input_row_size]\n"
"smlal v23.4s, v0.4h, v11.4h\n"
"ld1 {v17.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v0.8h, v11.8h\n"
"ld1 {v18.8b}, [x13]\n"
"add x13, x12, %[input_row_size]\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"ld1 {v9.8b}, [x15], %[input_depth]\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v3.4h, v14.4h\n"
"smlal2 v22.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"subs w14, w14, #2\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"cmp w14, #3\n"
"smlal v21.4s, v4.4h, v15.4h\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v22.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v5.4h, v16.4h\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v22.4s, v5.8h, v16.8h\n"
"ld1 {v16.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v1.4h, v12.4h\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v24.4s, v1.8h, v12.8h\n"
"ld1 {v12.8b}, [x15], %[input_depth]\n"
"smlal v23.4s, v2.4h, v13.4h\n"
"saddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v24.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x15]\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"ld1 {v17.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v5.4h, v18.4h\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"smlal2 v24.4s, v5.8h, v18.8h\n"
"ld1 {v18.8b}, [x12]\n"
"smlal v21.4s, v6.4h, v9.4h\n"
"smlal2 v22.4s, v6.8h, v9.8h\n"
"smlal v19.4s, v0.4h, v9.4h\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"smlal2 v20.4s, v0.8h, v9.8h\n"
"ld1 {v9.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v6.4h, v11.4h\n"
"smlal2 v24.4s, v6.8h, v11.8h\n"
"smlal v21.4s, v7.4h, v10.4h\n"
"smlal2 v22.4s, v7.8h, v10.8h\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"smlal v19.4s, v1.4h, v10.4h\n"
"smlal2 v20.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v7.4h, v12.4h\n"
"smlal2 v24.4s, v7.8h, v12.8h\n"
"smlal v25.4s, v1.4h, v12.4h\n"
"smlal2 v26.4s, v1.8h, v12.8h\n"
"smlal v21.4s, v8.4h, v11.4h\n"
"smlal2 v22.4s, v8.8h, v11.8h\n"
"add x11, x11, %[input_width_increment]\n"
"smlal v19.4s, v2.4h, v11.4h\n"
"mov x12, x11\n"
"smlal2 v20.4s, v2.8h, v11.8h\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"smlal v25.4s, v0.4h, v11.4h\n"
"smlal2 v26.4s, v0.8h, v11.8h\n"
"ld1 {v11.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v8.4h, v13.4h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v8.8h, v13.8h\n"
"smlal v25.4s, v2.4h, v13.4h\n"
"smlal2 v26.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x13]\n"
"add x13, x12, %[input_row_size]\n"
"add x15, x13, %[input_row_size]\n"
"ld1 {v27.4s, v28.4s}, [%[bias_ptr]]\n"
"scvtf v21.4s, v21.4s\n"
"scvtf v22.4s, v22.4s\n"
"scvtf v23.4s, v23.4s\n"
"scvtf v24.4s, v24.4s\n"
"fmul v21.4s, v21.4s, v30.4s\n"
"fmul v22.4s, v22.4s, v31.4s\n"
"fmul v23.4s, v23.4s, v30.4s\n"
"fmul v24.4s, v24.4s, v31.4s\n"
"fadd v21.4s, v21.4s, v27.4s\n"
"fadd v22.4s, v22.4s, v28.4s\n"
"fadd v23.4s, v23.4s, v27.4s\n"
"fadd v24.4s, v24.4s, v28.4s\n"
"dup v28.8h, w0\n"
"dup v27.4s, w3\n"
"fmax v21.4s, v21.4s, v27.4s\n"
"fmin v21.4s, v21.4s, v29.4s\n"
"fmax v22.4s, v22.4s, v27.4s\n"
"fmin v22.4s, v22.4s, v29.4s\n"
"fmax v23.4s, v23.4s, v27.4s\n"
"fmin v23.4s, v23.4s, v29.4s\n"
"fmax v24.4s, v24.4s, v27.4s\n"
"fmin v24.4s, v24.4s, v29.4s\n"
"st1 {v21.4s, v22.4s}, [x6], x4\n"
"st1 {v23.4s, v24.4s}, [x6], x4\n"
"fcvtms v21.4s, v21.4s\n"
"fcvtms v22.4s, v22.4s\n"
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
"movi v22.4s, #0\n"
"movi v24.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"smlal v19.4s, v6.4h, v9.4h\n"
"smlal2 v20.4s, v6.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal v25.4s, v6.4h, v11.4h\n"
"smlal2 v26.4s, v6.8h, v11.8h\n"
"smlal v19.4s, v7.4h, v10.4h\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v20.4s, v7.8h, v10.8h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal v25.4s, v7.4h, v12.4h\n"
"smlal2 v26.4s, v7.8h, v12.8h\n"
"smlal v19.4s, v8.4h, v11.4h\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v20.4s, v8.8h, v11.8h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"smlal v25.4s, v8.4h, v13.4h\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"smlal2 v26.4s, v8.8h, v13.8h\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"smlal v19.4s, v3.4h, v14.4h\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"smlal2 v20.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v25.4s, v3.4h, v16.4h\n"
"movi v21.4s, #0\n"
"smlal2 v26.4s, v3.8h, v16.8h\n"
"movi v23.4s, #0\n"
"smlal v19.4s, v4.4h, v15.4h\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v20.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"smlal v25.4s, v4.4h, v17.4h\n"
"smlal2 v26.4s, v4.8h, v17.8h\n"
"smlal v19.4s, v5.4h, v16.4h\n"
"saddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v20.4s, v5.8h, v16.8h\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n"
"smlal v25.4s, v5.4h, v18.4h\n"
"smlal2 v26.4s, v5.8h, v18.8h\n"
"ld1 {v27.4s, v28.4s}, [%[bias_ptr]]\n"
"scvtf v19.4s, v19.4s\n"
"scvtf v20.4s, v20.4s\n"
"scvtf v25.4s, v25.4s\n"
"scvtf v26.4s, v26.4s\n"
"fmul v19.4s, v19.4s, v30.4s\n"
"fmul v20.4s, v20.4s, v31.4s\n"
"fmul v25.4s, v25.4s, v30.4s\n"
"fmul v26.4s, v26.4s, v31.4s\n"
"fadd v19.4s, v19.4s, v27.4s\n"
"fadd v20.4s, v20.4s, v28.4s\n"
"fadd v25.4s, v25.4s, v27.4s\n"
"fadd v26.4s, v26.4s, v28.4s\n"
"dup v27.4s, w3\n"
"fmax v19.4s, v19.4s, v27.4s\n"
"fmin v19.4s, v19.4s, v29.4s\n"
"fmax v20.4s, v20.4s, v27.4s\n"
"fmin v20.4s, v20.4s, v29.4s\n"
"fmax v25.4s, v25.4s, v27.4s\n"
"fmin v25.4s, v25.4s, v29.4s\n"
"fmax v26.4s, v26.4s, v27.4s\n"
"fmin v26.4s, v26.4s, v29.4s\n"
"dup v28.8h, w0\n"
"st1 {v19.4s, v20.4s}, [x7], x4\n"
"st1 {v25.4s, v26.4s}, [x7], x4\n"
"fcvtms v19.4s, v19.4s\n"
"fcvtms v20.4s, v20.4s\n"
"fcvtms v25.4s, v25.4s\n"
"fcvtms v26.4s, v26.4s\n"
"movi v20.4s, #0\n"
"movi v26.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"movi v19.4s, #0\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"movi v25.4s, #0\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
"cmp w14, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v13.8b}, [x12]\n"
"add x12, x15, %[input_row_size]\n"
"smlal v23.4s, v0.4h, v11.4h\n"
"ld1 {v17.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v0.8h, v11.8h\n"
"ld1 {v18.8b}, [x13]\n"
"add x13, x12, %[input_row_size]\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"ld1 {v9.8b}, [x15], %[input_depth]\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v3.4h, v14.4h\n"
"smlal2 v22.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"smlal v21.4s, v4.4h, v15.4h\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v22.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v5.4h, v16.4h\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v22.4s, v5.8h, v16.8h\n"
"ld1 {v16.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v1.4h, v12.4h\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v24.4s, v1.8h, v12.8h\n"
"ld1 {v12.8b}, [x15], %[input_depth]\n"
"smlal v23.4s, v2.4h, v13.4h\n"
"saddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v24.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x15]\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"ld1 {v17.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v5.4h, v18.4h\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"smlal2 v24.4s, v5.8h, v18.8h\n"
"ld1 {v18.8b}, [x12]\n"
"smlal v21.4s, v6.4h, v9.4h\n"
"smlal2 v22.4s, v6.8h, v9.8h\n"
"smlal v19.4s, v0.4h, v9.4h\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"smlal2 v20.4s, v0.8h, v9.8h\n"
"ld1 {v9.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v6.4h, v11.4h\n"
"smlal2 v24.4s, v6.8h, v11.8h\n"
"smlal v21.4s, v7.4h, v10.4h\n"
"smlal2 v22.4s, v7.8h, v10.8h\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"smlal v19.4s, v1.4h, v10.4h\n"
"smlal2 v20.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v7.4h, v12.4h\n"
"smlal2 v24.4s, v7.8h, v12.8h\n"
"smlal v25.4s, v1.4h, v12.4h\n"
"smlal2 v26.4s, v1.8h, v12.8h\n"
"smlal v21.4s, v8.4h, v11.4h\n"
"smlal2 v22.4s, v8.8h, v11.8h\n"
"smlal v19.4s, v2.4h, v11.4h\n"
"smlal2 v20.4s, v2.8h, v11.8h\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"smlal v25.4s, v0.4h, v11.4h\n"
"smlal2 v26.4s, v0.8h, v11.8h\n"
"ld1 {v11.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v8.4h, v13.4h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v8.8h, v13.8h\n"
"smlal v25.4s, v2.4h, v13.4h\n"
"smlal2 v26.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x13]\n"
"ld1 {v27.4s, v28.4s}, [%[bias_ptr]]\n"
"scvtf v21.4s, v21.4s\n"
"scvtf v22.4s, v22.4s\n"
"scvtf v23.4s, v23.4s\n"
"scvtf v24.4s, v24.4s\n"
"fmul v21.4s, v21.4s, v30.4s\n"
"fmul v22.4s, v22.4s, v31.4s\n"
"fmul v23.4s, v23.4s, v30.4s\n"
"fmul v24.4s, v24.4s, v31.4s\n"
"fadd v21.4s, v21.4s, v27.4s\n"
"fadd v22.4s, v22.4s, v28.4s\n"
"fadd v23.4s, v23.4s, v27.4s\n"
"fadd v24.4s, v24.4s, v28.4s\n"
"dup v28.8h, w0\n"
"dup v27.4s, w3\n"
"fmax v21.4s, v21.4s, v27.4s\n"
"fmin v21.4s, v21.4s, v29.4s\n"
"fmax v22.4s, v22.4s, v27.4s\n"
"fmin v22.4s, v22.4s, v29.4s\n"
"fmax v23.4s, v23.4s, v27.4s\n"
"fmin v23.4s, v23.4s, v29.4s\n"
"fmax v24.4s, v24.4s, v27.4s\n"
"fmin v24.4s, v24.4s, v29.4s\n"
"st1 {v21.4s, v22.4s}, [x6], x4\n"
"st1 {v23.4s, v24.4s}, [x6]\n"
"fcvtms v21.4s, v21.4s\n"
"fcvtms v22.4s, v22.4s\n"
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
"movi v22.4s, #0\n"
"movi v24.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"smlal v19.4s, v6.4h, v9.4h\n"
"smlal2 v20.4s, v6.8h, v9.8h\n"
"smlal v25.4s, v6.4h, v11.4h\n"
"smlal2 v26.4s, v6.8h, v11.8h\n"
"smlal v19.4s, v7.4h, v10.4h\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v20.4s, v7.8h, v10.8h\n"
"smlal v25.4s, v7.4h, v12.4h\n"
"smlal2 v26.4s, v7.8h, v12.8h\n"
"smlal v19.4s, v8.4h, v11.4h\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"smlal2 v20.4s, v8.8h, v11.8h\n"
"smlal v25.4s, v8.4h, v13.4h\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"smlal2 v26.4s, v8.8h, v13.8h\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"smlal v19.4s, v3.4h, v14.4h\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"smlal2 v20.4s, v3.8h, v14.8h\n"
"smlal v25.4s, v3.4h, v16.4h\n"
"smlal2 v26.4s, v3.8h, v16.8h\n"
"smlal v19.4s, v4.4h, v15.4h\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v20.4s, v4.8h, v15.8h\n"
"smlal v25.4s, v4.4h, v17.4h\n"
"smlal2 v26.4s, v4.8h, v17.8h\n"
"smlal v19.4s, v5.4h, v16.4h\n"
"saddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v20.4s, v5.8h, v16.8h\n"
"smlal v25.4s, v5.4h, v18.4h\n"
"smlal2 v26.4s, v5.8h, v18.8h\n"
"ld1 {v27.4s, v28.4s}, [%[bias_ptr]]\n"
"scvtf v19.4s, v19.4s\n"
"scvtf v20.4s, v20.4s\n"
"scvtf v25.4s, v25.4s\n"
"scvtf v26.4s, v26.4s\n"
"fmul v19.4s, v19.4s, v30.4s\n"
"fmul v20.4s, v20.4s, v31.4s\n"
"fmul v25.4s, v25.4s, v30.4s\n"
"fmul v26.4s, v26.4s, v31.4s\n"
"fadd v19.4s, v19.4s, v27.4s\n"
"fadd v20.4s, v20.4s, v28.4s\n"
"fadd v25.4s, v25.4s, v27.4s\n"
"fadd v26.4s, v26.4s, v28.4s\n"
"dup v28.8h, w0\n"
"dup v27.4s, w3\n"
"fmax v19.4s, v19.4s, v27.4s\n"
"fmin v19.4s, v19.4s, v29.4s\n"
"fmax v20.4s, v20.4s, v27.4s\n"
"fmin v20.4s, v20.4s, v29.4s\n"
"fmax v25.4s, v25.4s, v27.4s\n"
"fmin v25.4s, v25.4s, v29.4s\n"
"fmax v26.4s, v26.4s, v27.4s\n"
"fmin v26.4s, v26.4s, v29.4s\n"
"dup v28.8h, w0\n"
"st1 {v19.4s, v20.4s}, [x7], x4\n"
"st1 {v25.4s, v26.4s}, [x7]\n"
"fcvtms v19.4s, v19.4s\n"
"fcvtms v20.4s, v20.4s\n"
"fcvtms v25.4s, v25.4s\n"
"fcvtms v26.4s, v26.4s\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
"add x12, x15, %[input_row_size]\n"
"add x13, x12, %[input_row_size]\n"
"ld1 {v12.8b}, [x15], %[input_depth]\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v13.8b}, [x15], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v17.8b}, [x15]\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x12]\n"
"smlal v21.4s, v3.4h, v14.4h\n"
"smlal2 v22.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v21.4s, v4.4h, v15.4h\n"
"smlal2 v22.4s, v4.8h, v15.8h\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"smlal v21.4s, v5.4h, v16.4h\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"smlal2 v22.4s, v5.8h, v16.8h\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"ld1 {v16.8b}, [x13]\n"
"smlal v21.4s, v6.4h, v12.4h\n"
"smlal2 v22.4s, v6.8h, v12.8h\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v7.4h, v13.4h\n"
"smlal2 v22.4s, v7.8h, v13.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v2.4h, v17.4h\n"
"smlal2 v24.4s, v2.8h, v17.8h\n"
"ld1 {v26.4s, v27.4s}, [%[bias_ptr]]\n"
"scvtf v21.4s, v21.4s\n"
"scvtf v22.4s, v22.4s\n"
"fmul v21.4s, v21.4s, v30.4s\n"
"fmul v22.4s, v22.4s, v31.4s\n"
"fadd v21.4s, v21.4s, v26.4s\n"
"fadd v22.4s, v22.4s, v27.4s\n"
"dup v26.4s, w3\n"
"fmax v21.4s, v21.4s, v26.4s\n"
"fmin v21.4s, v21.4s, v29.4s\n"
"fmax v22.4s, v22.4s, v26.4s\n"
"fmin v22.4s, v22.4s, v29.4s\n"
"st1 {v21.4s, v22.4s}, [x6]\n"
"fcvtms v21.4s, v21.4s\n"
"fcvtms v22.4s, v22.4s\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"smlal v23.4s, v3.4h, v9.4h\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"smlal2 v24.4s, v3.8h, v9.8h\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"smlal v23.4s, v4.4h, v10.4h\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"smlal2 v24.4s, v4.8h, v10.8h\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"smlal v23.4s, v5.4h, v11.4h\n"
"smlal2 v24.4s, v5.8h, v11.8h\n"
"smlal v23.4s, v6.4h, v14.4h\n"
"smlal2 v24.4s, v6.8h, v14.8h\n"
"smlal v23.4s, v7.4h, v15.4h\n"
"smlal2 v24.4s, v7.8h, v15.8h\n"
"smlal v23.4s, v8.4h, v16.4h\n"
"smlal2 v24.4s, v8.8h, v16.8h\n"
"ld1 {v26.4s, v27.4s}, [%[bias_ptr]]\n"
"scvtf v23.4s, v23.4s\n"
"scvtf v24.4s, v24.4s\n"
"fmul v23.4s, v23.4s, v30.4s\n"
"fmul v24.4s, v24.4s, v31.4s\n"
"fadd v23.4s, v23.4s, v26.4s\n"
"fadd v24.4s, v24.4s, v27.4s\n"
"dup v26.4s, w3\n"
"fmax v23.4s, v23.4s, v26.4s\n"
"fmin v23.4s, v23.4s, v29.4s\n"
"fmax v24.4s, v24.4s, v26.4s\n"
"fmin v24.4s, v24.4s, v29.4s\n"
"st1 {v23.4s, v24.4s}, [x7]\n"
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
"subs %w[output_window_height], %w[output_window_height], #2\n"
"add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
"cmp %w[output_window_height], #2\n"
"add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
"cmp %w[output_window_height], #1\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
"mov x11, %[input_ptr]\n"
"mov x12, x11\n"
"add x13, x12, %[input_row_size]\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"add x15, x13, %[input_row_size]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"mov w14, %w[output_window_width]\n"
"cmp w14, #2\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"ld1 {v15.8b}, [x15], %[input_depth]\n"
"ld1 {v16.8b}, [x15], %[input_depth]\n"
"ld1 {v17.8b}, [x15], %[input_depth]\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"movi v24.4s, #0\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"movi v25.4s, #0\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"movi v26.4s, #0\n"
"movi v27.4s, #0\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
"cmp w14, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
"smlal v24.4s, v0.4h, v9.4h\n"
"ld1 {v18.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v0.8h, v9.8h\n"
"ld1 {v19.8b}, [x12]\n"
"smlal v26.4s, v0.4h, v11.4h\n"
"ld1 {v20.8b}, [x13], %[input_depth]\n"
"smlal2 v27.4s, v0.8h, v11.8h\n"
"ld1 {v21.8b}, [x13]\n"
"smlal v24.4s, v1.4h, v10.4h\n"
"ld1 {v22.8b}, [x15], %[input_depth]\n"
"smlal2 v25.4s, v1.8h, v10.8h\n"
"ld1 {v23.8b}, [x15]\n"
"smlal v24.4s, v2.4h, v11.4h\n"
"subs w14, w14, #2\n"
"smlal2 v25.4s, v2.8h, v11.8h\n"
"cmp w14, #3\n"
"smlal v24.4s, v3.4h, v12.4h\n"
"add x11, x11, %[input_width_increment]\n"
"smlal2 v25.4s, v3.8h, v12.8h\n"
"mov x12, x11\n"
"smlal v26.4s, v3.4h, v14.4h\n"
"add x13, x12, %[input_row_size]\n"
"smlal2 v27.4s, v3.8h, v14.8h\n"
"add x15, x13, %[input_row_size]\n"
"smlal v24.4s, v4.4h, v13.4h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v4.8h, v13.8h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal v24.4s, v5.4h, v14.4h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v5.8h, v14.8h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal v24.4s, v6.4h, v15.4h\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"smlal2 v25.4s, v6.8h, v15.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v26.4s, v6.4h, v17.4h\n"
"ld1 {v15.8b}, [x15], %[input_depth]\n"
"smlal2 v27.4s, v6.8h, v17.8h\n"
"smlal v24.4s, v7.4h, v16.4h\n"
"smlal2 v25.4s, v7.8h, v16.8h\n"
"ld1 {v16.8b}, [x15], %[input_depth]\n"
"smlal v24.4s, v8.4h, v17.4h\n"
"saddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v25.4s, v8.8h, v17.8h\n"
"ld1 {v17.8b}, [x15], %[input_depth]\n"
"saddw v19.8h, v28.8h, v19.8b\n"
"smlal v26.4s, v1.4h, v18.4h\n"
"saddw v20.8h, v28.8h, v20.8b\n"
"smlal2 v27.4s, v1.8h, v18.8h\n"
"smlal v26.4s, v2.4h, v19.4h\n"
"saddw v21.8h, v28.8h, v21.8b\n"
"smlal2 v27.4s, v2.8h, v19.8h\n"
"smlal v26.4s, v4.4h, v20.4h\n"
"smlal v26.4s, v5.4h, v21.4h\n"
"smlal2 v27.4s, v4.8h, v20.8h\n"
"saddw v22.8h, v28.8h, v22.8b\n"
"smlal2 v27.4s, v5.8h, v21.8h\n"
"saddw v23.8h, v28.8h, v23.8b\n"
"smlal v26.4s, v7.4h, v22.4h\n"
"smlal2 v27.4s, v7.8h, v22.8h\n"
"smlal v26.4s, v8.4h, v23.4h\n"
"smlal2 v27.4s, v8.8h, v23.8h\n"
"ld1 {v28.4s, v29.4s}, [%[bias_ptr]]\n"
"scvtf v24.4s, v24.4s\n"
"scvtf v25.4s, v25.4s\n"
"scvtf v26.4s, v26.4s\n"
"scvtf v27.4s, v27.4s\n"
"fmul v24.4s, v24.4s, v30.4s\n"
"fmul v25.4s, v25.4s, v31.4s\n"
"fmul v26.4s, v26.4s, v30.4s\n"
"fmul v27.4s, v27.4s, v31.4s\n"
"fadd v24.4s, v24.4s, v28.4s\n"
"fadd v25.4s, v25.4s, v29.4s\n"
"fadd v26.4s, v26.4s, v28.4s\n"
"fadd v27.4s, v27.4s, v29.4s\n"
"dup v28.4s, w3\n"
"dup v29.4s, w2\n"
"fmax v24.4s, v24.4s, v28.4s\n"
"fmin v24.4s, v24.4s, v29.4s\n"
"fmax v25.4s, v25.4s, v28.4s\n"
"fmin v25.4s, v25.4s, v29.4s\n"
"fmax v26.4s, v26.4s, v28.4s\n"
"fmin v26.4s, v26.4s, v29.4s\n"
"fmax v27.4s, v27.4s, v28.4s\n"
"fmin v27.4s, v27.4s, v29.4s\n"
"dup v28.8h, w0\n"
"st1 {v24.4s, v25.4s}, [x6], x4\n"
"st1 {v26.4s, v27.4s}, [x6], x4\n"
"fcvtms v24.4s, v24.4s\n"
"fcvtms v25.4s, v25.4s\n"
"fcvtms v26.4s, v26.4s\n"
"fcvtms v27.4s, v27.4s\n"
"movi v25.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"movi v27.4s, #0\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"movi v24.4s, #0\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"movi v26.4s, #0\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
"cmp w14, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
"smlal v24.4s, v0.4h, v9.4h\n"
"ld1 {v18.8b}, [x12], %[input_depth]\n"
"smlal2 v25.4s, v0.8h, v9.8h\n"
"ld1 {v19.8b}, [x12]\n"
"smlal v26.4s, v0.4h, v11.4h\n"
"ld1 {v20.8b}, [x13], %[input_depth]\n"
"smlal2 v27.4s, v0.8h, v11.8h\n"
"ld1 {v21.8b}, [x13]\n"
"smlal v24.4s, v1.4h, v10.4h\n"
"ld1 {v22.8b}, [x15], %[input_depth]\n"
"smlal2 v25.4s, v1.8h, v10.8h\n"
"ld1 {v23.8b}, [x15]\n"
"smlal v24.4s, v2.4h, v11.4h\n"
"smlal2 v25.4s, v2.8h, v11.8h\n"
"smlal v24.4s, v3.4h, v12.4h\n"
"smlal2 v25.4s, v3.8h, v12.8h\n"
"smlal v26.4s, v3.4h, v14.4h\n"
"smlal2 v27.4s, v3.8h, v14.8h\n"
"smlal v24.4s, v4.4h, v13.4h\n"
"smlal2 v25.4s, v4.8h, v13.8h\n"
"smlal v24.4s, v5.4h, v14.4h\n"
"smlal2 v25.4s, v5.8h, v14.8h\n"
"smlal v24.4s, v6.4h, v15.4h\n"
"smlal2 v25.4s, v6.8h, v15.8h\n"
"smlal v26.4s, v6.4h, v17.4h\n"
"smlal2 v27.4s, v6.8h, v17.8h\n"
"smlal v24.4s, v7.4h, v16.4h\n"
"smlal2 v25.4s, v7.8h, v16.8h\n"
"smlal v24.4s, v8.4h, v17.4h\n"
"saddw v18.8h, v28.8h, v18.8b\n"
"smlal2 v25.4s, v8.8h, v17.8h\n"
"saddw v19.8h, v28.8h, v19.8b\n"
"smlal v26.4s, v1.4h, v18.4h\n"
"saddw v20.8h, v28.8h, v20.8b\n"
"smlal2 v27.4s, v1.8h, v18.8h\n"
"smlal v26.4s, v2.4h, v19.4h\n"
"saddw v21.8h, v28.8h, v21.8b\n"
"smlal2 v27.4s, v2.8h, v19.8h\n"
"smlal v26.4s, v4.4h, v20.4h\n"
"smlal v26.4s, v5.4h, v21.4h\n"
"smlal2 v27.4s, v4.8h, v20.8h\n"
"saddw v22.8h, v28.8h, v22.8b\n"
"smlal2 v27.4s, v5.8h, v21.8h\n"
"saddw v23.8h, v28.8h, v23.8b\n"
"smlal v26.4s, v7.4h, v22.4h\n"
"smlal2 v27.4s, v7.8h, v22.8h\n"
"smlal v26.4s, v8.4h, v23.4h\n"
"smlal2 v27.4s, v8.8h, v23.8h\n"
"ld1 {v28.4s, v29.4s}, [%[bias_ptr]]\n"
"scvtf v24.4s, v24.4s\n"
"scvtf v25.4s, v25.4s\n"
"scvtf v26.4s, v26.4s\n"
"scvtf v27.4s, v27.4s\n"
"fmul v24.4s, v24.4s, v30.4s\n"
"fmul v25.4s, v25.4s, v31.4s\n"
"fmul v26.4s, v26.4s, v30.4s\n"
"fmul v27.4s, v27.4s, v31.4s\n"
"fadd v24.4s, v24.4s, v28.4s\n"
"fadd v25.4s, v25.4s, v29.4s\n"
"fadd v26.4s, v26.4s, v28.4s\n"
"fadd v27.4s, v27.4s, v29.4s\n"
"dup v28.4s, w3\n"
"dup v29.4s, w2\n"
"fmax v24.4s, v24.4s, v28.4s\n"
"fmin v24.4s, v24.4s, v29.4s\n"
"fmax v25.4s, v25.4s, v28.4s\n"
"fmin v25.4s, v25.4s, v29.4s\n"
"fmax v26.4s, v26.4s, v28.4s\n"
"fmin v26.4s, v26.4s, v29.4s\n"
"fmax v27.4s, v27.4s, v28.4s\n"
"fmin v27.4s, v27.4s, v29.4s\n"
"dup v28.8h, w0\n"
"st1 {v24.4s, v25.4s}, [x6], x4\n"
"st1 {v26.4s, v27.4s}, [x6]\n"
"fcvtms v24.4s, v24.4s\n"
"fcvtms v25.4s, v25.4s\n"
"fcvtms v26.4s, v26.4s\n"
"fcvtms v27.4s, v27.4s\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
"dup v29.8h, w2\n"
"smlal v24.4s, v0.4h, v9.4h\n"
"smlal2 v25.4s, v0.8h, v9.8h\n"
"smlal v24.4s, v1.4h, v10.4h\n"
"smlal2 v25.4s, v1.8h, v10.8h\n"
"smlal v24.4s, v2.4h, v11.4h\n"
"smlal2 v25.4s, v2.8h, v11.8h\n"
"smlal v24.4s, v3.4h, v12.4h\n"
"smlal2 v25.4s, v3.8h, v12.8h\n"
"smlal v24.4s, v4.4h, v13.4h\n"
"smlal2 v25.4s, v4.8h, v13.8h\n"
"smlal v24.4s, v5.4h, v14.4h\n"
"smlal2 v25.4s, v5.8h, v14.8h\n"
"smlal v24.4s, v6.4h, v15.4h\n"
"smlal2 v25.4s, v6.8h, v15.8h\n"
"smlal v24.4s, v7.4h, v16.4h\n"
"smlal2 v25.4s, v7.8h, v16.8h\n"
"smlal v24.4s, v8.4h, v17.4h\n"
"smlal2 v25.4s, v8.8h, v17.8h\n"
"ld1 {v26.4s, v27.4s}, [%[bias_ptr]]\n"
"scvtf v24.4s, v24.4s\n"
"scvtf v25.4s, v25.4s\n"
"fmul v24.4s, v24.4s, v30.4s\n"
"fmul v25.4s, v25.4s, v31.4s\n"
"fadd v24.4s, v24.4s, v26.4s\n"
"fadd v25.4s, v25.4s, v27.4s\n"
"dup v26.4s, w3\n"
"dup v27.4s, w2\n"
"fmax v24.4s, v24.4s, v26.4s\n"
"fmin v24.4s, v24.4s, v27.4s\n"
"fmax v25.4s, v25.4s, v26.4s\n"
"fmin v25.4s, v25.4s, v27.4s\n"
"st1 {v24.4s, v25.4s}, [x6]\n"
"fcvtms v24.4s, v24.4s\n"
"fcvtms v25.4s, v25.4s\n"
DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr),
[output_window_height] "+r"(output_window_height)
:
[input_scale] "r"(input_scale),
[bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
[input_depth] "r"(input_depth),
[output_window_width] "r"(output_window_width),
[input_width_increment] "r"(input_width_increment),
[input_height_increment] "r"(input_height_increment),
[output_height_increment] "r"(output_height_increment),
[per_channel_scales] "r"(per_channel_scales),
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"v30", "v31",
"x0", "x2", "x3", "x4", "x5", "x6", "x7",
"x10", "x11", "x12", "x13", "x14", "x15",
"x19", "x20");
#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
}
};
template <>
struct DepthwiseConvHybridPartialPerChannel<
DepthwiseConvOutputRounding::kUpward, EdgeType::kCenter, 1, 1> {
static inline void Run(const float* input_scale, const int8* input_ptr,
const int8* filter_ptr, const float* bias_ptr,
float* output_ptr, const float* per_channel_scales,
const DepthwiseConvParams* params_ptr) {
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr x11, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"dup v26.8h, w9\n"
"ldr w9, [%[input_scale]]\n"
"cmp x11, #16\n"
"dup v28.4s, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w10, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.4s, w9\n"
"dup v31.4s, w10\n"
"movi v16.4s, #0\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"movi v17.4s, #0\n"
"sshll v0.8h, v0.8b, #0\n"
"ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
"fmul v6.4s, v6.4s, v28.4s\n"
"ld1 {v10.4s}, [%[bias_ptr]], #16\n"
"ld1 {v7.4s}, [%[per_channel_scales]], #16\n"
"fmul v7.4s, v7.4s, v28.4s\n"
"ld1 {v11.4s}, [%[bias_ptr]], #16\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"subs x11, x11, #8\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"cmp x11, #16\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"scvtf v16.4s, v16.4s\n"
"scvtf v17.4s, v17.4s\n"
"fmul v16.4s, v16.4s, v6.4s\n"
"fmul v17.4s, v17.4s, v7.4s\n"
"fadd v16.4s, v16.4s, v10.4s\n"
"fadd v17.4s, v17.4s, v11.4s\n"
"fmax v16.4s, v16.4s, v30.4s\n"
"fmin v16.4s, v16.4s, v31.4s\n"
"fmax v17.4s, v17.4s, v30.4s\n"
"fmin v17.4s, v17.4s, v31.4s\n"
"st1 {v16.4s, v17.4s}, [%[output_ptr]], #32\n"
"fcvtms v16.4s, v16.4s\n"
"fcvtms v17.4s, v17.4s\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"movi v16.4s, #0\n"
"sshll v0.8h, v0.8b, #0\n"
"movi v17.4s, #0\n"
"ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
"ld1 {v10.4s}, [%[bias_ptr]], #16\n"
"ld1 {v7.4s}, [%[per_channel_scales]], #16\n"
"ld1 {v11.4s}, [%[bias_ptr]], #16\n"
"fmul v6.4s, v6.4s, v28.4s\n"
"fmul v7.4s, v7.4s, v28.4s\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"scvtf v16.4s, v16.4s\n"
"scvtf v17.4s, v17.4s\n"
"fmul v16.4s, v16.4s, v6.4s\n"
"fmul v17.4s, v17.4s, v7.4s\n"
"fadd v16.4s, v16.4s, v10.4s\n"
"fadd v17.4s, v17.4s, v11.4s\n"
"fmax v16.4s, v16.4s, v30.4s\n"
"fmin v16.4s, v16.4s, v31.4s\n"
"fmax v17.4s, v17.4s, v30.4s\n"
"fmin v17.4s, v17.4s, v31.4s\n"
"st1 {v16.4s, v17.4s}, [%[output_ptr]]\n"
"fcvtms v16.4s, v16.4s\n"
"fcvtms v17.4s, v17.4s\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
[per_channel_scales] "+r"(per_channel_scales)
:
[params_ptr] "r"(params_ptr), [input_scale] "r"(input_scale)
:
"cc", "memory",
"v0", "v6", "v7", "v8", "v10", "v11", "v16", "v17", "v18", "v19",
"v26", "v28", "v30", "v31",
"x9", "x10", "x11");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
template <>
struct DepthwiseConvHybridPartialPerChannel<
DepthwiseConvOutputRounding::kUpward, EdgeType::kCorner, 1, 1> {
static inline void Run(const float* input_scale, const int8* input_ptr,
const int8* filter_ptr, const float* bias_ptr,
float* output_ptr, const float* per_channel_scales,
const DepthwiseConvParams* params_ptr) {
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"ldr x9, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
"cmp x15, #16\n"
"add x12, %[input_ptr], x15\n"
"add x13, %[input_ptr], x9\n"
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"add x14, x13, x15\n"
"ld1 {v9.8b}, [x12], #8\n"
"ldr x6, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
"add x9, %[filter_ptr], x15\n"
"ld1 {v10.8b}, [x13], #8\n"
"add x10, %[filter_ptr], x6\n"
"ld1 {v11.8b}, [x14], #8\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"add x11, x10, x15\n"
"ld1 {v1.8b}, [x9], #8\n"
"ld1 {v2.8b}, [x10], #8\n"
"ld1 {v3.8b}, [x11], #8\n"
"ldr w6, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"dup v26.8h, w6\n"
"ldr w6, [%[input_scale]]\n"
"dup v28.4s, w6\n"
"ldr w6, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w7, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.4s, w6\n"
"dup v31.4s, w7\n"
"ld1 {v4.4s}, [%[bias_ptr]], #16\n"
"ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
"ld1 {v5.4s}, [%[bias_ptr]], #16\n"
"ld1 {v7.4s}, [%[per_channel_scales]], #16\n"
"fmul v6.4s, v6.4s, v28.4s\n"
"fmul v7.4s, v7.4s, v28.4s\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"movi v16.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"movi v17.4s, #0\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"subs x15, x15, #8\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [%[input_ptr]], #8\n"
"cmp x15, #16\n"
"ld1 {v0.8b}, [%[filter_ptr]], #8\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], #8\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"ld1 {v1.8b}, [x9], #8\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], #8\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"ld1 {v2.8b}, [x10], #8\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"ld1 {v11.8b}, [x14], #8\n"
"ld1 {v3.8b}, [x11], #8\n"
"scvtf v16.4s, v16.4s\n"
"scvtf v17.4s, v17.4s\n"
"fmul v16.4s, v16.4s, v6.4s\n"
"fmul v17.4s, v17.4s, v7.4s\n"
"fadd v16.4s, v16.4s, v4.4s\n"
"fadd v17.4s, v17.4s, v5.4s\n"
"fmax v16.4s, v16.4s, v30.4s\n"
"fmin v16.4s, v16.4s, v31.4s\n"
"fmax v17.4s, v17.4s, v30.4s\n"
"fmin v17.4s, v17.4s, v31.4s\n"
"st1 {v16.4s, v17.4s}, [%[output_ptr]], #32\n"
"fcvtms v16.4s, v16.4s\n"
"fcvtms v17.4s, v17.4s\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"movi v16.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"movi v17.4s, #0\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"ld1 {v4.4s}, [%[bias_ptr]], #16\n"
"ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
"ld1 {v5.4s}, [%[bias_ptr]], #16\n"
"ld1 {v7.4s}, [%[per_channel_scales]], #16\n"
"fmul v6.4s, v6.4s, v28.4s\n"
"fmul v7.4s, v7.4s, v28.4s\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"scvtf v16.4s, v16.4s\n"
"scvtf v17.4s, v17.4s\n"
"fmul v16.4s, v16.4s, v6.4s\n"
"fmul v17.4s, v17.4s, v7.4s\n"
"fadd v16.4s, v16.4s, v4.4s\n"
"fadd v17.4s, v17.4s, v5.4s\n"
"fmax v16.4s, v16.4s, v30.4s\n"
"fmin v16.4s, v16.4s, v31.4s\n"
"fmax v17.4s, v17.4s, v30.4s\n"
"fmin v17.4s, v17.4s, v31.4s\n"
"st1 {v16.4s, v17.4s}, [%[output_ptr]]\n"
"fcvtms v16.4s, v16.4s\n"
"fcvtms v17.4s, v17.4s\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
[per_channel_scales] "+r"(per_channel_scales)
:
[input_scale] "r"(input_scale),
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v16", "v17","v18", "v19", "v26", "v28", "v30", "v31",
"x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
template <>
struct DepthwiseConvHybridPartialPerChannel<
DepthwiseConvOutputRounding::kUpward, EdgeType::kHorizontal, 1, 1> {
static inline void Run(const float* input_scale, const int8* input_ptr,
const int8* filter_ptr, const float* bias_ptr,
float* output_ptr, const float* per_channel_scales,
const DepthwiseConvParams* params_ptr) {
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ldr x7, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
"mov x12, %[input_ptr]\n"
"ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
"mov x9, %[filter_ptr]\n"
"ldr x14, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
"add x13, x12, x11\n"
"ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"ld1 {v8.8b}, [x12], x7\n"
"add x10, x9, x14\n"
"ld1 {v9.8b}, [x12], x7\n"
"cmp x15, #16\n"
"ld1 {v10.8b}, [x12]\n"
"add %[input_ptr], %[input_ptr], #8\n"
"ld1 {v11.8b}, [x13], x7\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"ld1 {v12.8b}, [x13], x7\n"
"ld1 {v13.8b}, [x13]\n"
"ld1 {v0.8b}, [x9], x7\n"
"ld1 {v1.8b}, [x9], x7\n"
"ld1 {v2.8b}, [x9]\n"
"ld1 {v3.8b}, [x10], x7\n"
"ld1 {v4.8b}, [x10], x7\n"
"ld1 {v5.8b}, [x10]\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"dup v26.8h, w12\n"
"ldr w12, [%[input_scale]]\n"
"dup v28.4s, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.4s, w12\n"
"dup v31.4s, w13\n"
"ld1 {v6.4s}, [%[bias_ptr]], #16\n"
"ld1 {v14.4s}, [%[per_channel_scales]], #16\n"
"fmul v14.4s, v14.4s, v28.4s\n"
"ld1 {v7.4s}, [%[bias_ptr]], #16\n"
"ld1 {v15.4s}, [%[per_channel_scales]], #16\n"
"fmul v15.4s, v15.4s, v28.4s\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"movi v16.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"movi v17.4s, #0\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"mov x12, %[input_ptr]\n"
"subs x15, x15, #8\n"
"add x13, x12, x11\n"
"cmp x15, #16\n"
"add %[input_ptr], %[input_ptr], #8\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"mov x9, %[filter_ptr]\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [x12], x7\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"add x10, x9, x14\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], x7\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"ld1 {v10.8b}, [x12]\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"ld1 {v0.8b}, [x9], x7\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"ld1 {v11.8b}, [x13], x7\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"ld1 {v1.8b}, [x9], x7\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"ld1 {v12.8b}, [x13], x7\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"ld1 {v2.8b}, [x9]\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"ld1 {v13.8b}, [x13]\n"
"scvtf v16.4s, v16.4s\n"
"fmul v16.4s, v16.4s, v14.4s\n"
"ld1 {v3.8b}, [x10], x7\n"
"scvtf v17.4s, v17.4s\n"
"fmul v17.4s, v17.4s, v15.4s\n"
"ld1 {v4.8b}, [x10], x7\n"
"fadd v16.4s, v16.4s, v6.4s\n"
"ld1 {v5.8b}, [x10]\n"
"fadd v17.4s, v17.4s, v7.4s\n"
"fmax v16.4s, v16.4s, v30.4s\n"
"fmin v16.4s, v16.4s, v31.4s\n"
"fmax v17.4s, v17.4s, v30.4s\n"
"fmin v17.4s, v17.4s, v31.4s\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"st1 {v16.4s, v17.4s}, [%[output_ptr]], #32\n"
"fcvtms v16.4s, v16.4s\n"
"fcvtms v17.4s, v17.4s\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"movi v16.4s, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"movi v17.4s, #0\n"
"sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n"
"ld1 {v6.4s}, [%[bias_ptr]], #16\n"
"ld1 {v14.4s}, [%[per_channel_scales]], #16\n"
"ld1 {v7.4s}, [%[bias_ptr]], #16\n"
"ld1 {v15.4s}, [%[per_channel_scales]], #16\n"
"fmul v14.4s, v14.4s, v28.4s\n"
"fmul v15.4s, v15.4s, v28.4s\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"scvtf v16.4s, v16.4s\n"
"scvtf v17.4s, v17.4s\n"
"fmul v16.4s, v16.4s, v14.4s\n"
"fmul v17.4s, v17.4s, v15.4s\n"
"fadd v16.4s, v16.4s, v6.4s\n"
"fadd v17.4s, v17.4s, v7.4s\n"
"fmax v16.4s, v16.4s, v30.4s\n"
"fmin v16.4s, v16.4s, v31.4s\n"
"fmax v17.4s, v17.4s, v30.4s\n"
"fmin v17.4s, v17.4s, v31.4s\n"
"st1 {v16.4s, v17.4s}, [%[output_ptr]]\n"
"fcvtms v16.4s, v16.4s\n"
"fcvtms v17.4s, v17.4s\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr),
[per_channel_scales] "+r"(per_channel_scales),
[bias_ptr] "+r"(bias_ptr)
:
[input_scale] "r"(input_scale), [params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v26", "v28", "v30", "v31",
"x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
template <>
struct DepthwiseConvHybridPartialPerChannel<
DepthwiseConvOutputRounding::kUpward, EdgeType::kVertical, 1, 1> {
static inline void Run(const float* input_scale, const int8* input_ptr,
const int8* filter_ptr, const float* bias_ptr,
float* output_ptr, const float* per_channel_scales,
const DepthwiseConvParams* params_ptr) {
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP …
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP …
asm volatile(
"ldr x6, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
"mov x12, %[input_ptr]\n"
"ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
"mov x7, %[filter_ptr]\n"
"ldr x5, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
"add x13, x12, x11\n"
"ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"add x14, x13, x11\n"
"ld1 {v8.8b}, [x12], x6\n"
"add x9, x7, x5\n"
"ld1 {v9.8b}, [x12]\n"
"cmp x15, #16\n"
"add x10, x9, x5\n"
"ld1 {v10.8b}, [x13], x6\n"
"add %[input_ptr], %[input_ptr], #8\n"
"ld1 {v11.8b}, [x13]\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"ld1 {v12.8b}, [x14], x6\n"
"ld1 {v13.8b}, [x14]\n"
"ld1 {v0.8b}, [x7], x6\n"
"ld1 {v1.8b}, [x7]\n"
"ld1 {v2.8b}, [x9], x6\n"
"ld1 {v3.8b}, [x9]\n"
"ld1 {v4.8b}, [x10], x6\n"
"ld1 {v5.8b}, [x10]\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"dup v26.8h, w12\n"
"ldr w12, [%[input_scale]]\n"
"dup v28.4s, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.4s, w12\n"
"dup v31.4s, w13\n"
"ld1 {v6.4s}, [%[bias_ptr]], #16\n"
"ld1 {v14.4s}, [%[per_channel_scales]], #16\n"
"ld1 {v7.4s}, [%[bias_ptr]], #16\n"
"ld1 {v15.4s}, [%[per_channel_scales]], #16\n"
"fmul v14.4s, v14.4s, v28.4s\n"
"fmul v15.4s, v15.4s, v28.4s\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"movi v16.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"movi v17.4s, #0\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
"mov x12, %[input_ptr]\n"
"subs x15, x15, #8\n"
"add x13, x12, x11\n"
"cmp x15, #16\n"
"add x14, x13, x11\n"
"add %[input_ptr], %[input_ptr], #8\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"mov x7, %[filter_ptr]\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"ld1 {v8.8b}, [x12], x6\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"add x9, x7, x5\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"add x10, x9, x5\n"
"ld1 {v9.8b}, [x12]\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"add %[filter_ptr], %[filter_ptr], #8\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"ld1 {v10.8b}, [x13], x6\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"ld1 {v0.8b}, [x7], x6\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"ld1 {v11.8b}, [x13]\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"ld1 {v1.8b}, [x7]\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"ld1 {v12.8b}, [x14], x6\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"ld1 {v2.8b}, [x9], x6\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"ld1 {v13.8b}, [x14]\n"
"scvtf v16.4s, v16.4s\n"
"fmul v16.4s, v16.4s, v14.4s\n"
"ld1 {v3.8b}, [x9]\n"
"scvtf v17.4s, v17.4s\n"
"fmul v17.4s, v17.4s, v15.4s\n"
"ld1 {v4.8b}, [x10], x6\n"
"fadd v16.4s, v16.4s, v6.4s\n"
"ld1 {v5.8b}, [x10]\n"
"fadd v17.4s, v17.4s, v7.4s\n"
"fmax v16.4s, v16.4s, v30.4s\n"
"fmin v16.4s, v16.4s, v31.4s\n"
"fmax v17.4s, v17.4s, v30.4s\n"
"fmin v17.4s, v17.4s, v31.4s\n"
"st1 {v16.4s, v17.4s}, [%[output_ptr]], #32\n"
"fcvtms v16.4s, v16.4s\n"
"fcvtms v17.4s, v17.4s\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"movi v16.4s, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"movi v17.4s, #0\n"
"sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n"
"ld1 {v6.4s}, [%[bias_ptr]], #16\n"
"ld1 {v14.4s}, [%[per_channel_scales]], #16\n"
"ld1 {v7.4s}, [%[bias_ptr]], #16\n"
"ld1 {v15.4s}, [%[per_channel_scales]], #16\n"
"fmul v14.4s, v14.4s, v28.4s\n"
"fmul v15.4s, v15.4s, v28.4s\n"
"bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
"smlal v16.4s, v0.4h, v8.4h\n"
"smlal2 v17.4s, v0.8h, v8.8h\n"
"smlal v16.4s, v1.4h, v9.4h\n"
"smlal2 v17.4s, v1.8h, v9.8h\n"
"smlal v16.4s, v2.4h, v10.4h\n"
"smlal2 v17.4s, v2.8h, v10.8h\n"
"smlal v16.4s, v3.4h, v11.4h\n"
"smlal2 v17.4s, v3.8h, v11.8h\n"
"smlal v16.4s, v4.4h, v12.4h\n"
"smlal2 v17.4s, v4.8h, v12.8h\n"
"smlal v16.4s, v5.4h, v13.4h\n"
"smlal2 v17.4s, v5.8h, v13.8h\n"
"scvtf v16.4s, v16.4s\n"
"scvtf v17.4s, v17.4s\n"
"fmul v16.4s, v16.4s, v14.4s\n"
"fmul v17.4s, v17.4s, v15.4s\n"
"fadd v16.4s, v16.4s, v6.4s\n"
"fadd v17.4s, v17.4s, v7.4s\n"
"fmax v16.4s, v16.4s, v30.4s\n"
"fmin v16.4s, v16.4s, v31.4s\n"
"fmax v17.4s, v17.4s, v30.4s\n"
"fmin v17.4s, v17.4s, v31.4s\n"
"st1 {v16.4s, v17.4s}, [%[output_ptr]]\n"
"fcvtms v16.4s, v16.4s\n"
"fcvtms v17.4s, v17.4s\n"
:
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
[per_channel_scales] "+r"(per_channel_scales)
:
[input_scale] "r"(input_scale),
[params_ptr] "r"(params_ptr)
:
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v26", "v28", "v30", "v31",
"x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
}
};
#undef OFFSET_INPUT_DEPTH
#undef OFFSET_INPUT_ROW_SIZE
#undef OFFSET_OUTPUT_DEPTH
#undef OFFSET_OUTPUT_ROW_SIZE
#undef OFFSET_INPUT_OFFSET
#undef OFFSET_OUTPUT_OFFSET
#undef OFFSET_OUTPUT_MULTIPLIER
#undef OFFSET_OUTPUT_ACTIVATION_MIN
#undef OFFSET_OUTPUT_ACTIVATION_MAX
#undef OFFSET_OUTPUT_RIGHT_SHIFT
#undef OFFSET_INPUT_WIDTH
#undef OFFSET_INPUT_HEIGHT
#undef OFFSET_OUTPUT_WIDTH
#undef OFFSET_OUTPUT_HEIGHT
#undef OFFSET_OUTPUT_FLOAT_ACTIVATION_MIN
#undef OFFSET_OUTPUT_FLOAT_ACTIVATION_MAX
template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
int32 kStrideHeight>
struct DepthwiseConvHybridThroughDepthPerChannel {
static void __attribute__((noinline))
Run(const float* input_scale, const int8* input_ptr, const int8* filter_ptr,
const float* bias_ptr, float* output_ptr, int64_t start_depth,
int64_t end_depth, int64_t input_depth, int64_t input_row_size,
int32 output_window_height, int32 output_window_width,
const float* per_channel_scales, const DepthwiseConvParams& params) {
for (; start_depth <= end_depth - 8; start_depth += 8) {
DepthwiseConvHybridWindowPerChannel<output_rounding, 8, kStrideWidth,
kStrideHeight>::Run(input_scale,
input_ptr, filter_ptr,
bias_ptr, output_ptr,
input_depth,
input_row_size,
output_window_height,
output_window_width,
per_channel_scales,
¶ms);
input_ptr += 8;
output_ptr += 8;
filter_ptr += 8;
bias_ptr += 8;
per_channel_scales += 8;
}
}
};
template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
int32 kStrideHeight>
struct DepthwiseConvHybridMultiRowPerChannel {
using ConvKernel =
DepthwiseConvHybridThroughDepthPerChannel<output_rounding, kStrideWidth,
kStrideHeight>;
static inline void Run(const float* input_scale, const int8* input_data,
int32 start_x, int32 end_x, const int8* filter_data,
const float* bias_data, float* output_data,
const float* per_channel_scales,
const DepthwiseConvParams& params,
const ShuffleParams& shuffle_params,
int8* shuffle_workspace) {
TFLITE_DCHECK(
shuffle_params.input_height ==
get_shuffle_input_size(kStrideHeight, shuffle_params.output_height));
TFLITE_DCHECK(
shuffle_params.input_width ==
get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
TFLITE_DCHECK_LE(
64 * shuffle_params.input_width * shuffle_params.input_height,
kDepthwiseConvScratchWorkspaceSize);
int32 out_x = start_x;
if (params.output_depth > 64 ||
(params.output_depth <= 64 && params.input_width > 150)) {
for (; out_x <= (end_x - shuffle_params.output_width);
out_x += shuffle_params.output_width) {
const int8* input_ptr = input_data;
const float* bias_ptr = bias_data;
const int8* filter_ptr = filter_data;
const float* per_channel_scales_ptr = per_channel_scales;
float* output_ptr = output_data;
int64_t depth = 0;
const int64_t shuffle_row_size = 64 * shuffle_params.input_width;
for (; depth <= params.output_depth - 64; depth += 64) {
const int8* h_ptr = input_ptr;
for (int32 i = 0; i < shuffle_params.input_height; i++) {
const int8* ptr = h_ptr;
for (int32 j = 0; j < shuffle_params.input_width; j++) {
optimized_ops_preload_l1_keep(ptr);
ptr += params.input_depth;
}
h_ptr += params.input_row_size;
}
ShuffleInput(input_ptr, params.input_depth, params.input_width,
params.input_height, 64, shuffle_params.input_width,
shuffle_params.input_height, shuffle_workspace);
ConvKernel::Run(input_scale,
shuffle_workspace, filter_ptr, bias_ptr, output_ptr,
0, 64, 64, shuffle_row_size,
shuffle_params.output_height,
shuffle_params.output_width, per_channel_scales_ptr,
params);
input_ptr += 64;
output_ptr += 64;
filter_ptr += 64;
bias_ptr += 64;
per_channel_scales_ptr += 64;
}
const int8* h_ptr = input_ptr;
for (int32 i = 0; i < shuffle_params.input_height; i++) {
const int8* ptr = h_ptr;
for (int32 j = 0; j < shuffle_params.input_width; j++) {
optimized_ops_preload_l1_keep(ptr);
ptr += params.input_depth;
}
h_ptr += params.input_row_size;
}
ConvKernel::Run(input_scale, input_ptr,
filter_ptr, bias_ptr, output_ptr, depth,
params.output_depth, params.input_depth,
params.input_row_size, shuffle_params.output_height,
shuffle_params.output_width, per_channel_scales_ptr,
params);
input_data +=
shuffle_params.output_width * kStrideWidth * params.input_depth;
output_data += shuffle_params.output_width * params.output_depth;
}
}
const int32 output_leftover_width = end_x - out_x;
if (output_leftover_width > 0) {
ConvKernel::Run(input_scale, input_data, filter_data,
bias_data, output_data, 0, params.output_depth,
params.input_depth, params.input_row_size,
shuffle_params.output_height, output_leftover_width,
per_channel_scales, params);
}
}
};
template <DepthwiseConvOutputRounding output_rounding>
inline void DepthwiseConvHybridHandlePaddingPerChannel(
const float* input_scale, const int8* input_data,
const int8* filter_data, const float* bias_data, float* output_data,
const float* per_channel_scales, const DepthwiseConvParams& params) {
if (params.input_width == 1 && params.input_height == 1) {
const int8* filter_ptr =
filter_data + params.filter_row_size + params.output_depth;
DepthwiseConvHybridPartialPerChannel<output_rounding, EdgeType::kCenter, 1,
1>::Run(input_scale, input_data,
filter_ptr, bias_data, output_data,
per_channel_scales, ¶ms);
return;
}
const int32 out_x_start_corner = 0;
const int32 out_x_end_corner = params.output_width - 1;
const int32 out_y_start_corner = 0;
const int32 out_y_end_corner = params.output_height - 1;
const int8* input_ptr = input_data;
const int8* filter_ptr =
filter_data + params.filter_row_size + params.output_depth;
float* output_ptr = output_data;
DepthwiseConvHybridPartialPerChannel<
output_rounding, EdgeType::kCorner, 1, 1>::Run(
input_scale, input_ptr, filter_ptr, bias_data,
output_ptr, per_channel_scales, ¶ms);
input_ptr += (params.stride_width - 1) * params.input_depth;
filter_ptr = filter_data + params.filter_row_size;
output_ptr += params.output_depth;
for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
out_x++) {
DepthwiseConvHybridPartialPerChannel<
output_rounding, EdgeType::kHorizontal, 1, 1>::Run(
input_scale, input_ptr, filter_ptr, bias_data, output_ptr,
per_channel_scales, ¶ms);
input_ptr += params.stride_width * params.input_depth;
output_ptr += params.output_depth;
}
DepthwiseConvHybridPartialPerChannel<
output_rounding, EdgeType::kCorner, 1, 1>::Run(
input_scale, input_ptr, filter_ptr, bias_data, output_ptr,
per_channel_scales, ¶ms);
input_ptr = input_data + (params.stride_width - 1) * params.input_row_size;
filter_ptr = filter_data + params.input_depth;
output_ptr = output_data + params.output_row_size;
for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
out_y++) {
DepthwiseConvHybridPartialPerChannel<
output_rounding, EdgeType::kVertical, 1, 1>::Run(
input_scale, input_ptr, filter_ptr, bias_data, output_ptr,
per_channel_scales, ¶ms);
input_ptr += params.stride_width * params.input_row_size;
output_ptr += params.output_row_size;
}
input_ptr = input_data + (params.input_width - 2) * params.input_depth +
(params.stride_width - 1) * params.input_row_size;
filter_ptr = filter_data;
output_ptr = output_data + params.output_row_size +
(params.output_width - 1) * params.output_depth;
for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
out_y++) {
DepthwiseConvHybridPartialPerChannel<
output_rounding, EdgeType::kVertical, 1, 1>::Run(
input_scale, input_ptr, filter_ptr, bias_data, output_ptr,
per_channel_scales, ¶ms);
input_ptr += params.stride_width * params.input_row_size;
output_ptr += params.output_row_size;
}
input_ptr = input_data + (params.input_height - 2) * params.input_row_size;
filter_ptr = filter_data + params.output_depth;
output_ptr =
output_data + (params.output_height - 1) * params.output_row_size;
DepthwiseConvHybridPartialPerChannel<
output_rounding, EdgeType::kCorner, 1, 1>::Run(
input_scale, input_ptr, filter_ptr, bias_data, output_ptr,
per_channel_scales, ¶ms);
input_ptr += (params.stride_width == 1) ? 0 : params.input_depth;
filter_ptr = filter_data;
output_ptr += params.output_depth;
for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
out_x++) {
DepthwiseConvHybridPartialPerChannel<
output_rounding, EdgeType::kHorizontal, 1, 1>::Run(
input_scale, input_ptr, filter_ptr, bias_data, output_ptr,
per_channel_scales, ¶ms);
input_ptr += params.stride_width * params.input_depth;
output_ptr += params.output_depth;
}
DepthwiseConvHybridPartialPerChannel<
output_rounding, EdgeType::kCorner, 1, 1>::Run(
input_scale, input_ptr, filter_ptr, bias_data, output_ptr,
per_channel_scales, ¶ms);
}
template <DepthwiseConvOutputRounding output_rounding>
inline void DepthwiseConvHybrid3x3FilterPerChannel(
const DepthwiseParams& rt_params, const float* input_scales,
const RuntimeShape& input_shape, const int8* input_data,
const RuntimeShape& filter_shape, const int8* filter_data,
const RuntimeShape& bias_shape, const float* bias_data,
const RuntimeShape& output_shape, float* output_data,
const float* per_channel_scales, const int32* input_offsets,
int thread_start, int thread_end, int thread_dim) {
DepthwiseConvParams params;
const int32 stride_width = rt_params.stride_width;
const int32 stride_height = rt_params.stride_height;
const int32 pad_width = rt_params.padding_values.width;
const int32 pad_height = rt_params.padding_values.height;
const int32 depth_multiplier = rt_params.depth_multiplier;
const float output_activation_min = rt_params.float_activation_min;
const float output_activation_max = rt_params.float_activation_max;
const int32 filter_offset = rt_params.weights_offset;
params.input_depth = input_shape.Dims(3);
params.input_width = input_shape.Dims(2);
params.input_height = input_shape.Dims(1);
params.input_row_size = params.input_depth * params.input_width;
params.stride_width = stride_width; params.stride_height = stride_height;
params.output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
params.output_width = output_shape.Dims(2);
params.output_height = output_shape.Dims(1);
params.output_row_size = params.output_depth * params.output_width;
params.filter_offset = filter_offset;
params.float_output_activation_min = output_activation_min;
params.float_output_activation_max = output_activation_max;
const int32 filter_height = filter_shape.Dims(1);
const int32 filter_width = filter_shape.Dims(2);
params.filter_row_size = params.output_depth * filter_width;
TFLITE_DCHECK(params.output_depth == params.input_depth * depth_multiplier);
TFLITE_DCHECK(depth_multiplier == 1);
TFLITE_DCHECK(filter_height == 3);
TFLITE_DCHECK(filter_width == 3);
TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
TFLITE_DCHECK(stride_width == stride_height);
TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
TFLITE_DCHECK(pad_width == pad_height);
TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
const int64_t input_batch_size = params.input_row_size * params.input_height;
const int64_t output_batch_size =
params.output_row_size * params.output_height;
ShuffleParams one_row_shuffle_params, two_row_shuffle_params,
four_row_shuffle_params, eight_row_shuffle_params;
if (stride_width == 1) {
one_row_shuffle_params = ShuffleParams(30, 1, 1, 1);
two_row_shuffle_params = ShuffleParams(22, 2, 1, 1);
four_row_shuffle_params = ShuffleParams(14, 4, 1, 1);
eight_row_shuffle_params = ShuffleParams(8, 8, 1, 1);
} else {
one_row_shuffle_params = ShuffleParams(14, 1, 2, 2);
two_row_shuffle_params = ShuffleParams(8, 2, 2, 2);
four_row_shuffle_params = ShuffleParams(4, 4, 2, 2);
eight_row_shuffle_params = ShuffleParams(2, 8, 2, 2);
}
using conv_multirow_func_t =
decltype(
&DepthwiseConvHybridMultiRowPerChannel<output_rounding, 1, 1>::Run);
conv_multirow_func_t conv_multirow_func =
DepthwiseConvHybridMultiRowPerChannel<output_rounding, 1, 1>::Run;
if (stride_width == 2) {
conv_multirow_func =
DepthwiseConvHybridMultiRowPerChannel<output_rounding, 2, 2>::Run;
}
int8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
int batch_start = 0;
int batch_end = batches;
int row_start = 0;
int row_end = params.output_height;
switch (thread_dim) {
case 0:
TFLITE_DCHECK_GE(thread_start, 0);
TFLITE_DCHECK_LE(thread_end, batches);
batch_start = thread_start;
batch_end = thread_end;
break;
case 1:
TFLITE_DCHECK_GE(thread_start, 0);
TFLITE_DCHECK_LE(thread_end, params.output_height);
row_start = thread_start;
row_end = thread_end;
break;
}
for (int32 b = batch_start; b < batch_end; ++b) {
const int8* input_ptr = input_data + b * input_batch_size;
float* output_ptr = output_data + b * output_batch_size;
params.input_offset = -input_offsets[b];
int32 out_x = 0;
int32 out_y = row_start;
int32 end_x = params.output_width;
int32 end_y = row_end;
if (pad_width == 1 && pad_height == 1) {
DepthwiseConvHybridHandlePaddingPerChannel<output_rounding>(
input_scales + b, input_ptr, filter_data,
bias_data, output_ptr, per_channel_scales, params);
out_x = 1;
end_x = params.output_width - 1;
out_y = std::max(1, out_y);
end_y = std::min(params.output_height - 1, end_y);
}
const int in_x = (out_x * stride_width) - pad_width;
const int in_y = (out_y * stride_height) - pad_height;
input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
output_ptr += out_y * params.output_row_size + out_x * params.output_depth;
if (params.input_width < four_row_shuffle_params.input_width) {
for (; out_y <= end_y - 8; out_y += 8) {
conv_multirow_func(input_scales + b, input_ptr,
out_x, end_x, filter_data, bias_data, output_ptr,
per_channel_scales, params, eight_row_shuffle_params,
shuffle_workspace);
input_ptr += 8 * stride_height * params.input_row_size;
output_ptr += 8 * params.output_row_size;
}
}
if (params.input_width < two_row_shuffle_params.input_width) {
for (; out_y <= end_y - 4; out_y += 4) {
conv_multirow_func(input_scales + b, input_ptr,
out_x, end_x, filter_data, bias_data, output_ptr,
per_channel_scales, params, four_row_shuffle_params,
shuffle_workspace);
input_ptr += 4 * stride_height * params.input_row_size;
output_ptr += 4 * params.output_row_size;
}
}
for (; out_y <= end_y - 2; out_y += 2) {
conv_multirow_func(input_scales + b, input_ptr,
out_x, end_x, filter_data, bias_data, output_ptr,
per_channel_scales, params, two_row_shuffle_params,
shuffle_workspace);
input_ptr += 2 * stride_height * params.input_row_size;
output_ptr += 2 * params.output_row_size;
}
for (; out_y < end_y; out_y++) {
conv_multirow_func(input_scales + b, input_ptr,
out_x, end_x, filter_data, bias_data, output_ptr,
per_channel_scales, params, one_row_shuffle_params,
shuffle_workspace);
input_ptr += stride_height * params.input_row_size;
output_ptr += params.output_row_size;
}
}
}
#endif
#undef STR
#undef STR_UNEXPANDED
}
}
}
#endif