#include "ruy/kernel_arm.h"
#include "ruy/opt_set.h"
#include "ruy/platform.h"
#include "ruy/profiler/instrumentation.h"
namespace ruy {
#if RUY_PLATFORM_NEON_32 && RUY_OPT(ASM)
#define RUY_ASM_LABEL_STORE_UINT8 …
#define RUY_ASM_LABEL_STORE_INT8 …
#define RUY_ASM_LABEL_STORE_INT16 …
#define RUY_ASM_LABEL_STORE_INT32 …
#define RUY_ASM_LABEL_AFTER_STORE …
#define RUY_OFFSET_LHS_BASE_PTR …
#define RUY_OFFSET_RHS_BASE_PTR …
#define RUY_OFFSET_DST_BASE_PTR …
#define RUY_OFFSET_BIAS …
#define RUY_OFFSET_START_ROW …
#define RUY_OFFSET_START_COL …
#define RUY_OFFSET_LAST_ROW …
#define RUY_OFFSET_LAST_COL …
#define RUY_OFFSET_DST_ROWS …
#define RUY_OFFSET_DST_COLS …
#define RUY_OFFSET_LHS_STRIDE …
#define RUY_OFFSET_RHS_STRIDE …
#define RUY_OFFSET_DST_STRIDE …
#define RUY_OFFSET_DEPTH …
#define RUY_OFFSET_CLAMP_MIN …
#define RUY_OFFSET_CLAMP_MAX …
#define RUY_OFFSET_FLAGS …
#define RUY_STACK_OFFSET_SIZE …
#define RUY_STACK_OFFSET_DST_COL_PTR …
#define RUY_STACK_OFFSET_DST_PTR …
#define RUY_STACK_OFFSET_ROW …
#define RUY_STACK_OFFSET_COL …
#define RUY_STACK_OFFSET_LHS_COL_PTR …
#define RUY_STACK_OFFSET_RHS_COL_PTR …
template <typename Params>
void CheckOffsetsInKernelParamsFloat32(const Params&) {
static_assert(offsetof(Params, lhs_base_ptr) == RUY_OFFSET_LHS_BASE_PTR, "");
static_assert(offsetof(Params, rhs_base_ptr) == RUY_OFFSET_RHS_BASE_PTR, "");
static_assert(offsetof(Params, dst_base_ptr) == RUY_OFFSET_DST_BASE_PTR, "");
static_assert(offsetof(Params, bias) == RUY_OFFSET_BIAS, "");
static_assert(offsetof(Params, start_row) == RUY_OFFSET_START_ROW, "");
static_assert(offsetof(Params, start_col) == RUY_OFFSET_START_COL, "");
static_assert(offsetof(Params, last_row) == RUY_OFFSET_LAST_ROW, "");
static_assert(offsetof(Params, last_col) == RUY_OFFSET_LAST_COL, "");
static_assert(offsetof(Params, dst_rows) == RUY_OFFSET_DST_ROWS, "");
static_assert(offsetof(Params, lhs_stride) == RUY_OFFSET_LHS_STRIDE, "");
static_assert(offsetof(Params, rhs_stride) == RUY_OFFSET_RHS_STRIDE, "");
static_assert(offsetof(Params, dst_stride) == RUY_OFFSET_DST_STRIDE, "");
static_assert(offsetof(Params, depth) == RUY_OFFSET_DEPTH, "");
static_assert(offsetof(Params, clamp_min) == RUY_OFFSET_CLAMP_MIN, "");
static_assert(offsetof(Params, clamp_max) == RUY_OFFSET_CLAMP_MAX, "");
static_assert(offsetof(Params, flags) == RUY_OFFSET_FLAGS, "");
}
void KernelFloat32Neon(const KernelParamsFloat<8, 4>& params) {
CheckOffsetsInKernelParamsFloat32(params);
profiler::ScopeLabel label("Kernel (kNeon)");
const float* lhs_ptr = params.lhs_base_ptr;
const float* rhs_ptr = params.rhs_base_ptr;
asm volatile(
#define RUY_MAKE_ZERO …
"vld1.32 {d0, d1}, [%[lhs_ptr]]!\n"
"vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
RUY_PREFETCH_LOAD("pld [%[lhs_ptr]]\n")
"vld1.32 {d4, d5}, [%[rhs_ptr]]!\n"
RUY_PREFETCH_LOAD("pld [%[rhs_ptr]]\n")
"sub sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
"str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_START_COL) "]\n"
"str r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_RHS_BASE_PTR) "]\n"
"str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
RUY_MAKE_ZERO(q3)
RUY_MAKE_ZERO(q4)
RUY_MAKE_ZERO(q5)
RUY_MAKE_ZERO(q6)
RUY_MAKE_ZERO(q7)
RUY_MAKE_ZERO(q8)
RUY_MAKE_ZERO(q9)
RUY_MAKE_ZERO(q10)
"mov r1, #1\n"
"1:\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
"cmp r1, r2\n"
"beq 79f\n"
"2:\n"
"vmla.f32 q3, q0, d4[0]\n"
"vmla.f32 q5, q0, d4[1]\n"
"vmla.f32 q7, q0, d5[0]\n"
"vmla.f32 q9, q0, d5[1]\n"
"vld1.32 {d0, d1}, [%[lhs_ptr]]!\n"
"vmla.f32 q4, q1, d4[0]\n"
"vmla.f32 q6, q1, d4[1]\n"
"vmla.f32 q8, q1, d5[0]\n"
"vmla.f32 q10, q1, d5[1]\n"
"vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
RUY_PREFETCH_LOAD("pld [%[lhs_ptr]]\n")
"vld1.32 {d4, d5}, [%[rhs_ptr]]!\n"
RUY_PREFETCH_LOAD("pld [%[rhs_ptr]]\n")
"add r1, r1, #1\n"
"cmp r1, r2\n"
"blt 2b\n"
"79:\n"
"vmla.f32 q3, q0, d4[0]\n"
"vmla.f32 q5, q0, d4[1]\n"
"vmla.f32 q7, q0, d5[0]\n"
"vmla.f32 q9, q0, d5[1]\n"
"vmla.f32 q4, q1, d4[0]\n"
"vmla.f32 q6, q1, d4[1]\n"
"vmla.f32 q8, q1, d5[0]\n"
"vmla.f32 q10, q1, d5[1]\n"
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"cmp r1, r3\n"
"bge 4f\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"add r4, r4, r1, lsl #3\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"b 5f\n"
"4:\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"str r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"cmp r8, r4\n"
"bge 5f\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
"ldr r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
"add r10, r10, r1, lsl #2\n"
"str r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
"5:\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"mov %[lhs_ptr], r4\n"
"ldr r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
"mov %[rhs_ptr], r5\n"
"ldrb r4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
"tst r4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 1000f\n"
"mov r8, #" RUY_STR(RUY_STACK_OFFSET_ROW) "\n"
"b 1001f\n"
"1000:\n"
"mov r8, #" RUY_STR(RUY_STACK_OFFSET_COL) "\n"
"1001:\n"
"ldr r8, [sp, r8]\n"
"tst r4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"beq 1002f\n"
"add r1, r1, r8, lsl #2\n"
"1002:\n"
"vld1.32 {d24, d25}, [r1]!\n"
"vld1.32 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n"
RUY_PREFETCH_LOAD("pld [%[lhs_ptr]]\n")
"vld1.32 {d4, d5}, [%[rhs_ptr]]!\n"
RUY_PREFETCH_LOAD("pld [%[rhs_ptr]]\n")
"tst r4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 6f\n"
"vld1.32 {d26, d27}, [r1]\n"
"vadd.f32 q3, q3, q12\n"
"vadd.f32 q5, q5, q12\n"
"vadd.f32 q7, q7, q12\n"
"vadd.f32 q9, q9, q12\n"
"vadd.f32 q4, q4, q13\n"
"vadd.f32 q6, q6, q13\n"
"vadd.f32 q8, q8, q13\n"
"vadd.f32 q10, q10, q13\n"
"b 7f\n"
"6:\n"
"vdup.32 q11, d24[0]\n"
"vdup.32 q13, d24[1]\n"
"vdup.32 q14, d25[0]\n"
"vdup.32 q15, d25[1]\n"
"vadd.f32 q3, q3, q11\n"
"vadd.f32 q4, q4, q11\n"
"vadd.f32 q5, q5, q13\n"
"vadd.f32 q6, q6, q13\n"
"vadd.f32 q7, q7, q14\n"
"vadd.f32 q8, q8, q14\n"
"vadd.f32 q9, q9, q15\n"
"vadd.f32 q10, q10, q15\n"
"7:\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"vdup.32 q12, r2\n"
"vdup.32 q13, r3\n"
"vmax.f32 q3, q3, q12\n"
"vmax.f32 q4, q4, q12\n"
"vmax.f32 q5, q5, q12\n"
"vmax.f32 q6, q6, q12\n"
"vmax.f32 q7, q7, q12\n"
"vmax.f32 q8, q8, q12\n"
"vmax.f32 q9, q9, q12\n"
"vmax.f32 q10, q10, q12\n"
"vmin.f32 q3, q3, q13\n"
"vmin.f32 q4, q4, q13\n"
"vmin.f32 q5, q5, q13\n"
"vmin.f32 q6, q6, q13\n"
"vmin.f32 q7, q7, q13\n"
"vmin.f32 q8, q8, q13\n"
"vmin.f32 q9, q9, q13\n"
"vmin.f32 q10, q10, q13\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"sub r1, r1, r8\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"sub r2, r2, r4\n"
"mov r3, #8\n"
"mov r5, #4\n"
"cmp r1, #8\n"
"it gt\n"
"movgt r1, r3\n"
"cmp r2, #4\n"
"it gt\n"
"movgt r2, r5\n"
"cmp r1, r3\n"
"it eq\n"
"cmpeq r2, r5\n"
"beq 30f\n"
"mov r3, %[dst_tmp_buf]\n"
"mov r4, #32\n"
"b 31f\n"
"30:\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"mov r4, r5\n"
"31:\n"
"vst1.32 {d6, d7, d8, d9}, [r3]\n"
"add r3, r3, r4\n"
RUY_MAKE_ZERO(q3)
RUY_MAKE_ZERO(q4)
"vst1.32 {d10, d11, d12, d13}, [r3]\n"
"add r3, r3, r4\n"
RUY_MAKE_ZERO(q5)
RUY_MAKE_ZERO(q6)
"vst1.32 {d14, d15, d16, d17}, [r3]\n"
"add r3, r3, r4\n"
RUY_MAKE_ZERO(q7)
RUY_MAKE_ZERO(q8)
"vst1.32 {d18, d19, d20, d21}, [r3]\n"
"add r3, r3, r4\n"
RUY_MAKE_ZERO(q9)
RUY_MAKE_ZERO(q10)
"beq 41f\n"
"ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"mov r3, %[dst_tmp_buf]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"mov r6, #0\n"
"50:\n"
"mov r5, #0\n"
"51:\n"
"ldr r10, [r3, r5, lsl #2]\n"
"str r10, [r4, r5, lsl #2]\n"
"add r5, r5, #1\n"
"cmp r5, r1\n"
"blt 51b\n"
"add r6, r6, #1\n"
"add r3, r3, #32\n"
"add r4, r4, r8\n"
"cmp r6, r2\n"
"blt 50b\n"
"41:\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"add r4, r4, #32\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr r6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"cmp r8, r3\n"
"beq 20f\n"
"add r8, r8, #8\n"
"str r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"b 21f\n"
"20:\n"
"str r6, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"add r4, r4, #4\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
"add r1, r1, r8, lsl #2\n"
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"21:\n"
"ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"cmp r8, r4\n"
"mov r1, #1\n"
"ble 1b\n"
"add sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
: [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr)
: [ params ] "r"(¶ms), [dst_tmp_buf] "r"(params.dst_tmp_buf)
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r10", "cc",
"memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q9", "q10", "q12", "q13");
}
#undef RUY_MAKE_ZERO
#undef RUY_STACK_OFFSET_SIZE
#undef RUY_STACK_OFFSET_DST_COL_PTR
#undef RUY_STACK_OFFSET_DST_PTR
#undef RUY_STACK_OFFSET_ROW
#undef RUY_STACK_OFFSET_COL
#undef RUY_STACK_OFFSET_LHS_COL_PTR
#undef RUY_STACK_OFFSET_RHS_COL_PTR
#undef RUY_OFFSET_LHS_BASE_PTR
#undef RUY_OFFSET_RHS_BASE_PTR
#undef RUY_OFFSET_DST_BASE_PTR
#undef RUY_OFFSET_BIAS
#undef RUY_OFFSET_START_ROW
#undef RUY_OFFSET_START_COL
#undef RUY_OFFSET_LAST_ROW
#undef RUY_OFFSET_LAST_COL
#undef RUY_OFFSET_DST_ROWS
#undef RUY_OFFSET_DST_COLS
#undef RUY_OFFSET_LHS_STRIDE
#undef RUY_OFFSET_RHS_STRIDE
#undef RUY_OFFSET_DST_STRIDE
#undef RUY_OFFSET_DEPTH
#undef RUY_OFFSET_CLAMP_MIN
#undef RUY_OFFSET_CLAMP_MAX
#undef RUY_OFFSET_FLAGS
#define RUY_OFFSET_BIAS …
#define RUY_OFFSET_LHS_SUMS …
#define RUY_OFFSET_RHS_SUMS …
#define RUY_OFFSET_LHS_BASE_PTR …
#define RUY_OFFSET_MULTIPLIER_FIXEDPOINT …
#define RUY_OFFSET_MULTIPLIER_EXPONENT …
#define RUY_OFFSET_RHS_BASE_PTR …
#define RUY_OFFSET_DST_BASE_PTR …
#define RUY_OFFSET_LHS_ZERO_POINT …
#define RUY_OFFSET_RHS_ZERO_POINT …
#define RUY_OFFSET_DST_ZERO_POINT …
#define RUY_OFFSET_PROD_ZP_DEPTH …
#define RUY_OFFSET_START_ROW …
#define RUY_OFFSET_START_COL …
#define RUY_OFFSET_LAST_ROW …
#define RUY_OFFSET_LAST_COL …
#define RUY_OFFSET_DST_ROWS …
#define RUY_OFFSET_DST_COLS …
#define RUY_OFFSET_LHS_STRIDE …
#define RUY_OFFSET_RHS_STRIDE …
#define RUY_OFFSET_DST_STRIDE …
#define RUY_OFFSET_DEPTH …
#define RUY_OFFSET_CLAMP_MIN …
#define RUY_OFFSET_CLAMP_MAX …
#define RUY_OFFSET_FLAGS …
#define RUY_OFFSET_DST_TYPE_ID …
#define RUY_STACK_OFFSET_SIZE …
#define RUY_STACK_OFFSET_DST_COL_PTR …
#define RUY_STACK_OFFSET_DST_PTR …
#define RUY_STACK_OFFSET_ROW …
#define RUY_STACK_OFFSET_COL …
#define RUY_STACK_OFFSET_LHS_COL_PTR …
#define RUY_STACK_OFFSET_RHS_COL_PTR …
template <typename Params>
void CheckOffsetsInKernelParams8bit(const Params&) {
static_assert(offsetof(Params, lhs_zero_point) == RUY_OFFSET_LHS_ZERO_POINT,
"");
static_assert(offsetof(Params, rhs_zero_point) == RUY_OFFSET_RHS_ZERO_POINT,
"");
static_assert(offsetof(Params, dst_zero_point) == RUY_OFFSET_DST_ZERO_POINT,
"");
static_assert(offsetof(Params, prod_zp_depth) == RUY_OFFSET_PROD_ZP_DEPTH,
"");
static_assert(offsetof(Params, multiplier_fixedpoint) ==
RUY_OFFSET_MULTIPLIER_FIXEDPOINT,
"");
static_assert(
offsetof(Params, multiplier_exponent) == RUY_OFFSET_MULTIPLIER_EXPONENT,
"");
static_assert(offsetof(Params, clamp_min) == RUY_OFFSET_CLAMP_MIN, "");
static_assert(offsetof(Params, clamp_max) == RUY_OFFSET_CLAMP_MAX, "");
static_assert(offsetof(Params, bias) == RUY_OFFSET_BIAS, "");
static_assert(offsetof(Params, lhs_sums) == RUY_OFFSET_LHS_SUMS, "");
static_assert(offsetof(Params, rhs_sums) == RUY_OFFSET_RHS_SUMS, "");
static_assert(offsetof(Params, flags) == RUY_OFFSET_FLAGS, "");
static_assert(offsetof(Params, lhs_base_ptr) == RUY_OFFSET_LHS_BASE_PTR, "");
static_assert(offsetof(Params, start_row) == RUY_OFFSET_START_ROW, "");
static_assert(offsetof(Params, last_row) == RUY_OFFSET_LAST_ROW, "");
static_assert(offsetof(Params, last_col) == RUY_OFFSET_LAST_COL, "");
static_assert(offsetof(Params, lhs_stride) == RUY_OFFSET_LHS_STRIDE, "");
static_assert(offsetof(Params, rhs_stride) == RUY_OFFSET_RHS_STRIDE, "");
static_assert(offsetof(Params, dst_stride) == RUY_OFFSET_DST_STRIDE, "");
static_assert(offsetof(Params, depth) == RUY_OFFSET_DEPTH, "");
}
void Kernel8bitNeon(const KernelParams8bit<4, 2>& params) {
profiler::ScopeLabel label("Kernel (kNeon)");
CheckOffsetsInKernelParams8bit(params);
const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
const std::int8_t* rhs_col_ptr =
static_cast<const int8_t*>(params.rhs_base_ptr);
const std::int8_t* lhs_ptr = lhs_col_ptr;
const std::int8_t* rhs_ptr = rhs_col_ptr;
asm volatile(
#define RUY_MAKE_ZERO …
"vld1.8 {d0, d1}, [%[lhs_ptr]]!\n"
RUY_MAKE_ZERO(q6)
RUY_MAKE_ZERO(q7)
"vld1.8 {d2, d3}, [%[lhs_ptr]]!\n"
RUY_MAKE_ZERO(q8)
RUY_MAKE_ZERO(q9)
RUY_MAKE_ZERO(q10)
"vld1.8 {d8, d9}, [%[rhs_ptr]]!\n"
RUY_MAKE_ZERO(q11)
"vld1.8 {d10, d11}, [%[rhs_ptr]]!\n"
"sub sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
RUY_MAKE_ZERO(q12)
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
RUY_MAKE_ZERO(q13)
"str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
RUY_MAKE_ZERO(q14)
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_START_COL) "]\n"
RUY_MAKE_ZERO(q15)
"str r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_RHS_BASE_PTR) "]\n"
"str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
"mov r1, #16\n"
"1:\n"
"ldr r10, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
"cmp r1, r10\n"
"beq 79f\n"
"2:\n"
"vmull.s8 q14, d0, d8\n"
"vmull.s8 q2, d0, d10\n"
"vmull.s8 q15, d2, d8\n"
"vmull.s8 q3, d2, d10\n"
"vmlal.s8 q14, d1, d9\n"
"vmlal.s8 q2, d1, d11\n"
"vmlal.s8 q15, d3, d9\n"
"vmlal.s8 q3, d3, d11\n"
"vld1.8 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n"
"vpadal.s16 q6, q14\n"
"vpadal.s16 q7, q15\n"
"vpadal.s16 q10, q2\n"
"vpadal.s16 q11, q3\n"
"vmull.s8 q14, d0, d8\n"
"vmull.s8 q2, d0, d10\n"
"vmull.s8 q15, d2, d8\n"
"vmull.s8 q3, d2, d10\n"
"vmlal.s8 q14, d1, d9\n"
"vmlal.s8 q2, d1, d11\n"
"vmlal.s8 q15, d3, d9\n"
"vmlal.s8 q3, d3, d11\n"
"vld1.8 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n"
"vpadal.s16 q8, q14\n"
"vld1.8 {d8, d9, d10, d11}, [%[rhs_ptr]]!\n"
"vpadal.s16 q9, q15\n"
"vpadal.s16 q12, q2\n"
"vpadal.s16 q13, q3\n"
RUY_PREFETCH_LOAD("pld [%[lhs_ptr]]\n")
RUY_PREFETCH_LOAD("pld [%[rhs_ptr]]\n")
"add r1, r1, #16\n"
"cmp r1, r10\n"
"blt 2b\n"
"79:\n"
"vmull.s8 q14, d0, d8\n"
"vmull.s8 q2, d0, d10\n"
"vmull.s8 q15, d2, d8\n"
"vmull.s8 q3, d2, d10\n"
"vmlal.s8 q14, d1, d9\n"
"vmlal.s8 q2, d1, d11\n"
"vmlal.s8 q15, d3, d9\n"
"vmlal.s8 q3, d3, d11\n"
"vld1.8 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n"
"vpadal.s16 q6, q14\n"
"vpadal.s16 q7, q15\n"
"vpadal.s16 q10, q2\n"
"vpadal.s16 q11, q3\n"
"vmull.s8 q14, d0, d8\n"
"vmull.s8 q2, d0, d10\n"
"vmull.s8 q15, d2, d8\n"
"vmull.s8 q3, d2, d10\n"
"vmlal.s8 q14, d1, d9\n"
"vmlal.s8 q2, d1, d11\n"
"vmlal.s8 q15, d3, d9\n"
"vmlal.s8 q3, d3, d11\n"
"vpadal.s16 q8, q14\n"
"vpadal.s16 q9, q15\n"
"vpadal.s16 q12, q2\n"
"vpadal.s16 q13, q3\n"
"vpadd.i32 d0, d12, d13\n"
"vpadd.i32 d1, d14, d15\n"
"vpadd.i32 d2, d16, d17\n"
"vpadd.i32 d3, d18, d19\n"
"vpadd.i32 d4, d20, d21\n"
"vpadd.i32 d5, d22, d23\n"
"vpadd.i32 d6, d24, d25\n"
"vpadd.i32 d7, d26, d27\n"
"vpadd.i32 d28, d0, d1\n"
"vpadd.i32 d29, d2, d3\n"
"vpadd.i32 d30, d4, d5\n"
"vpadd.i32 d31, d6, d7\n"
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"cmp r1, r3\n"
"bge 4f\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"add r4, r4, r1, lsl #2\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"b 5f\n"
"4:\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"str r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"cmp r8, r4\n"
"bge 5f\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
"ldr r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
"add r10, r10, r1, lsl #1\n"
"str r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
"5:\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"mov %[lhs_ptr], r4\n"
"ldr r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
"mov %[rhs_ptr], r5\n"
"ldrb r4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
"tst r4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 1000f\n"
"mov r8, #" RUY_STR(RUY_STACK_OFFSET_ROW) "\n"
"b 1001f\n"
"1000:\n"
"mov r8, #" RUY_STR(RUY_STACK_OFFSET_COL) "\n"
"1001:\n"
"ldr r8, [sp, r8]\n"
"tst r4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"beq 1002f\n"
"add r1, r1, r8, lsl #2\n"
"1002:\n"
"vld1.32 {d24}, [r1]!\n"
"vld1.8 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n"
RUY_PREFETCH_LOAD("pld [%[lhs_ptr]]\n")
"vld1.8 {d8, d9, d10, d11}, [%[rhs_ptr]]!\n"
RUY_PREFETCH_LOAD("pld [%[rhs_ptr]]\n")
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
"vdup.32 q9, r3\n"
"vadd.i32 d24, d24, d18\n"
"tst r4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 6f\n"
"vld1.32 {d25}, [r1]\n"
"vadd.i32 d25, d25, d19\n"
"vadd.i32 q14, q14, q12\n"
"vadd.i32 q15, q15, q12\n"
"b 7f\n"
"6:\n"
"vdup.32 q10, d24[0]\n"
"vdup.32 q11, d24[1]\n"
"vadd.i32 q14, q14, q10\n"
"vadd.i32 q15, q15, q11\n"
"7:\n"
"ldrb r6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
"beq 401f\n"
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"add r3, r3, r4, lsl #2\n"
"vld1.32 { d12 }, [r3]\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
"vdup.32 q10, r5\n"
"vmls.i32 q14, q10, d12[0]\n"
"vmls.i32 q15, q10, d12[1]\n"
"401:\n"
"ldrb r6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
"beq 402f\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"add r2, r2, r4, lsl #2\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
"vld1.32 {d22, d23}, [r2]\n"
"vdup.32 d13, r5\n"
"vmul.i32 q11, q11, d13[1]\n"
"vsub.s32 q14, q14, q11\n"
"vsub.s32 q15, q15, q11\n"
"ldrb r10, [%[params], #" RUY_STR(RUY_OFFSET_DST_TYPE_ID) "]\n"
"cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
"402:\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
"tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
"beq 1003f\n"
"add r1, r1, r8, lsl #2\n"
"add r2, r2, r8, lsl #2\n"
"1003:\n"
"vld1.32 {d20}, [r1]!\n"
"vld1.32 {d12}, [r2]!\n"
"tst r6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"vmvn.i32 q8, #0\n"
"bne 8f\n"
"vld1.32 {d21}, [r1]\n"
"vld1.32 {d13}, [r2]\n"
"vmin.s32 q11, q10, q8\n"
"vsub.s32 q10, q10, q11\n"
"vshl.s32 q14, q14, q10\n"
"vshl.s32 q15, q15, q10\n"
"vqdmulh.s32 q14, q14, q6\n"
"vqdmulh.s32 q15, q15, q6\n"
"vrshl.s32 q14, q14, q11\n"
"vrshl.s32 q15, q15, q11\n"
"b 9f\n"
"8:\n"
"vmin.s32 d22, d20, d16\n"
"vsub.s32 d20, d20, d22\n"
"vdup.32 q12, d20[0]\n"
"vdup.32 q13, d20[1]\n"
"vshl.s32 q14, q14, q12\n"
"vshl.s32 q15, q15, q13\n"
"vqdmulh.s32 q14, q14, d12[0]\n"
"vqdmulh.s32 q15, q15, d12[1]\n"
"vdup.32 q12, d22[0]\n"
"vdup.32 q13, d22[1]\n"
"vrshl.s32 q14, q14, q12\n"
"vrshl.s32 q15, q15, q13\n"
"9:\n"
"ldrb r10, [%[params], #" RUY_STR(RUY_OFFSET_DST_TYPE_ID) "]\n"
"cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
"cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
"vqmovn.s32 d28, q14\n"
"vqmovn.s32 d29, q15\n"
RUY_MAKE_ZERO(q6)
RUY_MAKE_ZERO(q7)
RUY_MAKE_ZERO(q8)
RUY_MAKE_ZERO(q9)
RUY_MAKE_ZERO(q10)
RUY_MAKE_ZERO(q11)
RUY_MAKE_ZERO(q12)
RUY_MAKE_ZERO(q13)
RUY_MAKE_ZERO(q15)
"ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
"vdup.16 q13, r4\n"
"vqadd.s16 q14, q14, q13\n"
"vqmovun.s16 d30, q14\n"
"ldrb r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrb r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"vdup.8 d28, r2\n"
"vdup.8 d29, r3\n"
"vmax.u8 d30, d30, d28\n"
"vmin.u8 d30, d30, d29\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"sub r1, r1, r8\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"sub r2, r2, r4\n"
"mov r3, #4\n"
"mov r5, #2\n"
"cmp r1, #4\n"
"it gt\n"
"movgt r1, r3\n"
"cmp r2, #2\n"
"it gt\n"
"movgt r2, r5\n"
"cmp r1, r3\n"
"it eq\n"
"cmpeq r2, r5\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"beq 30f\n"
"mov r3, %[dst_tmp_buf]\n"
"vst1.8 {d30}, [r3]\n"
"mov r6, #0\n"
"50:\n"
"mov r8, #0\n"
"51:\n"
"ldrb r10, [r3, r8]\n"
"strb r10, [r4, r8]\n"
"add r8, r8, #1\n"
"cmp r8, r1\n"
"blt 51b\n"
"add r6, r6, #1\n"
"add r3, r3, #4\n"
"add r4, r4, r5\n"
"cmp r6, r2\n"
"blt 50b\n"
"b 31f\n"
"30:\n"
"ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"mov r4, r3\n"
"mov r6, #1\n"
"vst1.32 {d30[0]}, [r3]\n"
"add r4, r4, r5\n"
"mov r3, r4\n"
"vst1.32 {d30[1]}, [r3]\n"
"31:\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"add r4, r4, #4\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
RUY_MAKE_ZERO(q13)
RUY_MAKE_ZERO(q14)
RUY_MAKE_ZERO(q15)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
"vqmovn.s32 d28, q14\n"
"vqmovn.s32 d29, q15\n"
RUY_MAKE_ZERO(q6)
RUY_MAKE_ZERO(q7)
RUY_MAKE_ZERO(q8)
RUY_MAKE_ZERO(q9)
RUY_MAKE_ZERO(q10)
RUY_MAKE_ZERO(q11)
RUY_MAKE_ZERO(q12)
RUY_MAKE_ZERO(q13)
RUY_MAKE_ZERO(q15)
"ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
"vdup.16 q13, r4\n"
"vqadd.s16 q14, q14, q13\n"
"vqmovn.s16 d30, q14\n"
"ldrb r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrb r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"vdup.8 d28, r2\n"
"vdup.8 d29, r3\n"
"vmax.s8 d30, d30, d28\n"
"vmin.s8 d30, d30, d29\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"sub r1, r1, r8\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"sub r2, r2, r4\n"
"mov r3, #4\n"
"mov r5, #2\n"
"cmp r1, #4\n"
"it gt\n"
"movgt r1, r3\n"
"cmp r2, #2\n"
"it gt\n"
"movgt r2, r5\n"
"cmp r1, r3\n"
"it eq\n"
"cmpeq r2, r5\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"beq 30f\n"
"mov r3, %[dst_tmp_buf]\n"
"vst1.8 {d30}, [r3]\n"
"mov r6, #0\n"
"50:\n"
"mov r8, #0\n"
"51:\n"
"ldrb r10, [r3, r8]\n"
"strb r10, [r4, r8]\n"
"add r8, r8, #1\n"
"cmp r8, r1\n"
"blt 51b\n"
"add r6, r6, #1\n"
"add r3, r3, #4\n"
"add r4, r4, r5\n"
"cmp r6, r2\n"
"blt 50b\n"
"b 31f\n"
"30:\n"
"ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"mov r4, r3\n"
"mov r6, #1\n"
"vst1.32 {d30[0]}, [r3]\n"
"add r4, r4, r5\n"
"mov r3, r4\n"
"vst1.32 {d30[1]}, [r3]\n"
"31:\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"add r4, r4, #4\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
RUY_MAKE_ZERO(q13)
RUY_MAKE_ZERO(q14)
RUY_MAKE_ZERO(q15)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
"ldrsh r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
"vdup.32 q13, r4\n"
"vadd.s32 q14, q14, q13\n"
"vadd.s32 q15, q15, q13\n"
"vqmovn.s32 d28, q14\n"
"vqmovn.s32 d29, q15\n"
RUY_MAKE_ZERO(q6)
RUY_MAKE_ZERO(q7)
RUY_MAKE_ZERO(q8)
RUY_MAKE_ZERO(q9)
RUY_MAKE_ZERO(q10)
RUY_MAKE_ZERO(q11)
RUY_MAKE_ZERO(q15)
"ldrh r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrh r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"vdup.16 q12, r2\n"
"vdup.16 q13, r3\n"
"vmax.s16 q14, q14, q12\n"
"vmin.s16 q14, q14, q13\n"
RUY_MAKE_ZERO(q12)
RUY_MAKE_ZERO(q13)
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"sub r1, r1, r8\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"sub r2, r2, r4\n"
"mov r3, #4\n"
"mov r5, #2\n"
"cmp r1, #4\n"
"it gt\n"
"movgt r1, r3\n"
"cmp r2, #2\n"
"it gt\n"
"movgt r2, r5\n"
"cmp r1, r3\n"
"it eq\n"
"cmpeq r2, r5\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"beq 30f\n"
"mov r3, %[dst_tmp_buf]\n"
"vst1.16 {q14}, [r3]\n"
"mov r6, #0\n"
"50:\n"
"mov r8, #0\n"
"51:\n"
"lsl r8, r8, #1\n"
"ldrh r10, [r3, r8]\n"
"strh r10, [r4, r8]\n"
"lsr r8, r8, #1\n"
"add r8, r8, #1\n"
"cmp r8, r1\n"
"blt 51b\n"
"add r6, r6, #1\n"
"add r3, r3, #8\n"
"add r4, r4, r5\n"
"cmp r6, r2\n"
"blt 50b\n"
"b 31f\n"
"30:\n"
"ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"mov r4, r3\n"
"mov r6, #2\n"
"vst1.16 {d28[0]}, [r3], r6\n"
"add r4, r4, r5\n"
"vst1.16 {d28[1]}, [r3], r6\n"
"vst1.16 {d28[2]}, [r3], r6\n"
"vst1.16 {d28[3]}, [r3], r6\n"
"mov r3, r4\n"
"vst1.16 {d29[0]}, [r3], r6\n"
"vst1.16 {d29[1]}, [r3], r6\n"
"vst1.16 {d29[2]}, [r3], r6\n"
"vst1.16 {d29[3]}, [r3], r6\n"
"31:\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"add r4, r4, #8\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
RUY_MAKE_ZERO(q14)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
RUY_MAKE_ZERO(q6)
RUY_MAKE_ZERO(q7)
RUY_MAKE_ZERO(q8)
RUY_MAKE_ZERO(q9)
RUY_MAKE_ZERO(q10)
RUY_MAKE_ZERO(q11)
RUY_MAKE_ZERO(q12)
RUY_MAKE_ZERO(q13)
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"sub r1, r1, r8\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"sub r2, r2, r4\n"
"mov r3, #4\n"
"mov r5, #2\n"
"cmp r1, #4\n"
"it gt\n"
"movgt r1, r3\n"
"cmp r2, #2\n"
"it gt\n"
"movgt r2, r5\n"
"cmp r1, r3\n"
"it eq\n"
"cmpeq r2, r5\n"
"beq 30f\n"
"mov r3, %[dst_tmp_buf]\n"
"mov r4, #16\n"
"b 31f\n"
"30:\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"mov r4, r5\n"
"31:\n"
"vst1.32 {d28, d29}, [r3]\n"
"add r3, r3, r4\n"
"vst1.32 {d30, d31}, [r3]\n"
"beq 41f\n"
"ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"mov r3, %[dst_tmp_buf]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"mov r6, #0\n"
"50:\n"
"mov r5, #0\n"
"51:\n"
"ldr r10, [r3, r5, lsl #2]\n"
"str r10, [r4, r5, lsl #2]\n"
"add r5, r5, #1\n"
"cmp r5, r1\n"
"blt 51b\n"
"add r6, r6, #1\n"
"add r3, r3, #16\n"
"add r4, r4, r8\n"
"cmp r6, r2\n"
"blt 50b\n"
"41:\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"add r4, r4, #16\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
RUY_MAKE_ZERO(q10)
RUY_MAKE_ZERO(q11)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr r6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"cmp r8, r3\n"
"beq 20f\n"
"add r8, r8, #4\n"
"str r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"b 21f\n"
"20:\n"
"str r6, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"add r4, r4, #2\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
"add r1, r1, r8, lsl #1\n"
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"21:\n"
"ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"cmp r8, r4\n"
"mov r1, #16\n"
"ble 1b\n"
"add sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
: [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr)
: [ params ] "r"(¶ms), [dst_tmp_buf] "r"(params.dst_tmp_buf)
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r10", "cc",
"memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q9", "q10", "q12", "q13", "q14", "q15");
}
void Kernel8bitNeon1Col(const KernelParams8bit<4, 2>& params) {
profiler::ScopeLabel label("Kernel (kNeon)");
CheckOffsetsInKernelParams8bit(params);
const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
const std::int8_t* rhs_col_ptr =
static_cast<const int8_t*>(params.rhs_base_ptr);
const std::int8_t* lhs_ptr = lhs_col_ptr;
const std::int8_t* rhs_ptr = rhs_col_ptr;
RUY_DCHECK(!(params.flags & RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL));
asm volatile(
#define RUY_MAKE_ZERO …
"vld1.8 {d0, d1}, [%[lhs_ptr]]!\n"
"vld1.8 {d2, d3}, [%[lhs_ptr]]!\n"
"vld1.8 {d4, d5}, [%[lhs_ptr]]!\n"
"vld1.8 {d6, d7}, [%[lhs_ptr]]!\n"
"vld1.8 {d8, d9}, [%[rhs_ptr]]!\n"
"add %[rhs_ptr], %[rhs_ptr], #16\n"
"sub sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
"str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_START_COL) "]\n"
"str r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_RHS_BASE_PTR) "]\n"
"str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
RUY_MAKE_ZERO(q6)
RUY_MAKE_ZERO(q7)
RUY_MAKE_ZERO(q8)
RUY_MAKE_ZERO(q9)
RUY_MAKE_ZERO(q10)
RUY_MAKE_ZERO(q11)
RUY_MAKE_ZERO(q12)
RUY_MAKE_ZERO(q13)
RUY_MAKE_ZERO(q14)
RUY_MAKE_ZERO(q15)
"mov r1, #16\n"
"1:\n"
"ldr r10, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
"cmp r1, r10\n"
"beq 79f\n"
"2:\n"
"vmull.s8 q14, d0, d8\n"
"vmull.s8 q15, d2, d8\n"
"vmlal.s8 q14, d1, d9\n"
"vmlal.s8 q15, d3, d9\n"
"vpadal.s16 q6, q14\n"
"vpadal.s16 q7, q15\n"
"vmull.s8 q14, d4, d8\n"
"vmull.s8 q15, d6, d8\n"
"vmlal.s8 q14, d5, d9\n"
"vmlal.s8 q15, d7, d9\n"
"vpadal.s16 q8, q14\n"
"vpadal.s16 q9, q15\n"
"vld1.8 {d0, d1}, [%[lhs_ptr]]!\n"
"vld1.8 {d2, d3}, [%[lhs_ptr]]!\n"
"vld1.8 {d4, d5}, [%[lhs_ptr]]!\n"
"vld1.8 {d6, d7}, [%[lhs_ptr]]!\n"
RUY_PREFETCH_LOAD("pld [%[lhs_ptr]]\n")
"vld1.8 {d8, d9}, [%[rhs_ptr]]!\n"
"add %[rhs_ptr], %[rhs_ptr], #16\n"
RUY_PREFETCH_LOAD("pld [%[rhs_ptr]]\n")
"add r1, r1, #16\n"
"cmp r1, r10\n"
"blt 2b\n"
"79:\n"
"vmull.s8 q14, d0, d8\n"
"vmull.s8 q15, d2, d8\n"
"vmlal.s8 q14, d1, d9\n"
"vmlal.s8 q15, d3, d9\n"
"vpadal.s16 q6, q14\n"
"vpadal.s16 q7, q15\n"
"vmull.s8 q14, d4, d8\n"
"vmull.s8 q15, d6, d8\n"
"vmlal.s8 q14, d5, d9\n"
"vmlal.s8 q15, d7, d9\n"
"vpadal.s16 q8, q14\n"
"vpadal.s16 q9, q15\n"
"vpadd.i32 d0, d12, d13\n"
"vpadd.i32 d1, d14, d15\n"
"vpadd.i32 d2, d16, d17\n"
"vpadd.i32 d3, d18, d19\n"
"vpadd.i32 d28, d0, d1\n"
"vpadd.i32 d29, d2, d3\n"
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"cmp r1, r3\n"
"bge 4f\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"add r4, r4, r1, lsl #2\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"b 5f\n"
"4:\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"str r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"cmp r8, r4\n"
"bge 5f\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
"ldr r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
"add r10, r10, r1, lsl #1\n"
"str r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
"5:\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
"mov %[lhs_ptr], r4\n"
"ldr r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
"mov %[rhs_ptr], r5\n"
"ldrb r4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"tst r4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"beq 1000f\n"
"add r1, r1, r8, lsl #2\n"
"1000:\n"
"vld1.32 {d24, d25}, [r1]\n"
"vld1.8 {d0, d1}, [%[lhs_ptr]]!\n"
"vld1.8 {d2, d3}, [%[lhs_ptr]]!\n"
"vld1.8 {d4, d5}, [%[lhs_ptr]]!\n"
"vld1.8 {d6, d7}, [%[lhs_ptr]]!\n"
RUY_PREFETCH_LOAD("pld [%[lhs_ptr]]\n")
"vld1.8 {d8, d9}, [%[rhs_ptr]]!\n"
"add %[rhs_ptr], %[rhs_ptr], #16\n"
RUY_PREFETCH_LOAD("pld [%[rhs_ptr]]\n")
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
"vdup.32 q9, r3\n"
"vadd.i32 q12, q12, q9\n"
"vadd.i32 q14, q14, q12\n"
"ldrb r6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
"beq 401f\n"
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"add r3, r3, r4, lsl #2\n"
"vld1.32 { d12 }, [r3]\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
"vdup.32 q10, r5\n"
"vmls.i32 q14, q10, d12[0]\n"
"401:\n"
"ldrb r6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
"beq 402f\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"add r2, r2, r4, lsl #2\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
"vld1.32 {d22, d23}, [r2]\n"
"vdup.32 d13, r5\n"
"vmul.i32 q11, q11, d13[1]\n"
"vsub.s32 q14, q14, q11\n"
"ldrb r10, [%[params], #" RUY_STR(RUY_OFFSET_DST_TYPE_ID) "]\n"
"cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
"402:\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
"tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"beq 1001f\n"
"add r1, r1, r4, lsl #2\n"
"1001:\n"
"vld1.32 {q10}, [r1]\n"
"vmvn.i32 q8, #0\n"
"vmin.s32 q13, q10, q8\n"
"vsub.s32 q12, q10, q13\n"
"vshl.s32 q14, q14, q12\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
"tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
"beq 1002f\n"
"add r1, r1, r4, lsl #2\n"
"1002:\n"
"vld1.32 {q10}, [r1]\n"
"vqdmulh.s32 q14, q14, q10\n"
"vrshl.s32 q14, q14, q13\n"
"ldrb r10, [%[params], #" RUY_STR(RUY_OFFSET_DST_TYPE_ID) "]\n"
"cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
"cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
"vqmovn.s32 d28, q14\n"
RUY_MAKE_ZERO(q6)
RUY_MAKE_ZERO(q7)
RUY_MAKE_ZERO(q8)
RUY_MAKE_ZERO(q9)
RUY_MAKE_ZERO(q10)
RUY_MAKE_ZERO(q11)
RUY_MAKE_ZERO(q12)
RUY_MAKE_ZERO(q13)
RUY_MAKE_ZERO(q15)
"ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
"vdup.16 q13, r4\n"
"vqadd.s16 q14, q14, q13\n"
"vqmovun.s16 d30, q14\n"
"ldrb r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrb r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"vdup.8 d28, r2\n"
"vdup.8 d29, r3\n"
"vmax.u8 d30, d30, d28\n"
"vmin.u8 d30, d30, d29\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"sub r1, r1, r8\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"sub r2, r2, r4\n"
"mov r3, #4\n"
"mov r5, #2\n"
"cmp r1, #4\n"
"it gt\n"
"movgt r1, r3\n"
"cmp r1, r3\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"beq 30f\n"
"mov r3, %[dst_tmp_buf]\n"
"vst1.8 {d30}, [r3]\n"
"50:\n"
"mov r8, #0\n"
"51:\n"
"ldrb r10, [r3, r8]\n"
"strb r10, [r4, r8]\n"
"add r8, r8, #1\n"
"cmp r8, r1\n"
"blt 51b\n"
"b 31f\n"
"30:\n"
"ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"mov r4, r3\n"
"mov r6, #1\n"
"vst1.8 {d30[0]}, [r3], r6\n"
"vst1.8 {d30[1]}, [r3], r6\n"
"vst1.8 {d30[2]}, [r3], r6\n"
"vst1.8 {d30[3]}, [r3], r6\n"
"31:\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"add r4, r4, #4\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
RUY_MAKE_ZERO(q13)
RUY_MAKE_ZERO(q14)
RUY_MAKE_ZERO(q15)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
"vqmovn.s32 d28, q14\n"
RUY_MAKE_ZERO(q6)
RUY_MAKE_ZERO(q7)
RUY_MAKE_ZERO(q8)
RUY_MAKE_ZERO(q9)
RUY_MAKE_ZERO(q10)
RUY_MAKE_ZERO(q11)
RUY_MAKE_ZERO(q12)
RUY_MAKE_ZERO(q13)
RUY_MAKE_ZERO(q15)
"ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
"vdup.16 q13, r4\n"
"vqadd.s16 q14, q14, q13\n"
"vqmovn.s16 d30, q14\n"
"ldrb r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrb r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"vdup.8 d28, r2\n"
"vdup.8 d29, r3\n"
"vmax.s8 d30, d30, d28\n"
"vmin.s8 d30, d30, d29\n"
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"sub r1, r1, r8\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"sub r2, r2, r4\n"
"mov r3, #4\n"
"mov r5, #2\n"
"cmp r1, #4\n"
"it gt\n"
"movgt r1, r3\n"
"cmp r1, r3\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"beq 30f\n"
"mov r3, %[dst_tmp_buf]\n"
"vst1.8 {d30}, [r3]\n"
"50:\n"
"mov r8, #0\n"
"51:\n"
"ldrb r10, [r3, r8]\n"
"strb r10, [r4, r8]\n"
"add r8, r8, #1\n"
"cmp r8, r1\n"
"blt 51b\n"
"b 31f\n"
"30:\n"
"ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"mov r4, r3\n"
"mov r6, #1\n"
"vst1.8 {d30[0]}, [r3], r6\n"
"vst1.8 {d30[1]}, [r3], r6\n"
"vst1.8 {d30[2]}, [r3], r6\n"
"vst1.8 {d30[3]}, [r3], r6\n"
"31:\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"add r4, r4, #4\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
RUY_MAKE_ZERO(q13)
RUY_MAKE_ZERO(q14)
RUY_MAKE_ZERO(q15)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
"ldrsh r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
"vdup.32 q13, r4\n"
"vadd.s32 q14, q14, q13\n"
"vqmovn.s32 d28, q14\n"
RUY_MAKE_ZERO(q6)
RUY_MAKE_ZERO(q7)
RUY_MAKE_ZERO(q8)
RUY_MAKE_ZERO(q9)
RUY_MAKE_ZERO(q10)
RUY_MAKE_ZERO(q11)
RUY_MAKE_ZERO(q15)
"ldrh r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrh r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"vdup.16 d24, r2\n"
"vdup.16 d26, r3\n"
"vmax.s16 d28, d28, d24\n"
"vmin.s16 d28, d28, d26\n"
RUY_MAKE_ZERO(q12)
RUY_MAKE_ZERO(q13)
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"sub r1, r1, r8\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"sub r2, r2, r4\n"
"mov r3, #4\n"
"mov r5, #2\n"
"cmp r1, #4\n"
"it gt\n"
"movgt r1, r3\n"
"cmp r1, r3\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"beq 30f\n"
"mov r3, %[dst_tmp_buf]\n"
"vst1.16 {d28}, [r3]\n"
"50:\n"
"mov r8, #0\n"
"51:\n"
"lsl r8, r8, #1\n"
"ldrh r10, [r3, r8]\n"
"strh r10, [r4, r8]\n"
"lsr r8, r8, #1\n"
"add r8, r8, #1\n"
"cmp r8, r1\n"
"blt 51b\n"
"b 31f\n"
"30:\n"
"ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"mov r4, r3\n"
"mov r6, #2\n"
"vst1.16 {d28[0]}, [r3], r6\n"
"vst1.16 {d28[1]}, [r3], r6\n"
"vst1.16 {d28[2]}, [r3], r6\n"
"vst1.16 {d28[3]}, [r3], r6\n"
"31:\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"add r4, r4, #8\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
RUY_MAKE_ZERO(q14)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
RUY_MAKE_ZERO(q6)
RUY_MAKE_ZERO(q7)
RUY_MAKE_ZERO(q8)
RUY_MAKE_ZERO(q9)
RUY_MAKE_ZERO(q10)
RUY_MAKE_ZERO(q11)
RUY_MAKE_ZERO(q12)
RUY_MAKE_ZERO(q13)
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"sub r1, r1, r8\n"
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"sub r2, r2, r4\n"
"mov r3, #4\n"
"mov r5, #2\n"
"cmp r1, #4\n"
"it gt\n"
"movgt r1, r3\n"
"cmp r1, r3\n"
"beq 30f\n"
"mov r3, %[dst_tmp_buf]\n"
"mov r4, #16\n"
"b 31f\n"
"30:\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"mov r4, r5\n"
"31:\n"
"vst1.32 {d28, d29}, [r3]\n"
"beq 41f\n"
"ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"mov r3, %[dst_tmp_buf]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"50:\n"
"mov r5, #0\n"
"51:\n"
"ldr r10, [r3, r5, lsl #2]\n"
"str r10, [r4, r5, lsl #2]\n"
"add r5, r5, #1\n"
"cmp r5, r1\n"
"blt 51b\n"
"41:\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"add r4, r4, #16\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
RUY_MAKE_ZERO(q10)
RUY_MAKE_ZERO(q11)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr r6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"cmp r8, r3\n"
"beq 20f\n"
"add r8, r8, #4\n"
"str r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"b 21f\n"
"20:\n"
"str r6, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"add r4, r4, #2\n"
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
"add r1, r1, r8\n"
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
"21:\n"
"ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
"cmp r8, r4\n"
"mov r1, #16\n"
"ble 1b\n"
"add sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
: [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr)
: [ params ] "r"(¶ms), [dst_tmp_buf] "r"(params.dst_tmp_buf)
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r10", "cc",
"memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q9", "q10", "q12", "q13", "q14", "q15");
}
#undef RUY_OFFSET_BIAS
#undef RUY_OFFSET_LHS_SUMS
#undef RUY_OFFSET_RHS_SUMS
#undef RUY_OFFSET_LHS_BASE_PTR
#undef RUY_OFFSET_MULTIPLIER_FIXEDPOINT
#undef RUY_OFFSET_MULTIPLIER_EXPONENT
#undef RUY_OFFSET_RHS_BASE_PTR
#undef RUY_OFFSET_DST_BASE_PTR
#undef RUY_OFFSET_LHS_ZERO_POINT
#undef RUY_OFFSET_RHS_ZERO_POINT
#undef RUY_OFFSET_DST_ZERO_POINT
#undef RUY_OFFSET_PROD_ZP_DEPTH
#undef RUY_OFFSET_START_ROW
#undef RUY_OFFSET_START_COL
#undef RUY_OFFSET_LAST_ROW
#undef RUY_OFFSET_LAST_COL
#undef RUY_OFFSET_DST_ROWS
#undef RUY_OFFSET_DST_COLS
#undef RUY_OFFSET_LHS_STRIDE
#undef RUY_OFFSET_RHS_STRIDE
#undef RUY_OFFSET_DST_STRIDE
#undef RUY_OFFSET_DEPTH
#undef RUY_OFFSET_CLAMP_MIN
#undef RUY_OFFSET_CLAMP_MAX
#undef RUY_OFFSET_FLAGS
#undef RUY_OFFSET_DST_TYPE_ID
#undef RUY_STACK_OFFSET_SIZE
#undef RUY_STACK_OFFSET_DST_COL_PTR
#undef RUY_STACK_OFFSET_DST_PTR
#undef RUY_STACK_OFFSET_ROW
#undef RUY_STACK_OFFSET_COL
#undef RUY_STACK_OFFSET_LHS_COL_PTR
#undef RUY_STACK_OFFSET_RHS_COL_PTR
#endif
}