#include <cstdint>
#include "ruy/asm_helpers.h"
#include "ruy/check_macros.h"
#include "ruy/kernel_arm.h"
#include "ruy/opt_set.h"
#include "ruy/platform.h"
#include "ruy/profiler/instrumentation.h"
namespace ruy {
#if RUY_PLATFORM_NEON_64 && RUY_OPT(ASM)
#define RUY_ASM_LABEL_STORE_UINT8 …
#define RUY_ASM_LABEL_STORE_INT8 …
#define RUY_ASM_LABEL_STORE_INT16 …
#define RUY_ASM_LABEL_STORE_INT32 …
#define RUY_ASM_LABEL_AFTER_STORE …
#define RUY_OFFSET_BIAS …
#define RUY_OFFSET_LHS_SUMS …
#define RUY_OFFSET_RHS_SUMS …
#define RUY_OFFSET_LHS_BASE_PTR …
#define RUY_OFFSET_MULTIPLIER_FIXEDPOINT …
#define RUY_OFFSET_MULTIPLIER_EXPONENT …
#define RUY_OFFSET_RHS_BASE_PTR …
#define RUY_OFFSET_DST_BASE_PTR …
#define RUY_OFFSET_LHS_ZERO_POINT …
#define RUY_OFFSET_RHS_ZERO_POINT …
#define RUY_OFFSET_DST_ZERO_POINT …
#define RUY_OFFSET_PROD_ZP_DEPTH …
#define RUY_OFFSET_START_ROW …
#define RUY_OFFSET_START_COL …
#define RUY_OFFSET_LAST_ROW …
#define RUY_OFFSET_LAST_COL …
#define RUY_OFFSET_DST_ROWS …
#define RUY_OFFSET_DST_COLS …
#define RUY_OFFSET_LHS_STRIDE …
#define RUY_OFFSET_RHS_STRIDE …
#define RUY_OFFSET_DST_STRIDE …
#define RUY_OFFSET_DEPTH …
#define RUY_OFFSET_CLAMP_MIN …
#define RUY_OFFSET_CLAMP_MAX …
#define RUY_OFFSET_FLAGS …
template <typename Params>
void CheckOffsetsInKernelParams8bit(const Params&) {
static_assert(offsetof(Params, lhs_zero_point) == RUY_OFFSET_LHS_ZERO_POINT,
"");
static_assert(offsetof(Params, rhs_zero_point) == RUY_OFFSET_RHS_ZERO_POINT,
"");
static_assert(offsetof(Params, dst_zero_point) == RUY_OFFSET_DST_ZERO_POINT,
"");
static_assert(offsetof(Params, prod_zp_depth) == RUY_OFFSET_PROD_ZP_DEPTH,
"");
static_assert(offsetof(Params, multiplier_fixedpoint) ==
RUY_OFFSET_MULTIPLIER_FIXEDPOINT,
"");
static_assert(
offsetof(Params, multiplier_exponent) == RUY_OFFSET_MULTIPLIER_EXPONENT,
"");
static_assert(offsetof(Params, clamp_min) == RUY_OFFSET_CLAMP_MIN, "");
static_assert(offsetof(Params, clamp_max) == RUY_OFFSET_CLAMP_MAX, "");
static_assert(offsetof(Params, bias) == RUY_OFFSET_BIAS, "");
static_assert(offsetof(Params, lhs_sums) == RUY_OFFSET_LHS_SUMS, "");
static_assert(offsetof(Params, rhs_sums) == RUY_OFFSET_RHS_SUMS, "");
static_assert(offsetof(Params, flags) == RUY_OFFSET_FLAGS, "");
static_assert(offsetof(Params, lhs_base_ptr) == RUY_OFFSET_LHS_BASE_PTR, "");
static_assert(offsetof(Params, start_row) == RUY_OFFSET_START_ROW, "");
static_assert(offsetof(Params, last_row) == RUY_OFFSET_LAST_ROW, "");
static_assert(offsetof(Params, last_col) == RUY_OFFSET_LAST_COL, "");
static_assert(offsetof(Params, lhs_stride) == RUY_OFFSET_LHS_STRIDE, "");
static_assert(offsetof(Params, rhs_stride) == RUY_OFFSET_RHS_STRIDE, "");
static_assert(offsetof(Params, dst_stride) == RUY_OFFSET_DST_STRIDE, "");
static_assert(offsetof(Params, depth) == RUY_OFFSET_DEPTH, "");
}
void Kernel8bitNeon(const KernelParams8bit<4, 4>& params) {
profiler::ScopeLabel label("Kernel (kNeon)");
CheckOffsetsInKernelParams8bit(params);
const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
const std::int8_t* rhs_col_ptr =
static_cast<const int8_t*>(params.rhs_base_ptr);
const std::int8_t* lhs_ptr = lhs_col_ptr;
const std::int8_t* rhs_ptr = rhs_col_ptr;
void* dst_col_ptr = params.dst_base_ptr;
void* dst_ptr = dst_col_ptr;
int row = params.start_row;
int col = params.start_col;
asm volatile(
#define RUY_MAKE_ZERO …
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
"ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
"ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v4.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v5.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v6.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v7.16b}, [%[rhs_ptr]], #16\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"mov w1, #16\n"
"smull v8.8h, v0.8b, v4.8b\n"
"smull v9.8h, v1.8b, v4.8b\n"
"smull v10.8h, v2.8b, v4.8b\n"
"smull v11.8h, v3.8b, v4.8b\n"
"smull v12.8h, v0.8b, v5.8b\n"
"smull v13.8h, v1.8b, v5.8b\n"
"smull v14.8h, v2.8b, v5.8b\n"
"smull v15.8h, v3.8b, v5.8b\n"
"smlal2 v8.8h, v0.16b, v4.16b\n"
"smlal2 v9.8h, v1.16b, v4.16b\n"
"smlal2 v10.8h, v2.16b, v4.16b\n"
"smlal2 v11.8h, v3.16b, v4.16b\n"
"smlal2 v12.8h, v0.16b, v5.16b\n"
"smlal2 v13.8h, v1.16b, v5.16b\n"
"smlal2 v14.8h, v2.16b, v5.16b\n"
"smlal2 v15.8h, v3.16b, v5.16b\n"
"1:\n"
"cmp w1, w12\n"
"beq 79f\n"
"2:\n"
"sadalp v16.4s, v8.8h\n"
"ld1 {v4.16b}, [%[rhs_ptr]], #16\n"
"smull v8.8h, v0.8b, v6.8b\n"
"sadalp v17.4s, v9.8h\n"
"ld1 {v5.16b}, [%[rhs_ptr]], #16\n"
"smull v9.8h, v1.8b, v6.8b\n"
"sadalp v18.4s, v10.8h\n"
"smull v10.8h, v2.8b, v6.8b\n"
"sadalp v19.4s, v11.8h\n"
"smull v11.8h, v3.8b, v6.8b\n"
"sadalp v20.4s, v12.8h\n"
"smull v12.8h, v0.8b, v7.8b\n"
"sadalp v21.4s, v13.8h\n"
"smull v13.8h, v1.8b, v7.8b\n"
"sadalp v22.4s, v14.8h\n"
"smull v14.8h, v2.8b, v7.8b\n"
"sadalp v23.4s, v15.8h\n"
"smull v15.8h, v3.8b, v7.8b\n"
"smlal2 v8.8h, v0.16b, v6.16b\n"
"smlal2 v9.8h, v1.16b, v6.16b\n"
"smlal2 v10.8h, v2.16b, v6.16b\n"
"smlal2 v11.8h, v3.16b, v6.16b\n"
"ld1 {v6.16b}, [%[rhs_ptr]], #16\n"
"smlal2 v12.8h, v0.16b, v7.16b\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v13.8h, v1.16b, v7.16b\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v14.8h, v2.16b, v7.16b\n"
"ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v15.8h, v3.16b, v7.16b\n"
"ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
"sadalp v24.4s, v8.8h\n"
"smull v8.8h, v0.8b, v4.8b\n"
"sadalp v25.4s, v9.8h\n"
"ld1 {v7.16b}, [%[rhs_ptr]], #16\n"
"smull v9.8h, v1.8b, v4.8b\n"
"sadalp v26.4s, v10.8h\n"
"smull v10.8h, v2.8b, v4.8b\n"
"sadalp v27.4s, v11.8h\n"
"smull v11.8h, v3.8b, v4.8b\n"
"sadalp v28.4s, v12.8h\n"
"smull v12.8h, v0.8b, v5.8b\n"
"sadalp v29.4s, v13.8h\n"
"smull v13.8h, v1.8b, v5.8b\n"
"sadalp v30.4s, v14.8h\n"
"smull v14.8h, v2.8b, v5.8b\n"
"sadalp v31.4s, v15.8h\n"
"smull v15.8h, v3.8b, v5.8b\n"
"smlal2 v8.8h, v0.16b, v4.16b\n"
"smlal2 v9.8h, v1.16b, v4.16b\n"
"smlal2 v10.8h, v2.16b, v4.16b\n"
"smlal2 v11.8h, v3.16b, v4.16b\n"
"smlal2 v12.8h, v0.16b, v5.16b\n"
"smlal2 v13.8h, v1.16b, v5.16b\n"
"smlal2 v14.8h, v2.16b, v5.16b\n"
"smlal2 v15.8h, v3.16b, v5.16b\n"
"add w1, w1, #16\n"
"cmp w1, w12\n"
"blt 2b\n"
"79:\n"
"sadalp v16.4s, v8.8h\n"
"smull v8.8h, v0.8b, v6.8b\n"
"sadalp v17.4s, v9.8h\n"
"smull v9.8h, v1.8b, v6.8b\n"
"sadalp v18.4s, v10.8h\n"
"smull v10.8h, v2.8b, v6.8b\n"
"sadalp v19.4s, v11.8h\n"
"smull v11.8h, v3.8b, v6.8b\n"
"sadalp v20.4s, v12.8h\n"
"smull v12.8h, v0.8b, v7.8b\n"
"sadalp v21.4s, v13.8h\n"
"smull v13.8h, v1.8b, v7.8b\n"
"sadalp v22.4s, v14.8h\n"
"smull v14.8h, v2.8b, v7.8b\n"
"sadalp v23.4s, v15.8h\n"
"smull v15.8h, v3.8b, v7.8b\n"
"smlal2 v8.8h, v0.16b, v6.16b\n"
"smlal2 v9.8h, v1.16b, v6.16b\n"
"smlal2 v10.8h, v2.16b, v6.16b\n"
"smlal2 v11.8h, v3.16b, v6.16b\n"
"smlal2 v12.8h, v0.16b, v7.16b\n"
"smlal2 v13.8h, v1.16b, v7.16b\n"
"smlal2 v14.8h, v2.16b, v7.16b\n"
"smlal2 v15.8h, v3.16b, v7.16b\n"
"sadalp v24.4s, v8.8h\n"
"sadalp v25.4s, v9.8h\n"
"sadalp v26.4s, v10.8h\n"
"sadalp v27.4s, v11.8h\n"
"sadalp v28.4s, v12.8h\n"
"sadalp v29.4s, v13.8h\n"
"sadalp v30.4s, v14.8h\n"
"sadalp v31.4s, v15.8h\n"
"addp v16.4s, v16.4s, v17.4s\n"
"addp v18.4s, v18.4s, v19.4s\n"
"addp v20.4s, v20.4s, v21.4s\n"
"addp v22.4s, v22.4s, v23.4s\n"
"addp v24.4s, v24.4s, v25.4s\n"
"addp v26.4s, v26.4s, v27.4s\n"
"addp v28.4s, v28.4s, v29.4s\n"
"addp v30.4s, v30.4s, v31.4s\n"
"addp v16.4s, v16.4s, v18.4s\n"
"addp v17.4s, v20.4s, v22.4s\n"
"addp v18.4s, v24.4s, v26.4s\n"
"addp v19.4s, v28.4s, v30.4s\n"
"cmp %w[row], w7\n"
"bge 4f\n"
"add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #2\n"
"b 5f\n"
"4:\n"
"mov %[lhs_col_ptr], x5\n"
"cmp %w[col], w8\n"
"bge 5f\n"
"add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #2\n"
"5:\n"
"mov %[lhs_ptr], %[lhs_col_ptr]\n"
"mov %[rhs_ptr], %[rhs_col_ptr]\n"
"mvni v8.4s, #0\n"
"ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
"ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
"ins v13.h[4], w4\n"
"ldr x4, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
"ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"dup v9.4s, w3\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"csel w3, %w[row], %w[col], eq\n"
"add x5, x1, x3, lsl #2\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"ld1 {v14.4s}, [x1]\n"
"add x5, x4, x3, lsl #2\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
"csel x4, x4, x5, eq\n"
"ld1 {v15.4s}, [x4]\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v4.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v5.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v6.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v7.16b}, [%[rhs_ptr]], #16\n"
"add v14.4s, v14.4s, v9.4s\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 6f\n"
"add v16.4s, v16.4s, v14.4s\n"
"add v17.4s, v17.4s, v14.4s\n"
"add v18.4s, v18.4s, v14.4s\n"
"add v19.4s, v19.4s, v14.4s\n"
"b 7f\n"
"6:\n"
"dup v20.4s, v14.s[0]\n"
"dup v21.4s, v14.s[1]\n"
"dup v22.4s, v14.s[2]\n"
"dup v23.4s, v14.s[3]\n"
"add v16.4s, v16.4s, v20.4s\n"
"add v17.4s, v17.4s, v21.4s\n"
"add v18.4s, v18.4s, v22.4s\n"
"add v19.4s, v19.4s, v23.4s\n"
"7:\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
"beq 401f\n"
"ldr x3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
"add x3, x3, %x[col], lsl #2\n"
"ld1 {v14.4s}, [x3]\n"
"ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
"dup v10.4s, w5\n"
"mls v16.4s, v10.4s, v14.s[0]\n"
"mls v17.4s, v10.4s, v14.s[1]\n"
"mls v18.4s, v10.4s, v14.s[2]\n"
"mls v19.4s, v10.4s, v14.s[3]\n"
"401:\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
"beq 402f\n"
"ldr x2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
"add x2, x2, %x[row], lsl #2\n"
"ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
"ld1 {v11.4s}, [x2]\n"
"ins v13.s[1], w5\n"
"mul v11.4s, v11.4s, v13.s[1]\n"
"sub v16.4s, v16.4s, v11.4s\n"
"sub v17.4s, v17.4s, v11.4s\n"
"sub v18.4s, v18.4s, v11.4s\n"
"sub v19.4s, v19.4s, v11.4s\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
"402:\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"csel w3, %w[row], %w[col], eq\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
"add x5, x1, x3, lsl #2\n"
"csel x1, x1, x5, eq\n"
"ld1 {v14.4s}, [x1]\n"
"smin v11.4s, v8.4s, v14.4s\n"
"sub v12.4s, v14.4s, v11.4s\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 8f\n"
"sshl v16.4s, v16.4s, v12.4s\n"
"sshl v17.4s, v17.4s, v12.4s\n"
"sshl v18.4s, v18.4s, v12.4s\n"
"sshl v19.4s, v19.4s, v12.4s\n"
"sqdmulh v16.4s, v16.4s, v15.4s\n"
"sqdmulh v17.4s, v17.4s, v15.4s\n"
"sqdmulh v18.4s, v18.4s, v15.4s\n"
"sqdmulh v19.4s, v19.4s, v15.4s\n"
"srshl v16.4s, v16.4s, v11.4s\n"
"srshl v17.4s, v17.4s, v11.4s\n"
"srshl v18.4s, v18.4s, v11.4s\n"
"srshl v19.4s, v19.4s, v11.4s\n"
"b 9f\n"
"8:\n"
"dup v20.4s, v12.s[0]\n"
"dup v21.4s, v12.s[1]\n"
"dup v22.4s, v12.s[2]\n"
"dup v23.4s, v12.s[3]\n"
"sshl v16.4s, v16.4s, v20.4s\n"
"sshl v17.4s, v17.4s, v21.4s\n"
"sshl v18.4s, v18.4s, v22.4s\n"
"sshl v19.4s, v19.4s, v23.4s\n"
"sqdmulh v16.4s, v16.4s, v15.s[0]\n"
"sqdmulh v17.4s, v17.4s, v15.s[1]\n"
"sqdmulh v18.4s, v18.4s, v15.s[2]\n"
"sqdmulh v19.4s, v19.4s, v15.s[3]\n"
"dup v20.4s, v11.s[0]\n"
"dup v21.4s, v11.s[1]\n"
"dup v22.4s, v11.s[2]\n"
"dup v23.4s, v11.s[3]\n"
"srshl v16.4s, v16.4s, v20.4s\n"
"srshl v17.4s, v17.4s, v21.4s\n"
"srshl v18.4s, v18.4s, v22.4s\n"
"srshl v19.4s, v19.4s, v23.4s\n"
"9:\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqxtn v17.4h, v18.4s\n"
"sqxtn2 v17.8h, v19.4s\n"
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"dup v14.8h, v13.h[4]\n"
"sqadd v16.8h, v16.8h, v14.8h\n"
"sqadd v17.8h, v17.8h, v14.8h\n"
"sqxtun v16.8b, v16.8h\n"
"sqxtun2 v16.16b, v17.8h\n"
"ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.16b, w2\n"
"dup v15.16b, w3\n"
"umax v16.16b, v16.16b, v14.16b\n"
"umin v16.16b, v16.16b, v15.16b\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #4\n"
"cmp w1, #4\n"
"csel w1, w1, w3, le\n"
"cmp w2, #4\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"mov x4, %[dst_ptr]\n"
"beq 30f\n"
"st1 {v16.16b}, [%[dst_tmp_buf]]\n"
"mov x3, %[dst_tmp_buf]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldrb w7, [x3, w5, uxtw]\n"
"strb w7, [x4, w5, uxtw]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #4\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"b 31f\n"
"30:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[0], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[1], [x3], #1\n"
"st1 {v16.b}[2], [x3], #1\n"
"st1 {v16.b}[3], [x3], #1\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[4], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[5], [x3], #1\n"
"st1 {v16.b}[6], [x3], #1\n"
"st1 {v16.b}[7], [x3], #1\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[8], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[9], [x3], #1\n"
"st1 {v16.b}[10], [x3], #1\n"
"st1 {v16.b}[11], [x3], #1\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[12], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[13], [x3], #1\n"
"st1 {v16.b}[14], [x3], #1\n"
"st1 {v16.b}[15], [x3], #1\n"
"31:\n"
"add %[dst_ptr], %[dst_ptr], #4\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqxtn v17.4h, v18.4s\n"
"sqxtn2 v17.8h, v19.4s\n"
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"dup v14.8h, v13.h[4]\n"
"sqadd v16.8h, v16.8h, v14.8h\n"
"sqadd v17.8h, v17.8h, v14.8h\n"
"sqxtn v16.8b, v16.8h\n"
"sqxtn2 v16.16b, v17.8h\n"
"ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.16b, w2\n"
"dup v15.16b, w3\n"
"smax v16.16b, v16.16b, v14.16b\n"
"smin v16.16b, v16.16b, v15.16b\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #4\n"
"cmp w1, #4\n"
"csel w1, w1, w3, le\n"
"cmp w2, #4\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"mov x4, %[dst_ptr]\n"
"beq 30f\n"
"st1 {v16.16b}, [%[dst_tmp_buf]]\n"
"mov x3, %[dst_tmp_buf]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldrb w7, [x3, w5, uxtw]\n"
"strb w7, [x4, w5, uxtw]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #4\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"b 31f\n"
"30:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[0], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[1], [x3], #1\n"
"st1 {v16.b}[2], [x3], #1\n"
"st1 {v16.b}[3], [x3], #1\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[4], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[5], [x3], #1\n"
"st1 {v16.b}[6], [x3], #1\n"
"st1 {v16.b}[7], [x3], #1\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[8], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[9], [x3], #1\n"
"st1 {v16.b}[10], [x3], #1\n"
"st1 {v16.b}[11], [x3], #1\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[12], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[13], [x3], #1\n"
"st1 {v16.b}[14], [x3], #1\n"
"st1 {v16.b}[15], [x3], #1\n"
"31:\n"
"add %[dst_ptr], %[dst_ptr], #4\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
"dup v14.4h, v13.h[4]\n"
"saddw v16.4s, v16.4s, v14.4h\n"
"saddw v17.4s, v17.4s, v14.4h\n"
"saddw v18.4s, v18.4s, v14.4h\n"
"saddw v19.4s, v19.4s, v14.4h\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqxtn v17.4h, v18.4s\n"
"sqxtn2 v17.8h, v19.4s\n"
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"ldrh w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrh w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.8h, w2\n"
"dup v15.8h, w3\n"
"smax v16.8h, v16.8h, v14.8h\n"
"smax v17.8h, v17.8h, v14.8h\n"
"smin v16.8h, v16.8h, v15.8h\n"
"smin v17.8h, v17.8h, v15.8h\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #4\n"
"cmp w1, #4\n"
"csel w1, w1, w3, le\n"
"cmp w2, #4\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"mov x4, %[dst_ptr]\n"
"beq 30f\n"
"str q16, [%[dst_tmp_buf], #0]\n"
"str q17, [%[dst_tmp_buf], #16]\n"
"mov x3, %[dst_tmp_buf]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldrh w7, [x3, x5, lsl #1]\n"
"strh w7, [x4, x5, lsl #1]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #8\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"b 31f\n"
"30:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.h}[0], [x3], #2\n"
"add x4, x4, x11\n"
"st1 {v16.h}[1], [x3], #2\n"
"st1 {v16.h}[2], [x3], #2\n"
"st1 {v16.h}[3], [x3], #2\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.h}[4], [x3], #2\n"
"add x4, x4, x11\n"
"st1 {v16.h}[5], [x3], #2\n"
"st1 {v16.h}[6], [x3], #2\n"
"st1 {v16.h}[7], [x3], #2\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v17.h}[0], [x3], #2\n"
"add x4, x4, x11\n"
"st1 {v17.h}[1], [x3], #2\n"
"st1 {v17.h}[2], [x3], #2\n"
"st1 {v17.h}[3], [x3], #2\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v17.h}[4], [x3], #2\n"
"add x4, x4, x11\n"
"st1 {v17.h}[5], [x3], #2\n"
"st1 {v17.h}[6], [x3], #2\n"
"st1 {v17.h}[7], [x3], #2\n"
"31:\n"
"add %[dst_ptr], %[dst_ptr], #8\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #4\n"
"cmp w1, #4\n"
"csel w1, w1, w3, le\n"
"cmp w2, #4\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"mov x4, %[dst_ptr]\n"
"beq 30f\n"
"str q16, [%[dst_tmp_buf], #0]\n"
"str q17, [%[dst_tmp_buf], #16]\n"
"str q18, [%[dst_tmp_buf], #32]\n"
"str q19, [%[dst_tmp_buf], #48]\n"
"mov x3, %[dst_tmp_buf]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldr w7, [x3, x5, lsl #2]\n"
"str w7, [x4, x5, lsl #2]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #16\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"b 31f\n"
"30:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.s}[0], [x3], #4\n"
"add x4, x4, x11\n"
"st1 {v16.s}[1], [x3], #4\n"
"st1 {v16.s}[2], [x3], #4\n"
"st1 {v16.s}[3], [x3], #4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v17.s}[0], [x3], #4\n"
"add x4, x4, x11\n"
"st1 {v17.s}[1], [x3], #4\n"
"st1 {v17.s}[2], [x3], #4\n"
"st1 {v17.s}[3], [x3], #4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v18.s}[0], [x3], #4\n"
"add x4, x4, x11\n"
"st1 {v18.s}[1], [x3], #4\n"
"st1 {v18.s}[2], [x3], #4\n"
"st1 {v18.s}[3], [x3], #4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v19.s}[0], [x3], #4\n"
"add x4, x4, x11\n"
"st1 {v19.s}[1], [x3], #4\n"
"st1 {v19.s}[2], [x3], #4\n"
"st1 {v19.s}[3], [x3], #4\n"
"31:\n"
"add %[dst_ptr], %[dst_ptr], #16\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
"smull v8.8h, v0.8b, v4.8b\n"
"smull v9.8h, v1.8b, v4.8b\n"
"smull v10.8h, v2.8b, v4.8b\n"
"smull v11.8h, v3.8b, v4.8b\n"
"smull v12.8h, v0.8b, v5.8b\n"
"smull v13.8h, v1.8b, v5.8b\n"
"smull v14.8h, v2.8b, v5.8b\n"
"smull v15.8h, v3.8b, v5.8b\n"
"smlal2 v8.8h, v0.16b, v4.16b\n"
"smlal2 v9.8h, v1.16b, v4.16b\n"
"smlal2 v10.8h, v2.16b, v4.16b\n"
"smlal2 v11.8h, v3.16b, v4.16b\n"
"smlal2 v12.8h, v0.16b, v5.16b\n"
"smlal2 v13.8h, v1.16b, v5.16b\n"
"smlal2 v14.8h, v2.16b, v5.16b\n"
"smlal2 v15.8h, v3.16b, v5.16b\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"cmp %w[row], w7\n"
"beq 20f\n"
"add %w[row], %w[row], #4\n"
"b 21f\n"
"20:\n"
"mov %w[row], w6\n"
"add %w[col], %w[col], #4\n"
"add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #2\n"
"mov %[dst_ptr], %[dst_col_ptr]\n"
"21:\n"
"cmp %w[col], w8\n"
"mov w1, #16\n"
"ble 1b\n"
: [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
: [ params ] "r"(¶ms), [dst_rows] "r"(params.dst_rows),
[dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf),
[dst_type_id] "r"(params.dst_type_id)
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
"memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
void Kernel8bitNeon1Col(const KernelParams8bit<4, 4>& params) {
profiler::ScopeLabel label("Kernel (kNeon)");
CheckOffsetsInKernelParams8bit(params);
const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
const std::int8_t* rhs_col_ptr =
static_cast<const int8_t*>(params.rhs_base_ptr);
const std::int8_t* lhs_ptr = lhs_col_ptr;
const std::int8_t* rhs_ptr = rhs_col_ptr;
void* dst_col_ptr = params.dst_base_ptr;
void* dst_ptr = dst_col_ptr;
int row = params.start_row;
int col = params.start_col;
RUY_DCHECK(!(params.flags & RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL));
asm volatile(
#define RUY_MAKE_ZERO …
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
"ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
"ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v4.16b}, [%[rhs_ptr]], #16\n"
"add %[rhs_ptr], %[rhs_ptr], #48\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
"mov w1, #16\n"
"smull v8.8h, v0.8b, v4.8b\n"
"smull v9.8h, v1.8b, v4.8b\n"
"smull v10.8h, v2.8b, v4.8b\n"
"smull v11.8h, v3.8b, v4.8b\n"
"smlal2 v8.8h, v0.16b, v4.16b\n"
"smlal2 v9.8h, v1.16b, v4.16b\n"
"smlal2 v10.8h, v2.16b, v4.16b\n"
"smlal2 v11.8h, v3.16b, v4.16b\n"
"1:\n"
"cmp w1, w12\n"
"beq 79f\n"
"2:\n"
"sadalp v16.4s, v8.8h\n"
"ld1 {v4.16b}, [%[rhs_ptr]], #16\n"
"add %[rhs_ptr], %[rhs_ptr], #48\n"
"sadalp v17.4s, v9.8h\n"
"sadalp v18.4s, v10.8h\n"
"sadalp v19.4s, v11.8h\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
"smull v8.8h, v0.8b, v4.8b\n"
"smull v9.8h, v1.8b, v4.8b\n"
"smull v10.8h, v2.8b, v4.8b\n"
"smull v11.8h, v3.8b, v4.8b\n"
"smlal2 v8.8h, v0.16b, v4.16b\n"
"smlal2 v9.8h, v1.16b, v4.16b\n"
"smlal2 v10.8h, v2.16b, v4.16b\n"
"smlal2 v11.8h, v3.16b, v4.16b\n"
"add w1, w1, #16\n"
"cmp w1, w12\n"
"blt 2b\n"
"79:\n"
"sadalp v16.4s, v8.8h\n"
"sadalp v17.4s, v9.8h\n"
"sadalp v18.4s, v10.8h\n"
"sadalp v19.4s, v11.8h\n"
"addp v16.4s, v16.4s, v17.4s\n"
"addp v18.4s, v18.4s, v19.4s\n"
"addp v16.4s, v16.4s, v18.4s\n"
"cmp %w[row], w7\n"
"bge 4f\n"
"add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #2\n"
"b 5f\n"
"4:\n"
"mov %[lhs_col_ptr], x5\n"
"cmp %w[col], w8\n"
"bge 5f\n"
"add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #2\n"
"5:\n"
"mov %[lhs_ptr], %[lhs_col_ptr]\n"
"mov %[rhs_ptr], %[rhs_col_ptr]\n"
"mvni v8.4s, #0\n"
"ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
"ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
"ins v13.h[4], w4\n"
"ldr x4, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
"ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"dup v9.4s, w3\n"
"add x5, x4, %x[row], lsl #2\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
"csel x4, x4, x5, eq\n"
"ld1 {v15.4s}, [x4]\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
"add x5, x1, %x[row], lsl #2\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"ld1 {v14.4s}, [x1]\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v4.16b}, [%[rhs_ptr]], #16\n"
"add %[rhs_ptr], %[rhs_ptr], #48\n"
"add v14.4s, v14.4s, v9.4s\n"
"add v16.4s, v16.4s, v14.4s\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
"beq 401f\n"
"ldr x3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
"add x3, x3, %x[col], lsl #2\n"
"ld1 {v14.4s}, [x3]\n"
"ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
"dup v10.4s, w5\n"
"mls v16.4s, v10.4s, v14.s[0]\n"
"401:\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
"beq 402f\n"
"ldr x2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
"add x2, x2, %x[row], lsl #2\n"
"ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
"ld1 {v11.4s}, [x2]\n"
"ins v13.s[1], w5\n"
"mul v11.4s, v11.4s, v13.s[1]\n"
"sub v16.4s, v16.4s, v11.4s\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
"402:\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
"add x5, x1, %x[row], lsl #2\n"
"csel x1, x1, x5, eq\n"
"ld1 {v14.4s}, [x1]\n"
"smin v11.4s, v8.4s, v14.4s\n"
"sub v12.4s, v14.4s, v11.4s\n"
"sshl v16.4s, v16.4s, v12.4s\n"
"sqdmulh v16.4s, v16.4s, v15.4s\n"
"srshl v16.4s, v16.4s, v11.4s\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
"sqxtn v16.4h, v16.4s\n"
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
"dup v14.8h, v13.h[4]\n"
"sqadd v16.8h, v16.8h, v14.8h\n"
"sqxtun v16.8b, v16.8h\n"
"ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.16b, w2\n"
"dup v15.16b, w3\n"
"umax v16.16b, v16.16b, v14.16b\n"
"umin v16.16b, v16.16b, v15.16b\n"
"sub w1, %w[dst_rows], %w[row]\n"
"mov w3, #4\n"
"cmp w1, #4\n"
"csel w1, w1, w3, le\n"
"cmp w1, w3\n"
"mov x4, %[dst_ptr]\n"
"beq 30f\n"
"st1 {v16.16b}, [%[dst_tmp_buf]]\n"
"mov x3, %[dst_tmp_buf]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldrb w7, [x3, w5, uxtw]\n"
"strb w7, [x4, w5, uxtw]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"b 31f\n"
"30:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[0], [x3], #1\n"
"st1 {v16.b}[1], [x3], #1\n"
"st1 {v16.b}[2], [x3], #1\n"
"st1 {v16.b}[3], [x3], #1\n"
"31:\n"
"add %[dst_ptr], %[dst_ptr], #4\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
"sqxtn v16.4h, v16.4s\n"
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
"dup v14.8h, v13.h[4]\n"
"sqadd v16.8h, v16.8h, v14.8h\n"
"sqxtn v16.8b, v16.8h\n"
"ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.16b, w2\n"
"dup v15.16b, w3\n"
"smax v16.16b, v16.16b, v14.16b\n"
"smin v16.16b, v16.16b, v15.16b\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #4\n"
"cmp w1, #4\n"
"csel w1, w1, w3, le\n"
"cmp w2, #4\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"mov x4, %[dst_ptr]\n"
"beq 30f\n"
"st1 {v16.16b}, [%[dst_tmp_buf]]\n"
"mov x3, %[dst_tmp_buf]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldrb w7, [x3, w5, uxtw]\n"
"strb w7, [x4, w5, uxtw]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"b 31f\n"
"30:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[0], [x3], #1\n"
"st1 {v16.b}[1], [x3], #1\n"
"st1 {v16.b}[2], [x3], #1\n"
"st1 {v16.b}[3], [x3], #1\n"
"31:\n"
"add %[dst_ptr], %[dst_ptr], #4\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
"dup v14.4h, v13.h[4]\n"
"saddw v16.4s, v16.4s, v14.4h\n"
"sqxtn v16.4h, v16.4s\n"
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
"ldrh w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrh w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.8h, w2\n"
"dup v15.8h, w3\n"
"smax v16.8h, v16.8h, v14.8h\n"
"smin v16.8h, v16.8h, v15.8h\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #4\n"
"cmp w1, #4\n"
"csel w1, w1, w3, le\n"
"cmp w2, #4\n"
"cmp w1, w3\n"
"mov x4, %[dst_ptr]\n"
"beq 30f\n"
"str q16, [%[dst_tmp_buf], #0]\n"
"mov x3, %[dst_tmp_buf]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldrh w7, [x3, x5, lsl #1]\n"
"strh w7, [x4, x5, lsl #1]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"blt 50b\n"
"b 31f\n"
"30:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.h}[0], [x3], #2\n"
"st1 {v16.h}[1], [x3], #2\n"
"st1 {v16.h}[2], [x3], #2\n"
"st1 {v16.h}[3], [x3], #2\n"
"31:\n"
"add %[dst_ptr], %[dst_ptr], #8\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #4\n"
"cmp w1, #4\n"
"csel w1, w1, w3, le\n"
"cmp w2, #4\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"mov x4, %[dst_ptr]\n"
"beq 30f\n"
"str q16, [%[dst_tmp_buf], #0]\n"
"mov x3, %[dst_tmp_buf]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldr w7, [x3, x5, lsl #2]\n"
"str w7, [x4, x5, lsl #2]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"b 31f\n"
"30:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.s}[0], [x3], #4\n"
"st1 {v16.s}[1], [x3], #4\n"
"st1 {v16.s}[2], [x3], #4\n"
"st1 {v16.s}[3], [x3], #4\n"
"31:\n"
"add %[dst_ptr], %[dst_ptr], #16\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
"smull v8.8h, v0.8b, v4.8b\n"
"smull v9.8h, v1.8b, v4.8b\n"
"smull v10.8h, v2.8b, v4.8b\n"
"smull v11.8h, v3.8b, v4.8b\n"
"smlal2 v8.8h, v0.16b, v4.16b\n"
"smlal2 v9.8h, v1.16b, v4.16b\n"
"smlal2 v10.8h, v2.16b, v4.16b\n"
"smlal2 v11.8h, v3.16b, v4.16b\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"cmp %w[row], w7\n"
"beq 20f\n"
"add %w[row], %w[row], #4\n"
"b 21f\n"
"20:\n"
"mov %w[row], w6\n"
"add %w[col], %w[col], #4\n"
"add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #2\n"
"mov %[dst_ptr], %[dst_col_ptr]\n"
"21:\n"
"cmp %w[col], w8\n"
"mov w1, #16\n"
"ble 1b\n"
: [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
: [ params ] "r"(¶ms), [dst_rows] "r"(params.dst_rows),
[dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf),
[dst_type_id] "r"(params.dst_type_id)
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
"memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19");
}
void Kernel8bitNeonA55ish(const KernelParams8bit<4, 4>& params) {
profiler::ScopeLabel label("Kernel (kNeon, optimized for in-order cores)");
CheckOffsetsInKernelParams8bit(params);
const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
const std::int8_t* rhs_col_ptr =
static_cast<const int8_t*>(params.rhs_base_ptr);
const std::int8_t* lhs_ptr = lhs_col_ptr;
const std::int8_t* rhs_ptr = rhs_col_ptr;
void* dst_col_ptr = params.dst_base_ptr;
void* dst_ptr = dst_col_ptr;
int row = params.start_row;
int col = params.start_col;
asm volatile(
#define RUY_MAKE_ZERO …
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
RUY_MAKE_ZERO(v16)
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
RUY_MAKE_ZERO(v17)
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
RUY_MAKE_ZERO(v18)
"ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
RUY_MAKE_ZERO(v19)
"ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
RUY_MAKE_ZERO(v20)
"ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
RUY_MAKE_ZERO(v21)
"ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
RUY_MAKE_ZERO(v22)
"ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
RUY_MAKE_ZERO(v23)
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
RUY_MAKE_ZERO(v24)
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
RUY_MAKE_ZERO(v25)
"ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
RUY_MAKE_ZERO(v26)
"ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
RUY_MAKE_ZERO(v27)
"ld1 {v4.16b}, [%[rhs_ptr]], #16\n"
RUY_MAKE_ZERO(v28)
"ld1 {v5.16b}, [%[rhs_ptr]], #16\n"
RUY_MAKE_ZERO(v29)
"ld1 {v6.16b}, [%[rhs_ptr]], #16\n"
RUY_MAKE_ZERO(v30)
"ld1 {v7.16b}, [%[rhs_ptr]], #16\n"
RUY_MAKE_ZERO(v31)
"mov w1, #16\n"
"smull v8.8h, v0.8b, v4.8b\n"
"smull v9.8h, v1.8b, v4.8b\n"
"smull v10.8h, v2.8b, v4.8b\n"
"smull v11.8h, v3.8b, v4.8b\n"
"smull v12.8h, v0.8b, v5.8b\n"
"smull v13.8h, v1.8b, v5.8b\n"
"smull v14.8h, v2.8b, v5.8b\n"
"smull v15.8h, v3.8b, v5.8b\n"
"smlal2 v8.8h, v0.16b, v4.16b\n"
"smlal2 v9.8h, v1.16b, v4.16b\n"
"smlal2 v10.8h, v2.16b, v4.16b\n"
"smlal2 v11.8h, v3.16b, v4.16b\n"
"smlal2 v12.8h, v0.16b, v5.16b\n"
"smlal2 v13.8h, v1.16b, v5.16b\n"
"smlal2 v14.8h, v2.16b, v5.16b\n"
"smlal2 v15.8h, v3.16b, v5.16b\n"
"1:\n"
"cmp w1, w12\n"
"beq 79f\n"
"2:\n"
"sadalp v16.4s, v8.8h\n"
"ldr d4, [%[rhs_ptr], #0]\n"
"smull v8.8h, v0.8b, v6.8b\n"
"ldr x7, [%[rhs_ptr], #8]\n"
"sadalp v17.4s, v9.8h\n"
"ldr d5, [%[rhs_ptr], #16]\n"
"smull v9.8h, v1.8b, v6.8b\n"
"ldr x8, [%[rhs_ptr], #24]\n"
"sadalp v18.4s, v10.8h\n"
"smull v10.8h, v2.8b, v6.8b\n"
"sadalp v19.4s, v11.8h\n"
"add %[lhs_ptr], %[lhs_ptr], #64\n"
"smull v11.8h, v3.8b, v6.8b\n"
"add %[rhs_ptr], %[rhs_ptr], #64\n"
"sadalp v20.4s, v12.8h\n"
"add w1, w1, #16\n"
"smull v12.8h, v0.8b, v7.8b\n"
"cmp w1, w12\n"
"sadalp v21.4s, v13.8h\n"
"ldr x3, [%[lhs_ptr], #-56]\n"
"smull v13.8h, v1.8b, v7.8b\n"
"ldr x4, [%[lhs_ptr], #-40]\n"
"sadalp v22.4s, v14.8h\n"
"ldr x5, [%[lhs_ptr], #-24]\n"
"smull v14.8h, v2.8b, v7.8b\n"
"ldr x6, [%[lhs_ptr], #-8]\n"
"sadalp v23.4s, v15.8h\n"
"smull v15.8h, v3.8b, v7.8b\n"
"smlal2 v8.8h, v0.16b, v6.16b\n"
"smlal2 v9.8h, v1.16b, v6.16b\n"
"smlal2 v10.8h, v2.16b, v6.16b\n"
"ldr x9, [%[rhs_ptr], #-24]\n"
"smlal2 v11.8h, v3.16b, v6.16b\n"
"ldr d6, [%[rhs_ptr], #-32]\n"
"smlal2 v12.8h, v0.16b, v7.16b\n"
"ldr d0, [%[lhs_ptr], #-64]\n"
"smlal2 v13.8h, v1.16b, v7.16b\n"
"ldr d1, [%[lhs_ptr], #-48]\n"
"smlal2 v14.8h, v2.16b, v7.16b\n"
"ins v4.d[1], x7\n"
"smlal2 v15.8h, v3.16b, v7.16b\n"
"ins v5.d[1], x8\n"
"ldr d2, [%[lhs_ptr], #-32]\n"
"ins v0.d[1], x3\n"
"sadalp v24.4s, v8.8h\n"
"ldr d3, [%[lhs_ptr], #-16]\n"
"ins v1.d[1], x4\n"
"smull v8.8h, v0.8b, v4.8b\n"
"ins v2.d[1], x5\n"
"sadalp v25.4s, v9.8h\n"
"ins v3.d[1], x6\n"
"smull v9.8h, v1.8b, v4.8b\n"
"ldr d7, [%[rhs_ptr], #-16]\n"
"sadalp v26.4s, v10.8h\n"
"ldr x10, [%[rhs_ptr], #-8]\n"
"smull v10.8h, v2.8b, v4.8b\n"
"sadalp v27.4s, v11.8h\n"
"smull v11.8h, v3.8b, v4.8b\n"
"sadalp v28.4s, v12.8h\n"
"smull v12.8h, v0.8b, v5.8b\n"
"sadalp v29.4s, v13.8h\n"
"smull v13.8h, v1.8b, v5.8b\n"
"sadalp v30.4s, v14.8h\n"
"smull v14.8h, v2.8b, v5.8b\n"
"sadalp v31.4s, v15.8h\n"
"smull v15.8h, v3.8b, v5.8b\n"
"smlal2 v8.8h, v0.16b, v4.16b\n"
"smlal2 v9.8h, v1.16b, v4.16b\n"
"smlal2 v10.8h, v2.16b, v4.16b\n"
"smlal2 v11.8h, v3.16b, v4.16b\n"
"smlal2 v12.8h, v0.16b, v5.16b\n"
"smlal2 v13.8h, v1.16b, v5.16b\n"
"ins v6.d[1], x9\n"
"smlal2 v14.8h, v2.16b, v5.16b\n"
"ins v7.d[1], x10\n"
"smlal2 v15.8h, v3.16b, v5.16b\n"
"blt 2b\n"
"79:\n"
"sadalp v16.4s, v8.8h\n"
"smull v8.8h, v0.8b, v6.8b\n"
"sadalp v17.4s, v9.8h\n"
"smull v9.8h, v1.8b, v6.8b\n"
"sadalp v18.4s, v10.8h\n"
"smull v10.8h, v2.8b, v6.8b\n"
"sadalp v19.4s, v11.8h\n"
"smull v11.8h, v3.8b, v6.8b\n"
"sadalp v20.4s, v12.8h\n"
"smull v12.8h, v0.8b, v7.8b\n"
"sadalp v21.4s, v13.8h\n"
"smull v13.8h, v1.8b, v7.8b\n"
"sadalp v22.4s, v14.8h\n"
"smull v14.8h, v2.8b, v7.8b\n"
"sadalp v23.4s, v15.8h\n"
"smull v15.8h, v3.8b, v7.8b\n"
"smlal2 v8.8h, v0.16b, v6.16b\n"
"smlal2 v9.8h, v1.16b, v6.16b\n"
"smlal2 v10.8h, v2.16b, v6.16b\n"
"smlal2 v11.8h, v3.16b, v6.16b\n"
"smlal2 v12.8h, v0.16b, v7.16b\n"
"smlal2 v13.8h, v1.16b, v7.16b\n"
"smlal2 v14.8h, v2.16b, v7.16b\n"
"smlal2 v15.8h, v3.16b, v7.16b\n"
"sadalp v24.4s, v8.8h\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"sadalp v25.4s, v9.8h\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"sadalp v26.4s, v10.8h\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"sadalp v27.4s, v11.8h\n"
"ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"sadalp v28.4s, v12.8h\n"
"ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
"sadalp v29.4s, v13.8h\n"
"ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
"sadalp v30.4s, v14.8h\n"
"sadalp v31.4s, v15.8h\n"
"addp v16.4s, v16.4s, v17.4s\n"
"addp v18.4s, v18.4s, v19.4s\n"
"addp v20.4s, v20.4s, v21.4s\n"
"addp v22.4s, v22.4s, v23.4s\n"
"addp v24.4s, v24.4s, v25.4s\n"
"addp v26.4s, v26.4s, v27.4s\n"
"addp v28.4s, v28.4s, v29.4s\n"
"addp v30.4s, v30.4s, v31.4s\n"
"addp v16.4s, v16.4s, v18.4s\n"
"addp v17.4s, v20.4s, v22.4s\n"
"addp v18.4s, v24.4s, v26.4s\n"
"addp v19.4s, v28.4s, v30.4s\n"
"cmp %w[row], w7\n"
"bge 4f\n"
"add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #2\n"
"b 5f\n"
"4:\n"
"mov %[lhs_col_ptr], x5\n"
"cmp %w[col], w8\n"
"bge 5f\n"
"add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #2\n"
"5:\n"
"mov %[lhs_ptr], %[lhs_col_ptr]\n"
"mov %[rhs_ptr], %[rhs_col_ptr]\n"
"mvni v8.4s, #0\n"
"ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
"ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
"ins v13.h[4], w4\n"
"ldr x4, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
"ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"dup v9.4s, w3\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"csel w3, %w[row], %w[col], eq\n"
"add x5, x1, x3, lsl #2\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"ld1 {v14.4s}, [x1]\n"
"add x5, x4, x3, lsl #2\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
"csel x4, x4, x5, eq\n"
"ld1 {v15.4s}, [x4]\n"
"add v14.4s, v14.4s, v9.4s\n"
"ldr d0, [%[lhs_ptr], #0]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 6f\n"
"add v16.4s, v16.4s, v14.4s\n"
"ldr d1, [%[lhs_ptr], #16]\n"
"add v17.4s, v17.4s, v14.4s\n"
"ldr d2, [%[lhs_ptr], #32]\n"
"add v18.4s, v18.4s, v14.4s\n"
"ldr d3, [%[lhs_ptr], #48]\n"
"add v19.4s, v19.4s, v14.4s\n"
"ldr d4, [%[rhs_ptr], #0]\n"
"ldr d5, [%[rhs_ptr], #16]\n"
"ldr d6, [%[rhs_ptr], #32]\n"
"ldr d7, [%[rhs_ptr], #48]\n"
"b 7f\n"
"6:\n"
"dup v20.4s, v14.s[0]\n"
"ldr d1, [%[lhs_ptr], #16]\n"
"dup v21.4s, v14.s[1]\n"
"ldr d2, [%[lhs_ptr], #32]\n"
"dup v22.4s, v14.s[2]\n"
"ldr d3, [%[lhs_ptr], #48]\n"
"dup v23.4s, v14.s[3]\n"
"ldr d4, [%[rhs_ptr], #0]\n"
"add v16.4s, v16.4s, v20.4s\n"
"ldr d5, [%[rhs_ptr], #16]\n"
"add v17.4s, v17.4s, v21.4s\n"
"ldr d6, [%[rhs_ptr], #32]\n"
"add v18.4s, v18.4s, v22.4s\n"
"ldr d7, [%[rhs_ptr], #48]\n"
"add v19.4s, v19.4s, v23.4s\n"
"7:\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
"beq 401f\n"
"ldr x3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
"add x3, x3, %x[col], lsl #2\n"
"ld1 {v14.4s}, [x3]\n"
"ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
"dup v10.4s, w5\n"
"mls v16.4s, v10.4s, v14.s[0]\n"
"mls v17.4s, v10.4s, v14.s[1]\n"
"mls v18.4s, v10.4s, v14.s[2]\n"
"mls v19.4s, v10.4s, v14.s[3]\n"
"401:\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
"beq 402f\n"
"ldr x2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
"add x2, x2, %x[row], lsl #2\n"
"ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
"ld1 {v11.4s}, [x2]\n"
"ins v13.s[1], w5\n"
"mul v11.4s, v11.4s, v13.s[1]\n"
"sub v16.4s, v16.4s, v11.4s\n"
"sub v17.4s, v17.4s, v11.4s\n"
"sub v18.4s, v18.4s, v11.4s\n"
"sub v19.4s, v19.4s, v11.4s\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
"402:\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"csel w3, %w[row], %w[col], eq\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
"add x5, x1, x3, lsl #2\n"
"csel x1, x1, x5, eq\n"
"ld1 {v14.4s}, [x1]\n"
"smin v11.4s, v8.4s, v14.4s\n"
"ldr x1, [%[lhs_ptr], #8]\n"
"sub v12.4s, v14.4s, v11.4s\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 8f\n"
"sshl v16.4s, v16.4s, v12.4s\n"
"ldr x2, [%[lhs_ptr], #24]\n"
"sshl v17.4s, v17.4s, v12.4s\n"
"ldr x3, [%[lhs_ptr], #40]\n"
"sshl v18.4s, v18.4s, v12.4s\n"
"ldr x4, [%[lhs_ptr], #56]\n"
"sshl v19.4s, v19.4s, v12.4s\n"
"ins v0.d[1], x1\n"
"ldr x1, [%[rhs_ptr], #8]\n"
"sqdmulh v16.4s, v16.4s, v15.4s\n"
"ins v1.d[1], x2\n"
"ldr x2, [%[rhs_ptr], #24]\n"
"sqdmulh v17.4s, v17.4s, v15.4s\n"
"ins v2.d[1], x3\n"
"ldr x3, [%[rhs_ptr], #40]\n"
"sqdmulh v18.4s, v18.4s, v15.4s\n"
"ins v3.d[1], x4\n"
"ldr x4, [%[rhs_ptr], #56]\n"
"sqdmulh v19.4s, v19.4s, v15.4s\n"
"srshl v16.4s, v16.4s, v11.4s\n"
"srshl v17.4s, v17.4s, v11.4s\n"
"srshl v18.4s, v18.4s, v11.4s\n"
"srshl v19.4s, v19.4s, v11.4s\n"
"b 9f\n"
"8:\n"
"dup v20.4s, v12.s[0]\n"
"ldr x2, [%[lhs_ptr], #24]\n"
"ldr x3, [%[lhs_ptr], #40]\n"
"dup v21.4s, v12.s[1]\n"
"ldr x4, [%[lhs_ptr], #56]\n"
"dup v22.4s, v12.s[2]\n"
"ins v0.d[1], x1\n"
"dup v23.4s, v12.s[3]\n"
"ldr x1, [%[rhs_ptr], #8]\n"
"sshl v16.4s, v16.4s, v20.4s\n"
"ins v1.d[1], x2\n"
"sshl v17.4s, v17.4s, v21.4s\n"
"ldr x2, [%[rhs_ptr], #24]\n"
"sshl v18.4s, v18.4s, v22.4s\n"
"ins v2.d[1], x3\n"
"sshl v19.4s, v19.4s, v23.4s\n"
"ldr x3, [%[rhs_ptr], #40]\n"
"sqdmulh v16.4s, v16.4s, v15.s[0]\n"
"ins v3.d[1], x4\n"
"sqdmulh v17.4s, v17.4s, v15.s[1]\n"
"ldr x4, [%[rhs_ptr], #56]\n"
"sqdmulh v18.4s, v18.4s, v15.s[2]\n"
"dup v20.4s, v11.s[0]\n"
"sqdmulh v19.4s, v19.4s, v15.s[3]\n"
"dup v21.4s, v11.s[1]\n"
"srshl v16.4s, v16.4s, v20.4s\n"
"dup v22.4s, v11.s[2]\n"
"srshl v17.4s, v17.4s, v21.4s\n"
"dup v23.4s, v11.s[3]\n"
"srshl v18.4s, v18.4s, v22.4s\n"
"srshl v19.4s, v19.4s, v23.4s\n"
"9:\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
"ins v4.d[1], x1\n"
"sqxtn v16.4h, v16.4s\n"
"ins v5.d[1], x2\n"
"sqxtn2 v16.8h, v17.4s\n"
"ins v6.d[1], x3\n"
"sqxtn v17.4h, v18.4s\n"
"ins v7.d[1], x4\n"
RUY_MAKE_ZERO(v18)
"sqxtn2 v17.8h, v19.4s\n"
RUY_MAKE_ZERO(v19)
"add %[lhs_ptr], %[lhs_ptr], #64\n"
"dup v14.8h, v13.h[4]\n"
RUY_MAKE_ZERO(v20)
"add %[rhs_ptr], %[rhs_ptr], #64\n"
"sqadd v16.8h, v16.8h, v14.8h\n"
RUY_MAKE_ZERO(v21)
"sqadd v17.8h, v17.8h, v14.8h\n"
RUY_MAKE_ZERO(v22)
"sqxtun v16.8b, v16.8h\n"
RUY_MAKE_ZERO(v23)
"sqxtun2 v16.16b, v17.8h\n"
RUY_MAKE_ZERO(v24)
"ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
RUY_MAKE_ZERO(v25)
"ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
RUY_MAKE_ZERO(v26)
"dup v14.16b, w2\n"
RUY_MAKE_ZERO(v27)
"dup v15.16b, w3\n"
RUY_MAKE_ZERO(v28)
"umax v16.16b, v16.16b, v14.16b\n"
RUY_MAKE_ZERO(v29)
"umin v16.16b, v16.16b, v15.16b\n"
RUY_MAKE_ZERO(v30)
"sub w1, %w[dst_rows], %w[row]\n"
RUY_MAKE_ZERO(v31)
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #4\n"
"cmp w1, #4\n"
"csel w1, w1, w3, le\n"
"cmp w2, #4\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"mov x4, %[dst_ptr]\n"
"beq 30f\n"
"st1 {v16.16b}, [%[dst_tmp_buf]]\n"
"mov x3, %[dst_tmp_buf]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldrb w7, [x3, w5, uxtw]\n"
"strb w7, [x4, w5, uxtw]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #4\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"b 31f\n"
"30:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[0], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[1], [x3], #1\n"
"st1 {v16.b}[2], [x3], #1\n"
"st1 {v16.b}[3], [x3], #1\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[4], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[5], [x3], #1\n"
"st1 {v16.b}[6], [x3], #1\n"
"st1 {v16.b}[7], [x3], #1\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[8], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[9], [x3], #1\n"
"st1 {v16.b}[10], [x3], #1\n"
"st1 {v16.b}[11], [x3], #1\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[12], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[13], [x3], #1\n"
"st1 {v16.b}[14], [x3], #1\n"
"st1 {v16.b}[15], [x3], #1\n"
"31:\n"
"add %[dst_ptr], %[dst_ptr], #4\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
"ins v4.d[1], x1\n"
"sqxtn v16.4h, v16.4s\n"
"ins v5.d[1], x2\n"
"sqxtn2 v16.8h, v17.4s\n"
"ins v6.d[1], x3\n"
"sqxtn v17.4h, v18.4s\n"
"ins v7.d[1], x4\n"
RUY_MAKE_ZERO(v18)
"sqxtn2 v17.8h, v19.4s\n"
RUY_MAKE_ZERO(v19)
"add %[lhs_ptr], %[lhs_ptr], #64\n"
"dup v14.8h, v13.h[4]\n"
RUY_MAKE_ZERO(v20)
"add %[rhs_ptr], %[rhs_ptr], #64\n"
"sqadd v16.8h, v16.8h, v14.8h\n"
RUY_MAKE_ZERO(v21)
"sqadd v17.8h, v17.8h, v14.8h\n"
RUY_MAKE_ZERO(v22)
"sqxtn v16.8b, v16.8h\n"
RUY_MAKE_ZERO(v23)
"sqxtn2 v16.16b, v17.8h\n"
RUY_MAKE_ZERO(v24)
"ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
RUY_MAKE_ZERO(v25)
"ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
RUY_MAKE_ZERO(v26)
"dup v14.16b, w2\n"
RUY_MAKE_ZERO(v27)
"dup v15.16b, w3\n"
RUY_MAKE_ZERO(v28)
"smax v16.16b, v16.16b, v14.16b\n"
RUY_MAKE_ZERO(v29)
"smin v16.16b, v16.16b, v15.16b\n"
RUY_MAKE_ZERO(v30)
"sub w1, %w[dst_rows], %w[row]\n"
RUY_MAKE_ZERO(v31)
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #4\n"
"cmp w1, #4\n"
"csel w1, w1, w3, le\n"
"cmp w2, #4\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"mov x4, %[dst_ptr]\n"
"beq 30f\n"
"st1 {v16.16b}, [%[dst_tmp_buf]]\n"
"mov x3, %[dst_tmp_buf]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldrb w7, [x3, w5, uxtw]\n"
"strb w7, [x4, w5, uxtw]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #4\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"b 31f\n"
"30:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[0], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[1], [x3], #1\n"
"st1 {v16.b}[2], [x3], #1\n"
"st1 {v16.b}[3], [x3], #1\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[4], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[5], [x3], #1\n"
"st1 {v16.b}[6], [x3], #1\n"
"st1 {v16.b}[7], [x3], #1\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[8], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[9], [x3], #1\n"
"st1 {v16.b}[10], [x3], #1\n"
"st1 {v16.b}[11], [x3], #1\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.b}[12], [x3], #1\n"
"add x4, x4, x11\n"
"st1 {v16.b}[13], [x3], #1\n"
"st1 {v16.b}[14], [x3], #1\n"
"st1 {v16.b}[15], [x3], #1\n"
"31:\n"
"add %[dst_ptr], %[dst_ptr], #4\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
"dup v14.4h, v13.h[4]\n"
"saddw v16.4s, v16.4s, v14.4h\n"
"saddw v17.4s, v17.4s, v14.4h\n"
"saddw v18.4s, v18.4s, v14.4h\n"
"saddw v19.4s, v19.4s, v14.4h\n"
"ins v4.d[1], x1\n"
"sqxtn v16.4h, v16.4s\n"
"ins v5.d[1], x2\n"
"sqxtn2 v16.8h, v17.4s\n"
"ins v6.d[1], x3\n"
"sqxtn v17.4h, v18.4s\n"
"ins v7.d[1], x4\n"
RUY_MAKE_ZERO(v18)
"sqxtn2 v17.8h, v19.4s\n"
RUY_MAKE_ZERO(v19)
"add %[lhs_ptr], %[lhs_ptr], #64\n"
RUY_MAKE_ZERO(v20)
"add %[rhs_ptr], %[rhs_ptr], #64\n"
RUY_MAKE_ZERO(v21)
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
RUY_MAKE_ZERO(v24)
"ldrh w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
RUY_MAKE_ZERO(v25)
"ldrh w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
RUY_MAKE_ZERO(v26)
"dup v14.8h, w2\n"
RUY_MAKE_ZERO(v27)
"dup v15.8h, w3\n"
RUY_MAKE_ZERO(v28)
"smax v16.8h, v16.8h, v14.8h\n"
"smax v17.8h, v17.8h, v14.8h\n"
RUY_MAKE_ZERO(v29)
"smin v16.8h, v16.8h, v15.8h\n"
"smin v17.8h, v17.8h, v15.8h\n"
RUY_MAKE_ZERO(v30)
"sub w1, %w[dst_rows], %w[row]\n"
RUY_MAKE_ZERO(v31)
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #4\n"
"cmp w1, #4\n"
"csel w1, w1, w3, le\n"
"cmp w2, #4\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"mov x4, %[dst_ptr]\n"
"beq 30f\n"
"str q16, [%[dst_tmp_buf], #0]\n"
"str q17, [%[dst_tmp_buf], #16]\n"
"mov x3, %[dst_tmp_buf]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldrh w7, [x3, x5, lsl #1]\n"
"strh w7, [x4, x5, lsl #1]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #8\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"b 31f\n"
"30:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.h}[0], [x3], #2\n"
"add x4, x4, x11\n"
"st1 {v16.h}[1], [x3], #2\n"
"st1 {v16.h}[2], [x3], #2\n"
"st1 {v16.h}[3], [x3], #2\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.h}[4], [x3], #2\n"
"add x4, x4, x11\n"
"st1 {v16.h}[5], [x3], #2\n"
"st1 {v16.h}[6], [x3], #2\n"
"st1 {v16.h}[7], [x3], #2\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v17.h}[0], [x3], #2\n"
"add x4, x4, x11\n"
"st1 {v17.h}[1], [x3], #2\n"
"st1 {v17.h}[2], [x3], #2\n"
"st1 {v17.h}[3], [x3], #2\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v17.h}[4], [x3], #2\n"
"add x4, x4, x11\n"
"st1 {v17.h}[5], [x3], #2\n"
"st1 {v17.h}[6], [x3], #2\n"
"st1 {v17.h}[7], [x3], #2\n"
"31:\n"
"add %[dst_ptr], %[dst_ptr], #8\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
"ldr x1, [%[lhs_ptr], #8]\n"
"ldr x2, [%[lhs_ptr], #24]\n"
"ldr x3, [%[lhs_ptr], #40]\n"
"ldr x4, [%[lhs_ptr], #56]\n"
"ins v0.d[1], x1\n"
"ldr x1, [%[rhs_ptr], #8]\n"
"ins v1.d[1], x2\n"
"ldr x2, [%[rhs_ptr], #24]\n"
"ins v2.d[1], x3\n"
"ldr x3, [%[rhs_ptr], #40]\n"
"ins v3.d[1], x4\n"
"ldr x4, [%[rhs_ptr], #56]\n"
"ins v4.d[1], x1\n"
"ins v5.d[1], x2\n"
"ins v6.d[1], x3\n"
"ins v7.d[1], x4\n"
RUY_MAKE_ZERO(v20)
"add %[lhs_ptr], %[lhs_ptr], #64\n"
RUY_MAKE_ZERO(v21)
"add %[rhs_ptr], %[rhs_ptr], #64\n"
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
"sub w1, %w[dst_rows], %w[row]\n"
RUY_MAKE_ZERO(v31)
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #4\n"
"cmp w1, #4\n"
"csel w1, w1, w3, le\n"
"cmp w2, #4\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"mov x4, %[dst_ptr]\n"
"beq 30f\n"
"str q16, [%[dst_tmp_buf], #0]\n"
"str q17, [%[dst_tmp_buf], #16]\n"
"str q18, [%[dst_tmp_buf], #32]\n"
"str q19, [%[dst_tmp_buf], #48]\n"
"mov x3, %[dst_tmp_buf]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldr w7, [x3, x5, lsl #2]\n"
"str w7, [x4, x5, lsl #2]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #16\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"b 31f\n"
"30:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.s}[0], [x3], #4\n"
"add x4, x4, x11\n"
"st1 {v16.s}[1], [x3], #4\n"
"st1 {v16.s}[2], [x3], #4\n"
"st1 {v16.s}[3], [x3], #4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v17.s}[0], [x3], #4\n"
"add x4, x4, x11\n"
"st1 {v17.s}[1], [x3], #4\n"
"st1 {v17.s}[2], [x3], #4\n"
"st1 {v17.s}[3], [x3], #4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v18.s}[0], [x3], #4\n"
"add x4, x4, x11\n"
"st1 {v18.s}[1], [x3], #4\n"
"st1 {v18.s}[2], [x3], #4\n"
"st1 {v18.s}[3], [x3], #4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v19.s}[0], [x3], #4\n"
"add x4, x4, x11\n"
"st1 {v19.s}[1], [x3], #4\n"
"st1 {v19.s}[2], [x3], #4\n"
"st1 {v19.s}[3], [x3], #4\n"
"31:\n"
"add %[dst_ptr], %[dst_ptr], #16\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
"smull v8.8h, v0.8b, v4.8b\n"
"smull v9.8h, v1.8b, v4.8b\n"
"smull v10.8h, v2.8b, v4.8b\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"smull v11.8h, v3.8b, v4.8b\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"smull v12.8h, v0.8b, v5.8b\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"smull v13.8h, v1.8b, v5.8b\n"
"smull v14.8h, v2.8b, v5.8b\n"
"smull v15.8h, v3.8b, v5.8b\n"
"cmp %w[row], w7\n"
"smlal2 v8.8h, v0.16b, v4.16b\n"
"smlal2 v9.8h, v1.16b, v4.16b\n"
"smlal2 v10.8h, v2.16b, v4.16b\n"
"smlal2 v11.8h, v3.16b, v4.16b\n"
"smlal2 v12.8h, v0.16b, v5.16b\n"
"smlal2 v13.8h, v1.16b, v5.16b\n"
"smlal2 v14.8h, v2.16b, v5.16b\n"
"smlal2 v15.8h, v3.16b, v5.16b\n"
"beq 20f\n"
"add %w[row], %w[row], #4\n"
"b 21f\n"
"20:\n"
"mov %w[row], w6\n"
"add %w[col], %w[col], #4\n"
"add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #2\n"
"mov %[dst_ptr], %[dst_col_ptr]\n"
"21:\n"
"cmp %w[col], w8\n"
"mov w1, #16\n"
"ble 1b\n"
: [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
: [ params ] "r"(¶ms),[dst_rows] "r"(params.dst_rows),
[dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf),
[dst_type_id] "r"(params.dst_type_id)
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
"memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
void Kernel8bitNeonDotprod(const KernelParams8bit<8, 8>& params) {
profiler::ScopeLabel label("Kernel (kNeonDotprod)");
CheckOffsetsInKernelParams8bit(params);
const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
const std::int8_t* rhs_col_ptr =
static_cast<const int8_t*>(params.rhs_base_ptr);
const std::int8_t* lhs_ptr = lhs_col_ptr;
const std::int8_t* rhs_ptr = rhs_col_ptr;
void* dst_col_ptr = params.dst_base_ptr;
void* dst_ptr = dst_col_ptr;
int row = params.start_row;
int col = params.start_col;
asm volatile(
#define RUY_MAKE_ZERO …
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
"ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
"ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"mov w1, #4\n"
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"1:\n"
#if RUY_OPT(MAX_STREAMING)
"cmp w12, #32\n"
"blt 78f\n"
"ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v6.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v7.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v8.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v9.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v10.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v11.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v12.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v13.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v14.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v15.16b}, [%[rhs_ptr]], #16\n"
"mov w1, #16\n"
"and w3, w12, #-16\n"
"81:\n"
"add w1, w1, #16\n"
".word 0x4f83e018 // sdot v24.4s, v0.16b, v3.4b[0]\n"
".word 0x4fa3e01a // sdot v26.4s, v0.16b, v3.4b[1]\n"
".word 0x4f83e81c // sdot v28.4s, v0.16b, v3.4b[2]\n"
".word 0x4fa3e81e // sdot v30.4s, v0.16b, v3.4b[3]\n"
"ldr q0, [%[lhs_ptr], #0]\n"
".word 0x4f82e031 // sdot v17.4s, v1.16b, v2.4b[0]\n"
".word 0x4fa2e033 // sdot v19.4s, v1.16b, v2.4b[1]\n"
".word 0x4f82e835 // sdot v21.4s, v1.16b, v2.4b[2]\n"
".word 0x4fa2e837 // sdot v23.4s, v1.16b, v2.4b[3]\n"
"ldr q2, [%[rhs_ptr], #0]\n"
".word 0x4f83e039 // sdot v25.4s, v1.16b, v3.4b[0]\n"
".word 0x4fa3e03b // sdot v27.4s, v1.16b, v3.4b[1]\n"
".word 0x4f83e83d // sdot v29.4s, v1.16b, v3.4b[2]\n"
".word 0x4fa3e83f // sdot v31.4s, v1.16b, v3.4b[3]\n"
"ldr q1, [%[lhs_ptr], #16]\n"
".word 0x4f87e098 // sdot v24.4s, v4.16b, v7.4b[0]\n"
".word 0x4fa7e09a // sdot v26.4s, v4.16b, v7.4b[1]\n"
"ldr q3, [%[rhs_ptr], #16]\n"
".word 0x4f87e89c // sdot v28.4s, v4.16b, v7.4b[2]\n"
".word 0x4fa7e89e // sdot v30.4s, v4.16b, v7.4b[3]\n"
".word 0x4f86e0b1 // sdot v17.4s, v5.16b, v6.4b[0]\n"
".word 0x4fa6e0b3 // sdot v19.4s, v5.16b, v6.4b[1]\n"
".word 0x4f86e8b5 // sdot v21.4s, v5.16b, v6.4b[2]\n"
".word 0x4fa6e8b7 // sdot v23.4s, v5.16b, v6.4b[3]\n"
".word 0x4f87e0b9 // sdot v25.4s, v5.16b, v7.4b[0]\n"
".word 0x4fa7e0bb // sdot v27.4s, v5.16b, v7.4b[1]\n"
".word 0x4f87e8bd // sdot v29.4s, v5.16b, v7.4b[2]\n"
".word 0x4fa7e8bf // sdot v31.4s, v5.16b, v7.4b[3]\n"
"ldr q5, [%[lhs_ptr], #48]\n"
".word 0x4f86e090 // sdot v16.4s, v4.16b, v6.4b[0]\n"
".word 0x4fa6e092 // sdot v18.4s, v4.16b, v6.4b[1]\n"
"ldr q7, [%[rhs_ptr], #48]\n"
".word 0x4f86e894 // sdot v20.4s, v4.16b, v6.4b[2]\n"
".word 0x4fa6e896 // sdot v22.4s, v4.16b, v6.4b[3]\n"
"ldr q4, [%[lhs_ptr], #32]\n"
".word 0x4f8be118 // sdot v24.4s, v8.16b, v11.4b[0]\n"
".word 0x4fabe11a // sdot v26.4s, v8.16b, v11.4b[1]\n"
"ldr q6, [%[rhs_ptr], #32]\n"
".word 0x4f8be91c // sdot v28.4s, v8.16b, v11.4b[2]\n"
".word 0x4fabe91e // sdot v30.4s, v8.16b, v11.4b[3]\n"
".word 0x4f8ae131 // sdot v17.4s, v9.16b, v10.4b[0]\n"
".word 0x4faae133 // sdot v19.4s, v9.16b, v10.4b[1]\n"
".word 0x4f8ae935 // sdot v21.4s, v9.16b, v10.4b[2]\n"
".word 0x4faae937 // sdot v23.4s, v9.16b, v10.4b[3]\n"
".word 0x4f8be139 // sdot v25.4s, v9.16b, v11.4b[0]\n"
".word 0x4fabe13b // sdot v27.4s, v9.16b, v11.4b[1]\n"
".word 0x4f8be93d // sdot v29.4s, v9.16b, v11.4b[2]\n"
".word 0x4fabe93f // sdot v31.4s, v9.16b, v11.4b[3]\n"
"ldr q9, [%[lhs_ptr], #80]\n"
".word 0x4f8ae110 // sdot v16.4s, v8.16b, v10.4b[0]\n"
".word 0x4faae112 // sdot v18.4s, v8.16b, v10.4b[1]\n"
"ldr q11, [%[rhs_ptr], #80]\n"
".word 0x4f8ae914 // sdot v20.4s, v8.16b, v10.4b[2]\n"
".word 0x4faae916 // sdot v22.4s, v8.16b, v10.4b[3]\n"
"ldr q8, [%[lhs_ptr], #64]\n"
".word 0x4f8fe198 // sdot v24.4s, v12.16b, v15.4b[0]\n"
".word 0x4fafe19a // sdot v26.4s, v12.16b, v15.4b[1]\n"
"ldr q10, [%[rhs_ptr], #64]\n"
".word 0x4f8fe99c // sdot v28.4s, v12.16b, v15.4b[2]\n"
".word 0x4fafe99e // sdot v30.4s, v12.16b, v15.4b[3]\n"
"add %[lhs_ptr], %[lhs_ptr], #128\n"
".word 0x4f8ee1b1 // sdot v17.4s, v13.16b, v14.4b[0]\n"
".word 0x4faee1b3 // sdot v19.4s, v13.16b, v14.4b[1]\n"
"add %[rhs_ptr], %[rhs_ptr], #128\n"
".word 0x4f8ee9b5 // sdot v21.4s, v13.16b, v14.4b[2]\n"
".word 0x4faee9b7 // sdot v23.4s, v13.16b, v14.4b[3]\n"
".word 0x4f8fe1b9 // sdot v25.4s, v13.16b, v15.4b[0]\n"
".word 0x4fafe1bb // sdot v27.4s, v13.16b, v15.4b[1]\n"
"cmp w1, w3\n"
".word 0x4f8fe9bd // sdot v29.4s, v13.16b, v15.4b[2]\n"
".word 0x4fafe9bf // sdot v31.4s, v13.16b, v15.4b[3]\n"
"ldr q13, [%[lhs_ptr], #-16]\n"
".word 0x4f8ee190 // sdot v16.4s, v12.16b, v14.4b[0]\n"
".word 0x4faee192 // sdot v18.4s, v12.16b, v14.4b[1]\n"
"ldr q15, [%[rhs_ptr], #-16]\n"
".word 0x4f8ee994 // sdot v20.4s, v12.16b, v14.4b[2]\n"
".word 0x4faee996 // sdot v22.4s, v12.16b, v14.4b[3]\n"
"ldr q12, [%[lhs_ptr], #-32]\n"
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
"ldr q14, [%[rhs_ptr], #-32]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"blt 81b\n"
".word 0x4f87e098 // sdot v24.4s, v4.16b, v7.4b[0]\n"
".word 0x4fa7e09a // sdot v26.4s, v4.16b, v7.4b[1]\n"
".word 0x4f87e89c // sdot v28.4s, v4.16b, v7.4b[2]\n"
".word 0x4fa7e89e // sdot v30.4s, v4.16b, v7.4b[3]\n"
".word 0x4f86e0b1 // sdot v17.4s, v5.16b, v6.4b[0]\n"
".word 0x4fa6e0b3 // sdot v19.4s, v5.16b, v6.4b[1]\n"
".word 0x4f86e8b5 // sdot v21.4s, v5.16b, v6.4b[2]\n"
".word 0x4fa6e8b7 // sdot v23.4s, v5.16b, v6.4b[3]\n"
".word 0x4f87e0b9 // sdot v25.4s, v5.16b, v7.4b[0]\n"
".word 0x4fa7e0bb // sdot v27.4s, v5.16b, v7.4b[1]\n"
".word 0x4f87e8bd // sdot v29.4s, v5.16b, v7.4b[2]\n"
".word 0x4fa7e8bf // sdot v31.4s, v5.16b, v7.4b[3]\n"
".word 0x4f86e090 // sdot v16.4s, v4.16b, v6.4b[0]\n"
".word 0x4fa6e092 // sdot v18.4s, v4.16b, v6.4b[1]\n"
".word 0x4f86e894 // sdot v20.4s, v4.16b, v6.4b[2]\n"
".word 0x4fa6e896 // sdot v22.4s, v4.16b, v6.4b[3]\n"
".word 0x4f8be118 // sdot v24.4s, v8.16b, v11.4b[0]\n"
".word 0x4fabe11a // sdot v26.4s, v8.16b, v11.4b[1]\n"
".word 0x4f8be91c // sdot v28.4s, v8.16b, v11.4b[2]\n"
".word 0x4fabe91e // sdot v30.4s, v8.16b, v11.4b[3]\n"
".word 0x4f8ae131 // sdot v17.4s, v9.16b, v10.4b[0]\n"
".word 0x4faae133 // sdot v19.4s, v9.16b, v10.4b[1]\n"
".word 0x4f8ae935 // sdot v21.4s, v9.16b, v10.4b[2]\n"
".word 0x4faae937 // sdot v23.4s, v9.16b, v10.4b[3]\n"
".word 0x4f8be139 // sdot v25.4s, v9.16b, v11.4b[0]\n"
".word 0x4fabe13b // sdot v27.4s, v9.16b, v11.4b[1]\n"
".word 0x4f8be93d // sdot v29.4s, v9.16b, v11.4b[2]\n"
".word 0x4fabe93f // sdot v31.4s, v9.16b, v11.4b[3]\n"
".word 0x4f8ae110 // sdot v16.4s, v8.16b, v10.4b[0]\n"
".word 0x4faae112 // sdot v18.4s, v8.16b, v10.4b[1]\n"
".word 0x4f8ae914 // sdot v20.4s, v8.16b, v10.4b[2]\n"
".word 0x4faae916 // sdot v22.4s, v8.16b, v10.4b[3]\n"
".word 0x4f8fe198 // sdot v24.4s, v12.16b, v15.4b[0]\n"
".word 0x4fafe19a // sdot v26.4s, v12.16b, v15.4b[1]\n"
".word 0x4f8fe99c // sdot v28.4s, v12.16b, v15.4b[2]\n"
".word 0x4fafe99e // sdot v30.4s, v12.16b, v15.4b[3]\n"
".word 0x4f8ee1b1 // sdot v17.4s, v13.16b, v14.4b[0]\n"
".word 0x4faee1b3 // sdot v19.4s, v13.16b, v14.4b[1]\n"
".word 0x4f8ee9b5 // sdot v21.4s, v13.16b, v14.4b[2]\n"
".word 0x4faee9b7 // sdot v23.4s, v13.16b, v14.4b[3]\n"
".word 0x4f8fe1b9 // sdot v25.4s, v13.16b, v15.4b[0]\n"
".word 0x4fafe1bb // sdot v27.4s, v13.16b, v15.4b[1]\n"
".word 0x4f8fe9bd // sdot v29.4s, v13.16b, v15.4b[2]\n"
".word 0x4fafe9bf // sdot v31.4s, v13.16b, v15.4b[3]\n"
".word 0x4f8ee190 // sdot v16.4s, v12.16b, v14.4b[0]\n"
".word 0x4faee192 // sdot v18.4s, v12.16b, v14.4b[1]\n"
".word 0x4f8ee994 // sdot v20.4s, v12.16b, v14.4b[2]\n"
".word 0x4faee996 // sdot v22.4s, v12.16b, v14.4b[3]\n"
"78:\n"
#endif
"cmp w1, w12\n"
"beq 79f\n"
"2:\n"
".word 0x4f83e018 // sdot v24.4s, v0.16b, v3.4b[0]\n"
".word 0x4fa3e01a // sdot v26.4s, v0.16b, v3.4b[1]\n"
"add w1, w1, #4\n"
".word 0x4f83e81c // sdot v28.4s, v0.16b, v3.4b[2]\n"
".word 0x4fa3e81e // sdot v30.4s, v0.16b, v3.4b[3]\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
".word 0x4f82e031 // sdot v17.4s, v1.16b, v2.4b[0]\n"
".word 0x4fa2e033 // sdot v19.4s, v1.16b, v2.4b[1]\n"
"cmp w1, w12\n"
".word 0x4f82e835 // sdot v21.4s, v1.16b, v2.4b[2]\n"
".word 0x4fa2e837 // sdot v23.4s, v1.16b, v2.4b[3]\n"
"ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
".word 0x4f83e039 // sdot v25.4s, v1.16b, v3.4b[0]\n"
".word 0x4fa3e03b // sdot v27.4s, v1.16b, v3.4b[1]\n"
".word 0x4f83e83d // sdot v29.4s, v1.16b, v3.4b[2]\n"
".word 0x4fa3e83f // sdot v31.4s, v1.16b, v3.4b[3]\n"
"ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"blt 2b\n"
"79:\n"
".word 0x4f83e018 // sdot v24.4s, v0.16b, v3.4b[0]\n"
".word 0x4fa3e01a // sdot v26.4s, v0.16b, v3.4b[1]\n"
".word 0x4f83e81c // sdot v28.4s, v0.16b, v3.4b[2]\n"
".word 0x4fa3e81e // sdot v30.4s, v0.16b, v3.4b[3]\n"
".word 0x4f82e031 // sdot v17.4s, v1.16b, v2.4b[0]\n"
".word 0x4fa2e033 // sdot v19.4s, v1.16b, v2.4b[1]\n"
".word 0x4f82e835 // sdot v21.4s, v1.16b, v2.4b[2]\n"
".word 0x4fa2e837 // sdot v23.4s, v1.16b, v2.4b[3]\n"
".word 0x4f83e039 // sdot v25.4s, v1.16b, v3.4b[0]\n"
".word 0x4fa3e03b // sdot v27.4s, v1.16b, v3.4b[1]\n"
".word 0x4f83e83d // sdot v29.4s, v1.16b, v3.4b[2]\n"
".word 0x4fa3e83f // sdot v31.4s, v1.16b, v3.4b[3]\n"
"cmp %w[row], w7\n"
"bge 4f\n"
"add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
"b 5f\n"
"4:\n"
"mov %[lhs_col_ptr], x5\n"
"cmp %w[col], w8\n"
"bge 5f\n"
"add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
"5:\n"
"mov %[lhs_ptr], %[lhs_col_ptr]\n"
"mov %[rhs_ptr], %[rhs_col_ptr]\n"
"mvni v8.4s, #0\n"
"ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
"ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"dup v9.4s, w3\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"csel w3, %w[row], %w[col], eq\n"
"add x5, x1, x3, lsl #2\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"ld1 {v14.4s}, [x1], #16\n"
"ld1 {v15.4s}, [x1]\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
"add v14.4s, v14.4s, v9.4s\n"
"add v15.4s, v15.4s, v9.4s\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 6f\n"
"add v16.4s, v16.4s, v14.4s\n"
"add v17.4s, v17.4s, v15.4s\n"
"add v18.4s, v18.4s, v14.4s\n"
"add v19.4s, v19.4s, v15.4s\n"
"add v20.4s, v20.4s, v14.4s\n"
"add v21.4s, v21.4s, v15.4s\n"
"add v22.4s, v22.4s, v14.4s\n"
"add v23.4s, v23.4s, v15.4s\n"
"add v24.4s, v24.4s, v14.4s\n"
"add v25.4s, v25.4s, v15.4s\n"
"add v26.4s, v26.4s, v14.4s\n"
"add v27.4s, v27.4s, v15.4s\n"
"add v28.4s, v28.4s, v14.4s\n"
"add v29.4s, v29.4s, v15.4s\n"
"add v30.4s, v30.4s, v14.4s\n"
"add v31.4s, v31.4s, v15.4s\n"
"b 7f\n"
"6:\n"
"dup v10.4s, v14.s[0]\n"
"dup v11.4s, v14.s[1]\n"
"dup v12.4s, v14.s[2]\n"
"dup v13.4s, v14.s[3]\n"
"add v16.4s, v16.4s, v10.4s\n"
"add v17.4s, v17.4s, v10.4s\n"
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
"add v22.4s, v22.4s, v13.4s\n"
"add v23.4s, v23.4s, v13.4s\n"
"dup v10.4s, v15.s[0]\n"
"dup v11.4s, v15.s[1]\n"
"dup v12.4s, v15.s[2]\n"
"dup v13.4s, v15.s[3]\n"
"add v24.4s, v24.4s, v10.4s\n"
"add v25.4s, v25.4s, v10.4s\n"
"add v26.4s, v26.4s, v11.4s\n"
"add v27.4s, v27.4s, v11.4s\n"
"add v28.4s, v28.4s, v12.4s\n"
"add v29.4s, v29.4s, v12.4s\n"
"add v30.4s, v30.4s, v13.4s\n"
"add v31.4s, v31.4s, v13.4s\n"
"7:\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
"beq 401f\n"
"ldr x3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
"add x3, x3, %x[col], lsl #2\n"
"ld1 {v14.4s}, [x3], #16\n"
"ld1 {v15.4s}, [x3]\n"
"ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
"dup v10.4s, w5\n"
"mls v16.4s, v10.4s, v14.s[0]\n"
"mls v17.4s, v10.4s, v14.s[0]\n"
"mls v18.4s, v10.4s, v14.s[1]\n"
"mls v19.4s, v10.4s, v14.s[1]\n"
"mls v20.4s, v10.4s, v14.s[2]\n"
"mls v21.4s, v10.4s, v14.s[2]\n"
"mls v22.4s, v10.4s, v14.s[3]\n"
"mls v23.4s, v10.4s, v14.s[3]\n"
"mls v24.4s, v10.4s, v15.s[0]\n"
"mls v25.4s, v10.4s, v15.s[0]\n"
"mls v26.4s, v10.4s, v15.s[1]\n"
"mls v27.4s, v10.4s, v15.s[1]\n"
"mls v28.4s, v10.4s, v15.s[2]\n"
"mls v29.4s, v10.4s, v15.s[2]\n"
"mls v30.4s, v10.4s, v15.s[3]\n"
"mls v31.4s, v10.4s, v15.s[3]\n"
"401:\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
"beq 402f\n"
"ldr x2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
"add x2, x2, %x[row], lsl #2\n"
"ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
"ld1 {v11.4s}, [x2], #16\n"
"ld1 {v12.4s}, [x2]\n"
"ins v13.s[1], w5\n"
"mul v11.4s, v11.4s, v13.s[1]\n"
"mul v12.4s, v12.4s, v13.s[1]\n"
"sub v16.4s, v16.4s, v11.4s\n"
"sub v17.4s, v17.4s, v12.4s\n"
"sub v18.4s, v18.4s, v11.4s\n"
"sub v19.4s, v19.4s, v12.4s\n"
"sub v20.4s, v20.4s, v11.4s\n"
"sub v21.4s, v21.4s, v12.4s\n"
"sub v22.4s, v22.4s, v11.4s\n"
"sub v23.4s, v23.4s, v12.4s\n"
"sub v24.4s, v24.4s, v11.4s\n"
"sub v25.4s, v25.4s, v12.4s\n"
"sub v26.4s, v26.4s, v11.4s\n"
"sub v27.4s, v27.4s, v12.4s\n"
"sub v28.4s, v28.4s, v11.4s\n"
"sub v29.4s, v29.4s, v12.4s\n"
"sub v30.4s, v30.4s, v11.4s\n"
"sub v31.4s, v31.4s, v12.4s\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
"402:\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"csel w3, %w[row], %w[col], eq\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
"add x5, x1, x3, lsl #2\n"
"csel x1, x1, x5, eq\n"
"ldr q9, [x1]\n"
"ldr q10, [x1, #16]\n"
"smin v11.4s, v8.4s, v9.4s\n"
"smin v12.4s, v8.4s, v10.4s\n"
"sub v9.4s, v9.4s, v11.4s\n"
"sub v10.4s, v10.4s, v12.4s\n"
"ldr x4, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
"add x5, x4, x3, lsl #2\n"
"csel x4, x4, x5, eq\n"
"ldr q14, [x4]\n"
"ldr q15, [x4, #16]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 8f\n"
"sshl v16.4s, v16.4s, v9.4s\n"
"sshl v17.4s, v17.4s, v10.4s\n"
"sshl v18.4s, v18.4s, v9.4s\n"
"sshl v19.4s, v19.4s, v10.4s\n"
"sshl v20.4s, v20.4s, v9.4s\n"
"sshl v21.4s, v21.4s, v10.4s\n"
"sshl v22.4s, v22.4s, v9.4s\n"
"sshl v23.4s, v23.4s, v10.4s\n"
"sshl v24.4s, v24.4s, v9.4s\n"
"sshl v25.4s, v25.4s, v10.4s\n"
"sshl v26.4s, v26.4s, v9.4s\n"
"sshl v27.4s, v27.4s, v10.4s\n"
"sshl v28.4s, v28.4s, v9.4s\n"
"sshl v29.4s, v29.4s, v10.4s\n"
"sshl v30.4s, v30.4s, v9.4s\n"
"sshl v31.4s, v31.4s, v10.4s\n"
"10:\n"
"sqdmulh v16.4s, v16.4s, v14.4s\n"
"sqdmulh v17.4s, v17.4s, v15.4s\n"
"sqdmulh v18.4s, v18.4s, v14.4s\n"
"sqdmulh v19.4s, v19.4s, v15.4s\n"
"sqdmulh v20.4s, v20.4s, v14.4s\n"
"sqdmulh v21.4s, v21.4s, v15.4s\n"
"sqdmulh v22.4s, v22.4s, v14.4s\n"
"sqdmulh v23.4s, v23.4s, v15.4s\n"
"sqdmulh v24.4s, v24.4s, v14.4s\n"
"sqdmulh v25.4s, v25.4s, v15.4s\n"
"sqdmulh v26.4s, v26.4s, v14.4s\n"
"sqdmulh v27.4s, v27.4s, v15.4s\n"
"sqdmulh v28.4s, v28.4s, v14.4s\n"
"sqdmulh v29.4s, v29.4s, v15.4s\n"
"sqdmulh v30.4s, v30.4s, v14.4s\n"
"sqdmulh v31.4s, v31.4s, v15.4s\n"
"srshl v16.4s, v16.4s, v11.4s\n"
"srshl v17.4s, v17.4s, v12.4s\n"
"srshl v18.4s, v18.4s, v11.4s\n"
"srshl v19.4s, v19.4s, v12.4s\n"
"srshl v20.4s, v20.4s, v11.4s\n"
"srshl v21.4s, v21.4s, v12.4s\n"
"srshl v22.4s, v22.4s, v11.4s\n"
"srshl v23.4s, v23.4s, v12.4s\n"
"srshl v24.4s, v24.4s, v11.4s\n"
"srshl v25.4s, v25.4s, v12.4s\n"
"srshl v26.4s, v26.4s, v11.4s\n"
"srshl v27.4s, v27.4s, v12.4s\n"
"srshl v28.4s, v28.4s, v11.4s\n"
"srshl v29.4s, v29.4s, v12.4s\n"
"srshl v30.4s, v30.4s, v11.4s\n"
"srshl v31.4s, v31.4s, v12.4s\n"
"b 9f\n"
"8:\n"
"dup v4.4s, v9.s[0]\n"
"dup v5.4s, v9.s[1]\n"
"dup v6.4s, v9.s[2]\n"
"dup v7.4s, v9.s[3]\n"
"sshl v16.4s, v16.4s, v4.4s\n"
"sshl v17.4s, v17.4s, v4.4s\n"
"sshl v18.4s, v18.4s, v5.4s\n"
"sshl v19.4s, v19.4s, v5.4s\n"
"sshl v20.4s, v20.4s, v6.4s\n"
"sshl v21.4s, v21.4s, v6.4s\n"
"sshl v22.4s, v22.4s, v7.4s\n"
"sshl v23.4s, v23.4s, v7.4s\n"
"dup v4.4s, v10.s[0]\n"
"dup v5.4s, v10.s[1]\n"
"dup v6.4s, v10.s[2]\n"
"dup v7.4s, v10.s[3]\n"
"sshl v24.4s, v24.4s, v4.4s\n"
"sshl v25.4s, v25.4s, v4.4s\n"
"sshl v26.4s, v26.4s, v5.4s\n"
"sshl v27.4s, v27.4s, v5.4s\n"
"sshl v28.4s, v28.4s, v6.4s\n"
"sshl v29.4s, v29.4s, v6.4s\n"
"sshl v30.4s, v30.4s, v7.4s\n"
"sshl v31.4s, v31.4s, v7.4s\n"
"11:\n"
"sqdmulh v16.4s, v16.4s, v14.s[0]\n"
"sqdmulh v17.4s, v17.4s, v14.s[0]\n"
"sqdmulh v18.4s, v18.4s, v14.s[1]\n"
"sqdmulh v19.4s, v19.4s, v14.s[1]\n"
"sqdmulh v20.4s, v20.4s, v14.s[2]\n"
"sqdmulh v21.4s, v21.4s, v14.s[2]\n"
"sqdmulh v22.4s, v22.4s, v14.s[3]\n"
"sqdmulh v23.4s, v23.4s, v14.s[3]\n"
"sqdmulh v24.4s, v24.4s, v15.s[0]\n"
"sqdmulh v25.4s, v25.4s, v15.s[0]\n"
"sqdmulh v26.4s, v26.4s, v15.s[1]\n"
"sqdmulh v27.4s, v27.4s, v15.s[1]\n"
"sqdmulh v28.4s, v28.4s, v15.s[2]\n"
"sqdmulh v29.4s, v29.4s, v15.s[2]\n"
"sqdmulh v30.4s, v30.4s, v15.s[3]\n"
"sqdmulh v31.4s, v31.4s, v15.s[3]\n"
"dup v4.4s, v11.s[0]\n"
"dup v5.4s, v11.s[1]\n"
"dup v6.4s, v11.s[2]\n"
"dup v7.4s, v11.s[3]\n"
"srshl v16.4s, v16.4s, v4.4s\n"
"srshl v17.4s, v17.4s, v4.4s\n"
"srshl v18.4s, v18.4s, v5.4s\n"
"srshl v19.4s, v19.4s, v5.4s\n"
"srshl v20.4s, v20.4s, v6.4s\n"
"srshl v21.4s, v21.4s, v6.4s\n"
"srshl v22.4s, v22.4s, v7.4s\n"
"srshl v23.4s, v23.4s, v7.4s\n"
"dup v4.4s, v12.s[0]\n"
"dup v5.4s, v12.s[1]\n"
"dup v6.4s, v12.s[2]\n"
"dup v7.4s, v12.s[3]\n"
"srshl v24.4s, v24.4s, v4.4s\n"
"srshl v25.4s, v25.4s, v4.4s\n"
"srshl v26.4s, v26.4s, v5.4s\n"
"srshl v27.4s, v27.4s, v5.4s\n"
"srshl v28.4s, v28.4s, v6.4s\n"
"srshl v29.4s, v29.4s, v6.4s\n"
"srshl v30.4s, v30.4s, v7.4s\n"
"srshl v31.4s, v31.4s, v7.4s\n"
"9:\n"
"ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
"ins v13.h[4], w4\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqxtn v17.4h, v18.4s\n"
"sqxtn2 v17.8h, v19.4s\n"
"sqxtn v18.4h, v20.4s\n"
"sqxtn2 v18.8h, v21.4s\n"
"sqxtn v19.4h, v22.4s\n"
"sqxtn2 v19.8h, v23.4s\n"
"sqxtn v20.4h, v24.4s\n"
"sqxtn2 v20.8h, v25.4s\n"
"sqxtn v21.4h, v26.4s\n"
"sqxtn2 v21.8h, v27.4s\n"
"sqxtn v22.4h, v28.4s\n"
"sqxtn2 v22.8h, v29.4s\n"
"sqxtn v23.4h, v30.4s\n"
"sqxtn2 v23.8h, v31.4s\n"
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"dup v14.8h, v13.h[4]\n"
"sqadd v16.8h, v16.8h, v14.8h\n"
"sqadd v17.8h, v17.8h, v14.8h\n"
"sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v19.8h, v19.8h, v14.8h\n"
"sqadd v20.8h, v20.8h, v14.8h\n"
"sqadd v21.8h, v21.8h, v14.8h\n"
"sqadd v22.8h, v22.8h, v14.8h\n"
"sqadd v23.8h, v23.8h, v14.8h\n"
"sqxtun v16.8b, v16.8h\n"
"sqxtun2 v16.16b, v17.8h\n"
"sqxtun v17.8b, v18.8h\n"
"sqxtun2 v17.16b, v19.8h\n"
"sqxtun v18.8b, v20.8h\n"
"sqxtun2 v18.16b, v21.8h\n"
"sqxtun v19.8b, v22.8h\n"
"sqxtun2 v19.16b, v23.8h\n"
"ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.16b, w2\n"
"dup v15.16b, w3\n"
"umax v16.16b, v16.16b, v14.16b\n"
"umax v17.16b, v17.16b, v14.16b\n"
"umax v18.16b, v18.16b, v14.16b\n"
"umax v19.16b, v19.16b, v14.16b\n"
"umin v16.16b, v16.16b, v15.16b\n"
"umin v17.16b, v17.16b, v15.16b\n"
"umin v18.16b, v18.16b, v15.16b\n"
"umin v19.16b, v19.16b, v15.16b\n"
"dup d20, v16.d[1]\n"
"dup d21, v17.d[1]\n"
"dup d22, v18.d[1]\n"
"dup d23, v19.d[1]\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 30f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #8\n"
"b 31f\n"
"30:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"31:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v16.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v16)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v20.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v20)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v17.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v17)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v21.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v21)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v18.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v18)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v22.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v22)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v19.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v19)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v23.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v23)
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"beq 41f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldrb w7, [x3, w5, uxtw]\n"
"strb w7, [x4, w5, uxtw]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #8\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"41:\n"
"add %[dst_ptr], %[dst_ptr], #8\n"
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqxtn v17.4h, v18.4s\n"
"sqxtn2 v17.8h, v19.4s\n"
"sqxtn v18.4h, v20.4s\n"
"sqxtn2 v18.8h, v21.4s\n"
"sqxtn v19.4h, v22.4s\n"
"sqxtn2 v19.8h, v23.4s\n"
"sqxtn v20.4h, v24.4s\n"
"sqxtn2 v20.8h, v25.4s\n"
"sqxtn v21.4h, v26.4s\n"
"sqxtn2 v21.8h, v27.4s\n"
"sqxtn v22.4h, v28.4s\n"
"sqxtn2 v22.8h, v29.4s\n"
"sqxtn v23.4h, v30.4s\n"
"sqxtn2 v23.8h, v31.4s\n"
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"dup v14.8h, v13.h[4]\n"
"sqadd v16.8h, v16.8h, v14.8h\n"
"sqadd v17.8h, v17.8h, v14.8h\n"
"sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v19.8h, v19.8h, v14.8h\n"
"sqadd v20.8h, v20.8h, v14.8h\n"
"sqadd v21.8h, v21.8h, v14.8h\n"
"sqadd v22.8h, v22.8h, v14.8h\n"
"sqadd v23.8h, v23.8h, v14.8h\n"
"sqxtn v16.8b, v16.8h\n"
"sqxtn2 v16.16b, v17.8h\n"
"sqxtn v17.8b, v18.8h\n"
"sqxtn2 v17.16b, v19.8h\n"
"sqxtn v18.8b, v20.8h\n"
"sqxtn2 v18.16b, v21.8h\n"
"sqxtn v19.8b, v22.8h\n"
"sqxtn2 v19.16b, v23.8h\n"
"ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.16b, w2\n"
"dup v15.16b, w3\n"
"smax v16.16b, v16.16b, v14.16b\n"
"smax v17.16b, v17.16b, v14.16b\n"
"smax v18.16b, v18.16b, v14.16b\n"
"smax v19.16b, v19.16b, v14.16b\n"
"smin v16.16b, v16.16b, v15.16b\n"
"smin v17.16b, v17.16b, v15.16b\n"
"smin v18.16b, v18.16b, v15.16b\n"
"smin v19.16b, v19.16b, v15.16b\n"
"dup d20, v16.d[1]\n"
"dup d21, v17.d[1]\n"
"dup d22, v18.d[1]\n"
"dup d23, v19.d[1]\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 130f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #8\n"
"b 131f\n"
"130:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"131:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v16.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v16)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v20.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v20)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v17.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v17)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v21.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v21)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v18.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v18)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v22.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v22)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v19.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v19)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v23.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v23)
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"beq 141f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"150:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"151:\n"
"ldrb w7, [x3, w5, uxtw]\n"
"strb w7, [x4, w5, uxtw]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 151b\n"
"add w6, w6, #1\n"
"add x3, x3, #8\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 150b\n"
"141:\n"
"add %[dst_ptr], %[dst_ptr], #8\n"
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
"dup v14.8h, v13.h[4]\n"
"saddw v16.4s, v16.4s, v14.4h\n"
"saddw v17.4s, v17.4s, v14.4h\n"
"saddw v18.4s, v18.4s, v14.4h\n"
"saddw v19.4s, v19.4s, v14.4h\n"
"saddw v20.4s, v20.4s, v14.4h\n"
"saddw v21.4s, v21.4s, v14.4h\n"
"saddw v22.4s, v22.4s, v14.4h\n"
"saddw v23.4s, v23.4s, v14.4h\n"
"saddw v24.4s, v24.4s, v14.4h\n"
"saddw v25.4s, v25.4s, v14.4h\n"
"saddw v26.4s, v26.4s, v14.4h\n"
"saddw v27.4s, v27.4s, v14.4h\n"
"saddw v28.4s, v28.4s, v14.4h\n"
"saddw v29.4s, v29.4s, v14.4h\n"
"saddw v30.4s, v30.4s, v14.4h\n"
"saddw v31.4s, v31.4s, v14.4h\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqxtn v17.4h, v18.4s\n"
"sqxtn2 v17.8h, v19.4s\n"
"sqxtn v18.4h, v20.4s\n"
"sqxtn2 v18.8h, v21.4s\n"
"sqxtn v19.4h, v22.4s\n"
"sqxtn2 v19.8h, v23.4s\n"
"sqxtn v20.4h, v24.4s\n"
"sqxtn2 v20.8h, v25.4s\n"
"sqxtn v21.4h, v26.4s\n"
"sqxtn2 v21.8h, v27.4s\n"
"sqxtn v22.4h, v28.4s\n"
"sqxtn2 v22.8h, v29.4s\n"
"sqxtn v23.4h, v30.4s\n"
"sqxtn2 v23.8h, v31.4s\n"
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"ldrsh w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrsh w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.8h, w2\n"
"dup v15.8h, w3\n"
"smax v16.8h, v16.8h, v14.8h\n"
"smax v17.8h, v17.8h, v14.8h\n"
"smax v18.8h, v18.8h, v14.8h\n"
"smax v19.8h, v19.8h, v14.8h\n"
"smax v20.8h, v20.8h, v14.8h\n"
"smax v21.8h, v21.8h, v14.8h\n"
"smax v22.8h, v22.8h, v14.8h\n"
"smax v23.8h, v23.8h, v14.8h\n"
"smin v16.8h, v16.8h, v15.8h\n"
"smin v17.8h, v17.8h, v15.8h\n"
"smin v18.8h, v18.8h, v15.8h\n"
"smin v19.8h, v19.8h, v15.8h\n"
"smin v20.8h, v20.8h, v15.8h\n"
"smin v21.8h, v21.8h, v15.8h\n"
"smin v22.8h, v22.8h, v15.8h\n"
"smin v23.8h, v23.8h, v15.8h\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 230f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #16\n"
"b 231f\n"
"230:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"231:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v16.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v16)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v17.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v17)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v18.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v18)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v19.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v19)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v20.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v20)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v21.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v21)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v22.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v22)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v23.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v23)
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"beq 241f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"250:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"251:\n"
"ldrsh w7, [x3, x5, lsl #1]\n"
"strh w7, [x4, x5, lsl #1]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 251b\n"
"add w6, w6, #1\n"
"add x3, x3, #16\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 250b\n"
"241:\n"
"add %[dst_ptr], %[dst_ptr], #16\n"
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 330f\n"
"mov x3, %[dst_tmp_buf]\n"
"st1 {v16.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v16)
"st1 {v17.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v17)
"st1 {v18.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v18)
"st1 {v19.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v19)
"st1 {v20.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v20)
"st1 {v21.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v21)
"st1 {v22.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v22)
"st1 {v23.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v23)
"st1 {v24.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v24)
"st1 {v25.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v25)
"st1 {v26.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v26)
"st1 {v27.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v27)
"st1 {v28.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v28)
"st1 {v29.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v29)
"st1 {v30.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v30)
"st1 {v31.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v31)
"b 331f\n"
"330:\n"
"mov x4, %[dst_ptr]\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.4s, v17.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"add x4, x4, x11\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v18.4s, v19.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
"add x4, x4, x11\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v20.4s, v21.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
"add x4, x4, x11\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v22.4s, v23.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
"add x4, x4, x11\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v24.4s, v25.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
"add x4, x4, x11\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v26.4s, v27.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
"add x4, x4, x11\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v28.4s, v29.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
"add x4, x4, x11\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v30.4s, v31.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"331:\n"
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"beq 341f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"350:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"351:\n"
"ldr w7, [x3, x5, lsl #2]\n"
"str w7, [x4, x5, lsl #2]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 351b\n"
"add w6, w6, #1\n"
"add x3, x3, #32\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 350b\n"
"341:\n"
"add %[dst_ptr], %[dst_ptr], #32\n"
RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"cmp %w[row], w7\n"
"beq 20f\n"
"add %w[row], %w[row], #8\n"
"b 21f\n"
"20:\n"
"mov %w[row], w6\n"
"add %w[col], %w[col], #8\n"
"add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
"mov %[dst_ptr], %[dst_col_ptr]\n"
"21:\n"
"cmp %w[col], w8\n"
"mov w1, #4\n"
"ble 1b\n"
: [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
: [ params ] "r"(¶ms), [dst_rows] "r"(params.dst_rows),
[dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf),
[dst_type_id] "r"(params.dst_type_id)
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
"memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
void Kernel8bitNeonDotprodX1(const KernelParams8bit<8, 8>& params) {
profiler::ScopeLabel label("Kernel (kNeonDotprod)");
CheckOffsetsInKernelParams8bit(params);
const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
const std::int8_t* rhs_col_ptr =
static_cast<const int8_t*>(params.rhs_base_ptr);
const std::int8_t* lhs_ptr = lhs_col_ptr;
const std::int8_t* rhs_ptr = rhs_col_ptr;
void* dst_col_ptr = params.dst_base_ptr;
void* dst_ptr = dst_col_ptr;
int row = params.start_row;
int col = params.start_col;
asm volatile(
#define RUY_MAKE_ZERO …
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
"ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
"ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"mov w1, #4\n"
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"1:\n"
"cmp w1, w12\n"
"beq 79f\n"
"2:\n"
".word 0x4f83e018 // sdot v24.4s, v0.16b, v3.4b[0]\n"
".word 0x4fa3e01a // sdot v26.4s, v0.16b, v3.4b[1]\n"
"add w1, w1, #4\n"
".word 0x4f83e81c // sdot v28.4s, v0.16b, v3.4b[2]\n"
".word 0x4fa3e81e // sdot v30.4s, v0.16b, v3.4b[3]\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
".word 0x4f82e031 // sdot v17.4s, v1.16b, v2.4b[0]\n"
".word 0x4fa2e033 // sdot v19.4s, v1.16b, v2.4b[1]\n"
"cmp w1, w12\n"
".word 0x4f82e835 // sdot v21.4s, v1.16b, v2.4b[2]\n"
".word 0x4fa2e837 // sdot v23.4s, v1.16b, v2.4b[3]\n"
"ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
".word 0x4f83e039 // sdot v25.4s, v1.16b, v3.4b[0]\n"
".word 0x4fa3e03b // sdot v27.4s, v1.16b, v3.4b[1]\n"
".word 0x4f83e83d // sdot v29.4s, v1.16b, v3.4b[2]\n"
".word 0x4fa3e83f // sdot v31.4s, v1.16b, v3.4b[3]\n"
"ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"blt 2b\n"
"79:\n"
".word 0x4f83e018 // sdot v24.4s, v0.16b, v3.4b[0]\n"
".word 0x4fa3e01a // sdot v26.4s, v0.16b, v3.4b[1]\n"
".word 0x4f83e81c // sdot v28.4s, v0.16b, v3.4b[2]\n"
".word 0x4fa3e81e // sdot v30.4s, v0.16b, v3.4b[3]\n"
".word 0x4f82e031 // sdot v17.4s, v1.16b, v2.4b[0]\n"
".word 0x4fa2e033 // sdot v19.4s, v1.16b, v2.4b[1]\n"
".word 0x4f82e835 // sdot v21.4s, v1.16b, v2.4b[2]\n"
".word 0x4fa2e837 // sdot v23.4s, v1.16b, v2.4b[3]\n"
".word 0x4f83e039 // sdot v25.4s, v1.16b, v3.4b[0]\n"
".word 0x4fa3e03b // sdot v27.4s, v1.16b, v3.4b[1]\n"
".word 0x4f83e83d // sdot v29.4s, v1.16b, v3.4b[2]\n"
".word 0x4fa3e83f // sdot v31.4s, v1.16b, v3.4b[3]\n"
"cmp %w[row], w7\n"
"bge 4f\n"
"add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
"b 5f\n"
"4:\n"
"mov %[lhs_col_ptr], x5\n"
"cmp %w[col], w8\n"
"bge 5f\n"
"add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
"5:\n"
"mov %[lhs_ptr], %[lhs_col_ptr]\n"
"mov %[rhs_ptr], %[rhs_col_ptr]\n"
"mvni v8.4s, #0\n"
"ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
"ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"dup v9.4s, w3\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"csel w3, %w[row], %w[col], eq\n"
"add x5, x1, x3, lsl #2\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"ld1 {v14.4s}, [x1], #16\n"
"ld1 {v15.4s}, [x1]\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
"add v14.4s, v14.4s, v9.4s\n"
"add v15.4s, v15.4s, v9.4s\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 6f\n"
"add v16.4s, v16.4s, v14.4s\n"
"add v17.4s, v17.4s, v15.4s\n"
"add v18.4s, v18.4s, v14.4s\n"
"add v19.4s, v19.4s, v15.4s\n"
"add v20.4s, v20.4s, v14.4s\n"
"add v21.4s, v21.4s, v15.4s\n"
"add v22.4s, v22.4s, v14.4s\n"
"add v23.4s, v23.4s, v15.4s\n"
"add v24.4s, v24.4s, v14.4s\n"
"add v25.4s, v25.4s, v15.4s\n"
"add v26.4s, v26.4s, v14.4s\n"
"add v27.4s, v27.4s, v15.4s\n"
"add v28.4s, v28.4s, v14.4s\n"
"add v29.4s, v29.4s, v15.4s\n"
"add v30.4s, v30.4s, v14.4s\n"
"add v31.4s, v31.4s, v15.4s\n"
"b 7f\n"
"6:\n"
"dup v10.4s, v14.s[0]\n"
"dup v11.4s, v14.s[1]\n"
"dup v12.4s, v14.s[2]\n"
"dup v13.4s, v14.s[3]\n"
"add v16.4s, v16.4s, v10.4s\n"
"add v17.4s, v17.4s, v10.4s\n"
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
"add v22.4s, v22.4s, v13.4s\n"
"add v23.4s, v23.4s, v13.4s\n"
"dup v10.4s, v15.s[0]\n"
"dup v11.4s, v15.s[1]\n"
"dup v12.4s, v15.s[2]\n"
"dup v13.4s, v15.s[3]\n"
"add v24.4s, v24.4s, v10.4s\n"
"add v25.4s, v25.4s, v10.4s\n"
"add v26.4s, v26.4s, v11.4s\n"
"add v27.4s, v27.4s, v11.4s\n"
"add v28.4s, v28.4s, v12.4s\n"
"add v29.4s, v29.4s, v12.4s\n"
"add v30.4s, v30.4s, v13.4s\n"
"add v31.4s, v31.4s, v13.4s\n"
"7:\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
"beq 401f\n"
"ldr x3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
"add x3, x3, %x[col], lsl #2\n"
"ld1 {v14.4s}, [x3], #16\n"
"ld1 {v15.4s}, [x3]\n"
"ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
"dup v10.4s, w5\n"
"mls v16.4s, v10.4s, v14.s[0]\n"
"mls v17.4s, v10.4s, v14.s[0]\n"
"mls v18.4s, v10.4s, v14.s[1]\n"
"mls v19.4s, v10.4s, v14.s[1]\n"
"mls v20.4s, v10.4s, v14.s[2]\n"
"mls v21.4s, v10.4s, v14.s[2]\n"
"mls v22.4s, v10.4s, v14.s[3]\n"
"mls v23.4s, v10.4s, v14.s[3]\n"
"mls v24.4s, v10.4s, v15.s[0]\n"
"mls v25.4s, v10.4s, v15.s[0]\n"
"mls v26.4s, v10.4s, v15.s[1]\n"
"mls v27.4s, v10.4s, v15.s[1]\n"
"mls v28.4s, v10.4s, v15.s[2]\n"
"mls v29.4s, v10.4s, v15.s[2]\n"
"mls v30.4s, v10.4s, v15.s[3]\n"
"mls v31.4s, v10.4s, v15.s[3]\n"
"401:\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
"beq 402f\n"
"ldr x2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
"add x2, x2, %x[row], lsl #2\n"
"ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
"ld1 {v11.4s}, [x2], #16\n"
"ld1 {v12.4s}, [x2]\n"
"ins v13.s[1], w5\n"
"mul v11.4s, v11.4s, v13.s[1]\n"
"mul v12.4s, v12.4s, v13.s[1]\n"
"sub v16.4s, v16.4s, v11.4s\n"
"sub v17.4s, v17.4s, v12.4s\n"
"sub v18.4s, v18.4s, v11.4s\n"
"sub v19.4s, v19.4s, v12.4s\n"
"sub v20.4s, v20.4s, v11.4s\n"
"sub v21.4s, v21.4s, v12.4s\n"
"sub v22.4s, v22.4s, v11.4s\n"
"sub v23.4s, v23.4s, v12.4s\n"
"sub v24.4s, v24.4s, v11.4s\n"
"sub v25.4s, v25.4s, v12.4s\n"
"sub v26.4s, v26.4s, v11.4s\n"
"sub v27.4s, v27.4s, v12.4s\n"
"sub v28.4s, v28.4s, v11.4s\n"
"sub v29.4s, v29.4s, v12.4s\n"
"sub v30.4s, v30.4s, v11.4s\n"
"sub v31.4s, v31.4s, v12.4s\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
"402:\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"csel w3, %w[row], %w[col], eq\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
"add x5, x1, x3, lsl #2\n"
"csel x1, x1, x5, eq\n"
"ldr q9, [x1]\n"
"ldr q10, [x1, #16]\n"
"smin v11.4s, v8.4s, v9.4s\n"
"smin v12.4s, v8.4s, v10.4s\n"
"sub v9.4s, v9.4s, v11.4s\n"
"sub v10.4s, v10.4s, v12.4s\n"
"ldr x4, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
"add x5, x4, x3, lsl #2\n"
"csel x4, x4, x5, eq\n"
"ldr q14, [x4]\n"
"ldr q15, [x4, #16]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 8f\n"
"sshl v16.4s, v16.4s, v9.4s\n"
"sshl v17.4s, v17.4s, v10.4s\n"
"sshl v18.4s, v18.4s, v9.4s\n"
"sshl v19.4s, v19.4s, v10.4s\n"
"sshl v20.4s, v20.4s, v9.4s\n"
"sshl v21.4s, v21.4s, v10.4s\n"
"sshl v22.4s, v22.4s, v9.4s\n"
"sshl v23.4s, v23.4s, v10.4s\n"
"sshl v24.4s, v24.4s, v9.4s\n"
"sshl v25.4s, v25.4s, v10.4s\n"
"sshl v26.4s, v26.4s, v9.4s\n"
"sshl v27.4s, v27.4s, v10.4s\n"
"sshl v28.4s, v28.4s, v9.4s\n"
"sshl v29.4s, v29.4s, v10.4s\n"
"sshl v30.4s, v30.4s, v9.4s\n"
"sshl v31.4s, v31.4s, v10.4s\n"
"10:\n"
"sqdmulh v16.4s, v16.4s, v14.4s\n"
"sqdmulh v17.4s, v17.4s, v15.4s\n"
"sqdmulh v18.4s, v18.4s, v14.4s\n"
"sqdmulh v19.4s, v19.4s, v15.4s\n"
"sqdmulh v20.4s, v20.4s, v14.4s\n"
"sqdmulh v21.4s, v21.4s, v15.4s\n"
"sqdmulh v22.4s, v22.4s, v14.4s\n"
"sqdmulh v23.4s, v23.4s, v15.4s\n"
"sqdmulh v24.4s, v24.4s, v14.4s\n"
"sqdmulh v25.4s, v25.4s, v15.4s\n"
"sqdmulh v26.4s, v26.4s, v14.4s\n"
"sqdmulh v27.4s, v27.4s, v15.4s\n"
"sqdmulh v28.4s, v28.4s, v14.4s\n"
"sqdmulh v29.4s, v29.4s, v15.4s\n"
"sqdmulh v30.4s, v30.4s, v14.4s\n"
"sqdmulh v31.4s, v31.4s, v15.4s\n"
"srshl v16.4s, v16.4s, v11.4s\n"
"srshl v17.4s, v17.4s, v12.4s\n"
"srshl v18.4s, v18.4s, v11.4s\n"
"srshl v19.4s, v19.4s, v12.4s\n"
"srshl v20.4s, v20.4s, v11.4s\n"
"srshl v21.4s, v21.4s, v12.4s\n"
"srshl v22.4s, v22.4s, v11.4s\n"
"srshl v23.4s, v23.4s, v12.4s\n"
"srshl v24.4s, v24.4s, v11.4s\n"
"srshl v25.4s, v25.4s, v12.4s\n"
"srshl v26.4s, v26.4s, v11.4s\n"
"srshl v27.4s, v27.4s, v12.4s\n"
"srshl v28.4s, v28.4s, v11.4s\n"
"srshl v29.4s, v29.4s, v12.4s\n"
"srshl v30.4s, v30.4s, v11.4s\n"
"srshl v31.4s, v31.4s, v12.4s\n"
"b 9f\n"
"8:\n"
"dup v4.4s, v9.s[0]\n"
"dup v5.4s, v9.s[1]\n"
"dup v6.4s, v9.s[2]\n"
"dup v7.4s, v9.s[3]\n"
"sshl v16.4s, v16.4s, v4.4s\n"
"sshl v17.4s, v17.4s, v4.4s\n"
"sshl v18.4s, v18.4s, v5.4s\n"
"sshl v19.4s, v19.4s, v5.4s\n"
"sshl v20.4s, v20.4s, v6.4s\n"
"sshl v21.4s, v21.4s, v6.4s\n"
"sshl v22.4s, v22.4s, v7.4s\n"
"sshl v23.4s, v23.4s, v7.4s\n"
"dup v4.4s, v10.s[0]\n"
"dup v5.4s, v10.s[1]\n"
"dup v6.4s, v10.s[2]\n"
"dup v7.4s, v10.s[3]\n"
"sshl v24.4s, v24.4s, v4.4s\n"
"sshl v25.4s, v25.4s, v4.4s\n"
"sshl v26.4s, v26.4s, v5.4s\n"
"sshl v27.4s, v27.4s, v5.4s\n"
"sshl v28.4s, v28.4s, v6.4s\n"
"sshl v29.4s, v29.4s, v6.4s\n"
"sshl v30.4s, v30.4s, v7.4s\n"
"sshl v31.4s, v31.4s, v7.4s\n"
"11:\n"
"sqdmulh v16.4s, v16.4s, v14.s[0]\n"
"sqdmulh v17.4s, v17.4s, v14.s[0]\n"
"sqdmulh v18.4s, v18.4s, v14.s[1]\n"
"sqdmulh v19.4s, v19.4s, v14.s[1]\n"
"sqdmulh v20.4s, v20.4s, v14.s[2]\n"
"sqdmulh v21.4s, v21.4s, v14.s[2]\n"
"sqdmulh v22.4s, v22.4s, v14.s[3]\n"
"sqdmulh v23.4s, v23.4s, v14.s[3]\n"
"sqdmulh v24.4s, v24.4s, v15.s[0]\n"
"sqdmulh v25.4s, v25.4s, v15.s[0]\n"
"sqdmulh v26.4s, v26.4s, v15.s[1]\n"
"sqdmulh v27.4s, v27.4s, v15.s[1]\n"
"sqdmulh v28.4s, v28.4s, v15.s[2]\n"
"sqdmulh v29.4s, v29.4s, v15.s[2]\n"
"sqdmulh v30.4s, v30.4s, v15.s[3]\n"
"sqdmulh v31.4s, v31.4s, v15.s[3]\n"
"dup v4.4s, v11.s[0]\n"
"dup v5.4s, v11.s[1]\n"
"dup v6.4s, v11.s[2]\n"
"dup v7.4s, v11.s[3]\n"
"srshl v16.4s, v16.4s, v4.4s\n"
"srshl v17.4s, v17.4s, v4.4s\n"
"srshl v18.4s, v18.4s, v5.4s\n"
"srshl v19.4s, v19.4s, v5.4s\n"
"srshl v20.4s, v20.4s, v6.4s\n"
"srshl v21.4s, v21.4s, v6.4s\n"
"srshl v22.4s, v22.4s, v7.4s\n"
"srshl v23.4s, v23.4s, v7.4s\n"
"dup v4.4s, v12.s[0]\n"
"dup v5.4s, v12.s[1]\n"
"dup v6.4s, v12.s[2]\n"
"dup v7.4s, v12.s[3]\n"
"srshl v24.4s, v24.4s, v4.4s\n"
"srshl v25.4s, v25.4s, v4.4s\n"
"srshl v26.4s, v26.4s, v5.4s\n"
"srshl v27.4s, v27.4s, v5.4s\n"
"srshl v28.4s, v28.4s, v6.4s\n"
"srshl v29.4s, v29.4s, v6.4s\n"
"srshl v30.4s, v30.4s, v7.4s\n"
"srshl v31.4s, v31.4s, v7.4s\n"
"9:\n"
"ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
"ins v13.h[4], w4\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqxtn v17.4h, v18.4s\n"
"sqxtn2 v17.8h, v19.4s\n"
"sqxtn v18.4h, v20.4s\n"
"sqxtn2 v18.8h, v21.4s\n"
"sqxtn v19.4h, v22.4s\n"
"sqxtn2 v19.8h, v23.4s\n"
"sqxtn v20.4h, v24.4s\n"
"sqxtn2 v20.8h, v25.4s\n"
"sqxtn v21.4h, v26.4s\n"
"sqxtn2 v21.8h, v27.4s\n"
"sqxtn v22.4h, v28.4s\n"
"sqxtn2 v22.8h, v29.4s\n"
"sqxtn v23.4h, v30.4s\n"
"sqxtn2 v23.8h, v31.4s\n"
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"dup v14.8h, v13.h[4]\n"
"sqadd v16.8h, v16.8h, v14.8h\n"
"sqadd v17.8h, v17.8h, v14.8h\n"
"sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v19.8h, v19.8h, v14.8h\n"
"sqadd v20.8h, v20.8h, v14.8h\n"
"sqadd v21.8h, v21.8h, v14.8h\n"
"sqadd v22.8h, v22.8h, v14.8h\n"
"sqadd v23.8h, v23.8h, v14.8h\n"
"sqxtun v16.8b, v16.8h\n"
"sqxtun2 v16.16b, v17.8h\n"
"sqxtun v17.8b, v18.8h\n"
"sqxtun2 v17.16b, v19.8h\n"
"sqxtun v18.8b, v20.8h\n"
"sqxtun2 v18.16b, v21.8h\n"
"sqxtun v19.8b, v22.8h\n"
"sqxtun2 v19.16b, v23.8h\n"
"ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.16b, w2\n"
"dup v15.16b, w3\n"
"umax v16.16b, v16.16b, v14.16b\n"
"umax v17.16b, v17.16b, v14.16b\n"
"umax v18.16b, v18.16b, v14.16b\n"
"umax v19.16b, v19.16b, v14.16b\n"
"umin v16.16b, v16.16b, v15.16b\n"
"umin v17.16b, v17.16b, v15.16b\n"
"umin v18.16b, v18.16b, v15.16b\n"
"umin v19.16b, v19.16b, v15.16b\n"
"dup d20, v16.d[1]\n"
"dup d21, v17.d[1]\n"
"dup d22, v18.d[1]\n"
"dup d23, v19.d[1]\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 30f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #8\n"
"b 31f\n"
"30:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"31:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v16.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v16)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v20.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v20)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v17.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v17)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v21.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v21)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v18.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v18)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v22.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v22)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v19.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v19)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v23.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v23)
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"beq 41f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldrb w7, [x3, w5, uxtw]\n"
"strb w7, [x4, w5, uxtw]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #8\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"41:\n"
"add %[dst_ptr], %[dst_ptr], #8\n"
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqxtn v17.4h, v18.4s\n"
"sqxtn2 v17.8h, v19.4s\n"
"sqxtn v18.4h, v20.4s\n"
"sqxtn2 v18.8h, v21.4s\n"
"sqxtn v19.4h, v22.4s\n"
"sqxtn2 v19.8h, v23.4s\n"
"sqxtn v20.4h, v24.4s\n"
"sqxtn2 v20.8h, v25.4s\n"
"sqxtn v21.4h, v26.4s\n"
"sqxtn2 v21.8h, v27.4s\n"
"sqxtn v22.4h, v28.4s\n"
"sqxtn2 v22.8h, v29.4s\n"
"sqxtn v23.4h, v30.4s\n"
"sqxtn2 v23.8h, v31.4s\n"
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"dup v14.8h, v13.h[4]\n"
"sqadd v16.8h, v16.8h, v14.8h\n"
"sqadd v17.8h, v17.8h, v14.8h\n"
"sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v19.8h, v19.8h, v14.8h\n"
"sqadd v20.8h, v20.8h, v14.8h\n"
"sqadd v21.8h, v21.8h, v14.8h\n"
"sqadd v22.8h, v22.8h, v14.8h\n"
"sqadd v23.8h, v23.8h, v14.8h\n"
"sqxtn v16.8b, v16.8h\n"
"sqxtn2 v16.16b, v17.8h\n"
"sqxtn v17.8b, v18.8h\n"
"sqxtn2 v17.16b, v19.8h\n"
"sqxtn v18.8b, v20.8h\n"
"sqxtn2 v18.16b, v21.8h\n"
"sqxtn v19.8b, v22.8h\n"
"sqxtn2 v19.16b, v23.8h\n"
"ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.16b, w2\n"
"dup v15.16b, w3\n"
"smax v16.16b, v16.16b, v14.16b\n"
"smax v17.16b, v17.16b, v14.16b\n"
"smax v18.16b, v18.16b, v14.16b\n"
"smax v19.16b, v19.16b, v14.16b\n"
"smin v16.16b, v16.16b, v15.16b\n"
"smin v17.16b, v17.16b, v15.16b\n"
"smin v18.16b, v18.16b, v15.16b\n"
"smin v19.16b, v19.16b, v15.16b\n"
"dup d20, v16.d[1]\n"
"dup d21, v17.d[1]\n"
"dup d22, v18.d[1]\n"
"dup d23, v19.d[1]\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 130f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #8\n"
"b 131f\n"
"130:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"131:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v16.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v16)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v20.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v20)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v17.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v17)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v21.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v21)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v18.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v18)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v22.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v22)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v19.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v19)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v23.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v23)
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"beq 141f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"150:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"151:\n"
"ldrb w7, [x3, w5, uxtw]\n"
"strb w7, [x4, w5, uxtw]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 151b\n"
"add w6, w6, #1\n"
"add x3, x3, #8\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 150b\n"
"141:\n"
"add %[dst_ptr], %[dst_ptr], #8\n"
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
"dup v14.8h, v13.h[4]\n"
"saddw v16.4s, v16.4s, v14.4h\n"
"saddw v17.4s, v17.4s, v14.4h\n"
"saddw v18.4s, v18.4s, v14.4h\n"
"saddw v19.4s, v19.4s, v14.4h\n"
"saddw v20.4s, v20.4s, v14.4h\n"
"saddw v21.4s, v21.4s, v14.4h\n"
"saddw v22.4s, v22.4s, v14.4h\n"
"saddw v23.4s, v23.4s, v14.4h\n"
"saddw v24.4s, v24.4s, v14.4h\n"
"saddw v25.4s, v25.4s, v14.4h\n"
"saddw v26.4s, v26.4s, v14.4h\n"
"saddw v27.4s, v27.4s, v14.4h\n"
"saddw v28.4s, v28.4s, v14.4h\n"
"saddw v29.4s, v29.4s, v14.4h\n"
"saddw v30.4s, v30.4s, v14.4h\n"
"saddw v31.4s, v31.4s, v14.4h\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqxtn v17.4h, v18.4s\n"
"sqxtn2 v17.8h, v19.4s\n"
"sqxtn v18.4h, v20.4s\n"
"sqxtn2 v18.8h, v21.4s\n"
"sqxtn v19.4h, v22.4s\n"
"sqxtn2 v19.8h, v23.4s\n"
"sqxtn v20.4h, v24.4s\n"
"sqxtn2 v20.8h, v25.4s\n"
"sqxtn v21.4h, v26.4s\n"
"sqxtn2 v21.8h, v27.4s\n"
"sqxtn v22.4h, v28.4s\n"
"sqxtn2 v22.8h, v29.4s\n"
"sqxtn v23.4h, v30.4s\n"
"sqxtn2 v23.8h, v31.4s\n"
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"ldrsh w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrsh w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.8h, w2\n"
"dup v15.8h, w3\n"
"smax v16.8h, v16.8h, v14.8h\n"
"smax v17.8h, v17.8h, v14.8h\n"
"smax v18.8h, v18.8h, v14.8h\n"
"smax v19.8h, v19.8h, v14.8h\n"
"smax v20.8h, v20.8h, v14.8h\n"
"smax v21.8h, v21.8h, v14.8h\n"
"smax v22.8h, v22.8h, v14.8h\n"
"smax v23.8h, v23.8h, v14.8h\n"
"smin v16.8h, v16.8h, v15.8h\n"
"smin v17.8h, v17.8h, v15.8h\n"
"smin v18.8h, v18.8h, v15.8h\n"
"smin v19.8h, v19.8h, v15.8h\n"
"smin v20.8h, v20.8h, v15.8h\n"
"smin v21.8h, v21.8h, v15.8h\n"
"smin v22.8h, v22.8h, v15.8h\n"
"smin v23.8h, v23.8h, v15.8h\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 230f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #16\n"
"b 231f\n"
"230:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"231:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v16.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v16)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v17.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v17)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v18.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v18)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v19.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v19)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v20.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v20)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v21.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v21)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v22.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v22)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v23.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v23)
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"beq 241f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"250:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"251:\n"
"ldrsh w7, [x3, x5, lsl #1]\n"
"strh w7, [x4, x5, lsl #1]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 251b\n"
"add w6, w6, #1\n"
"add x3, x3, #16\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 250b\n"
"241:\n"
"add %[dst_ptr], %[dst_ptr], #16\n"
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 330f\n"
"mov x3, %[dst_tmp_buf]\n"
"st1 {v16.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v16)
"st1 {v17.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v17)
"st1 {v18.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v18)
"st1 {v19.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v19)
"st1 {v20.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v20)
"st1 {v21.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v21)
"st1 {v22.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v22)
"st1 {v23.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v23)
"st1 {v24.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v24)
"st1 {v25.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v25)
"st1 {v26.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v26)
"st1 {v27.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v27)
"st1 {v28.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v28)
"st1 {v29.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v29)
"st1 {v30.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v30)
"st1 {v31.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v31)
"b 331f\n"
"330:\n"
"mov x4, %[dst_ptr]\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v16.4s, v17.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"add x4, x4, x11\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v18.4s, v19.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
"add x4, x4, x11\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v20.4s, v21.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
"add x4, x4, x11\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v22.4s, v23.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
"add x4, x4, x11\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v24.4s, v25.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
"add x4, x4, x11\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v26.4s, v27.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
"add x4, x4, x11\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v28.4s, v29.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
"add x4, x4, x11\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov x3, x4\n"
"st1 {v30.4s, v31.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"331:\n"
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"beq 341f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"350:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"351:\n"
"ldr w7, [x3, x5, lsl #2]\n"
"str w7, [x4, x5, lsl #2]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 351b\n"
"add w6, w6, #1\n"
"add x3, x3, #32\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 350b\n"
"341:\n"
"add %[dst_ptr], %[dst_ptr], #32\n"
RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"cmp %w[row], w7\n"
"beq 20f\n"
"add %w[row], %w[row], #8\n"
"b 21f\n"
"20:\n"
"mov %w[row], w6\n"
"add %w[col], %w[col], #8\n"
"add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
"mov %[dst_ptr], %[dst_col_ptr]\n"
"21:\n"
"cmp %w[col], w8\n"
"mov w1, #4\n"
"ble 1b\n"
: [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
: [ params ] "r"(¶ms), [dst_rows] "r"(params.dst_rows),
[dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf),
[dst_type_id] "r"(params.dst_type_id)
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
"memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
void Kernel8bitNeonDotprod1Col(const KernelParams8bit<8, 8>& params) {
profiler::ScopeLabel label("Kernel (kNeonDotprod)");
CheckOffsetsInKernelParams8bit(params);
const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
const std::int8_t* rhs_col_ptr =
static_cast<const int8_t*>(params.rhs_base_ptr);
const std::int8_t* lhs_ptr = lhs_col_ptr;
const std::int8_t* rhs_ptr = rhs_col_ptr;
void* dst_col_ptr = params.dst_base_ptr;
void* dst_ptr = dst_col_ptr;
int row = params.start_row;
int col = params.start_col;
RUY_DCHECK(!(params.flags & RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL));
asm volatile(
#define RUY_MAKE_ZERO …
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
"ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
"ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v2.8b}, [%[rhs_ptr]]\n"
"add %[rhs_ptr], %[rhs_ptr], #32\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"mov w1, #4\n"
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
"1:\n"
"cmp w1, w12\n"
"beq 79f\n"
"2:\n"
"add w1, w1, #4\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
".word 0x4f82e031 // sdot v17.4s, v1.16b, v2.4b[0]\n"
"cmp w1, w12\n"
"ld1 {v2.8b}, [%[rhs_ptr]]\n"
"add %[rhs_ptr], %[rhs_ptr], #32\n"
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"blt 2b\n"
"79:\n"
".word 0x4f82e031 // sdot v17.4s, v1.16b, v2.4b[0]\n"
"cmp %w[row], w7\n"
"bge 4f\n"
"add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
"b 5f\n"
"4:\n"
"mov %[lhs_col_ptr], x5\n"
"cmp %w[col], w8\n"
"bge 5f\n"
"add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
"5:\n"
"mov %[lhs_ptr], %[lhs_col_ptr]\n"
"mov %[rhs_ptr], %[rhs_col_ptr]\n"
"mvni v8.4s, #0\n"
"ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
"ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
"ins v13.h[4], w4\n"
"ldr x4, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
"ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"dup v9.4s, w3\n"
"add x5, x4, %x[row], lsl #2\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
"csel x4, x4, x5, eq\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
"add x5, x1, %x[row], lsl #2\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"ld1 {v14.4s}, [x1], #16\n"
"ld1 {v15.4s}, [x1]\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v2.8b}, [%[rhs_ptr]]\n"
"add %[rhs_ptr], %[rhs_ptr], #32\n"
"add v14.4s, v14.4s, v9.4s\n"
"add v15.4s, v15.4s, v9.4s\n"
"add v16.4s, v16.4s, v14.4s\n"
"add v17.4s, v17.4s, v15.4s\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
"beq 401f\n"
"ldr x3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
"add x3, x3, %x[col], lsl #2\n"
"ld1 {v14.4s}, [x3], #16\n"
"ld1 {v15.4s}, [x3]\n"
"ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
"dup v10.4s, w5\n"
"mls v16.4s, v10.4s, v14.s[0]\n"
"mls v17.4s, v10.4s, v14.s[0]\n"
"401:\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
"beq 402f\n"
"ldr x2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
"add x2, x2, %x[row], lsl #2\n"
"ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
"ld1 {v11.4s}, [x2], #16\n"
"ld1 {v12.4s}, [x2]\n"
"ins v13.s[1], w5\n"
"mul v11.4s, v11.4s, v13.s[1]\n"
"mul v12.4s, v12.4s, v13.s[1]\n"
"sub v16.4s, v16.4s, v11.4s\n"
"sub v17.4s, v17.4s, v12.4s\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
"402:\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
"add x5, x1, %x[row], lsl #2\n"
"csel x1, x1, x5, eq\n"
"ldr q9, [x1]\n"
"ldr q10, [x1, #16]\n"
"smin v11.4s, v8.4s, v9.4s\n"
"smin v12.4s, v8.4s, v10.4s\n"
"sub v9.4s, v9.4s, v11.4s\n"
"sub v10.4s, v10.4s, v12.4s\n"
"sshl v16.4s, v16.4s, v9.4s\n"
"sshl v17.4s, v17.4s, v10.4s\n"
"403:\n"
"ldr q14, [x4]\n"
"ldr q15, [x4, #16]\n"
"sqdmulh v16.4s, v16.4s, v14.4s\n"
"sqdmulh v17.4s, v17.4s, v15.4s\n"
"srshl v16.4s, v16.4s, v11.4s\n"
"srshl v17.4s, v17.4s, v12.4s\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"dup v14.8h, v13.h[4]\n"
"sqadd v16.8h, v16.8h, v14.8h\n"
"sqxtun v16.8b, v16.8h\n"
"ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.16b, w2\n"
"dup v15.16b, w3\n"
"umax v16.16b, v16.16b, v14.16b\n"
"umin v16.16b, v16.16b, v15.16b\n"
"dup d20, v16.d[1]\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"cmp w1, w3\n"
"beq 30f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #8\n"
"b 31f\n"
"30:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"31:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"st1 {v16.8b}, [x3]\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
"beq 41f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldrb w7, [x3, w5, uxtw]\n"
"strb w7, [x4, w5, uxtw]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"41:\n"
"add %[dst_ptr], %[dst_ptr], #8\n"
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"dup v14.8h, v13.h[4]\n"
"sqadd v16.8h, v16.8h, v14.8h\n"
"sqxtn v16.8b, v16.8h\n"
"ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.16b, w2\n"
"dup v15.16b, w3\n"
"smax v16.16b, v16.16b, v14.16b\n"
"smin v16.16b, v16.16b, v15.16b\n"
"dup d20, v16.d[1]\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"cmp w1, w3\n"
"beq 130f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #8\n"
"b 131f\n"
"130:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"131:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"st1 {v16.8b}, [x3]\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
"beq 141f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"150:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"151:\n"
"ldrb w7, [x3, w5, uxtw]\n"
"strb w7, [x4, w5, uxtw]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 151b\n"
"141:\n"
"add %[dst_ptr], %[dst_ptr], #8\n"
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
"dup v14.8h, v13.h[4]\n"
"saddw v16.4s, v16.4s, v14.4h\n"
"saddw v17.4s, v17.4s, v14.4h\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"ldrsh w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrsh w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.8h, w2\n"
"dup v15.8h, w3\n"
"smax v16.8h, v16.8h, v14.8h\n"
"smin v16.8h, v16.8h, v15.8h\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"cmp w1, w3\n"
"beq 230f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #16\n"
"b 231f\n"
"230:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"231:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"st1 {v16.8h}, [x3]\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
"beq 241f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"250:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"251:\n"
"ldrsh w7, [x3, x5, lsl #1]\n"
"strh w7, [x4, x5, lsl #1]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 251b\n"
"241:\n"
"add %[dst_ptr], %[dst_ptr], #16\n"
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"beq 330f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #16\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v16.4s}, [x3], x4\n"
RUY_MAKE_ZERO(v16)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v17.4s}, [x3], x4\n"
RUY_MAKE_ZERO(v17)
"b 331f\n"
"330:\n"
"mov x4, %[dst_ptr]\n"
"mov x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v16.4s, v17.4s}, [x3], #32\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"331:\n"
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
"beq 341f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"350:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"mov w5, #0\n"
"351:\n"
"ldr w7, [x3, x5, lsl #2]\n"
"str w7, [x4, x5, lsl #2]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 351b\n"
"341:\n"
"add %[dst_ptr], %[dst_ptr], #32\n"
RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"cmp %w[row], w7\n"
"beq 20f\n"
"add %w[row], %w[row], #8\n"
"b 21f\n"
"20:\n"
"mov %w[row], w6\n"
"add %w[col], %w[col], #8\n"
"add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
"mov %[dst_ptr], %[dst_col_ptr]\n"
"21:\n"
"cmp %w[col], w8\n"
"mov w1, #4\n"
"ble 1b\n"
: [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
: [ params ] "r"(¶ms), [dst_rows] "r"(params.dst_rows),
[dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf),
[dst_type_id] "r"(params.dst_type_id)
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
"memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17");
}
void Kernel8bitNeonDotprodA55ish(const KernelParams8bit<8, 8>& params) {
profiler::ScopeLabel label(
"Kernel (kNeonDotprod, optimized for in-order cores)");
CheckOffsetsInKernelParams8bit(params);
const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
const std::int8_t* rhs_col_ptr =
static_cast<const int8_t*>(params.rhs_base_ptr);
const std::int8_t* lhs_ptr = lhs_col_ptr;
const std::int8_t* rhs_ptr = rhs_col_ptr;
void* dst_col_ptr = params.dst_base_ptr;
void* dst_ptr = dst_col_ptr;
int row = params.start_row;
int col = params.start_col;
asm volatile(
#define RUY_MAKE_ZERO …
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
RUY_MAKE_ZERO(v16)
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
RUY_MAKE_ZERO(v17)
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
RUY_MAKE_ZERO(v18)
"ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
RUY_MAKE_ZERO(v19)
"ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
RUY_MAKE_ZERO(v20)
"ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
RUY_MAKE_ZERO(v21)
"ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
RUY_MAKE_ZERO(v22)
"ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
"ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
RUY_MAKE_ZERO(v23)
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
RUY_MAKE_ZERO(v28)
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
RUY_MAKE_ZERO(v29)
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
RUY_MAKE_ZERO(v30)
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
RUY_MAKE_ZERO(v31)
"1:\n"
"add x5, %[lhs_ptr], x12, lsl #3\n"
"sub x5, x5, #32\n"
"cmp %[lhs_ptr], x5\n"
"beq 79f\n"
"2:\n"
".word 0x4f83e018 // sdot v24.4s, v0.16b, v3.4b[0]\n"
"ldr x1, [%[lhs_ptr], #8]\n"
".word 0x4fa3e01a // sdot v26.4s, v0.16b, v3.4b[1]\n"
"ldr x3, [%[rhs_ptr], #8]\n"
".word 0x4f83e81c // sdot v28.4s, v0.16b, v3.4b[2]\n"
"ldr x4, [%[rhs_ptr], #24]\n"
".word 0x4fa3e81e // sdot v30.4s, v0.16b, v3.4b[3]\n"
"ldr d0, [%[lhs_ptr], #0]\n"
".word 0x4f82e031 // sdot v17.4s, v1.16b, v2.4b[0]\n"
"ins v0.d[1], x1\n"
".word 0x4fa2e033 // sdot v19.4s, v1.16b, v2.4b[1]\n"
"ldr x2, [%[lhs_ptr], #24]\n"
".word 0x4f82e835 // sdot v21.4s, v1.16b, v2.4b[2]\n"
"add %[lhs_ptr], %[lhs_ptr], #32\n"
".word 0x4fa2e837 // sdot v23.4s, v1.16b, v2.4b[3]\n"
"ldr d2, [%[rhs_ptr], #0]\n"
".word 0x4f83e039 // sdot v25.4s, v1.16b, v3.4b[0]\n"
"ins v2.d[1], x3\n"
".word 0x4fa3e03b // sdot v27.4s, v1.16b, v3.4b[1]\n"
"cmp %[lhs_ptr], x5\n"
".word 0x4f83e83d // sdot v29.4s, v1.16b, v3.4b[2]\n"
"add %[rhs_ptr], %[rhs_ptr], #32\n"
".word 0x4fa3e83f // sdot v31.4s, v1.16b, v3.4b[3]\n"
"ldr d3, [%[rhs_ptr], #-16]\n"
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
"ldr d1, [%[lhs_ptr], #-16]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
"ins v3.d[1], x4\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
"ins v1.d[1], x2\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"blt 2b\n"
"79:\n"
".word 0x4f83e018 // sdot v24.4s, v0.16b, v3.4b[0]\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
".word 0x4fa3e01a // sdot v26.4s, v0.16b, v3.4b[1]\n"
"cmp %w[row], w7\n"
".word 0x4f83e81c // sdot v28.4s, v0.16b, v3.4b[2]\n"
".word 0x4fa3e81e // sdot v30.4s, v0.16b, v3.4b[3]\n"
".word 0x4f82e031 // sdot v17.4s, v1.16b, v2.4b[0]\n"
".word 0x4fa2e033 // sdot v19.4s, v1.16b, v2.4b[1]\n"
".word 0x4f82e835 // sdot v21.4s, v1.16b, v2.4b[2]\n"
".word 0x4fa2e837 // sdot v23.4s, v1.16b, v2.4b[3]\n"
".word 0x4f83e039 // sdot v25.4s, v1.16b, v3.4b[0]\n"
".word 0x4fa3e03b // sdot v27.4s, v1.16b, v3.4b[1]\n"
".word 0x4f83e83d // sdot v29.4s, v1.16b, v3.4b[2]\n"
".word 0x4fa3e83f // sdot v31.4s, v1.16b, v3.4b[3]\n"
"bge 4f\n"
"add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
"b 5f\n"
"4:\n"
"mov %[lhs_col_ptr], x5\n"
"cmp %w[col], w8\n"
"bge 5f\n"
"add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
"5:\n"
"mov %[lhs_ptr], %[lhs_col_ptr]\n"
"mvni v8.4s, #0\n"
"mov %[rhs_ptr], %[rhs_col_ptr]\n"
"ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
"ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"dup v9.4s, w3\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"csel w3, %w[row], %w[col], eq\n"
"add x5, x1, x3, lsl #2\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"ld1 {v14.2s}, [x1], #8\n"
"ldr x5, [x1], #8\n"
"ins v14.d[1], x5\n"
"ld1 {v15.2s}, [x1], #8\n"
"ldr x5, [x1], #8\n"
"ins v15.d[1], x5\n"
"add v14.4s, v14.4s, v9.4s\n"
"add v15.4s, v15.4s, v9.4s\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 6f\n"
"add v16.4s, v16.4s, v14.4s\n"
"add v17.4s, v17.4s, v15.4s\n"
"add v18.4s, v18.4s, v14.4s\n"
"add v19.4s, v19.4s, v15.4s\n"
"add v20.4s, v20.4s, v14.4s\n"
"add v21.4s, v21.4s, v15.4s\n"
"add v22.4s, v22.4s, v14.4s\n"
"add v23.4s, v23.4s, v15.4s\n"
"add v24.4s, v24.4s, v14.4s\n"
"add v25.4s, v25.4s, v15.4s\n"
"add v26.4s, v26.4s, v14.4s\n"
"add v27.4s, v27.4s, v15.4s\n"
"add v28.4s, v28.4s, v14.4s\n"
"add v29.4s, v29.4s, v15.4s\n"
"add v30.4s, v30.4s, v14.4s\n"
"add v31.4s, v31.4s, v15.4s\n"
"b 7f\n"
"6:\n"
"dup v10.4s, v14.s[0]\n"
"dup v11.4s, v14.s[1]\n"
"add v16.4s, v16.4s, v10.4s\n"
"dup v12.4s, v14.s[2]\n"
"add v17.4s, v17.4s, v10.4s\n"
"dup v13.4s, v14.s[3]\n"
"add v18.4s, v18.4s, v11.4s\n"
"dup v10.4s, v15.s[0]\n"
"add v19.4s, v19.4s, v11.4s\n"
"dup v11.4s, v15.s[1]\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
"dup v12.4s, v15.s[2]\n"
"add v22.4s, v22.4s, v13.4s\n"
"add v23.4s, v23.4s, v13.4s\n"
"dup v13.4s, v15.s[3]\n"
"add v24.4s, v24.4s, v10.4s\n"
"add v25.4s, v25.4s, v10.4s\n"
"add v26.4s, v26.4s, v11.4s\n"
"add v27.4s, v27.4s, v11.4s\n"
"add v28.4s, v28.4s, v12.4s\n"
"add v29.4s, v29.4s, v12.4s\n"
"add v30.4s, v30.4s, v13.4s\n"
"add v31.4s, v31.4s, v13.4s\n"
"7:\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
"beq 401f\n"
"ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
"dup v10.4s, w5\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
"add x5, x5, %x[col], lsl #2\n"
"ld1 {v14.2s}, [x5], #8\n"
"ldr x7, [x5], #8\n"
"ld1 {v15.2s}, [x5], #8\n"
"ins v14.d[1], x7\n"
"ldr x7, [x5], #8\n"
"ins v15.d[1], x7\n"
"mls v16.4s, v10.4s, v14.s[0]\n"
"mls v17.4s, v10.4s, v14.s[0]\n"
"mls v18.4s, v10.4s, v14.s[1]\n"
"mls v19.4s, v10.4s, v14.s[1]\n"
"mls v20.4s, v10.4s, v14.s[2]\n"
"mls v21.4s, v10.4s, v14.s[2]\n"
"mls v22.4s, v10.4s, v14.s[3]\n"
"mls v23.4s, v10.4s, v14.s[3]\n"
"mls v24.4s, v10.4s, v15.s[0]\n"
"mls v25.4s, v10.4s, v15.s[0]\n"
"mls v26.4s, v10.4s, v15.s[1]\n"
"mls v27.4s, v10.4s, v15.s[1]\n"
"mls v28.4s, v10.4s, v15.s[2]\n"
"mls v29.4s, v10.4s, v15.s[2]\n"
"mls v30.4s, v10.4s, v15.s[3]\n"
"mls v31.4s, v10.4s, v15.s[3]\n"
"401:\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
"beq 402f\n"
"ldr x2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
"add x2, x2, %x[row], lsl #2\n"
"ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
"ins v13.s[1], w5\n"
"ld1 {v11.2s}, [x2], #8\n"
"ldr x4, [x2], #8\n"
"ins v11.d[1], x4\n"
"ld1 {v12.2s}, [x2], #8\n"
"ldr x4, [x2], #8\n"
"ins v12.d[1], x4\n"
"mul v11.4s, v11.4s, v13.s[1]\n"
"mul v12.4s, v12.4s, v13.s[1]\n"
"sub v16.4s, v16.4s, v11.4s\n"
"sub v17.4s, v17.4s, v12.4s\n"
"sub v18.4s, v18.4s, v11.4s\n"
"sub v19.4s, v19.4s, v12.4s\n"
"sub v20.4s, v20.4s, v11.4s\n"
"sub v21.4s, v21.4s, v12.4s\n"
"sub v22.4s, v22.4s, v11.4s\n"
"sub v23.4s, v23.4s, v12.4s\n"
"sub v24.4s, v24.4s, v11.4s\n"
"sub v25.4s, v25.4s, v12.4s\n"
"sub v26.4s, v26.4s, v11.4s\n"
"sub v27.4s, v27.4s, v12.4s\n"
"sub v28.4s, v28.4s, v11.4s\n"
"sub v29.4s, v29.4s, v12.4s\n"
"sub v30.4s, v30.4s, v11.4s\n"
"sub v31.4s, v31.4s, v12.4s\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
"402:\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
"ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
"add x5, x1, x3, lsl #2\n"
"csel x1, x1, x5, eq\n"
"ldr q9, [x1]\n"
"ldr q10, [x1, #16]\n"
"smin v11.4s, v8.4s, v9.4s\n"
"smin v12.4s, v8.4s, v10.4s\n"
"sub v9.4s, v9.4s, v11.4s\n"
"sub v10.4s, v10.4s, v12.4s\n"
"ldr x4, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
"add x5, x4, x3, lsl #2\n"
"csel x4, x4, x5, eq\n"
"ldr q14, [x4]\n"
"ldr q15, [x4, #16]\n"
"tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 8f\n"
"sshl v16.4s, v16.4s, v9.4s\n"
"sshl v17.4s, v17.4s, v10.4s\n"
"sshl v18.4s, v18.4s, v9.4s\n"
"sshl v19.4s, v19.4s, v10.4s\n"
"sshl v20.4s, v20.4s, v9.4s\n"
"sshl v21.4s, v21.4s, v10.4s\n"
"sshl v22.4s, v22.4s, v9.4s\n"
"sshl v23.4s, v23.4s, v10.4s\n"
"sshl v24.4s, v24.4s, v9.4s\n"
"sshl v25.4s, v25.4s, v10.4s\n"
"sshl v26.4s, v26.4s, v9.4s\n"
"sshl v27.4s, v27.4s, v10.4s\n"
"sshl v28.4s, v28.4s, v9.4s\n"
"sshl v29.4s, v29.4s, v10.4s\n"
"sshl v30.4s, v30.4s, v9.4s\n"
"sshl v31.4s, v31.4s, v10.4s\n"
"10:\n"
"ld1 {v0.8b}, [%[lhs_ptr]], #8\n"
"sqdmulh v16.4s, v16.4s, v14.4s\n"
"ldr x1, [%[lhs_ptr]], #8\n"
"sqdmulh v17.4s, v17.4s, v15.4s\n"
"ld1 {v1.8b}, [%[lhs_ptr]], #8\n"
"sqdmulh v18.4s, v18.4s, v14.4s\n"
"ldr x2, [%[lhs_ptr]], #8\n"
"sqdmulh v19.4s, v19.4s, v15.4s\n"
"ld1 {v2.8b}, [%[rhs_ptr]], #8\n"
"sqdmulh v20.4s, v20.4s, v14.4s\n"
"ldr x5, [%[rhs_ptr]], #8\n"
"sqdmulh v21.4s, v21.4s, v15.4s\n"
"ld1 {v3.8b}, [%[rhs_ptr]], #8\n"
"sqdmulh v22.4s, v22.4s, v14.4s\n"
"ldr x6, [%[rhs_ptr]], #8\n"
"sqdmulh v23.4s, v23.4s, v15.4s\n"
"sqdmulh v24.4s, v24.4s, v14.4s\n"
"sqdmulh v25.4s, v25.4s, v15.4s\n"
"sqdmulh v26.4s, v26.4s, v14.4s\n"
"sqdmulh v27.4s, v27.4s, v15.4s\n"
"sqdmulh v28.4s, v28.4s, v14.4s\n"
"sqdmulh v29.4s, v29.4s, v15.4s\n"
"sqdmulh v30.4s, v30.4s, v14.4s\n"
"sqdmulh v31.4s, v31.4s, v15.4s\n"
"srshl v16.4s, v16.4s, v11.4s\n"
"srshl v17.4s, v17.4s, v12.4s\n"
"srshl v18.4s, v18.4s, v11.4s\n"
"srshl v19.4s, v19.4s, v12.4s\n"
"srshl v20.4s, v20.4s, v11.4s\n"
"srshl v21.4s, v21.4s, v12.4s\n"
"srshl v22.4s, v22.4s, v11.4s\n"
"srshl v23.4s, v23.4s, v12.4s\n"
"srshl v24.4s, v24.4s, v11.4s\n"
"srshl v25.4s, v25.4s, v12.4s\n"
"ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
"srshl v26.4s, v26.4s, v11.4s\n"
"ins v13.h[4], w4\n"
"srshl v27.4s, v27.4s, v12.4s\n"
"ins v0.d[1], x1\n"
"srshl v28.4s, v28.4s, v11.4s\n"
"ins v1.d[1], x2\n"
"srshl v29.4s, v29.4s, v12.4s\n"
"ins v2.d[1], x5\n"
"srshl v30.4s, v30.4s, v11.4s\n"
"ins v3.d[1], x6\n"
"srshl v31.4s, v31.4s, v12.4s\n"
"b 9f\n"
"8:\n"
"dup v4.4s, v9.s[0]\n"
"dup v5.4s, v9.s[1]\n"
"sshl v16.4s, v16.4s, v4.4s\n"
"dup v6.4s, v9.s[2]\n"
"sshl v17.4s, v17.4s, v4.4s\n"
"dup v7.4s, v9.s[3]\n"
"sshl v18.4s, v18.4s, v5.4s\n"
"dup v4.4s, v10.s[0]\n"
"sshl v19.4s, v19.4s, v5.4s\n"
"dup v5.4s, v10.s[1]\n"
"sshl v20.4s, v20.4s, v6.4s\n"
"sshl v21.4s, v21.4s, v6.4s\n"
"dup v6.4s, v10.s[2]\n"
"sshl v22.4s, v22.4s, v7.4s\n"
"sshl v23.4s, v23.4s, v7.4s\n"
"dup v7.4s, v10.s[3]\n"
"sshl v24.4s, v24.4s, v4.4s\n"
"sshl v25.4s, v25.4s, v4.4s\n"
"sshl v26.4s, v26.4s, v5.4s\n"
"sshl v27.4s, v27.4s, v5.4s\n"
"sshl v28.4s, v28.4s, v6.4s\n"
"sshl v29.4s, v29.4s, v6.4s\n"
"sshl v30.4s, v30.4s, v7.4s\n"
"sshl v31.4s, v31.4s, v7.4s\n"
"11:\n"
"ld1 {v0.8b}, [%[lhs_ptr]], #8\n"
"sqdmulh v16.4s, v16.4s, v14.s[0]\n"
"ldr x1, [%[lhs_ptr]], #8\n"
"sqdmulh v17.4s, v17.4s, v14.s[0]\n"
"ld1 {v1.8b}, [%[lhs_ptr]], #8\n"
"sqdmulh v18.4s, v18.4s, v14.s[1]\n"
"ldr x2, [%[lhs_ptr]], #8\n"
"sqdmulh v19.4s, v19.4s, v14.s[1]\n"
"ld1 {v2.8b}, [%[rhs_ptr]], #8\n"
"sqdmulh v20.4s, v20.4s, v14.s[2]\n"
"ldr x5, [%[rhs_ptr]], #8\n"
"sqdmulh v21.4s, v21.4s, v14.s[2]\n"
"ld1 {v3.8b}, [%[rhs_ptr]], #8\n"
"sqdmulh v22.4s, v22.4s, v14.s[3]\n"
"ldr x6, [%[rhs_ptr]], #8\n"
"sqdmulh v23.4s, v23.4s, v14.s[3]\n"
"dup v4.4s, v11.s[0]\n"
"sqdmulh v24.4s, v24.4s, v15.s[0]\n"
"dup v5.4s, v11.s[1]\n"
"sqdmulh v25.4s, v25.4s, v15.s[0]\n"
"dup v6.4s, v11.s[2]\n"
"sqdmulh v26.4s, v26.4s, v15.s[1]\n"
"dup v7.4s, v11.s[3]\n"
"sqdmulh v27.4s, v27.4s, v15.s[1]\n"
"sqdmulh v28.4s, v28.4s, v15.s[2]\n"
"sqdmulh v29.4s, v29.4s, v15.s[2]\n"
"sqdmulh v30.4s, v30.4s, v15.s[3]\n"
"sqdmulh v31.4s, v31.4s, v15.s[3]\n"
"srshl v16.4s, v16.4s, v4.4s\n"
"srshl v17.4s, v17.4s, v4.4s\n"
"dup v4.4s, v12.s[0]\n"
"srshl v18.4s, v18.4s, v5.4s\n"
"srshl v19.4s, v19.4s, v5.4s\n"
"dup v5.4s, v12.s[1]\n"
"srshl v20.4s, v20.4s, v6.4s\n"
"srshl v21.4s, v21.4s, v6.4s\n"
"dup v6.4s, v12.s[2]\n"
"srshl v22.4s, v22.4s, v7.4s\n"
"srshl v23.4s, v23.4s, v7.4s\n"
"dup v7.4s, v12.s[3]\n"
"srshl v24.4s, v24.4s, v4.4s\n"
"ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
"srshl v25.4s, v25.4s, v4.4s\n"
"ins v13.h[4], w4\n"
"srshl v26.4s, v26.4s, v5.4s\n"
"ins v0.d[1], x1\n"
"srshl v27.4s, v27.4s, v5.4s\n"
"ins v1.d[1], x2\n"
"srshl v28.4s, v28.4s, v6.4s\n"
"ins v2.d[1], x5\n"
"srshl v29.4s, v29.4s, v6.4s\n"
"ins v3.d[1], x6\n"
"srshl v30.4s, v30.4s, v7.4s\n"
"srshl v31.4s, v31.4s, v7.4s\n"
"9:\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
"cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
"beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqxtn v17.4h, v18.4s\n"
"sqxtn2 v17.8h, v19.4s\n"
"sqxtn v18.4h, v20.4s\n"
"sqxtn2 v18.8h, v21.4s\n"
"sqxtn v19.4h, v22.4s\n"
"sqxtn2 v19.8h, v23.4s\n"
"sqxtn v20.4h, v24.4s\n"
"sqxtn2 v20.8h, v25.4s\n"
"sqxtn v21.4h, v26.4s\n"
"sqxtn2 v21.8h, v27.4s\n"
"sqxtn v22.4h, v28.4s\n"
"sqxtn2 v22.8h, v29.4s\n"
"sqxtn v23.4h, v30.4s\n"
"sqxtn2 v23.8h, v31.4s\n"
"dup v14.8h, v13.h[4]\n"
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"sqadd v16.8h, v16.8h, v14.8h\n"
"sqadd v17.8h, v17.8h, v14.8h\n"
"sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v19.8h, v19.8h, v14.8h\n"
"sqadd v20.8h, v20.8h, v14.8h\n"
"sqadd v21.8h, v21.8h, v14.8h\n"
"sqadd v22.8h, v22.8h, v14.8h\n"
"sqadd v23.8h, v23.8h, v14.8h\n"
"ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"sqxtun v16.8b, v16.8h\n"
"ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"sqxtun2 v16.16b, v17.8h\n"
"sqxtun v17.8b, v18.8h\n"
"sqxtun2 v17.16b, v19.8h\n"
"sqxtun v18.8b, v20.8h\n"
"sqxtun2 v18.16b, v21.8h\n"
"sqxtun v19.8b, v22.8h\n"
"sqxtun2 v19.16b, v23.8h\n"
"dup v14.16b, w2\n"
"dup v15.16b, w3\n"
"sub w1, %w[dst_rows], %w[row]\n"
"umax v16.16b, v16.16b, v14.16b\n"
"sub w2, %w[dst_cols], %w[col]\n"
"umax v17.16b, v17.16b, v14.16b\n"
"mov w3, #8\n"
"umax v18.16b, v18.16b, v14.16b\n"
"cmp w1, #8\n"
"umax v19.16b, v19.16b, v14.16b\n"
"csel w1, w1, w3, le\n"
"umin v16.16b, v16.16b, v15.16b\n"
"cmp w2, #8\n"
"umin v17.16b, v17.16b, v15.16b\n"
"csel w2, w2, w3, le\n"
"umin v18.16b, v18.16b, v15.16b\n"
"umin v19.16b, v19.16b, v15.16b\n"
"dup d20, v16.d[1]\n"
"dup d21, v17.d[1]\n"
"dup d22, v18.d[1]\n"
"dup d23, v19.d[1]\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 30f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #8\n"
"b 31f\n"
"30:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"31:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v16.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v16)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v20.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v20)
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v17.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v17)
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v21.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v21)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v18.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v18)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v22.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v22)
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v19.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v19)
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v23.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v23)
"beq 41f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldrb w7, [x3, w5, uxtw]\n"
"strb w7, [x4, w5, uxtw]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #8\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"41:\n"
"add %[dst_ptr], %[dst_ptr], #8\n"
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqxtn v17.4h, v18.4s\n"
"sqxtn2 v17.8h, v19.4s\n"
"sqxtn v18.4h, v20.4s\n"
"sqxtn2 v18.8h, v21.4s\n"
"sqxtn v19.4h, v22.4s\n"
"sqxtn2 v19.8h, v23.4s\n"
"sqxtn v20.4h, v24.4s\n"
"sqxtn2 v20.8h, v25.4s\n"
"sqxtn v21.4h, v26.4s\n"
"sqxtn2 v21.8h, v27.4s\n"
"sqxtn v22.4h, v28.4s\n"
"sqxtn2 v22.8h, v29.4s\n"
"sqxtn v23.4h, v30.4s\n"
"sqxtn2 v23.8h, v31.4s\n"
"dup v14.8h, v13.h[4]\n"
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"sqadd v16.8h, v16.8h, v14.8h\n"
"sqadd v17.8h, v17.8h, v14.8h\n"
"sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v19.8h, v19.8h, v14.8h\n"
"sqadd v20.8h, v20.8h, v14.8h\n"
"sqadd v21.8h, v21.8h, v14.8h\n"
"sqadd v22.8h, v22.8h, v14.8h\n"
"sqadd v23.8h, v23.8h, v14.8h\n"
"ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"sqxtn v16.8b, v16.8h\n"
"ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"sqxtn2 v16.16b, v17.8h\n"
"sqxtn v17.8b, v18.8h\n"
"sqxtn2 v17.16b, v19.8h\n"
"sqxtn v18.8b, v20.8h\n"
"sqxtn2 v18.16b, v21.8h\n"
"sqxtn v19.8b, v22.8h\n"
"sqxtn2 v19.16b, v23.8h\n"
"dup v14.16b, w2\n"
"dup v15.16b, w3\n"
"sub w1, %w[dst_rows], %w[row]\n"
"smax v16.16b, v16.16b, v14.16b\n"
"sub w2, %w[dst_cols], %w[col]\n"
"smax v17.16b, v17.16b, v14.16b\n"
"mov w3, #8\n"
"smax v18.16b, v18.16b, v14.16b\n"
"cmp w1, #8\n"
"smax v19.16b, v19.16b, v14.16b\n"
"csel w1, w1, w3, le\n"
"smin v16.16b, v16.16b, v15.16b\n"
"cmp w2, #8\n"
"smin v17.16b, v17.16b, v15.16b\n"
"csel w2, w2, w3, le\n"
"smin v18.16b, v18.16b, v15.16b\n"
"smin v19.16b, v19.16b, v15.16b\n"
"dup d20, v16.d[1]\n"
"dup d21, v17.d[1]\n"
"dup d22, v18.d[1]\n"
"dup d23, v19.d[1]\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 130f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #8\n"
"b 131f\n"
"130:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"131:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v16.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v16)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v20.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v20)
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v17.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v17)
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v21.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v21)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v18.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v18)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v22.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v22)
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v19.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v19)
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v23.8b}, [x3], x4\n"
RUY_MAKE_ZERO(v23)
"beq 141f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"150:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"151:\n"
"ldrb w7, [x3, w5, uxtw]\n"
"strb w7, [x4, w5, uxtw]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 151b\n"
"add w6, w6, #1\n"
"add x3, x3, #8\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 150b\n"
"141:\n"
"add %[dst_ptr], %[dst_ptr], #8\n"
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
"dup v14.8h, v13.h[4]\n"
"saddw v16.4s, v16.4s, v14.4h\n"
"saddw v17.4s, v17.4s, v14.4h\n"
"saddw v18.4s, v18.4s, v14.4h\n"
"saddw v19.4s, v19.4s, v14.4h\n"
"saddw v20.4s, v20.4s, v14.4h\n"
"saddw v21.4s, v21.4s, v14.4h\n"
"saddw v22.4s, v22.4s, v14.4h\n"
"saddw v23.4s, v23.4s, v14.4h\n"
"saddw v24.4s, v24.4s, v14.4h\n"
"saddw v25.4s, v25.4s, v14.4h\n"
"saddw v26.4s, v26.4s, v14.4h\n"
"saddw v27.4s, v27.4s, v14.4h\n"
"saddw v28.4s, v28.4s, v14.4h\n"
"saddw v29.4s, v29.4s, v14.4h\n"
"saddw v30.4s, v30.4s, v14.4h\n"
"saddw v31.4s, v31.4s, v14.4h\n"
"sqxtn v16.4h, v16.4s\n"
"sqxtn2 v16.8h, v17.4s\n"
"sqxtn v17.4h, v18.4s\n"
"sqxtn2 v17.8h, v19.4s\n"
"sqxtn v18.4h, v20.4s\n"
"sqxtn2 v18.8h, v21.4s\n"
"sqxtn v19.4h, v22.4s\n"
"sqxtn2 v19.8h, v23.4s\n"
"sqxtn v20.4h, v24.4s\n"
"sqxtn2 v20.8h, v25.4s\n"
"sqxtn v21.4h, v26.4s\n"
"sqxtn2 v21.8h, v27.4s\n"
"sqxtn v22.4h, v28.4s\n"
"sqxtn2 v22.8h, v29.4s\n"
"sqxtn v23.4h, v30.4s\n"
"sqxtn2 v23.8h, v31.4s\n"
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"ldrsh w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldrsh w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.8h, w2\n"
"dup v15.8h, w3\n"
"smax v16.8h, v16.8h, v14.8h\n"
"smax v17.8h, v17.8h, v14.8h\n"
"smax v18.8h, v18.8h, v14.8h\n"
"smax v19.8h, v19.8h, v14.8h\n"
"smax v20.8h, v20.8h, v14.8h\n"
"smax v21.8h, v21.8h, v14.8h\n"
"smax v22.8h, v22.8h, v14.8h\n"
"smax v23.8h, v23.8h, v14.8h\n"
"smin v16.8h, v16.8h, v15.8h\n"
"smin v17.8h, v17.8h, v15.8h\n"
"smin v18.8h, v18.8h, v15.8h\n"
"smin v19.8h, v19.8h, v15.8h\n"
"smin v20.8h, v20.8h, v15.8h\n"
"smin v21.8h, v21.8h, v15.8h\n"
"smin v22.8h, v22.8h, v15.8h\n"
"smin v23.8h, v23.8h, v15.8h\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 230f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #16\n"
"b 231f\n"
"230:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"231:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v16.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v16)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v17.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v17)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v18.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v18)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v19.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v19)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v20.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v20)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v21.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v21)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v22.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v22)
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"st1 {v23.8h}, [x3], x4\n"
RUY_MAKE_ZERO(v23)
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"beq 241f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"250:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"251:\n"
"ldrsh w7, [x3, x5, lsl #1]\n"
"strh w7, [x4, x5, lsl #1]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 251b\n"
"add w6, w6, #1\n"
"add x3, x3, #16\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 250b\n"
"241:\n"
"add %[dst_ptr], %[dst_ptr], #16\n"
"b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
"ld1 {v0.8b}, [%[lhs_ptr]], #8\n"
"ldr x1, [%[lhs_ptr]], #8\n"
"ld1 {v1.8b}, [%[lhs_ptr]], #8\n"
"ldr x2, [%[lhs_ptr]], #8\n"
"ld1 {v2.8b}, [%[rhs_ptr]], #8\n"
"ldr x5, [%[rhs_ptr]], #8\n"
"ld1 {v3.8b}, [%[rhs_ptr]], #8\n"
"ldr x6, [%[rhs_ptr]], #8\n"
"ins v0.d[1], x1\n"
"ins v1.d[1], x2\n"
"ins v2.d[1], x5\n"
"ins v3.d[1], x6\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 330f\n"
"mov x3, %[dst_tmp_buf]\n"
"st1 {v16.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v16)
"st1 {v17.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v17)
"st1 {v18.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v18)
"st1 {v19.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v19)
"st1 {v20.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v20)
"st1 {v21.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v21)
"st1 {v22.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v22)
"st1 {v23.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v23)
"st1 {v24.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v24)
"st1 {v25.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v25)
"st1 {v26.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v26)
"st1 {v27.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v27)
"st1 {v28.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v28)
"st1 {v29.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v29)
"st1 {v30.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v30)
"st1 {v31.4s}, [x3], #16\n"
RUY_MAKE_ZERO(v31)
"b 331f\n"
"330:\n"
"mov x4, %[dst_ptr]\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"st1 {v16.4s, v17.4s}, [x4], x11\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"st1 {v18.4s, v19.4s}, [x4], x11\n"
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"st1 {v20.4s, v21.4s}, [x4], x11\n"
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"st1 {v22.4s, v23.4s}, [x4], x11\n"
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"st1 {v24.4s, v25.4s}, [x4], x11\n"
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"st1 {v26.4s, v27.4s}, [x4], x11\n"
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"st1 {v28.4s, v29.4s}, [x4], x11\n"
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"st1 {v30.4s, v31.4s}, [x4], x11\n"
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"331:\n"
".word 0x4f82e010 // sdot v16.4s, v0.16b, v2.4b[0]\n"
".word 0x4fa2e012 // sdot v18.4s, v0.16b, v2.4b[1]\n"
".word 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n"
".word 0x4fa2e816 // sdot v22.4s, v0.16b, v2.4b[3]\n"
"beq 341f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"350:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"351:\n"
"ldr w7, [x3, x5, lsl #2]\n"
"str w7, [x4, x5, lsl #2]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 351b\n"
"add w6, w6, #1\n"
"add x3, x3, #32\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 350b\n"
"341:\n"
"add %[dst_ptr], %[dst_ptr], #32\n"
RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"cmp %w[row], w7\n"
"beq 20f\n"
"add %w[row], %w[row], #8\n"
"b 21f\n"
"20:\n"
"mov %w[row], w6\n"
"add %w[col], %w[col], #8\n"
"add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
"mov %[dst_ptr], %[dst_col_ptr]\n"
"21:\n"
"cmp %w[col], w8\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ble 1b\n"
: [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
: [ params ] "r"(¶ms), [dst_rows] "r"(params.dst_rows),
[dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf),
[dst_type_id] "r"(params.dst_type_id)
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
"memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
#undef RUY_OFFSET_BIAS
#undef RUY_OFFSET_LHS_SUMS
#undef RUY_OFFSET_RHS_SUMS
#undef RUY_OFFSET_LHS_BASE_PTR
#undef RUY_OFFSET_MULTIPLIER_FIXEDPOINT
#undef RUY_OFFSET_MULTIPLIER_EXPONENT
#undef RUY_OFFSET_RHS_BASE_PTR
#undef RUY_OFFSET_DST_BASE_PTR
#undef RUY_OFFSET_LHS_ZERO_POINT
#undef RUY_OFFSET_RHS_ZERO_POINT
#undef RUY_OFFSET_DST_ZERO_POINT
#undef RUY_OFFSET_PROD_ZP_DEPTH
#undef RUY_OFFSET_START_ROW
#undef RUY_OFFSET_START_COL
#undef RUY_OFFSET_LAST_ROW
#undef RUY_OFFSET_LAST_COL
#undef RUY_OFFSET_DST_ROWS
#undef RUY_OFFSET_DST_COLS
#undef RUY_OFFSET_LHS_STRIDE
#undef RUY_OFFSET_RHS_STRIDE
#undef RUY_OFFSET_DST_STRIDE
#undef RUY_OFFSET_DEPTH
#undef RUY_OFFSET_CLAMP_MIN
#undef RUY_OFFSET_CLAMP_MAX
#undef RUY_OFFSET_FLAGS
#define RUY_OFFSET_LHS_BASE_PTR …
#define RUY_OFFSET_RHS_BASE_PTR …
#define RUY_OFFSET_DST_BASE_PTR …
#define RUY_OFFSET_BIAS …
#define RUY_OFFSET_START_ROW …
#define RUY_OFFSET_START_COL …
#define RUY_OFFSET_LAST_ROW …
#define RUY_OFFSET_LAST_COL …
#define RUY_OFFSET_LHS_STRIDE …
#define RUY_OFFSET_RHS_STRIDE …
#define RUY_OFFSET_DST_STRIDE …
#define RUY_OFFSET_DEPTH …
#define RUY_OFFSET_CLAMP_MIN …
#define RUY_OFFSET_CLAMP_MAX …
#define RUY_OFFSET_FLAGS …
template <typename Params>
void CheckOffsetsInKernelParamsFloat(const Params&) {
static_assert(offsetof(Params, lhs_base_ptr) == RUY_OFFSET_LHS_BASE_PTR, "");
static_assert(offsetof(Params, rhs_base_ptr) == RUY_OFFSET_RHS_BASE_PTR, "");
static_assert(offsetof(Params, dst_base_ptr) == RUY_OFFSET_DST_BASE_PTR, "");
static_assert(offsetof(Params, bias) == RUY_OFFSET_BIAS, "");
static_assert(offsetof(Params, start_row) == RUY_OFFSET_START_ROW, "");
static_assert(offsetof(Params, start_col) == RUY_OFFSET_START_COL, "");
static_assert(offsetof(Params, last_row) == RUY_OFFSET_LAST_ROW, "");
static_assert(offsetof(Params, last_col) == RUY_OFFSET_LAST_COL, "");
static_assert(offsetof(Params, lhs_stride) == RUY_OFFSET_LHS_STRIDE, "");
static_assert(offsetof(Params, rhs_stride) == RUY_OFFSET_RHS_STRIDE, "");
static_assert(offsetof(Params, dst_stride) == RUY_OFFSET_DST_STRIDE, "");
static_assert(offsetof(Params, depth) == RUY_OFFSET_DEPTH, "");
static_assert(offsetof(Params, clamp_min) == RUY_OFFSET_CLAMP_MIN, "");
static_assert(offsetof(Params, clamp_max) == RUY_OFFSET_CLAMP_MAX, "");
static_assert(offsetof(Params, flags) == RUY_OFFSET_FLAGS, "");
}
void KernelFloatNeon(const KernelParamsFloat<8, 8>& params) {
CheckOffsetsInKernelParamsFloat(params);
profiler::ScopeLabel label("Kernel (kNeon)");
const float* lhs_col_ptr = params.lhs_base_ptr;
const float* rhs_col_ptr = params.rhs_base_ptr;
const float* lhs_ptr = lhs_col_ptr;
const float* rhs_ptr = rhs_col_ptr;
float* dst_col_ptr = params.dst_base_ptr;
float* dst_ptr = dst_col_ptr;
int row = params.start_row;
int col = params.start_col;
asm volatile(
#define RUY_MAKE_ZERO …
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
"ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
"ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
"ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
"ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"mov w1, #1\n"
"1:\n"
"fmla v16.4s, v0.4s, v2.s[0]\n"
"fmla v18.4s, v0.4s, v2.s[1]\n"
"fmla v20.4s, v0.4s, v2.s[2]\n"
"fmla v22.4s, v0.4s, v2.s[3]\n"
#if RUY_OPT(MAX_STREAMING)
"cmp w12, #8\n"
"blt 78f\n"
"and w2, w12, #-4\n"
"ld1 {v4.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v5.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v6.4s}, [%[rhs_ptr]], #16\n"
"ld1 {v7.4s}, [%[rhs_ptr]], #16\n"
"ld1 {v8.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v9.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v10.4s}, [%[rhs_ptr]], #16\n"
"ld1 {v11.4s}, [%[rhs_ptr]], #16\n"
"ld1 {v12.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v13.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v14.4s}, [%[rhs_ptr]], #16\n"
"ld1 {v15.4s}, [%[rhs_ptr]], #16\n"
"mov w1, #4\n"
"80:\n"
"add %[lhs_ptr], %[lhs_ptr], #128\n"
"add %[rhs_ptr], %[rhs_ptr], #128\n"
"fmla v24.4s, v0.4s, v3.s[0]\n"
"fmla v26.4s, v0.4s, v3.s[1]\n"
"fmla v28.4s, v0.4s, v3.s[2]\n"
"fmla v30.4s, v0.4s, v3.s[3]\n"
"ldr q0, [%[lhs_ptr], #-128]\n"
"fmla v25.4s, v1.4s, v3.s[0]\n"
"fmla v27.4s, v1.4s, v3.s[1]\n"
"fmla v29.4s, v1.4s, v3.s[2]\n"
"fmla v31.4s, v1.4s, v3.s[3]\n"
"ldr q3, [%[rhs_ptr], #-112]\n"
"fmla v17.4s, v1.4s, v2.s[0]\n"
"fmla v19.4s, v1.4s, v2.s[1]\n"
"fmla v21.4s, v1.4s, v2.s[2]\n"
"fmla v23.4s, v1.4s, v2.s[3]\n"
"ldr q1, [%[lhs_ptr], #-112]\n"
"fmla v16.4s, v4.4s, v6.s[0]\n"
"fmla v18.4s, v4.4s, v6.s[1]\n"
"ldr q2, [%[rhs_ptr], #-128]\n"
"fmla v20.4s, v4.4s, v6.s[2]\n"
"fmla v22.4s, v4.4s, v6.s[3]\n"
"fmla v24.4s, v4.4s, v7.s[0]\n"
"fmla v26.4s, v4.4s, v7.s[1]\n"
"fmla v28.4s, v4.4s, v7.s[2]\n"
"fmla v30.4s, v4.4s, v7.s[3]\n"
"ldr q4, [%[lhs_ptr], #-96]\n"
"fmla v25.4s, v5.4s, v7.s[0]\n"
"fmla v27.4s, v5.4s, v7.s[1]\n"
"fmla v29.4s, v5.4s, v7.s[2]\n"
"fmla v31.4s, v5.4s, v7.s[3]\n"
"ldr q7, [%[rhs_ptr], #-80]\n"
"fmla v17.4s, v5.4s, v6.s[0]\n"
"fmla v19.4s, v5.4s, v6.s[1]\n"
"fmla v21.4s, v5.4s, v6.s[2]\n"
"fmla v23.4s, v5.4s, v6.s[3]\n"
"ldr q5, [%[lhs_ptr], #-80]\n"
"fmla v16.4s, v8.4s, v10.s[0]\n"
"fmla v18.4s, v8.4s, v10.s[1]\n"
"ldr q6, [%[rhs_ptr], #-96]\n"
"fmla v20.4s, v8.4s, v10.s[2]\n"
"fmla v22.4s, v8.4s, v10.s[3]\n"
"fmla v24.4s, v8.4s, v11.s[0]\n"
"fmla v26.4s, v8.4s, v11.s[1]\n"
"fmla v28.4s, v8.4s, v11.s[2]\n"
"fmla v30.4s, v8.4s, v11.s[3]\n"
"ldr q8, [%[lhs_ptr], #-64]\n"
"fmla v25.4s, v9.4s, v11.s[0]\n"
"fmla v27.4s, v9.4s, v11.s[1]\n"
"fmla v29.4s, v9.4s, v11.s[2]\n"
"fmla v31.4s, v9.4s, v11.s[3]\n"
"ldr q11, [%[rhs_ptr], #-48]\n"
"fmla v17.4s, v9.4s, v10.s[0]\n"
"fmla v19.4s, v9.4s, v10.s[1]\n"
"fmla v21.4s, v9.4s, v10.s[2]\n"
"fmla v23.4s, v9.4s, v10.s[3]\n"
"ldr q9, [%[lhs_ptr], #-48]\n"
"fmla v16.4s, v12.4s, v14.s[0]\n"
"fmla v18.4s, v12.4s, v14.s[1]\n"
"ldr q10, [%[rhs_ptr], #-64]\n"
"fmla v20.4s, v12.4s, v14.s[2]\n"
"fmla v22.4s, v12.4s, v14.s[3]\n"
"fmla v24.4s, v12.4s, v15.s[0]\n"
"fmla v26.4s, v12.4s, v15.s[1]\n"
"fmla v28.4s, v12.4s, v15.s[2]\n"
"fmla v30.4s, v12.4s, v15.s[3]\n"
"ldr q12, [%[lhs_ptr], #-32]\n"
"fmla v25.4s, v13.4s, v15.s[0]\n"
"fmla v27.4s, v13.4s, v15.s[1]\n"
"fmla v29.4s, v13.4s, v15.s[2]\n"
"fmla v31.4s, v13.4s, v15.s[3]\n"
"ldr q15, [%[rhs_ptr], #-16]\n"
"fmla v17.4s, v13.4s, v14.s[0]\n"
"fmla v19.4s, v13.4s, v14.s[1]\n"
"fmla v21.4s, v13.4s, v14.s[2]\n"
"fmla v23.4s, v13.4s, v14.s[3]\n"
"ldr q13, [%[lhs_ptr], #-16]\n"
"fmla v16.4s, v0.4s, v2.s[0]\n"
"fmla v18.4s, v0.4s, v2.s[1]\n"
"ldr q14, [%[rhs_ptr], #-32]\n"
"fmla v20.4s, v0.4s, v2.s[2]\n"
"fmla v22.4s, v0.4s, v2.s[3]\n"
"add w1, w1, #4\n"
"cmp w1, w2\n"
"blt 80b\n"
"fmla v16.4s, v4.4s, v6.s[0]\n"
"fmla v18.4s, v4.4s, v6.s[1]\n"
"fmla v20.4s, v4.4s, v6.s[2]\n"
"fmla v22.4s, v4.4s, v6.s[3]\n"
"fmla v24.4s, v4.4s, v7.s[0]\n"
"fmla v26.4s, v4.4s, v7.s[1]\n"
"fmla v28.4s, v4.4s, v7.s[2]\n"
"fmla v30.4s, v4.4s, v7.s[3]\n"
"fmla v25.4s, v5.4s, v7.s[0]\n"
"fmla v27.4s, v5.4s, v7.s[1]\n"
"fmla v29.4s, v5.4s, v7.s[2]\n"
"fmla v31.4s, v5.4s, v7.s[3]\n"
"fmla v17.4s, v5.4s, v6.s[0]\n"
"fmla v19.4s, v5.4s, v6.s[1]\n"
"fmla v21.4s, v5.4s, v6.s[2]\n"
"fmla v23.4s, v5.4s, v6.s[3]\n"
"fmla v16.4s, v8.4s, v10.s[0]\n"
"fmla v18.4s, v8.4s, v10.s[1]\n"
"fmla v20.4s, v8.4s, v10.s[2]\n"
"fmla v22.4s, v8.4s, v10.s[3]\n"
"fmla v24.4s, v8.4s, v11.s[0]\n"
"fmla v26.4s, v8.4s, v11.s[1]\n"
"fmla v28.4s, v8.4s, v11.s[2]\n"
"fmla v30.4s, v8.4s, v11.s[3]\n"
"fmla v25.4s, v9.4s, v11.s[0]\n"
"fmla v27.4s, v9.4s, v11.s[1]\n"
"fmla v29.4s, v9.4s, v11.s[2]\n"
"fmla v31.4s, v9.4s, v11.s[3]\n"
"fmla v17.4s, v9.4s, v10.s[0]\n"
"fmla v19.4s, v9.4s, v10.s[1]\n"
"fmla v21.4s, v9.4s, v10.s[2]\n"
"fmla v23.4s, v9.4s, v10.s[3]\n"
"fmla v16.4s, v12.4s, v14.s[0]\n"
"fmla v18.4s, v12.4s, v14.s[1]\n"
"fmla v20.4s, v12.4s, v14.s[2]\n"
"fmla v22.4s, v12.4s, v14.s[3]\n"
"fmla v24.4s, v12.4s, v15.s[0]\n"
"fmla v26.4s, v12.4s, v15.s[1]\n"
"fmla v28.4s, v12.4s, v15.s[2]\n"
"fmla v30.4s, v12.4s, v15.s[3]\n"
"fmla v25.4s, v13.4s, v15.s[0]\n"
"fmla v27.4s, v13.4s, v15.s[1]\n"
"fmla v29.4s, v13.4s, v15.s[2]\n"
"fmla v31.4s, v13.4s, v15.s[3]\n"
"fmla v17.4s, v13.4s, v14.s[0]\n"
"fmla v19.4s, v13.4s, v14.s[1]\n"
"fmla v21.4s, v13.4s, v14.s[2]\n"
"fmla v23.4s, v13.4s, v14.s[3]\n"
"78:\n"
#endif
"cmp w1, w12\n"
"beq 79f\n"
"2:\n"
"fmla v24.4s, v0.4s, v3.s[0]\n"
"fmla v26.4s, v0.4s, v3.s[1]\n"
"ld1 {v4.4s}, [%[rhs_ptr]], #16\n"
"fmla v28.4s, v0.4s, v3.s[2]\n"
"fmla v30.4s, v0.4s, v3.s[3]\n"
"ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
"fmla v25.4s, v1.4s, v3.s[0]\n"
"fmla v27.4s, v1.4s, v3.s[1]\n"
"add w1, w1, #1\n"
"fmla v29.4s, v1.4s, v3.s[2]\n"
"fmla v31.4s, v1.4s, v3.s[3]\n"
"ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
"fmla v17.4s, v1.4s, v2.s[0]\n"
"fmla v19.4s, v1.4s, v2.s[1]\n"
"cmp w1, w12\n"
"fmla v21.4s, v1.4s, v2.s[2]\n"
"fmla v23.4s, v1.4s, v2.s[3]\n"
"ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
"fmla v16.4s, v0.4s, v4.s[0]\n"
"fmla v18.4s, v0.4s, v4.s[1]\n"
"mov v2.16b, v4.16b\n"
"fmla v20.4s, v0.4s, v4.s[2]\n"
"fmla v22.4s, v0.4s, v4.s[3]\n"
"blt 2b\n"
"79:\n"
"fmla v24.4s, v0.4s, v3.s[0]\n"
"fmla v26.4s, v0.4s, v3.s[1]\n"
"fmla v28.4s, v0.4s, v3.s[2]\n"
"fmla v30.4s, v0.4s, v3.s[3]\n"
"fmla v25.4s, v1.4s, v3.s[0]\n"
"fmla v27.4s, v1.4s, v3.s[1]\n"
"fmla v29.4s, v1.4s, v3.s[2]\n"
"fmla v31.4s, v1.4s, v3.s[3]\n"
"fmla v17.4s, v1.4s, v2.s[0]\n"
"fmla v19.4s, v1.4s, v2.s[1]\n"
"fmla v21.4s, v1.4s, v2.s[2]\n"
"fmla v23.4s, v1.4s, v2.s[3]\n"
"cmp %w[row], w7\n"
"bge 4f\n"
"add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
"b 5f\n"
"4:\n"
"mov %[lhs_col_ptr], x5\n"
"cmp %w[col], w8\n"
"bge 5f\n"
"add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
"5:\n"
"mov %[lhs_ptr], %[lhs_col_ptr]\n"
"mov %[rhs_ptr], %[rhs_col_ptr]\n"
"ldrb w4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"csel w3, %w[row], %w[col], eq\n"
"add x5, x1, x3, lsl #2\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"ld1 {v14.4s}, [x1], #16\n"
"ld1 {v15.4s}, [x1]\n"
"ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
"ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 6f\n"
"fadd v16.4s, v16.4s, v14.4s\n"
"fadd v17.4s, v17.4s, v15.4s\n"
"fadd v18.4s, v18.4s, v14.4s\n"
"fadd v19.4s, v19.4s, v15.4s\n"
"fadd v20.4s, v20.4s, v14.4s\n"
"fadd v21.4s, v21.4s, v15.4s\n"
"fadd v22.4s, v22.4s, v14.4s\n"
"fadd v23.4s, v23.4s, v15.4s\n"
"fadd v24.4s, v24.4s, v14.4s\n"
"fadd v25.4s, v25.4s, v15.4s\n"
"fadd v26.4s, v26.4s, v14.4s\n"
"fadd v27.4s, v27.4s, v15.4s\n"
"fadd v28.4s, v28.4s, v14.4s\n"
"fadd v29.4s, v29.4s, v15.4s\n"
"fadd v30.4s, v30.4s, v14.4s\n"
"fadd v31.4s, v31.4s, v15.4s\n"
"b 7f\n"
"6:\n"
"dup v8.4s, v14.s[0]\n"
"dup v9.4s, v14.s[1]\n"
"dup v10.4s, v14.s[2]\n"
"dup v11.4s, v14.s[3]\n"
"dup v12.4s, v15.s[0]\n"
"dup v13.4s, v15.s[1]\n"
"dup v14.4s, v15.s[2]\n"
"dup v15.4s, v15.s[3]\n"
"fadd v16.4s, v16.4s, v8.4s\n"
"fadd v17.4s, v17.4s, v8.4s\n"
"fadd v18.4s, v18.4s, v9.4s\n"
"fadd v19.4s, v19.4s, v9.4s\n"
"fadd v20.4s, v20.4s, v10.4s\n"
"fadd v21.4s, v21.4s, v10.4s\n"
"fadd v22.4s, v22.4s, v11.4s\n"
"fadd v23.4s, v23.4s, v11.4s\n"
"fadd v24.4s, v24.4s, v12.4s\n"
"fadd v25.4s, v25.4s, v12.4s\n"
"fadd v26.4s, v26.4s, v13.4s\n"
"fadd v27.4s, v27.4s, v13.4s\n"
"fadd v28.4s, v28.4s, v14.4s\n"
"fadd v29.4s, v29.4s, v14.4s\n"
"fadd v30.4s, v30.4s, v15.4s\n"
"fadd v31.4s, v31.4s, v15.4s\n"
"7:\n"
"ldr w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.4s, w2\n"
"dup v15.4s, w3\n"
"fmax v16.4s, v16.4s, v14.4s\n"
"fmax v17.4s, v17.4s, v14.4s\n"
"fmax v18.4s, v18.4s, v14.4s\n"
"fmax v19.4s, v19.4s, v14.4s\n"
"fmax v20.4s, v20.4s, v14.4s\n"
"fmax v21.4s, v21.4s, v14.4s\n"
"fmax v22.4s, v22.4s, v14.4s\n"
"fmax v23.4s, v23.4s, v14.4s\n"
"fmax v24.4s, v24.4s, v14.4s\n"
"fmax v25.4s, v25.4s, v14.4s\n"
"fmax v26.4s, v26.4s, v14.4s\n"
"fmax v27.4s, v27.4s, v14.4s\n"
"fmax v28.4s, v28.4s, v14.4s\n"
"fmax v29.4s, v29.4s, v14.4s\n"
"fmax v30.4s, v30.4s, v14.4s\n"
"fmax v31.4s, v31.4s, v14.4s\n"
"fmin v16.4s, v16.4s, v15.4s\n"
"fmin v17.4s, v17.4s, v15.4s\n"
"fmin v18.4s, v18.4s, v15.4s\n"
"fmin v19.4s, v19.4s, v15.4s\n"
"fmin v20.4s, v20.4s, v15.4s\n"
"fmin v21.4s, v21.4s, v15.4s\n"
"fmin v22.4s, v22.4s, v15.4s\n"
"fmin v23.4s, v23.4s, v15.4s\n"
"fmin v24.4s, v24.4s, v15.4s\n"
"fmin v25.4s, v25.4s, v15.4s\n"
"fmin v26.4s, v26.4s, v15.4s\n"
"fmin v27.4s, v27.4s, v15.4s\n"
"fmin v28.4s, v28.4s, v15.4s\n"
"fmin v29.4s, v29.4s, v15.4s\n"
"fmin v30.4s, v30.4s, v15.4s\n"
"fmin v31.4s, v31.4s, v15.4s\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 30f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #32\n"
"b 31f\n"
"30:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"31:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"str q16, [x3, #0]\n"
"str q17, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"str q18, [x3, #0]\n"
"str q19, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
"str q20, [x3, #0]\n"
"str q21, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
"str q22, [x3, #0]\n"
"str q23, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
"str q24, [x3, #0]\n"
"str q25, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
"str q26, [x3, #0]\n"
"str q27, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
"str q28, [x3, #0]\n"
"str q29, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
"str q30, [x3, #0]\n"
"str q31, [x3, #16]\n"
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"beq 41f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldr w7, [x3, x5, lsl #2]\n"
"str w7, [x4, x5, lsl #2]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #32\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"41:\n"
"add %[dst_ptr], %[dst_ptr], #32\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"cmp %w[row], w7\n"
"beq 20f\n"
"add %w[row], %w[row], #8\n"
"b 21f\n"
"20:\n"
"mov %w[row], w6\n"
"add %w[col], %w[col], #8\n"
"add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
"mov %[dst_ptr], %[dst_col_ptr]\n"
"21:\n"
"cmp %w[col], w8\n"
"mov w1, #1\n"
"ble 1b\n"
: [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
: [ params ] "r"(¶ms), [dst_rows] "r"(params.dst_rows),
[dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf)
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
"memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
void KernelFloatNeonX1(const KernelParamsFloat<8, 8>& params) {
CheckOffsetsInKernelParamsFloat(params);
profiler::ScopeLabel label("Kernel (kNeon) X1");
const float* lhs_col_ptr = params.lhs_base_ptr;
const float* rhs_col_ptr = params.rhs_base_ptr;
const float* lhs_ptr = lhs_col_ptr;
const float* rhs_ptr = rhs_col_ptr;
float* dst_col_ptr = params.dst_base_ptr;
float* dst_ptr = dst_col_ptr;
int row = params.start_row;
int col = params.start_col;
asm volatile(
#define RUY_MAKE_ZERO …
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
"ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
"ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
"ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
"ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"mov w1, #1\n"
"1:\n"
"fmla v16.4s, v0.4s, v2.s[0]\n"
"fmla v18.4s, v0.4s, v2.s[1]\n"
"fmla v20.4s, v0.4s, v2.s[2]\n"
"fmla v22.4s, v0.4s, v2.s[3]\n"
"cmp w1, w12\n"
"beq 79f\n"
"2:\n"
"fmla v24.4s, v0.4s, v3.s[0]\n"
"fmla v26.4s, v0.4s, v3.s[1]\n"
"ld1 {v4.4s}, [%[rhs_ptr]], #16\n"
"fmla v28.4s, v0.4s, v3.s[2]\n"
"fmla v30.4s, v0.4s, v3.s[3]\n"
"ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
"fmla v25.4s, v1.4s, v3.s[0]\n"
"fmla v27.4s, v1.4s, v3.s[1]\n"
"add w1, w1, #1\n"
"fmla v29.4s, v1.4s, v3.s[2]\n"
"fmla v31.4s, v1.4s, v3.s[3]\n"
"ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
"fmla v17.4s, v1.4s, v2.s[0]\n"
"fmla v19.4s, v1.4s, v2.s[1]\n"
"cmp w1, w12\n"
"fmla v21.4s, v1.4s, v2.s[2]\n"
"fmla v23.4s, v1.4s, v2.s[3]\n"
"ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
"fmla v16.4s, v0.4s, v4.s[0]\n"
"fmla v18.4s, v0.4s, v4.s[1]\n"
"mov v2.16b, v4.16b\n"
"fmla v20.4s, v0.4s, v4.s[2]\n"
"fmla v22.4s, v0.4s, v4.s[3]\n"
"blt 2b\n"
"79:\n"
"fmla v24.4s, v0.4s, v3.s[0]\n"
"fmla v26.4s, v0.4s, v3.s[1]\n"
"fmla v28.4s, v0.4s, v3.s[2]\n"
"fmla v30.4s, v0.4s, v3.s[3]\n"
"fmla v25.4s, v1.4s, v3.s[0]\n"
"fmla v27.4s, v1.4s, v3.s[1]\n"
"fmla v29.4s, v1.4s, v3.s[2]\n"
"fmla v31.4s, v1.4s, v3.s[3]\n"
"fmla v17.4s, v1.4s, v2.s[0]\n"
"fmla v19.4s, v1.4s, v2.s[1]\n"
"fmla v21.4s, v1.4s, v2.s[2]\n"
"fmla v23.4s, v1.4s, v2.s[3]\n"
"cmp %w[row], w7\n"
"bge 4f\n"
"add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
"b 5f\n"
"4:\n"
"mov %[lhs_col_ptr], x5\n"
"cmp %w[col], w8\n"
"bge 5f\n"
"add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
"5:\n"
"mov %[lhs_ptr], %[lhs_col_ptr]\n"
"mov %[rhs_ptr], %[rhs_col_ptr]\n"
"ldrb w4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"csel w3, %w[row], %w[col], eq\n"
"add x5, x1, x3, lsl #2\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"ld1 {v14.4s}, [x1], #16\n"
"ld1 {v15.4s}, [x1]\n"
"ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
"ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 6f\n"
"fadd v16.4s, v16.4s, v14.4s\n"
"fadd v17.4s, v17.4s, v15.4s\n"
"fadd v18.4s, v18.4s, v14.4s\n"
"fadd v19.4s, v19.4s, v15.4s\n"
"fadd v20.4s, v20.4s, v14.4s\n"
"fadd v21.4s, v21.4s, v15.4s\n"
"fadd v22.4s, v22.4s, v14.4s\n"
"fadd v23.4s, v23.4s, v15.4s\n"
"fadd v24.4s, v24.4s, v14.4s\n"
"fadd v25.4s, v25.4s, v15.4s\n"
"fadd v26.4s, v26.4s, v14.4s\n"
"fadd v27.4s, v27.4s, v15.4s\n"
"fadd v28.4s, v28.4s, v14.4s\n"
"fadd v29.4s, v29.4s, v15.4s\n"
"fadd v30.4s, v30.4s, v14.4s\n"
"fadd v31.4s, v31.4s, v15.4s\n"
"b 7f\n"
"6:\n"
"dup v8.4s, v14.s[0]\n"
"dup v9.4s, v14.s[1]\n"
"dup v10.4s, v14.s[2]\n"
"dup v11.4s, v14.s[3]\n"
"dup v12.4s, v15.s[0]\n"
"dup v13.4s, v15.s[1]\n"
"dup v14.4s, v15.s[2]\n"
"dup v15.4s, v15.s[3]\n"
"fadd v16.4s, v16.4s, v8.4s\n"
"fadd v17.4s, v17.4s, v8.4s\n"
"fadd v18.4s, v18.4s, v9.4s\n"
"fadd v19.4s, v19.4s, v9.4s\n"
"fadd v20.4s, v20.4s, v10.4s\n"
"fadd v21.4s, v21.4s, v10.4s\n"
"fadd v22.4s, v22.4s, v11.4s\n"
"fadd v23.4s, v23.4s, v11.4s\n"
"fadd v24.4s, v24.4s, v12.4s\n"
"fadd v25.4s, v25.4s, v12.4s\n"
"fadd v26.4s, v26.4s, v13.4s\n"
"fadd v27.4s, v27.4s, v13.4s\n"
"fadd v28.4s, v28.4s, v14.4s\n"
"fadd v29.4s, v29.4s, v14.4s\n"
"fadd v30.4s, v30.4s, v15.4s\n"
"fadd v31.4s, v31.4s, v15.4s\n"
"7:\n"
"ldr w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.4s, w2\n"
"dup v15.4s, w3\n"
"fmax v16.4s, v16.4s, v14.4s\n"
"fmax v17.4s, v17.4s, v14.4s\n"
"fmax v18.4s, v18.4s, v14.4s\n"
"fmax v19.4s, v19.4s, v14.4s\n"
"fmax v20.4s, v20.4s, v14.4s\n"
"fmax v21.4s, v21.4s, v14.4s\n"
"fmax v22.4s, v22.4s, v14.4s\n"
"fmax v23.4s, v23.4s, v14.4s\n"
"fmax v24.4s, v24.4s, v14.4s\n"
"fmax v25.4s, v25.4s, v14.4s\n"
"fmax v26.4s, v26.4s, v14.4s\n"
"fmax v27.4s, v27.4s, v14.4s\n"
"fmax v28.4s, v28.4s, v14.4s\n"
"fmax v29.4s, v29.4s, v14.4s\n"
"fmax v30.4s, v30.4s, v14.4s\n"
"fmax v31.4s, v31.4s, v14.4s\n"
"fmin v16.4s, v16.4s, v15.4s\n"
"fmin v17.4s, v17.4s, v15.4s\n"
"fmin v18.4s, v18.4s, v15.4s\n"
"fmin v19.4s, v19.4s, v15.4s\n"
"fmin v20.4s, v20.4s, v15.4s\n"
"fmin v21.4s, v21.4s, v15.4s\n"
"fmin v22.4s, v22.4s, v15.4s\n"
"fmin v23.4s, v23.4s, v15.4s\n"
"fmin v24.4s, v24.4s, v15.4s\n"
"fmin v25.4s, v25.4s, v15.4s\n"
"fmin v26.4s, v26.4s, v15.4s\n"
"fmin v27.4s, v27.4s, v15.4s\n"
"fmin v28.4s, v28.4s, v15.4s\n"
"fmin v29.4s, v29.4s, v15.4s\n"
"fmin v30.4s, v30.4s, v15.4s\n"
"fmin v31.4s, v31.4s, v15.4s\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 30f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #32\n"
"b 31f\n"
"30:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"31:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"str q16, [x3, #0]\n"
"str q17, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"str q18, [x3, #0]\n"
"str q19, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
"str q20, [x3, #0]\n"
"str q21, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
"str q22, [x3, #0]\n"
"str q23, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
"str q24, [x3, #0]\n"
"str q25, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
"str q26, [x3, #0]\n"
"str q27, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
"str q28, [x3, #0]\n"
"str q29, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
"str q30, [x3, #0]\n"
"str q31, [x3, #16]\n"
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"beq 41f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldr w7, [x3, x5, lsl #2]\n"
"str w7, [x4, x5, lsl #2]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #32\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"41:\n"
"add %[dst_ptr], %[dst_ptr], #32\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"cmp %w[row], w7\n"
"beq 20f\n"
"add %w[row], %w[row], #8\n"
"b 21f\n"
"20:\n"
"mov %w[row], w6\n"
"add %w[col], %w[col], #8\n"
"add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
"mov %[dst_ptr], %[dst_col_ptr]\n"
"21:\n"
"cmp %w[col], w8\n"
"mov w1, #1\n"
"ble 1b\n"
: [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
: [ params ] "r"(¶ms), [dst_rows] "r"(params.dst_rows),
[dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf)
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
"memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
void KernelFloatNeonA55ish(const KernelParamsFloat<8, 8>& params) {
profiler::ScopeLabel label("Kernel (kNeon, optimized for in-order cores)");
CheckOffsetsInKernelParamsFloat(params);
const float* lhs_col_ptr = params.lhs_base_ptr;
const float* rhs_col_ptr = params.rhs_base_ptr;
const float* lhs_ptr = lhs_col_ptr;
const float* rhs_ptr = rhs_col_ptr;
float* dst_col_ptr = params.dst_base_ptr;
float* dst_ptr = dst_col_ptr;
int row = params.start_row;
int col = params.start_col;
asm volatile(
#define RUY_MAKE_ZERO …
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
"ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
"ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
RUY_MAKE_ZERO(v16)
"ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
RUY_MAKE_ZERO(v17)
"ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
RUY_MAKE_ZERO(v18)
"ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
RUY_MAKE_ZERO(v19)
"ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
RUY_MAKE_ZERO(v20)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #64]\n")
RUY_MAKE_ZERO(v21)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #64]\n")
RUY_MAKE_ZERO(v22)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #128]\n")
RUY_MAKE_ZERO(v23)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #128]\n")
RUY_MAKE_ZERO(v24)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #192]\n")
RUY_MAKE_ZERO(v25)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #192]\n")
RUY_MAKE_ZERO(v26)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #256]\n")
RUY_MAKE_ZERO(v27)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #256]\n")
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"sub w1, w12, #1\n"
"1:\n"
"cmp w1, #0\n"
"fmla v16.4s, v0.4s, v2.s[0]\n"
"fmla v18.4s, v0.4s, v2.s[1]\n"
"fmla v20.4s, v0.4s, v2.s[2]\n"
"fmla v22.4s, v0.4s, v2.s[3]\n"
"beq 79f\n"
"2:\n"
"fmla v24.4s, v0.4s, v3.s[0]\n"
"ldr x2, [%[lhs_ptr], #8]\n"
"fmla v26.4s, v0.4s, v3.s[1]\n"
"ldr x3, [%[lhs_ptr], #24]\n"
"fmla v28.4s, v0.4s, v3.s[2]\n"
"ldr x5, [%[rhs_ptr], #24]\n"
"fmla v30.4s, v0.4s, v3.s[3]\n"
"ldr x4, [%[rhs_ptr], #8]\n"
"fmla v25.4s, v1.4s, v3.s[0]\n"
"subs w1, w1, #1\n"
"ldr d0, [%[lhs_ptr]], #32\n"
"fmla v27.4s, v1.4s, v3.s[1]\n"
"fmla v29.4s, v1.4s, v3.s[2]\n"
"fmla v31.4s, v1.4s, v3.s[3]\n"
"ins v0.d[1], x2\n"
"ldr d3, [%[rhs_ptr], #16]\n"
"fmla v17.4s, v1.4s, v2.s[0]\n"
"fmla v19.4s, v1.4s, v2.s[1]\n"
"ins v3.d[1], x5\n"
"ldr d4, [%[rhs_ptr]], #32\n"
"fmla v21.4s, v1.4s, v2.s[2]\n"
"fmla v23.4s, v1.4s, v2.s[3]\n"
"fmla v16.4s, v0.4s, v4.s[0]\n"
"ins v4.d[1], x4\n"
"ldr d1, [%[lhs_ptr], #-16]\n"
"fmla v18.4s, v0.4s, v4.s[1]\n"
"fmla v20.4s, v0.4s, v4.s[2]\n"
"ins v1.d[1], x3\n"
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #256]\n")
"mov v2.16b, v4.16b\n"
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #256]\n")
"fmla v22.4s, v0.4s, v4.s[3]\n"
"bne 2b\n"
"79:\n"
"fmla v24.4s, v0.4s, v3.s[0]\n"
"fmla v26.4s, v0.4s, v3.s[1]\n"
"fmla v28.4s, v0.4s, v3.s[2]\n"
"fmla v30.4s, v0.4s, v3.s[3]\n"
"fmla v25.4s, v1.4s, v3.s[0]\n"
"fmla v27.4s, v1.4s, v3.s[1]\n"
"fmla v29.4s, v1.4s, v3.s[2]\n"
"fmla v31.4s, v1.4s, v3.s[3]\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"fmla v17.4s, v1.4s, v2.s[0]\n"
"fmla v19.4s, v1.4s, v2.s[1]\n"
"fmla v21.4s, v1.4s, v2.s[2]\n"
"fmla v23.4s, v1.4s, v2.s[3]\n"
"cmp %w[row], w7\n"
"bge 4f\n"
"add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
"b 5f\n"
"4:\n"
"mov %[lhs_col_ptr], x5\n"
"cmp %w[col], w8\n"
"bge 5f\n"
"add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
"5:\n"
"mov %[lhs_ptr], %[lhs_col_ptr]\n"
"mov %[rhs_ptr], %[rhs_col_ptr]\n"
"ldrb w4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"csel w3, %w[row], %w[col], eq\n"
"add x5, x1, x3, lsl #2\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"ld1 {v14.4s}, [x1], #16\n"
"ld1 {v15.4s}, [x1]\n"
"ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
"ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 6f\n"
"fadd v16.4s, v16.4s, v14.4s\n"
"fadd v17.4s, v17.4s, v15.4s\n"
"fadd v18.4s, v18.4s, v14.4s\n"
"fadd v19.4s, v19.4s, v15.4s\n"
"fadd v20.4s, v20.4s, v14.4s\n"
"fadd v21.4s, v21.4s, v15.4s\n"
"fadd v22.4s, v22.4s, v14.4s\n"
"fadd v23.4s, v23.4s, v15.4s\n"
"fadd v24.4s, v24.4s, v14.4s\n"
"fadd v25.4s, v25.4s, v15.4s\n"
"fadd v26.4s, v26.4s, v14.4s\n"
"fadd v27.4s, v27.4s, v15.4s\n"
"fadd v28.4s, v28.4s, v14.4s\n"
"fadd v29.4s, v29.4s, v15.4s\n"
"fadd v30.4s, v30.4s, v14.4s\n"
"fadd v31.4s, v31.4s, v15.4s\n"
"b 7f\n"
"6:\n"
"dup v8.4s, v14.s[0]\n"
"dup v9.4s, v14.s[1]\n"
"fadd v16.4s, v16.4s, v8.4s\n"
"dup v10.4s, v14.s[2]\n"
"fadd v17.4s, v17.4s, v8.4s\n"
"dup v11.4s, v14.s[3]\n"
"fadd v18.4s, v18.4s, v9.4s\n"
"dup v12.4s, v15.s[0]\n"
"fadd v19.4s, v19.4s, v9.4s\n"
"dup v13.4s, v15.s[1]\n"
"fadd v20.4s, v20.4s, v10.4s\n"
"dup v14.4s, v15.s[2]\n"
"fadd v21.4s, v21.4s, v10.4s\n"
"dup v15.4s, v15.s[3]\n"
"fadd v22.4s, v22.4s, v11.4s\n"
"fadd v23.4s, v23.4s, v11.4s\n"
"fadd v24.4s, v24.4s, v12.4s\n"
"fadd v25.4s, v25.4s, v12.4s\n"
"fadd v26.4s, v26.4s, v13.4s\n"
"fadd v27.4s, v27.4s, v13.4s\n"
"fadd v28.4s, v28.4s, v14.4s\n"
"fadd v29.4s, v29.4s, v14.4s\n"
"fadd v30.4s, v30.4s, v15.4s\n"
"fadd v31.4s, v31.4s, v15.4s\n"
"7:\n"
"ldr w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.4s, w2\n"
"dup v15.4s, w3\n"
"fmax v16.4s, v16.4s, v14.4s\n"
"fmax v17.4s, v17.4s, v14.4s\n"
"fmax v18.4s, v18.4s, v14.4s\n"
"fmax v19.4s, v19.4s, v14.4s\n"
"fmax v20.4s, v20.4s, v14.4s\n"
"fmax v21.4s, v21.4s, v14.4s\n"
"fmax v22.4s, v22.4s, v14.4s\n"
"fmax v23.4s, v23.4s, v14.4s\n"
"fmax v24.4s, v24.4s, v14.4s\n"
"fmax v25.4s, v25.4s, v14.4s\n"
"fmax v26.4s, v26.4s, v14.4s\n"
"fmax v27.4s, v27.4s, v14.4s\n"
"fmax v28.4s, v28.4s, v14.4s\n"
"fmax v29.4s, v29.4s, v14.4s\n"
"fmax v30.4s, v30.4s, v14.4s\n"
"fmax v31.4s, v31.4s, v14.4s\n"
"fmin v16.4s, v16.4s, v15.4s\n"
"fmin v17.4s, v17.4s, v15.4s\n"
"fmin v18.4s, v18.4s, v15.4s\n"
"fmin v19.4s, v19.4s, v15.4s\n"
"fmin v20.4s, v20.4s, v15.4s\n"
"fmin v21.4s, v21.4s, v15.4s\n"
"fmin v22.4s, v22.4s, v15.4s\n"
"fmin v23.4s, v23.4s, v15.4s\n"
"fmin v24.4s, v24.4s, v15.4s\n"
"fmin v25.4s, v25.4s, v15.4s\n"
"fmin v26.4s, v26.4s, v15.4s\n"
"fmin v27.4s, v27.4s, v15.4s\n"
"fmin v28.4s, v28.4s, v15.4s\n"
"fmin v29.4s, v29.4s, v15.4s\n"
"fmin v30.4s, v30.4s, v15.4s\n"
"fmin v31.4s, v31.4s, v15.4s\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 30f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #32\n"
"b 31f\n"
"30:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"31:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"str q16, [x3, #0]\n"
"str q17, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"str q18, [x3, #0]\n"
"str q19, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
"str q20, [x3, #0]\n"
"str q21, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
"str q22, [x3, #0]\n"
"str q23, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
"str q24, [x3, #0]\n"
"str q25, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
"str q26, [x3, #0]\n"
"str q27, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
"str q28, [x3, #0]\n"
"str q29, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
"str q30, [x3, #0]\n"
"str q31, [x3, #16]\n"
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"beq 41f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldr w7, [x3, x5, lsl #2]\n"
"str w7, [x4, x5, lsl #2]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #32\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"41:\n"
"add %[dst_ptr], %[dst_ptr], #32\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"cmp %w[row], w7\n"
"beq 20f\n"
"add %w[row], %w[row], #8\n"
"b 21f\n"
"20:\n"
"mov %w[row], w6\n"
"add %w[col], %w[col], #8\n"
"add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
"mov %[dst_ptr], %[dst_col_ptr]\n"
"21:\n"
"cmp %w[col], w8\n"
"sub w1, w12, #1\n"
"ble 1b\n"
: [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
: [ params ] "r"(¶ms), [dst_rows] "r"(params.dst_rows),
[dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf)
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
"memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
void KernelFloatNeonDotprodA55ish(const KernelParamsFloat<8, 8>& params) {
profiler::ScopeLabel label(
"Kernel (kNeonDotprod, optimized for in-order cores)");
CheckOffsetsInKernelParamsFloat(params);
const float* lhs_col_ptr = params.lhs_base_ptr;
const float* rhs_col_ptr = params.rhs_base_ptr;
const float* lhs_ptr = lhs_col_ptr;
const float* rhs_ptr = rhs_col_ptr;
float* dst_col_ptr = params.dst_base_ptr;
float* dst_ptr = dst_col_ptr;
int row = params.start_row;
int col = params.start_col;
asm volatile(
#define RUY_MAKE_ZERO …
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
"ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
"ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
"ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
"ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
RUY_MAKE_ZERO(v16)
"ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
RUY_MAKE_ZERO(v17)
"ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
RUY_MAKE_ZERO(v18)
"ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
RUY_MAKE_ZERO(v19)
"ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
RUY_MAKE_ZERO(v20)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #64]\n")
RUY_MAKE_ZERO(v21)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #64]\n")
RUY_MAKE_ZERO(v22)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #128]\n")
RUY_MAKE_ZERO(v23)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #128]\n")
RUY_MAKE_ZERO(v24)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #192]\n")
RUY_MAKE_ZERO(v25)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #192]\n")
RUY_MAKE_ZERO(v26)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #256]\n")
RUY_MAKE_ZERO(v27)
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #256]\n")
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"sub w1, w12, #1\n"
"1:\n"
"cmp w1, #0\n"
"fmla v16.4s, v0.4s, v2.s[0]\n"
"fmla v18.4s, v0.4s, v2.s[1]\n"
"fmla v20.4s, v0.4s, v2.s[2]\n"
"fmla v22.4s, v0.4s, v2.s[3]\n"
"beq 79f\n"
"2:\n"
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #256]\n")
"fmla v24.4s, v0.4s, v3.s[0]\n"
"ldr x2, [%[lhs_ptr], #8]\n"
"fmla v26.4s, v0.4s, v3.s[1]\n"
"ldr x3, [%[lhs_ptr], #24]\n"
"fmla v28.4s, v0.4s, v3.s[2]\n"
"ldr x5, [%[rhs_ptr], #24]\n"
"fmla v30.4s, v0.4s, v3.s[3]\n"
"ldr d0, [%[lhs_ptr]], #32\n"
"fmla v25.4s, v1.4s, v3.s[0]\n"
"ldr x4, [%[rhs_ptr], #8]\n"
"fmla v27.4s, v1.4s, v3.s[1]\n"
"subs w1, w1, #1\n"
"fmla v29.4s, v1.4s, v3.s[2]\n"
"ins v0.d[1], x2\n"
"fmla v31.4s, v1.4s, v3.s[3]\n"
"ldr d3, [%[rhs_ptr], #16]\n"
"fmla v17.4s, v1.4s, v2.s[0]\n"
"ins v3.d[1], x5\n"
"fmla v19.4s, v1.4s, v2.s[1]\n"
"ldr d4, [%[rhs_ptr]], #32\n"
"fmla v21.4s, v1.4s, v2.s[2]\n"
"ins v4.d[1], x4\n"
"fmla v23.4s, v1.4s, v2.s[3]\n"
RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #256]\n")
"fmla v16.4s, v0.4s, v4.s[0]\n"
"ldr d1, [%[lhs_ptr], #-16]\n"
"fmla v18.4s, v0.4s, v4.s[1]\n"
"ins v1.d[1], x3\n"
"fmla v20.4s, v0.4s, v4.s[2]\n"
"mov v2.16b, v4.16b\n"
"fmla v22.4s, v0.4s, v4.s[3]\n"
"bne 2b\n"
"79:\n"
"fmla v24.4s, v0.4s, v3.s[0]\n"
"fmla v26.4s, v0.4s, v3.s[1]\n"
"fmla v28.4s, v0.4s, v3.s[2]\n"
"fmla v30.4s, v0.4s, v3.s[3]\n"
"fmla v25.4s, v1.4s, v3.s[0]\n"
"fmla v27.4s, v1.4s, v3.s[1]\n"
"fmla v29.4s, v1.4s, v3.s[2]\n"
"fmla v31.4s, v1.4s, v3.s[3]\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"fmla v17.4s, v1.4s, v2.s[0]\n"
"fmla v19.4s, v1.4s, v2.s[1]\n"
"fmla v21.4s, v1.4s, v2.s[2]\n"
"fmla v23.4s, v1.4s, v2.s[3]\n"
"cmp %w[row], w7\n"
"bge 4f\n"
"add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
"b 5f\n"
"4:\n"
"mov %[lhs_col_ptr], x5\n"
"cmp %w[col], w8\n"
"bge 5f\n"
"add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
"5:\n"
"mov %[lhs_ptr], %[lhs_col_ptr]\n"
"mov %[rhs_ptr], %[rhs_col_ptr]\n"
"ldrb w4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"csel w3, %w[row], %w[col], eq\n"
"add x5, x1, x3, lsl #2\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"ld1 {v14.4s}, [x1], #16\n"
"ld1 {v15.4s}, [x1]\n"
"ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
"ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
"ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
"bne 6f\n"
"fadd v16.4s, v16.4s, v14.4s\n"
"fadd v17.4s, v17.4s, v15.4s\n"
"fadd v18.4s, v18.4s, v14.4s\n"
"fadd v19.4s, v19.4s, v15.4s\n"
"fadd v20.4s, v20.4s, v14.4s\n"
"fadd v21.4s, v21.4s, v15.4s\n"
"fadd v22.4s, v22.4s, v14.4s\n"
"fadd v23.4s, v23.4s, v15.4s\n"
"fadd v24.4s, v24.4s, v14.4s\n"
"fadd v25.4s, v25.4s, v15.4s\n"
"fadd v26.4s, v26.4s, v14.4s\n"
"fadd v27.4s, v27.4s, v15.4s\n"
"fadd v28.4s, v28.4s, v14.4s\n"
"fadd v29.4s, v29.4s, v15.4s\n"
"fadd v30.4s, v30.4s, v14.4s\n"
"fadd v31.4s, v31.4s, v15.4s\n"
"b 7f\n"
"6:\n"
"dup v8.4s, v14.s[0]\n"
"dup v9.4s, v14.s[1]\n"
"fadd v16.4s, v16.4s, v8.4s\n"
"dup v10.4s, v14.s[2]\n"
"fadd v17.4s, v17.4s, v8.4s\n"
"dup v11.4s, v14.s[3]\n"
"fadd v18.4s, v18.4s, v9.4s\n"
"dup v12.4s, v15.s[0]\n"
"fadd v19.4s, v19.4s, v9.4s\n"
"dup v13.4s, v15.s[1]\n"
"fadd v20.4s, v20.4s, v10.4s\n"
"dup v14.4s, v15.s[2]\n"
"fadd v21.4s, v21.4s, v10.4s\n"
"dup v15.4s, v15.s[3]\n"
"fadd v22.4s, v22.4s, v11.4s\n"
"fadd v23.4s, v23.4s, v11.4s\n"
"fadd v24.4s, v24.4s, v12.4s\n"
"fadd v25.4s, v25.4s, v12.4s\n"
"fadd v26.4s, v26.4s, v13.4s\n"
"fadd v27.4s, v27.4s, v13.4s\n"
"fadd v28.4s, v28.4s, v14.4s\n"
"fadd v29.4s, v29.4s, v14.4s\n"
"fadd v30.4s, v30.4s, v15.4s\n"
"fadd v31.4s, v31.4s, v15.4s\n"
"7:\n"
"ldr w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
"ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
"dup v14.4s, w2\n"
"dup v15.4s, w3\n"
"fmax v16.4s, v16.4s, v14.4s\n"
"fmax v17.4s, v17.4s, v14.4s\n"
"fmax v18.4s, v18.4s, v14.4s\n"
"fmax v19.4s, v19.4s, v14.4s\n"
"fmax v20.4s, v20.4s, v14.4s\n"
"fmax v21.4s, v21.4s, v14.4s\n"
"fmax v22.4s, v22.4s, v14.4s\n"
"fmax v23.4s, v23.4s, v14.4s\n"
"fmax v24.4s, v24.4s, v14.4s\n"
"fmax v25.4s, v25.4s, v14.4s\n"
"fmax v26.4s, v26.4s, v14.4s\n"
"fmax v27.4s, v27.4s, v14.4s\n"
"fmax v28.4s, v28.4s, v14.4s\n"
"fmax v29.4s, v29.4s, v14.4s\n"
"fmax v30.4s, v30.4s, v14.4s\n"
"fmax v31.4s, v31.4s, v14.4s\n"
"fmin v16.4s, v16.4s, v15.4s\n"
"fmin v17.4s, v17.4s, v15.4s\n"
"fmin v18.4s, v18.4s, v15.4s\n"
"fmin v19.4s, v19.4s, v15.4s\n"
"fmin v20.4s, v20.4s, v15.4s\n"
"fmin v21.4s, v21.4s, v15.4s\n"
"fmin v22.4s, v22.4s, v15.4s\n"
"fmin v23.4s, v23.4s, v15.4s\n"
"fmin v24.4s, v24.4s, v15.4s\n"
"fmin v25.4s, v25.4s, v15.4s\n"
"fmin v26.4s, v26.4s, v15.4s\n"
"fmin v27.4s, v27.4s, v15.4s\n"
"fmin v28.4s, v28.4s, v15.4s\n"
"fmin v29.4s, v29.4s, v15.4s\n"
"fmin v30.4s, v30.4s, v15.4s\n"
"fmin v31.4s, v31.4s, v15.4s\n"
"sub w1, %w[dst_rows], %w[row]\n"
"sub w2, %w[dst_cols], %w[col]\n"
"mov w3, #8\n"
"cmp w1, #8\n"
"csel w1, w1, w3, le\n"
"cmp w2, #8\n"
"csel w2, w2, w3, le\n"
"cmp w1, w3\n"
"ccmp w2, w3, 0, eq\n"
"beq 30f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, #32\n"
"b 31f\n"
"30:\n"
"mov x3, %[dst_ptr]\n"
"mov x4, x11\n"
"31:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
"str q16, [x3, #0]\n"
"str q17, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v16)
RUY_MAKE_ZERO(v17)
"str q18, [x3, #0]\n"
"str q19, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v18)
RUY_MAKE_ZERO(v19)
"str q20, [x3, #0]\n"
"str q21, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v20)
RUY_MAKE_ZERO(v21)
"str q22, [x3, #0]\n"
"str q23, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v22)
RUY_MAKE_ZERO(v23)
"str q24, [x3, #0]\n"
"str q25, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v24)
RUY_MAKE_ZERO(v25)
"str q26, [x3, #0]\n"
"str q27, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v26)
RUY_MAKE_ZERO(v27)
"str q28, [x3, #0]\n"
"str q29, [x3, #16]\n"
"add x3, x3, x4\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
RUY_MAKE_ZERO(v28)
RUY_MAKE_ZERO(v29)
"str q30, [x3, #0]\n"
"str q31, [x3, #16]\n"
RUY_MAKE_ZERO(v30)
RUY_MAKE_ZERO(v31)
"beq 41f\n"
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"50:\n"
RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
"mov w5, #0\n"
"51:\n"
"ldr w7, [x3, x5, lsl #2]\n"
"str w7, [x4, x5, lsl #2]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add x3, x3, #32\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"blt 50b\n"
"41:\n"
"add %[dst_ptr], %[dst_ptr], #32\n"
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
"cmp %w[row], w7\n"
"beq 20f\n"
"add %w[row], %w[row], #8\n"
"b 21f\n"
"20:\n"
"mov %w[row], w6\n"
"add %w[col], %w[col], #8\n"
"add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
"mov %[dst_ptr], %[dst_col_ptr]\n"
"21:\n"
"cmp %w[col], w8\n"
"sub w1, w12, #1\n"
"ble 1b\n"
: [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
: [ params ] "r"(¶ms), [dst_rows] "r"(params.dst_rows),
[dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf)
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
"memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
#undef RUY_OFFSET_BIAS
#undef RUY_OFFSET_FLAGS
#undef RUY_OFFSET_LHS_BASE_PTR
#undef RUY_OFFSET_CLAMP_MIN
#undef RUY_OFFSET_CLAMP_MAX
#undef RUY_OFFSET_START_ROW
#undef RUY_OFFSET_LAST_ROW
#undef RUY_OFFSET_LAST_COL
#undef RUY_OFFSET_LHS_STRIDE
#undef RUY_OFFSET_RHS_STRIDE
#undef RUY_OFFSET_DST_STRIDE
#undef RUY_OFFSET_DEPTH
#undef RUY_OFFSET_START_COL
#undef RUY_OFFSET_RHS_BASE_PTR
#undef RUY_OFFSET_DST_BASE_PTR
#endif
}