; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-I8MM
; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM
define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: udot:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: udot z0.s, z1.b, z2.b
; CHECK-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
%mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: udot_wide:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: udot z0.d, z1.h, z2.h
; CHECK-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
%b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
%mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: sdot:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sdot z0.s, z1.b, z2.b
; CHECK-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
%mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %accc, <vscale x 16 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: sdot_wide:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sdot z0.d, z1.h, z2.h
; CHECK-NEXT: ret
entry:
%a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
%b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
%mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-I8MM-LABEL: usdot:
; CHECK-I8MM: // %bb.0: // %entry
; CHECK-I8MM-NEXT: usdot z0.s, z1.b, z2.b
; CHECK-I8MM-NEXT: ret
;
; CHECK-NOI8MM-LABEL: usdot:
; CHECK-NOI8MM: // %bb.0: // %entry
; CHECK-NOI8MM-NEXT: uunpklo z3.h, z1.b
; CHECK-NOI8MM-NEXT: sunpklo z4.h, z2.b
; CHECK-NOI8MM-NEXT: uunpkhi z1.h, z1.b
; CHECK-NOI8MM-NEXT: sunpkhi z2.h, z2.b
; CHECK-NOI8MM-NEXT: ptrue p0.s
; CHECK-NOI8MM-NEXT: uunpklo z5.s, z3.h
; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h
; CHECK-NOI8MM-NEXT: sunpklo z6.s, z4.h
; CHECK-NOI8MM-NEXT: sunpkhi z4.s, z4.h
; CHECK-NOI8MM-NEXT: uunpklo z7.s, z1.h
; CHECK-NOI8MM-NEXT: uunpkhi z1.s, z1.h
; CHECK-NOI8MM-NEXT: sunpklo z24.s, z2.h
; CHECK-NOI8MM-NEXT: sunpkhi z2.s, z2.h
; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z5.s, z6.s
; CHECK-NOI8MM-NEXT: mul z3.s, z3.s, z4.s
; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z1.s, z2.s
; CHECK-NOI8MM-NEXT: movprfx z1, z3
; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s
; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s
; CHECK-NOI8MM-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
%mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-I8MM-LABEL: sudot:
; CHECK-I8MM: // %bb.0: // %entry
; CHECK-I8MM-NEXT: usdot z0.s, z2.b, z1.b
; CHECK-I8MM-NEXT: ret
;
; CHECK-NOI8MM-LABEL: sudot:
; CHECK-NOI8MM: // %bb.0: // %entry
; CHECK-NOI8MM-NEXT: sunpklo z3.h, z1.b
; CHECK-NOI8MM-NEXT: uunpklo z4.h, z2.b
; CHECK-NOI8MM-NEXT: sunpkhi z1.h, z1.b
; CHECK-NOI8MM-NEXT: uunpkhi z2.h, z2.b
; CHECK-NOI8MM-NEXT: ptrue p0.s
; CHECK-NOI8MM-NEXT: sunpklo z5.s, z3.h
; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h
; CHECK-NOI8MM-NEXT: uunpklo z6.s, z4.h
; CHECK-NOI8MM-NEXT: uunpkhi z4.s, z4.h
; CHECK-NOI8MM-NEXT: sunpklo z7.s, z1.h
; CHECK-NOI8MM-NEXT: sunpkhi z1.s, z1.h
; CHECK-NOI8MM-NEXT: uunpklo z24.s, z2.h
; CHECK-NOI8MM-NEXT: uunpkhi z2.s, z2.h
; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z5.s, z6.s
; CHECK-NOI8MM-NEXT: mul z3.s, z3.s, z4.s
; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z1.s, z2.s
; CHECK-NOI8MM-NEXT: movprfx z1, z3
; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s
; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s
; CHECK-NOI8MM-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
%mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: udot_8to64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z4.s, #0 // =0x0
; CHECK-NEXT: udot z4.s, z2.b, z3.b
; CHECK-NEXT: sunpklo z2.d, z4.s
; CHECK-NEXT: sunpkhi z3.d, z4.s
; CHECK-NEXT: add z0.d, z0.d, z2.d
; CHECK-NEXT: add z1.d, z1.d, z3.d
; CHECK-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
%mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
<vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
ret <vscale x 4 x i64> %partial.reduce
}
define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
; CHECK-LABEL: sdot_8to64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z4.s, #0 // =0x0
; CHECK-NEXT: sdot z4.s, z2.b, z3.b
; CHECK-NEXT: sunpklo z2.d, z4.s
; CHECK-NEXT: sunpkhi z3.d, z4.s
; CHECK-NEXT: add z0.d, z0.d, z2.d
; CHECK-NEXT: add z1.d, z1.d, z3.d
; CHECK-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
%mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
<vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
ret <vscale x 4 x i64> %partial.reduce
}
define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
; CHECK-I8MM-LABEL: usdot_8to64:
; CHECK-I8MM: // %bb.0: // %entry
; CHECK-I8MM-NEXT: mov z4.s, #0 // =0x0
; CHECK-I8MM-NEXT: usdot z4.s, z2.b, z3.b
; CHECK-I8MM-NEXT: sunpklo z2.d, z4.s
; CHECK-I8MM-NEXT: sunpkhi z3.d, z4.s
; CHECK-I8MM-NEXT: add z0.d, z0.d, z2.d
; CHECK-I8MM-NEXT: add z1.d, z1.d, z3.d
; CHECK-I8MM-NEXT: ret
;
; CHECK-NOI8MM-LABEL: usdot_8to64:
; CHECK-NOI8MM: // %bb.0: // %entry
; CHECK-NOI8MM-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NOI8MM-NEXT: addvl sp, sp, #-2
; CHECK-NOI8MM-NEXT: str z9, [sp] // 16-byte Folded Spill
; CHECK-NOI8MM-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NOI8MM-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
; CHECK-NOI8MM-NEXT: .cfi_offset w29, -16
; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
; CHECK-NOI8MM-NEXT: uunpklo z4.h, z2.b
; CHECK-NOI8MM-NEXT: sunpklo z5.h, z3.b
; CHECK-NOI8MM-NEXT: uunpkhi z2.h, z2.b
; CHECK-NOI8MM-NEXT: sunpkhi z3.h, z3.b
; CHECK-NOI8MM-NEXT: ptrue p0.d
; CHECK-NOI8MM-NEXT: uunpklo z6.s, z4.h
; CHECK-NOI8MM-NEXT: uunpkhi z4.s, z4.h
; CHECK-NOI8MM-NEXT: sunpklo z7.s, z5.h
; CHECK-NOI8MM-NEXT: sunpkhi z5.s, z5.h
; CHECK-NOI8MM-NEXT: uunpklo z24.s, z2.h
; CHECK-NOI8MM-NEXT: uunpkhi z2.s, z2.h
; CHECK-NOI8MM-NEXT: sunpklo z25.s, z3.h
; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h
; CHECK-NOI8MM-NEXT: uunpkhi z26.d, z6.s
; CHECK-NOI8MM-NEXT: uunpklo z6.d, z6.s
; CHECK-NOI8MM-NEXT: uunpklo z27.d, z4.s
; CHECK-NOI8MM-NEXT: sunpklo z28.d, z7.s
; CHECK-NOI8MM-NEXT: sunpklo z29.d, z5.s
; CHECK-NOI8MM-NEXT: uunpkhi z4.d, z4.s
; CHECK-NOI8MM-NEXT: sunpkhi z7.d, z7.s
; CHECK-NOI8MM-NEXT: sunpkhi z5.d, z5.s
; CHECK-NOI8MM-NEXT: uunpkhi z30.d, z24.s
; CHECK-NOI8MM-NEXT: uunpkhi z31.d, z2.s
; CHECK-NOI8MM-NEXT: uunpklo z24.d, z24.s
; CHECK-NOI8MM-NEXT: uunpklo z2.d, z2.s
; CHECK-NOI8MM-NEXT: sunpkhi z8.d, z25.s
; CHECK-NOI8MM-NEXT: sunpklo z25.d, z25.s
; CHECK-NOI8MM-NEXT: sunpklo z9.d, z3.s
; CHECK-NOI8MM-NEXT: mul z27.d, z27.d, z29.d
; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z28.d
; CHECK-NOI8MM-NEXT: sunpkhi z3.d, z3.s
; CHECK-NOI8MM-NEXT: mul z4.d, z4.d, z5.d
; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z7.d
; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z2.d, z9.d
; CHECK-NOI8MM-NEXT: movprfx z2, z27
; CHECK-NOI8MM-NEXT: mla z2.d, p0/m, z24.d, z25.d
; CHECK-NOI8MM-NEXT: ldr z9, [sp] // 16-byte Folded Reload
; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z31.d, z3.d
; CHECK-NOI8MM-NEXT: movprfx z3, z4
; CHECK-NOI8MM-NEXT: mla z3.d, p0/m, z30.d, z8.d
; CHECK-NOI8MM-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NOI8MM-NEXT: add z0.d, z2.d, z0.d
; CHECK-NOI8MM-NEXT: add z1.d, z3.d, z1.d
; CHECK-NOI8MM-NEXT: addvl sp, sp, #2
; CHECK-NOI8MM-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NOI8MM-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
%mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
<vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
ret <vscale x 4 x i64> %partial.reduce
}
define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-I8MM-LABEL: sudot_8to64:
; CHECK-I8MM: // %bb.0: // %entry
; CHECK-I8MM-NEXT: mov z4.s, #0 // =0x0
; CHECK-I8MM-NEXT: usdot z4.s, z3.b, z2.b
; CHECK-I8MM-NEXT: sunpklo z2.d, z4.s
; CHECK-I8MM-NEXT: sunpkhi z3.d, z4.s
; CHECK-I8MM-NEXT: add z0.d, z0.d, z2.d
; CHECK-I8MM-NEXT: add z1.d, z1.d, z3.d
; CHECK-I8MM-NEXT: ret
;
; CHECK-NOI8MM-LABEL: sudot_8to64:
; CHECK-NOI8MM: // %bb.0: // %entry
; CHECK-NOI8MM-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NOI8MM-NEXT: addvl sp, sp, #-2
; CHECK-NOI8MM-NEXT: str z9, [sp] // 16-byte Folded Spill
; CHECK-NOI8MM-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NOI8MM-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
; CHECK-NOI8MM-NEXT: .cfi_offset w29, -16
; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
; CHECK-NOI8MM-NEXT: sunpklo z4.h, z2.b
; CHECK-NOI8MM-NEXT: uunpklo z5.h, z3.b
; CHECK-NOI8MM-NEXT: sunpkhi z2.h, z2.b
; CHECK-NOI8MM-NEXT: uunpkhi z3.h, z3.b
; CHECK-NOI8MM-NEXT: ptrue p0.d
; CHECK-NOI8MM-NEXT: sunpklo z6.s, z4.h
; CHECK-NOI8MM-NEXT: sunpkhi z4.s, z4.h
; CHECK-NOI8MM-NEXT: uunpklo z7.s, z5.h
; CHECK-NOI8MM-NEXT: uunpkhi z5.s, z5.h
; CHECK-NOI8MM-NEXT: sunpklo z24.s, z2.h
; CHECK-NOI8MM-NEXT: sunpkhi z2.s, z2.h
; CHECK-NOI8MM-NEXT: uunpklo z25.s, z3.h
; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h
; CHECK-NOI8MM-NEXT: sunpkhi z26.d, z6.s
; CHECK-NOI8MM-NEXT: sunpklo z6.d, z6.s
; CHECK-NOI8MM-NEXT: sunpklo z27.d, z4.s
; CHECK-NOI8MM-NEXT: uunpklo z28.d, z7.s
; CHECK-NOI8MM-NEXT: uunpklo z29.d, z5.s
; CHECK-NOI8MM-NEXT: sunpkhi z4.d, z4.s
; CHECK-NOI8MM-NEXT: uunpkhi z7.d, z7.s
; CHECK-NOI8MM-NEXT: uunpkhi z5.d, z5.s
; CHECK-NOI8MM-NEXT: sunpkhi z30.d, z24.s
; CHECK-NOI8MM-NEXT: sunpkhi z31.d, z2.s
; CHECK-NOI8MM-NEXT: sunpklo z24.d, z24.s
; CHECK-NOI8MM-NEXT: sunpklo z2.d, z2.s
; CHECK-NOI8MM-NEXT: uunpkhi z8.d, z25.s
; CHECK-NOI8MM-NEXT: uunpklo z25.d, z25.s
; CHECK-NOI8MM-NEXT: uunpklo z9.d, z3.s
; CHECK-NOI8MM-NEXT: mul z27.d, z27.d, z29.d
; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z28.d
; CHECK-NOI8MM-NEXT: uunpkhi z3.d, z3.s
; CHECK-NOI8MM-NEXT: mul z4.d, z4.d, z5.d
; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z7.d
; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z2.d, z9.d
; CHECK-NOI8MM-NEXT: movprfx z2, z27
; CHECK-NOI8MM-NEXT: mla z2.d, p0/m, z24.d, z25.d
; CHECK-NOI8MM-NEXT: ldr z9, [sp] // 16-byte Folded Reload
; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z31.d, z3.d
; CHECK-NOI8MM-NEXT: movprfx z3, z4
; CHECK-NOI8MM-NEXT: mla z3.d, p0/m, z30.d, z8.d
; CHECK-NOI8MM-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NOI8MM-NEXT: add z0.d, z2.d, z0.d
; CHECK-NOI8MM-NEXT: add z1.d, z3.d, z1.d
; CHECK-NOI8MM-NEXT: addvl sp, sp, #2
; CHECK-NOI8MM-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NOI8MM-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
%mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
<vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
ret <vscale x 4 x i64> %partial.reduce
}
define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
; CHECK-LABEL: not_udot:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: and z1.h, z1.h, #0xff
; CHECK-NEXT: and z2.h, z2.h, #0xff
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uunpklo z3.s, z1.h
; CHECK-NEXT: uunpklo z4.s, z2.h
; CHECK-NEXT: uunpkhi z1.s, z1.h
; CHECK-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEXT: mla z0.s, p0/m, z3.s, z4.s
; CHECK-NEXT: mla z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
%b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
%mult = mul nuw nsw <vscale x 8 x i32> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
; CHECK-LABEL: not_udot_wide:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: and z1.s, z1.s, #0xffff
; CHECK-NEXT: and z2.s, z2.s, #0xffff
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpklo z3.d, z1.s
; CHECK-NEXT: uunpklo z4.d, z2.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: uunpkhi z2.d, z2.s
; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: ret
entry:
%a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64>
%b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64>
%mult = mul nuw nsw <vscale x 4 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 2 x i64> @not_usdot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: not_usdot:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uunpklo z3.s, z1.h
; CHECK-NEXT: sunpklo z4.s, z2.h
; CHECK-NEXT: uunpkhi z1.s, z1.h
; CHECK-NEXT: sunpkhi z2.s, z2.h
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpklo z5.d, z3.s
; CHECK-NEXT: uunpkhi z3.d, z3.s
; CHECK-NEXT: sunpklo z6.d, z4.s
; CHECK-NEXT: sunpkhi z4.d, z4.s
; CHECK-NEXT: uunpklo z7.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: sunpklo z24.d, z2.s
; CHECK-NEXT: sunpkhi z2.d, z2.s
; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-NEXT: mul z3.d, z3.d, z4.d
; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: movprfx z1, z3
; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
%b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
%mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 2 x i64> @not_sudot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: not_sudot:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sunpklo z3.s, z1.h
; CHECK-NEXT: uunpklo z4.s, z2.h
; CHECK-NEXT: sunpkhi z1.s, z1.h
; CHECK-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sunpklo z5.d, z3.s
; CHECK-NEXT: sunpkhi z3.d, z3.s
; CHECK-NEXT: uunpklo z6.d, z4.s
; CHECK-NEXT: uunpkhi z4.d, z4.s
; CHECK-NEXT: sunpklo z7.d, z1.s
; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: uunpklo z24.d, z2.s
; CHECK-NEXT: uunpkhi z2.d, z2.s
; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-NEXT: mul z3.d, z3.d, z4.d
; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: movprfx z1, z3
; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ret
entry:
%a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
%b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
%mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}