; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s
define <vscale x 2 x i64> @add_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: add_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: add z0.d, z0.d, z1.d
; CHECK-NEXT: ret
%res = add <vscale x 2 x i64> %a, %b
ret <vscale x 2 x i64> %res
}
define <vscale x 4 x i32> @add_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: add_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: add z0.s, z0.s, z1.s
; CHECK-NEXT: ret
%res = add <vscale x 4 x i32> %a, %b
ret <vscale x 4 x i32> %res
}
define <vscale x 8 x i16> @add_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: add_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: ret
%res = add <vscale x 8 x i16> %a, %b
ret <vscale x 8 x i16> %res
}
define <vscale x 16 x i8> @add_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: add_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: add z0.b, z0.b, z1.b
; CHECK-NEXT: ret
%res = add <vscale x 16 x i8> %a, %b
ret <vscale x 16 x i8> %res
}
define <vscale x 16 x i8> @add_i8_zero(<vscale x 16 x i8> %a) {
; CHECK-LABEL: add_i8_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: ret
%res = add <vscale x 16 x i8> %a, zeroinitializer
ret <vscale x 16 x i8> %res
}
define <vscale x 1 x i32> @add_nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b) {
; CHECK-LABEL: add_nxv1i32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: add z0.s, z0.s, z1.s
; CHECK-NEXT: ret
entry:
%c = add <vscale x 1 x i32> %a, %b
ret <vscale x 1 x i32> %c
}
define <vscale x 2 x i64> @sub_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: sub_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: sub z0.d, z0.d, z1.d
; CHECK-NEXT: ret
%res = sub <vscale x 2 x i64> %a, %b
ret <vscale x 2 x i64> %res
}
define <vscale x 4 x i32> @sub_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: sub_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: sub z0.s, z0.s, z1.s
; CHECK-NEXT: ret
%res = sub <vscale x 4 x i32> %a, %b
ret <vscale x 4 x i32> %res
}
define <vscale x 8 x i16> @sub_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: sub_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: sub z0.h, z0.h, z1.h
; CHECK-NEXT: ret
%res = sub <vscale x 8 x i16> %a, %b
ret <vscale x 8 x i16> %res
}
define <vscale x 16 x i8> @sub_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: sub_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: sub z0.b, z0.b, z1.b
; CHECK-NEXT: ret
%res = sub <vscale x 16 x i8> %a, %b
ret <vscale x 16 x i8> %res
}
define <vscale x 16 x i8> @sub_i8_zero(<vscale x 16 x i8> %a) {
; CHECK-LABEL: sub_i8_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: ret
%res = sub <vscale x 16 x i8> %a, zeroinitializer
ret <vscale x 16 x i8> %res
}
define <vscale x 16 x i8> @abs_nxv16i8(<vscale x 16 x i8> %a) {
; CHECK-LABEL: abs_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: abs z0.b, p0/m, z0.b
; CHECK-NEXT: ret
%res = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> %a, i1 false)
ret <vscale x 16 x i8> %res
}
define <vscale x 8 x i16> @abs_nxv8i16(<vscale x 8 x i16> %a) {
; CHECK-LABEL: abs_nxv8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: abs z0.h, p0/m, z0.h
; CHECK-NEXT: ret
%res = call <vscale x 8 x i16> @llvm.abs.nxv8i16(<vscale x 8 x i16> %a, i1 false)
ret <vscale x 8 x i16> %res
}
define <vscale x 4 x i32> @abs_nxv4i32(<vscale x 4 x i32> %a) {
; CHECK-LABEL: abs_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: abs z0.s, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 4 x i32> @llvm.abs.nxv4i32(<vscale x 4 x i32> %a, i1 false)
ret <vscale x 4 x i32> %res
}
define <vscale x 2 x i64> @abs_nxv2i64(<vscale x 2 x i64> %a) {
; CHECK-LABEL: abs_nxv2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: abs z0.d, p0/m, z0.d
; CHECK-NEXT: ret
%res = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> %a, i1 false)
ret <vscale x 2 x i64> %res
}
define <vscale x 4 x i16> @abs_nxv4i16(<vscale x 4 x i16> %a) {
; CHECK-LABEL: abs_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: sxth z0.s, p0/m, z0.s
; CHECK-NEXT: abs z0.s, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 4 x i16> @llvm.abs.nxv4i16(<vscale x 4 x i16> %a, i1 false)
ret <vscale x 4 x i16> %res
}
define <vscale x 32 x i8> @abs_nxv32i8(<vscale x 32 x i8> %a) {
; CHECK-LABEL: abs_nxv32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: abs z0.b, p0/m, z0.b
; CHECK-NEXT: abs z1.b, p0/m, z1.b
; CHECK-NEXT: ret
%res = call <vscale x 32 x i8> @llvm.abs.nxv32i8(<vscale x 32 x i8> %a, i1 false)
ret <vscale x 32 x i8> %res
}
define <vscale x 8 x i64> @abs_nxv8i64(<vscale x 8 x i64> %a) {
; CHECK-LABEL: abs_nxv8i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: abs z0.d, p0/m, z0.d
; CHECK-NEXT: abs z1.d, p0/m, z1.d
; CHECK-NEXT: abs z2.d, p0/m, z2.d
; CHECK-NEXT: abs z3.d, p0/m, z3.d
; CHECK-NEXT: ret
%res = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> %a, i1 false)
ret <vscale x 8 x i64> %res
}
define <vscale x 2 x i64> @sqadd_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: sqadd_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: sqadd z0.d, z0.d, z1.d
; CHECK-NEXT: ret
%res = call <vscale x 2 x i64> @llvm.sadd.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
ret <vscale x 2 x i64> %res
}
define <vscale x 4 x i32> @sqadd_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: sqadd_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: sqadd z0.s, z0.s, z1.s
; CHECK-NEXT: ret
%res = call <vscale x 4 x i32> @llvm.sadd.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
ret <vscale x 4 x i32> %res
}
define <vscale x 4 x i32> @sqadd_i32_zero(<vscale x 4 x i32> %a) {
; CHECK-LABEL: sqadd_i32_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: ret
%res = call <vscale x 4 x i32> @llvm.sadd.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> zeroinitializer)
ret <vscale x 4 x i32> %res
}
define <vscale x 8 x i16> @sqadd_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: sqadd_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: sqadd z0.h, z0.h, z1.h
; CHECK-NEXT: ret
%res = call <vscale x 8 x i16> @llvm.sadd.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
ret <vscale x 8 x i16> %res
}
define <vscale x 16 x i8> @sqadd_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: sqadd_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: sqadd z0.b, z0.b, z1.b
; CHECK-NEXT: ret
%res = call <vscale x 16 x i8> @llvm.sadd.sat.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
ret <vscale x 16 x i8> %res
}
define <vscale x 2 x i64> @sqsub_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: sqsub_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: sqsub z0.d, z0.d, z1.d
; CHECK-NEXT: ret
%res = call <vscale x 2 x i64> @llvm.ssub.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
ret <vscale x 2 x i64> %res
}
define <vscale x 2 x i64> @sqsub_i64_zero(<vscale x 2 x i64> %a) {
; CHECK-LABEL: sqsub_i64_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: ret
%res = call <vscale x 2 x i64> @llvm.ssub.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer)
ret <vscale x 2 x i64> %res
}
define <vscale x 4 x i32> @sqsub_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: sqsub_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: sqsub z0.s, z0.s, z1.s
; CHECK-NEXT: ret
%res = call <vscale x 4 x i32> @llvm.ssub.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
ret <vscale x 4 x i32> %res
}
define <vscale x 8 x i16> @sqsub_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: sqsub_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: sqsub z0.h, z0.h, z1.h
; CHECK-NEXT: ret
%res = call <vscale x 8 x i16> @llvm.ssub.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
ret <vscale x 8 x i16> %res
}
define <vscale x 16 x i8> @sqsub_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: sqsub_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: sqsub z0.b, z0.b, z1.b
; CHECK-NEXT: ret
%res = call <vscale x 16 x i8> @llvm.ssub.sat.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
ret <vscale x 16 x i8> %res
}
define <vscale x 2 x i64> @uqadd_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: uqadd_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: uqadd z0.d, z0.d, z1.d
; CHECK-NEXT: ret
%res = call <vscale x 2 x i64> @llvm.uadd.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
ret <vscale x 2 x i64> %res
}
define <vscale x 4 x i32> @uqadd_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: uqadd_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: uqadd z0.s, z0.s, z1.s
; CHECK-NEXT: ret
%res = call <vscale x 4 x i32> @llvm.uadd.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
ret <vscale x 4 x i32> %res
}
define <vscale x 8 x i16> @uqadd_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: uqadd_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: uqadd z0.h, z0.h, z1.h
; CHECK-NEXT: ret
%res = call <vscale x 8 x i16> @llvm.uadd.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
ret <vscale x 8 x i16> %res
}
define <vscale x 16 x i8> @uqadd_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: uqadd_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: uqadd z0.b, z0.b, z1.b
; CHECK-NEXT: ret
%res = call <vscale x 16 x i8> @llvm.uadd.sat.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
ret <vscale x 16 x i8> %res
}
define <vscale x 2 x i64> @uqsub_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: uqsub_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: uqsub z0.d, z0.d, z1.d
; CHECK-NEXT: ret
%res = call <vscale x 2 x i64> @llvm.usub.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
ret <vscale x 2 x i64> %res
}
define <vscale x 4 x i32> @uqsub_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: uqsub_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: uqsub z0.s, z0.s, z1.s
; CHECK-NEXT: ret
%res = call <vscale x 4 x i32> @llvm.usub.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
ret <vscale x 4 x i32> %res
}
define <vscale x 8 x i16> @uqsub_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: uqsub_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: uqsub z0.h, z0.h, z1.h
; CHECK-NEXT: ret
%res = call <vscale x 8 x i16> @llvm.usub.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
ret <vscale x 8 x i16> %res
}
define <vscale x 16 x i8> @uqsub_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: uqsub_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: uqsub z0.b, z0.b, z1.b
; CHECK-NEXT: ret
%res = call <vscale x 16 x i8> @llvm.usub.sat.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
ret <vscale x 16 x i8> %res
}
define <vscale x 16 x i8> @mad_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) {
; CHECK-LABEL: mad_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mad z0.b, p0/m, z1.b, z2.b
; CHECK-NEXT: ret
%prod = mul <vscale x 16 x i8> %a, %b
%res = add <vscale x 16 x i8> %c, %prod
ret <vscale x 16 x i8> %res
}
define <vscale x 8 x i16> @mad_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) {
; CHECK-LABEL: mad_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mad z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: ret
%prod = mul <vscale x 8 x i16> %a, %b
%res = add <vscale x 8 x i16> %c, %prod
ret <vscale x 8 x i16> %res
}
define <vscale x 4 x i32> @mad_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
; CHECK-LABEL: mad_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: ret
%prod = mul <vscale x 4 x i32> %a, %b
%res = add <vscale x 4 x i32> %c, %prod
ret <vscale x 4 x i32> %res
}
define <vscale x 2 x i64> @mad_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) {
; CHECK-LABEL: mad_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: ret
%prod = mul <vscale x 2 x i64> %a, %b
%res = add <vscale x 2 x i64> %c, %prod
ret <vscale x 2 x i64> %res
}
define <vscale x 16 x i8> @mla_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) {
; CHECK-LABEL: mla_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mla z0.b, p0/m, z1.b, z2.b
; CHECK-NEXT: ret
%prod = mul <vscale x 16 x i8> %b, %c
%res = add <vscale x 16 x i8> %a, %prod
ret <vscale x 16 x i8> %res
}
define <vscale x 8 x i16> @mla_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) {
; CHECK-LABEL: mla_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mla z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: ret
%prod = mul <vscale x 8 x i16> %b, %c
%res = add <vscale x 8 x i16> %a, %prod
ret <vscale x 8 x i16> %res
}
define <vscale x 4 x i32> @mla_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
; CHECK-LABEL: mla_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mla z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: ret
%prod = mul <vscale x 4 x i32> %b, %c
%res = add <vscale x 4 x i32> %a, %prod
ret <vscale x 4 x i32> %res
}
define <vscale x 2 x i64> @mla_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) {
; CHECK-LABEL: mla_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: ret
%prod = mul <vscale x 2 x i64> %b, %c
%res = add <vscale x 2 x i64> %a, %prod
ret <vscale x 2 x i64> %res
}
define <vscale x 16 x i8> @mla_i8_multiuse(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, ptr %p) {
; CHECK-LABEL: mla_i8_multiuse:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b
; CHECK-NEXT: add z0.b, z2.b, z1.b
; CHECK-NEXT: st1b { z1.b }, p0, [x0]
; CHECK-NEXT: ret
%prod = mul <vscale x 16 x i8> %a, %b
store <vscale x 16 x i8> %prod, ptr %p
%res = add <vscale x 16 x i8> %c, %prod
ret <vscale x 16 x i8> %res
}
define <vscale x 16 x i8> @msb_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) {
; CHECK-LABEL: msb_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: msb z0.b, p0/m, z1.b, z2.b
; CHECK-NEXT: ret
%prod = mul <vscale x 16 x i8> %a, %b
%res = sub <vscale x 16 x i8> %c, %prod
ret <vscale x 16 x i8> %res
}
define <vscale x 8 x i16> @msb_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) {
; CHECK-LABEL: msb_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: msb z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: ret
%prod = mul <vscale x 8 x i16> %a, %b
%res = sub <vscale x 8 x i16> %c, %prod
ret <vscale x 8 x i16> %res
}
define <vscale x 4 x i32> @msb_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
; CHECK-LABEL: msb_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: msb z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: ret
%prod = mul <vscale x 4 x i32> %a, %b
%res = sub <vscale x 4 x i32> %c, %prod
ret <vscale x 4 x i32> %res
}
define <vscale x 2 x i64> @msb_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) {
; CHECK-LABEL: msb_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: msb z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: ret
%prod = mul <vscale x 2 x i64> %a, %b
%res = sub <vscale x 2 x i64> %c, %prod
ret <vscale x 2 x i64> %res
}
define <vscale x 16 x i8> @mls_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) {
; CHECK-LABEL: mls_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mls z0.b, p0/m, z1.b, z2.b
; CHECK-NEXT: ret
%prod = mul <vscale x 16 x i8> %b, %c
%res = sub <vscale x 16 x i8> %a, %prod
ret <vscale x 16 x i8> %res
}
define <vscale x 8 x i16> @mls_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) {
; CHECK-LABEL: mls_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mls z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: ret
%prod = mul <vscale x 8 x i16> %b, %c
%res = sub <vscale x 8 x i16> %a, %prod
ret <vscale x 8 x i16> %res
}
define <vscale x 4 x i32> @mls_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
; CHECK-LABEL: mls_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mls z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: ret
%prod = mul <vscale x 4 x i32> %b, %c
%res = sub <vscale x 4 x i32> %a, %prod
ret <vscale x 4 x i32> %res
}
define <vscale x 2 x i64> @mls_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) {
; CHECK-LABEL: mls_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mls z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: ret
%prod = mul <vscale x 2 x i64> %b, %c
%res = sub <vscale x 2 x i64> %a, %prod
ret <vscale x 2 x i64> %res
}
; Test cases below have one of the add/sub operands as constant splat
define <vscale x 2 x i64> @muladd_i64_positiveAddend(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
; CHECK-LABEL: muladd_i64_positiveAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, #0xffffffff
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: ret
{
%1 = mul <vscale x 2 x i64> %a, %b
%2 = add <vscale x 2 x i64> %1, splat (i64 4294967295)
ret <vscale x 2 x i64> %2
}
define <vscale x 2 x i64> @muladd_i64_negativeAddend(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
; CHECK-LABEL: muladd_i64_negativeAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, #0xffffffff00000001
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: ret
{
%1 = mul <vscale x 2 x i64> %a, %b
%2 = add <vscale x 2 x i64> %1, splat (i64 -4294967295)
ret <vscale x 2 x i64> %2
}
define <vscale x 4 x i32> @muladd_i32_positiveAddend(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
; CHECK-LABEL: muladd_i32_positiveAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.s, #0x10000
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: ret
{
%1 = mul <vscale x 4 x i32> %a, %b
%2 = add <vscale x 4 x i32> %1, splat (i32 65536)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @muladd_i32_negativeAddend(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
; CHECK-LABEL: muladd_i32_negativeAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.s, #0xffff0000
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: ret
{
%1 = mul <vscale x 4 x i32> %a, %b
%2 = add <vscale x 4 x i32> %1, splat (i32 -65536)
ret <vscale x 4 x i32> %2
}
define <vscale x 8 x i16> @muladd_i16_positiveAddend(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
; CHECK-LABEL: muladd_i16_positiveAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.h, #255 // =0xff
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mad z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: ret
{
%1 = mul <vscale x 8 x i16> %a, %b
%2 = add <vscale x 8 x i16> %1, splat (i16 255)
ret <vscale x 8 x i16> %2
}
define <vscale x 8 x i16> @muladd_i16_negativeAddend(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
; CHECK-LABEL: muladd_i16_negativeAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.h, #-255 // =0xffffffffffffff01
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mad z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: ret
{
%1 = mul <vscale x 8 x i16> %a, %b
%2 = add <vscale x 8 x i16> %1, splat (i16 -255)
ret <vscale x 8 x i16> %2
}
define <vscale x 16 x i8> @muladd_i8_positiveAddend(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
; CHECK-LABEL: muladd_i8_positiveAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.b, #15 // =0xf
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mad z0.b, p0/m, z1.b, z2.b
; CHECK-NEXT: ret
{
%1 = mul <vscale x 16 x i8> %a, %b
%2 = add <vscale x 16 x i8> %1, splat (i8 15)
ret <vscale x 16 x i8> %2
}
define <vscale x 16 x i8> @muladd_i8_negativeAddend(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
; CHECK-LABEL: muladd_i8_negativeAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.b, #-15 // =0xfffffffffffffff1
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mad z0.b, p0/m, z1.b, z2.b
; CHECK-NEXT: ret
{
%1 = mul <vscale x 16 x i8> %a, %b
%2 = add <vscale x 16 x i8> %1, splat (i8 -15)
ret <vscale x 16 x i8> %2
}
define <vscale x 2 x i64> @mulsub_i64_positiveAddend(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
; CHECK-LABEL: mulsub_i64_positiveAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: mov z1.d, #0xffffffff
; CHECK-NEXT: sub z0.d, z0.d, z1.d
; CHECK-NEXT: ret
{
%1 = mul <vscale x 2 x i64> %a, %b
%2 = sub <vscale x 2 x i64> %1, splat (i64 4294967295)
ret <vscale x 2 x i64> %2
}
define <vscale x 2 x i64> @mulsub_i64_negativeAddend(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
; CHECK-LABEL: mulsub_i64_negativeAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: mov z1.d, #0xffffffff00000001
; CHECK-NEXT: sub z0.d, z0.d, z1.d
; CHECK-NEXT: ret
{
%1 = mul <vscale x 2 x i64> %a, %b
%2 = sub <vscale x 2 x i64> %1, splat (i64 -4294967295)
ret <vscale x 2 x i64> %2
}
define <vscale x 4 x i32> @mulsub_i32_positiveAddend(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
; CHECK-LABEL: mulsub_i32_positiveAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: mov z1.s, #0x10000
; CHECK-NEXT: sub z0.s, z0.s, z1.s
; CHECK-NEXT: ret
{
%1 = mul <vscale x 4 x i32> %a, %b
%2 = sub <vscale x 4 x i32> %1, splat (i32 65536)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @mulsub_i32_negativeAddend(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
; CHECK-LABEL: mulsub_i32_negativeAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: mov z1.s, #0xffff0000
; CHECK-NEXT: sub z0.s, z0.s, z1.s
; CHECK-NEXT: ret
{
%1 = mul <vscale x 4 x i32> %a, %b
%2 = sub <vscale x 4 x i32> %1, splat (i32 -65536)
ret <vscale x 4 x i32> %2
}
define <vscale x 8 x i16> @mulsub_i16_positiveAddend(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
; CHECK-LABEL: mulsub_i16_positiveAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: sub z0.h, z0.h, #255 // =0xff
; CHECK-NEXT: ret
{
%1 = mul <vscale x 8 x i16> %a, %b
%2 = sub <vscale x 8 x i16> %1, splat (i16 255)
ret <vscale x 8 x i16> %2
}
define <vscale x 8 x i16> @mulsub_i16_negativeAddend(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
; CHECK-LABEL: mulsub_i16_negativeAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: mov z1.h, #-255 // =0xffffffffffffff01
; CHECK-NEXT: sub z0.h, z0.h, z1.h
; CHECK-NEXT: ret
{
%1 = mul <vscale x 8 x i16> %a, %b
%2 = sub <vscale x 8 x i16> %1, splat (i16 -255)
ret <vscale x 8 x i16> %2
}
define <vscale x 16 x i8> @mulsub_i8_positiveAddend(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
; CHECK-LABEL: mulsub_i8_positiveAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: sub z0.b, z0.b, #15 // =0xf
; CHECK-NEXT: ret
{
%1 = mul <vscale x 16 x i8> %a, %b
%2 = sub <vscale x 16 x i8> %1, splat (i8 15)
ret <vscale x 16 x i8> %2
}
define <vscale x 16 x i8> @mulsub_i8_negativeAddend(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
; CHECK-LABEL: mulsub_i8_negativeAddend:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: sub z0.b, z0.b, #241 // =0xf1
; CHECK-NEXT: ret
{
%1 = mul <vscale x 16 x i8> %a, %b
%2 = sub <vscale x 16 x i8> %1, splat (i8 -15)
ret <vscale x 16 x i8> %2
}
; TOFIX: Should generate msb for mul+sub in this case. Shuffling operand of sub generates the required msb instruction.
define <vscale x 8 x i16> @multiple_fused_ops(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
; CHECK-LABEL: multiple_fused_ops:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #200 // =0xc8
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov z2.h, w8
; CHECK-NEXT: mla z2.h, p0/m, z0.h, z1.h
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: sub z0.h, z0.h, z1.h
; CHECK-NEXT: ret
{
%1 = mul <vscale x 8 x i16> %a, %b
%2 = add <vscale x 8 x i16> %1, splat (i16 200)
%3 = mul <vscale x 8 x i16> %2, %a
%4 = sub <vscale x 8 x i16> %3, %b
ret <vscale x 8 x i16> %4
}
define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) {
; CHECK-LABEL: mad_in_loop:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w3, #1
; CHECK-NEXT: b.lt .LBB70_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w9, w3
; CHECK-NEXT: mov z0.s, #1 // =0x1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: whilelo p1.s, xzr, x9
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: cntw x10
; CHECK-NEXT: .LBB70_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
; CHECK-NEXT: ld1w { z2.s }, p1/z, [x2, x8, lsl #2]
; CHECK-NEXT: mad z1.s, p0/m, z2.s, z0.s
; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2]
; CHECK-NEXT: add x8, x8, x10
; CHECK-NEXT: whilelo p1.s, x8, x9
; CHECK-NEXT: b.mi .LBB70_2
; CHECK-NEXT: .LBB70_3: // %for.cond.cleanup
; CHECK-NEXT: ret
entry:
%cmp9 = icmp sgt i32 %n, 0
br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %n to i64
%active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
%0 = tail call i64 @llvm.vscale.i64()
%1 = shl nuw nsw i64 %0, 2
br label %vector.body
vector.body: ; preds = %vector.body, %for.body.preheader
%index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
%2 = getelementptr inbounds i32, ptr %src1, i64 %index
%wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %2, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
%3 = getelementptr inbounds i32, ptr %src2, i64 %index
%wide.masked.load12 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
%4 = mul nsw <vscale x 4 x i32> %wide.masked.load12, %wide.masked.load
%5 = add nsw <vscale x 4 x i32> %4, splat (i32 1)
%6 = getelementptr inbounds i32, ptr %dst, i64 %index
tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %5, ptr %6, i32 4, <vscale x 4 x i1> %active.lane.mask)
%index.next = add i64 %index, %1
%active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
%7 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
br i1 %7, label %vector.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
declare i64 @llvm.vscale.i64()
declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64)
declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>)
declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr nocapture, i32 immarg, <vscale x 4 x i1>)
declare <vscale x 16 x i8> @llvm.sadd.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 8 x i16> @llvm.sadd.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 4 x i32> @llvm.sadd.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 2 x i64> @llvm.sadd.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
declare <vscale x 16 x i8> @llvm.ssub.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 8 x i16> @llvm.ssub.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 4 x i32> @llvm.ssub.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 2 x i64> @llvm.ssub.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
declare <vscale x 16 x i8> @llvm.uadd.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 8 x i16> @llvm.uadd.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 4 x i32> @llvm.uadd.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 2 x i64> @llvm.uadd.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
declare <vscale x 16 x i8> @llvm.usub.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 8 x i16> @llvm.usub.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 4 x i32> @llvm.usub.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 2 x i64> @llvm.usub.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
declare <vscale x 32 x i8> @llvm.abs.nxv32i8(<vscale x 32 x i8>, i1)
declare <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8>, i1)
declare <vscale x 4 x i16> @llvm.abs.nxv4i16(<vscale x 4 x i16>, i1)
declare <vscale x 8 x i16> @llvm.abs.nxv8i16(<vscale x 8 x i16>, i1)
declare <vscale x 4 x i32> @llvm.abs.nxv4i32(<vscale x 4 x i32>, i1)
declare <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64>, i1)
declare <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64>, i1)