; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
define i32 @reduce_sum_2xi32(<2 x i32> %v) {
; CHECK-LABEL: reduce_sum_2xi32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vmv.s.x v9, zero
; CHECK-NEXT: vredsum.vs v8, v8, v9
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%e0 = extractelement <2 x i32> %v, i32 0
%e1 = extractelement <2 x i32> %v, i32 1
%add0 = add i32 %e0, %e1
ret i32 %add0
}
define i32 @reduce_sum_4xi32(<4 x i32> %v) {
; CHECK-LABEL: reduce_sum_4xi32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmv.s.x v9, zero
; CHECK-NEXT: vredsum.vs v8, v8, v9
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%e0 = extractelement <4 x i32> %v, i32 0
%e1 = extractelement <4 x i32> %v, i32 1
%e2 = extractelement <4 x i32> %v, i32 2
%e3 = extractelement <4 x i32> %v, i32 3
%add0 = add i32 %e0, %e1
%add1 = add i32 %add0, %e2
%add2 = add i32 %add1, %e3
ret i32 %add2
}
define i32 @reduce_sum_8xi32(<8 x i32> %v) {
; CHECK-LABEL: reduce_sum_8xi32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vmv.s.x v10, zero
; CHECK-NEXT: vredsum.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%e0 = extractelement <8 x i32> %v, i32 0
%e1 = extractelement <8 x i32> %v, i32 1
%e2 = extractelement <8 x i32> %v, i32 2
%e3 = extractelement <8 x i32> %v, i32 3
%e4 = extractelement <8 x i32> %v, i32 4
%e5 = extractelement <8 x i32> %v, i32 5
%e6 = extractelement <8 x i32> %v, i32 6
%e7 = extractelement <8 x i32> %v, i32 7
%add0 = add i32 %e0, %e1
%add1 = add i32 %add0, %e2
%add2 = add i32 %add1, %e3
%add3 = add i32 %add2, %e4
%add4 = add i32 %add3, %e5
%add5 = add i32 %add4, %e6
%add6 = add i32 %add5, %e7
ret i32 %add6
}
define i32 @reduce_sum_16xi32(<16 x i32> %v) {
; CHECK-LABEL: reduce_sum_16xi32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%e5 = extractelement <16 x i32> %v, i32 5
%e6 = extractelement <16 x i32> %v, i32 6
%e7 = extractelement <16 x i32> %v, i32 7
%e8 = extractelement <16 x i32> %v, i32 8
%e9 = extractelement <16 x i32> %v, i32 9
%e10 = extractelement <16 x i32> %v, i32 10
%e11 = extractelement <16 x i32> %v, i32 11
%e12 = extractelement <16 x i32> %v, i32 12
%e13 = extractelement <16 x i32> %v, i32 13
%e14 = extractelement <16 x i32> %v, i32 14
%e15 = extractelement <16 x i32> %v, i32 15
%add0 = add i32 %e0, %e1
%add1 = add i32 %add0, %e2
%add2 = add i32 %add1, %e3
%add3 = add i32 %add2, %e4
%add4 = add i32 %add3, %e5
%add5 = add i32 %add4, %e6
%add6 = add i32 %add5, %e7
%add7 = add i32 %add6, %e8
%add8 = add i32 %add7, %e9
%add9 = add i32 %add8, %e10
%add10 = add i32 %add9, %e11
%add11 = add i32 %add10, %e12
%add12 = add i32 %add11, %e13
%add13 = add i32 %add12, %e14
%add14 = add i32 %add13, %e15
ret i32 %add14
}
define i32 @reduce_sum_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v9, zero
; CHECK-NEXT: vredsum.vs v8, v8, v9
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%add0 = add i32 %e0, %e1
ret i32 %add0
}
define i32 @reduce_sum_16xi32_prefix3(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix3:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v9, zero
; CHECK-NEXT: vredsum.vs v8, v8, v9
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%add0 = add i32 %e0, %e1
%add1 = add i32 %add0, %e2
ret i32 %add1
}
define i32 @reduce_sum_16xi32_prefix4(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix4:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v9, zero
; CHECK-NEXT: vredsum.vs v8, v8, v9
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%add0 = add i32 %e0, %e1
%add1 = add i32 %add0, %e2
%add2 = add i32 %add1, %e3
ret i32 %add2
}
define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix5:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v10, zero
; CHECK-NEXT: vredsum.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%add0 = add i32 %e0, %e1
%add1 = add i32 %add0, %e2
%add2 = add i32 %add1, %e3
%add3 = add i32 %add2, %e4
ret i32 %add3
}
define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix6:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v10, zero
; CHECK-NEXT: vredsum.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%e5 = extractelement <16 x i32> %v, i32 5
%add0 = add i32 %e0, %e1
%add1 = add i32 %add0, %e2
%add2 = add i32 %add1, %e3
%add3 = add i32 %add2, %e4
%add4 = add i32 %add3, %e5
ret i32 %add4
}
define i32 @reduce_sum_16xi32_prefix7(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix7:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v10, zero
; CHECK-NEXT: vredsum.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%e5 = extractelement <16 x i32> %v, i32 5
%e6 = extractelement <16 x i32> %v, i32 6
%add0 = add i32 %e0, %e1
%add1 = add i32 %add0, %e2
%add2 = add i32 %add1, %e3
%add3 = add i32 %add2, %e4
%add4 = add i32 %add3, %e5
%add5 = add i32 %add4, %e6
ret i32 %add5
}
define i32 @reduce_sum_16xi32_prefix8(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v10, zero
; CHECK-NEXT: vredsum.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%e5 = extractelement <16 x i32> %v, i32 5
%e6 = extractelement <16 x i32> %v, i32 6
%e7 = extractelement <16 x i32> %v, i32 7
%add0 = add i32 %e0, %e1
%add1 = add i32 %add0, %e2
%add2 = add i32 %add1, %e3
%add3 = add i32 %add2, %e4
%add4 = add i32 %add3, %e5
%add5 = add i32 %add4, %e6
%add6 = add i32 %add5, %e7
ret i32 %add6
}
define i32 @reduce_sum_16xi32_prefix9(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix9:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 9, e32, m4, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%e5 = extractelement <16 x i32> %v, i32 5
%e6 = extractelement <16 x i32> %v, i32 6
%e7 = extractelement <16 x i32> %v, i32 7
%e8 = extractelement <16 x i32> %v, i32 8
%add0 = add i32 %e0, %e1
%add1 = add i32 %add0, %e2
%add2 = add i32 %add1, %e3
%add3 = add i32 %add2, %e4
%add4 = add i32 %add3, %e5
%add5 = add i32 %add4, %e6
%add6 = add i32 %add5, %e7
%add7 = add i32 %add6, %e8
ret i32 %add7
}
define i32 @reduce_sum_16xi32_prefix13(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix13:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 13, e32, m4, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%e5 = extractelement <16 x i32> %v, i32 5
%e6 = extractelement <16 x i32> %v, i32 6
%e7 = extractelement <16 x i32> %v, i32 7
%e8 = extractelement <16 x i32> %v, i32 8
%e9 = extractelement <16 x i32> %v, i32 9
%e10 = extractelement <16 x i32> %v, i32 10
%e11 = extractelement <16 x i32> %v, i32 11
%e12 = extractelement <16 x i32> %v, i32 12
%add0 = add i32 %e0, %e1
%add1 = add i32 %add0, %e2
%add2 = add i32 %add1, %e3
%add3 = add i32 %add2, %e4
%add4 = add i32 %add3, %e5
%add5 = add i32 %add4, %e6
%add6 = add i32 %add5, %e7
%add7 = add i32 %add6, %e8
%add8 = add i32 %add7, %e9
%add9 = add i32 %add8, %e10
%add10 = add i32 %add9, %e11
%add11 = add i32 %add10, %e12
ret i32 %add11
}
define i32 @reduce_sum_16xi32_prefix14(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix14:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 14, e32, m4, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%e5 = extractelement <16 x i32> %v, i32 5
%e6 = extractelement <16 x i32> %v, i32 6
%e7 = extractelement <16 x i32> %v, i32 7
%e8 = extractelement <16 x i32> %v, i32 8
%e9 = extractelement <16 x i32> %v, i32 9
%e10 = extractelement <16 x i32> %v, i32 10
%e11 = extractelement <16 x i32> %v, i32 11
%e12 = extractelement <16 x i32> %v, i32 12
%e13 = extractelement <16 x i32> %v, i32 13
%add0 = add i32 %e0, %e1
%add1 = add i32 %add0, %e2
%add2 = add i32 %add1, %e3
%add3 = add i32 %add2, %e4
%add4 = add i32 %add3, %e5
%add5 = add i32 %add4, %e6
%add6 = add i32 %add5, %e7
%add7 = add i32 %add6, %e8
%add8 = add i32 %add7, %e9
%add9 = add i32 %add8, %e10
%add10 = add i32 %add9, %e11
%add11 = add i32 %add10, %e12
%add12 = add i32 %add11, %e13
ret i32 %add12
}
define i32 @reduce_sum_16xi32_prefix15(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix15:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 15, e32, m4, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%e5 = extractelement <16 x i32> %v, i32 5
%e6 = extractelement <16 x i32> %v, i32 6
%e7 = extractelement <16 x i32> %v, i32 7
%e8 = extractelement <16 x i32> %v, i32 8
%e9 = extractelement <16 x i32> %v, i32 9
%e10 = extractelement <16 x i32> %v, i32 10
%e11 = extractelement <16 x i32> %v, i32 11
%e12 = extractelement <16 x i32> %v, i32 12
%e13 = extractelement <16 x i32> %v, i32 13
%e14 = extractelement <16 x i32> %v, i32 14
%add0 = add i32 %e0, %e1
%add1 = add i32 %add0, %e2
%add2 = add i32 %add1, %e3
%add3 = add i32 %add2, %e4
%add4 = add i32 %add3, %e5
%add5 = add i32 %add4, %e6
%add6 = add i32 %add5, %e7
%add7 = add i32 %add6, %e8
%add8 = add i32 %add7, %e9
%add9 = add i32 %add8, %e10
%add10 = add i32 %add9, %e11
%add11 = add i32 %add10, %e12
%add12 = add i32 %add11, %e13
%add13 = add i32 %add12, %e14
ret i32 %add13
}
; Check that we can match with the operand ordered reversed, but the
; reduction order unchanged.
define i32 @reduce_sum_4xi32_op_order(<4 x i32> %v) {
; CHECK-LABEL: reduce_sum_4xi32_op_order:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmv.s.x v9, zero
; CHECK-NEXT: vredsum.vs v8, v8, v9
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%e0 = extractelement <4 x i32> %v, i32 0
%e1 = extractelement <4 x i32> %v, i32 1
%e2 = extractelement <4 x i32> %v, i32 2
%e3 = extractelement <4 x i32> %v, i32 3
%add0 = add i32 %e1, %e0
%add1 = add i32 %e2, %add0
%add2 = add i32 %add1, %e3
ret i32 %add2
}
; Negative test - Reduction order isn't compatibile with current
; incremental matching scheme.
define i32 @reduce_sum_4xi32_reduce_order(<4 x i32> %v) {
; RV32-LABEL: reduce_sum_4xi32_reduce_order:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: vslidedown.vi v9, v8, 1
; RV32-NEXT: vmv.x.s a1, v9
; RV32-NEXT: vslidedown.vi v9, v8, 2
; RV32-NEXT: vmv.x.s a2, v9
; RV32-NEXT: vslidedown.vi v8, v8, 3
; RV32-NEXT: vmv.x.s a3, v8
; RV32-NEXT: add a1, a1, a2
; RV32-NEXT: add a0, a0, a3
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: ret
;
; RV64-LABEL: reduce_sum_4xi32_reduce_order:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: vslidedown.vi v9, v8, 1
; RV64-NEXT: vmv.x.s a1, v9
; RV64-NEXT: vslidedown.vi v9, v8, 2
; RV64-NEXT: vmv.x.s a2, v9
; RV64-NEXT: vslidedown.vi v8, v8, 3
; RV64-NEXT: vmv.x.s a3, v8
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: add a0, a0, a3
; RV64-NEXT: addw a0, a0, a1
; RV64-NEXT: ret
%e0 = extractelement <4 x i32> %v, i32 0
%e1 = extractelement <4 x i32> %v, i32 1
%e2 = extractelement <4 x i32> %v, i32 2
%e3 = extractelement <4 x i32> %v, i32 3
%add0 = add i32 %e1, %e2
%add1 = add i32 %e0, %add0
%add2 = add i32 %add1, %e3
ret i32 %add2
}
;; Most of the cornercases are exercised above, the following just
;; makes sure that other opcodes work as expected.
define i32 @reduce_xor_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_xor_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v9, zero
; CHECK-NEXT: vredxor.vs v8, v8, v9
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%xor0 = xor i32 %e0, %e1
ret i32 %xor0
}
define i32 @reduce_xor_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_xor_16xi32_prefix5:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v10, zero
; CHECK-NEXT: vredxor.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%xor0 = xor i32 %e0, %e1
%xor1 = xor i32 %xor0, %e2
%xor2 = xor i32 %xor1, %e3
%xor3 = xor i32 %xor2, %e4
ret i32 %xor3
}
define i32 @reduce_and_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_and_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vredand.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%and0 = and i32 %e0, %e1
ret i32 %and0
}
define i32 @reduce_and_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_and_16xi32_prefix5:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vsetivli zero, 5, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.i v10, -1
; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vredand.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%and0 = and i32 %e0, %e1
%and1 = and i32 %and0, %e2
%and2 = and i32 %and1, %e3
%and3 = and i32 %and2, %e4
ret i32 %and3
}
define i32 @reduce_or_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_or_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vredor.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%or0 = or i32 %e0, %e1
ret i32 %or0
}
define i32 @reduce_or_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_or_16xi32_prefix5:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v10, zero
; CHECK-NEXT: vredor.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%or0 = or i32 %e0, %e1
%or1 = or i32 %or0, %e2
%or2 = or i32 %or1, %e3
%or3 = or i32 %or2, %e4
ret i32 %or3
}
declare i32 @llvm.smax.i32(i32 %a, i32 %b)
declare i32 @llvm.smin.i32(i32 %a, i32 %b)
declare i32 @llvm.umax.i32(i32 %a, i32 %b)
declare i32 @llvm.umin.i32(i32 %a, i32 %b)
define i32 @reduce_smax_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_smax_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vredmax.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1)
ret i32 %smax0
}
define i32 @reduce_smax_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_smax_16xi32_prefix5:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: lui a0, 524288
; CHECK-NEXT: vmv.s.x v10, a0
; CHECK-NEXT: vredmax.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1)
%smax1 = call i32 @llvm.smax.i32(i32 %smax0, i32 %e2)
%smax2 = call i32 @llvm.smax.i32(i32 %smax1, i32 %e3)
%smax3 = call i32 @llvm.smax.i32(i32 %smax2, i32 %e4)
ret i32 %smax3
}
define i32 @reduce_smin_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_smin_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vredmin.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1)
ret i32 %smin0
}
define i32 @reduce_smin_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_smin_16xi32_prefix5:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: lui a0, 524288
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vmv.s.x v10, a0
; CHECK-NEXT: vredmin.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1)
%smin1 = call i32 @llvm.smin.i32(i32 %smin0, i32 %e2)
%smin2 = call i32 @llvm.smin.i32(i32 %smin1, i32 %e3)
%smin3 = call i32 @llvm.smin.i32(i32 %smin2, i32 %e4)
ret i32 %smin3
}
define i32 @reduce_umax_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_umax_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vredmaxu.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1)
ret i32 %umax0
}
define i32 @reduce_umax_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_umax_16xi32_prefix5:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v10, zero
; CHECK-NEXT: vredmaxu.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1)
%umax1 = call i32 @llvm.umax.i32(i32 %umax0, i32 %e2)
%umax2 = call i32 @llvm.umax.i32(i32 %umax1, i32 %e3)
%umax3 = call i32 @llvm.umax.i32(i32 %umax2, i32 %e4)
ret i32 %umax3
}
define i32 @reduce_umin_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_umin_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vredminu.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1)
ret i32 %umin0
}
define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
; RV32-LABEL: reduce_umin_16xi32_prefix5:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; RV32-NEXT: vle32.v v8, (a0)
; RV32-NEXT: vsetivli zero, 5, e32, m1, ta, ma
; RV32-NEXT: vmv.v.i v10, -1
; RV32-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; RV32-NEXT: vredminu.vs v8, v8, v10
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: ret
;
; RV64-LABEL: reduce_umin_16xi32_prefix5:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; RV64-NEXT: vle32.v v8, (a0)
; RV64-NEXT: li a0, -1
; RV64-NEXT: vmv.s.x v10, a0
; RV64-NEXT: vredminu.vs v8, v8, v10
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
%e2 = extractelement <16 x i32> %v, i32 2
%e3 = extractelement <16 x i32> %v, i32 3
%e4 = extractelement <16 x i32> %v, i32 4
%umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1)
%umin1 = call i32 @llvm.umin.i32(i32 %umin0, i32 %e2)
%umin2 = call i32 @llvm.umin.i32(i32 %umin1, i32 %e3)
%umin3 = call i32 @llvm.umin.i32(i32 %umin2, i32 %e4)
ret i32 %umin3
}
define float @reduce_fadd_16xf32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_fadd_16xf32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v9, zero
; CHECK-NEXT: vfredusum.vs v8, v8, v9
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
%v = load <16 x float>, ptr %p, align 256
%e0 = extractelement <16 x float> %v, i32 0
%e1 = extractelement <16 x float> %v, i32 1
%fadd0 = fadd fast float %e0, %e1
ret float %fadd0
}
define float @reduce_fadd_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_fadd_16xi32_prefix5:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: lui a0, 524288
; CHECK-NEXT: vmv.s.x v10, a0
; CHECK-NEXT: vfredusum.vs v8, v8, v10
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
%v = load <16 x float>, ptr %p, align 256
%e0 = extractelement <16 x float> %v, i32 0
%e1 = extractelement <16 x float> %v, i32 1
%e2 = extractelement <16 x float> %v, i32 2
%e3 = extractelement <16 x float> %v, i32 3
%e4 = extractelement <16 x float> %v, i32 4
%fadd0 = fadd fast float %e0, %e1
%fadd1 = fadd fast float %fadd0, %e2
%fadd2 = fadd fast float %fadd1, %e3
%fadd3 = fadd fast float %fadd2, %e4
ret float %fadd3
}
;; Corner case tests for fadd associativity
; Negative test, not associative. Would need strict opcode.
define float @reduce_fadd_2xf32_non_associative(ptr %p) {
; CHECK-LABEL: reduce_fadd_2xf32_non_associative:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vfmv.f.s fa5, v8
; CHECK-NEXT: vslidedown.vi v8, v8, 1
; CHECK-NEXT: vfmv.f.s fa4, v8
; CHECK-NEXT: fadd.s fa0, fa5, fa4
; CHECK-NEXT: ret
%v = load <2 x float>, ptr %p, align 256
%e0 = extractelement <2 x float> %v, i32 0
%e1 = extractelement <2 x float> %v, i32 1
%fadd0 = fadd float %e0, %e1
ret float %fadd0
}
; Positive test - minimal set of fast math flags
define float @reduce_fadd_2xf32_reassoc_only(ptr %p) {
; CHECK-LABEL: reduce_fadd_2xf32_reassoc_only:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: lui a0, 524288
; CHECK-NEXT: vmv.s.x v9, a0
; CHECK-NEXT: vfredusum.vs v8, v8, v9
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
%v = load <2 x float>, ptr %p, align 256
%e0 = extractelement <2 x float> %v, i32 0
%e1 = extractelement <2 x float> %v, i32 1
%fadd0 = fadd reassoc float %e0, %e1
ret float %fadd0
}
; Negative test - wrong fast math flag.
define float @reduce_fadd_2xf32_ninf_only(ptr %p) {
; CHECK-LABEL: reduce_fadd_2xf32_ninf_only:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vfmv.f.s fa5, v8
; CHECK-NEXT: vslidedown.vi v8, v8, 1
; CHECK-NEXT: vfmv.f.s fa4, v8
; CHECK-NEXT: fadd.s fa0, fa5, fa4
; CHECK-NEXT: ret
%v = load <2 x float>, ptr %p, align 256
%e0 = extractelement <2 x float> %v, i32 0
%e1 = extractelement <2 x float> %v, i32 1
%fadd0 = fadd ninf float %e0, %e1
ret float %fadd0
}
; Negative test - last fadd is not associative
define float @reduce_fadd_4xi32_non_associative(ptr %p) {
; CHECK-LABEL: reduce_fadd_4xi32_non_associative:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vslidedown.vi v9, v8, 3
; CHECK-NEXT: vfmv.f.s fa5, v9
; CHECK-NEXT: lui a0, 524288
; CHECK-NEXT: vmv.s.x v9, a0
; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma
; CHECK-NEXT: vfredusum.vs v8, v8, v9
; CHECK-NEXT: vfmv.f.s fa4, v8
; CHECK-NEXT: fadd.s fa0, fa4, fa5
; CHECK-NEXT: ret
%v = load <4 x float>, ptr %p, align 256
%e0 = extractelement <4 x float> %v, i32 0
%e1 = extractelement <4 x float> %v, i32 1
%e2 = extractelement <4 x float> %v, i32 2
%e3 = extractelement <4 x float> %v, i32 3
%fadd0 = fadd fast float %e0, %e1
%fadd1 = fadd fast float %fadd0, %e2
%fadd2 = fadd float %fadd1, %e3
ret float %fadd2
}
; Negative test - first fadd is not associative
; We could form a reduce for elements 2 and 3.
define float @reduce_fadd_4xi32_non_associative2(ptr %p) {
; CHECK-LABEL: reduce_fadd_4xi32_non_associative2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vfmv.f.s fa5, v8
; CHECK-NEXT: vslidedown.vi v9, v8, 1
; CHECK-NEXT: vfmv.f.s fa4, v9
; CHECK-NEXT: vslidedown.vi v9, v8, 2
; CHECK-NEXT: vfmv.f.s fa3, v9
; CHECK-NEXT: vslidedown.vi v8, v8, 3
; CHECK-NEXT: vfmv.f.s fa2, v8
; CHECK-NEXT: fadd.s fa5, fa5, fa4
; CHECK-NEXT: fadd.s fa4, fa3, fa2
; CHECK-NEXT: fadd.s fa0, fa5, fa4
; CHECK-NEXT: ret
%v = load <4 x float>, ptr %p, align 256
%e0 = extractelement <4 x float> %v, i32 0
%e1 = extractelement <4 x float> %v, i32 1
%e2 = extractelement <4 x float> %v, i32 2
%e3 = extractelement <4 x float> %v, i32 3
%fadd0 = fadd float %e0, %e1
%fadd1 = fadd fast float %fadd0, %e2
%fadd2 = fadd fast float %fadd1, %e3
ret float %fadd2
}