llvm/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64

define i32 @reduce_sum_2xi32(<2 x i32> %v) {
; CHECK-LABEL: reduce_sum_2xi32:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT:    vmv.s.x v9, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v9
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %e0 = extractelement <2 x i32> %v, i32 0
  %e1 = extractelement <2 x i32> %v, i32 1
  %add0 = add i32 %e0, %e1
  ret i32 %add0
}

define i32 @reduce_sum_4xi32(<4 x i32> %v) {
; CHECK-LABEL: reduce_sum_4xi32:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT:    vmv.s.x v9, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v9
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %e0 = extractelement <4 x i32> %v, i32 0
  %e1 = extractelement <4 x i32> %v, i32 1
  %e2 = extractelement <4 x i32> %v, i32 2
  %e3 = extractelement <4 x i32> %v, i32 3
  %add0 = add i32 %e0, %e1
  %add1 = add i32 %add0, %e2
  %add2 = add i32 %add1, %e3
  ret i32 %add2
}

define i32 @reduce_sum_8xi32(<8 x i32> %v) {
; CHECK-LABEL: reduce_sum_8xi32:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT:    vmv.s.x v10, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v10
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %e0 = extractelement <8 x i32> %v, i32 0
  %e1 = extractelement <8 x i32> %v, i32 1
  %e2 = extractelement <8 x i32> %v, i32 2
  %e3 = extractelement <8 x i32> %v, i32 3
  %e4 = extractelement <8 x i32> %v, i32 4
  %e5 = extractelement <8 x i32> %v, i32 5
  %e6 = extractelement <8 x i32> %v, i32 6
  %e7 = extractelement <8 x i32> %v, i32 7
  %add0 = add i32 %e0, %e1
  %add1 = add i32 %add0, %e2
  %add2 = add i32 %add1, %e3
  %add3 = add i32 %add2, %e4
  %add4 = add i32 %add3, %e5
  %add5 = add i32 %add4, %e6
  %add6 = add i32 %add5, %e7
  ret i32 %add6
}

define i32 @reduce_sum_16xi32(<16 x i32> %v) {
; CHECK-LABEL: reduce_sum_16xi32:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
; CHECK-NEXT:    vmv.s.x v12, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v12
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %e5 = extractelement <16 x i32> %v, i32 5
  %e6 = extractelement <16 x i32> %v, i32 6
  %e7 = extractelement <16 x i32> %v, i32 7
  %e8 = extractelement <16 x i32> %v, i32 8
  %e9 = extractelement <16 x i32> %v, i32 9
  %e10 = extractelement <16 x i32> %v, i32 10
  %e11 = extractelement <16 x i32> %v, i32 11
  %e12 = extractelement <16 x i32> %v, i32 12
  %e13 = extractelement <16 x i32> %v, i32 13
  %e14 = extractelement <16 x i32> %v, i32 14
  %e15 = extractelement <16 x i32> %v, i32 15
  %add0 = add i32 %e0, %e1
  %add1 = add i32 %add0, %e2
  %add2 = add i32 %add1, %e3
  %add3 = add i32 %add2, %e4
  %add4 = add i32 %add3, %e5
  %add5 = add i32 %add4, %e6
  %add6 = add i32 %add5, %e7
  %add7 = add i32 %add6, %e8
  %add8 = add i32 %add7, %e9
  %add9 = add i32 %add8, %e10
  %add10 = add i32 %add9, %e11
  %add11 = add i32 %add10, %e12
  %add12 = add i32 %add11, %e13
  %add13 = add i32 %add12, %e14
  %add14 = add i32 %add13, %e15
  ret i32 %add14
}

define i32 @reduce_sum_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v9, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v9
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %add0 = add i32 %e0, %e1
  ret i32 %add0
}

define i32 @reduce_sum_16xi32_prefix3(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v9, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v9
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %add0 = add i32 %e0, %e1
  %add1 = add i32 %add0, %e2
  ret i32 %add1
}

define i32 @reduce_sum_16xi32_prefix4(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix4:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v9, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v9
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %add0 = add i32 %e0, %e1
  %add1 = add i32 %add0, %e2
  %add2 = add i32 %add1, %e3
  ret i32 %add2
}

define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v10, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v10
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %add0 = add i32 %e0, %e1
  %add1 = add i32 %add0, %e2
  %add2 = add i32 %add1, %e3
  %add3 = add i32 %add2, %e4
  ret i32 %add3
}

define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v10, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v10
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %e5 = extractelement <16 x i32> %v, i32 5
  %add0 = add i32 %e0, %e1
  %add1 = add i32 %add0, %e2
  %add2 = add i32 %add1, %e3
  %add3 = add i32 %add2, %e4
  %add4 = add i32 %add3, %e5
  ret i32 %add4
}

define i32 @reduce_sum_16xi32_prefix7(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix7:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v10, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v10
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %e5 = extractelement <16 x i32> %v, i32 5
  %e6 = extractelement <16 x i32> %v, i32 6
  %add0 = add i32 %e0, %e1
  %add1 = add i32 %add0, %e2
  %add2 = add i32 %add1, %e3
  %add3 = add i32 %add2, %e4
  %add4 = add i32 %add3, %e5
  %add5 = add i32 %add4, %e6
  ret i32 %add5
}

define i32 @reduce_sum_16xi32_prefix8(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix8:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v10, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v10
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %e5 = extractelement <16 x i32> %v, i32 5
  %e6 = extractelement <16 x i32> %v, i32 6
  %e7 = extractelement <16 x i32> %v, i32 7
  %add0 = add i32 %e0, %e1
  %add1 = add i32 %add0, %e2
  %add2 = add i32 %add1, %e3
  %add3 = add i32 %add2, %e4
  %add4 = add i32 %add3, %e5
  %add5 = add i32 %add4, %e6
  %add6 = add i32 %add5, %e7
  ret i32 %add6
}

define i32 @reduce_sum_16xi32_prefix9(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix9:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 9, e32, m4, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v12, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v12
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %e5 = extractelement <16 x i32> %v, i32 5
  %e6 = extractelement <16 x i32> %v, i32 6
  %e7 = extractelement <16 x i32> %v, i32 7
  %e8 = extractelement <16 x i32> %v, i32 8
  %add0 = add i32 %e0, %e1
  %add1 = add i32 %add0, %e2
  %add2 = add i32 %add1, %e3
  %add3 = add i32 %add2, %e4
  %add4 = add i32 %add3, %e5
  %add5 = add i32 %add4, %e6
  %add6 = add i32 %add5, %e7
  %add7 = add i32 %add6, %e8
  ret i32 %add7
}

define i32 @reduce_sum_16xi32_prefix13(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix13:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 13, e32, m4, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v12, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v12
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %e5 = extractelement <16 x i32> %v, i32 5
  %e6 = extractelement <16 x i32> %v, i32 6
  %e7 = extractelement <16 x i32> %v, i32 7
  %e8 = extractelement <16 x i32> %v, i32 8
  %e9 = extractelement <16 x i32> %v, i32 9
  %e10 = extractelement <16 x i32> %v, i32 10
  %e11 = extractelement <16 x i32> %v, i32 11
  %e12 = extractelement <16 x i32> %v, i32 12
  %add0 = add i32 %e0, %e1
  %add1 = add i32 %add0, %e2
  %add2 = add i32 %add1, %e3
  %add3 = add i32 %add2, %e4
  %add4 = add i32 %add3, %e5
  %add5 = add i32 %add4, %e6
  %add6 = add i32 %add5, %e7
  %add7 = add i32 %add6, %e8
  %add8 = add i32 %add7, %e9
  %add9 = add i32 %add8, %e10
  %add10 = add i32 %add9, %e11
  %add11 = add i32 %add10, %e12
  ret i32 %add11
}


define i32 @reduce_sum_16xi32_prefix14(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix14:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 14, e32, m4, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v12, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v12
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %e5 = extractelement <16 x i32> %v, i32 5
  %e6 = extractelement <16 x i32> %v, i32 6
  %e7 = extractelement <16 x i32> %v, i32 7
  %e8 = extractelement <16 x i32> %v, i32 8
  %e9 = extractelement <16 x i32> %v, i32 9
  %e10 = extractelement <16 x i32> %v, i32 10
  %e11 = extractelement <16 x i32> %v, i32 11
  %e12 = extractelement <16 x i32> %v, i32 12
  %e13 = extractelement <16 x i32> %v, i32 13
  %add0 = add i32 %e0, %e1
  %add1 = add i32 %add0, %e2
  %add2 = add i32 %add1, %e3
  %add3 = add i32 %add2, %e4
  %add4 = add i32 %add3, %e5
  %add5 = add i32 %add4, %e6
  %add6 = add i32 %add5, %e7
  %add7 = add i32 %add6, %e8
  %add8 = add i32 %add7, %e9
  %add9 = add i32 %add8, %e10
  %add10 = add i32 %add9, %e11
  %add11 = add i32 %add10, %e12
  %add12 = add i32 %add11, %e13
  ret i32 %add12
}

define i32 @reduce_sum_16xi32_prefix15(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix15:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 15, e32, m4, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v12, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v12
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %e5 = extractelement <16 x i32> %v, i32 5
  %e6 = extractelement <16 x i32> %v, i32 6
  %e7 = extractelement <16 x i32> %v, i32 7
  %e8 = extractelement <16 x i32> %v, i32 8
  %e9 = extractelement <16 x i32> %v, i32 9
  %e10 = extractelement <16 x i32> %v, i32 10
  %e11 = extractelement <16 x i32> %v, i32 11
  %e12 = extractelement <16 x i32> %v, i32 12
  %e13 = extractelement <16 x i32> %v, i32 13
  %e14 = extractelement <16 x i32> %v, i32 14
  %add0 = add i32 %e0, %e1
  %add1 = add i32 %add0, %e2
  %add2 = add i32 %add1, %e3
  %add3 = add i32 %add2, %e4
  %add4 = add i32 %add3, %e5
  %add5 = add i32 %add4, %e6
  %add6 = add i32 %add5, %e7
  %add7 = add i32 %add6, %e8
  %add8 = add i32 %add7, %e9
  %add9 = add i32 %add8, %e10
  %add10 = add i32 %add9, %e11
  %add11 = add i32 %add10, %e12
  %add12 = add i32 %add11, %e13
  %add13 = add i32 %add12, %e14
  ret i32 %add13
}

; Check that we can match with the operand ordered reversed, but the
; reduction order unchanged.
define i32 @reduce_sum_4xi32_op_order(<4 x i32> %v) {
; CHECK-LABEL: reduce_sum_4xi32_op_order:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT:    vmv.s.x v9, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v9
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %e0 = extractelement <4 x i32> %v, i32 0
  %e1 = extractelement <4 x i32> %v, i32 1
  %e2 = extractelement <4 x i32> %v, i32 2
  %e3 = extractelement <4 x i32> %v, i32 3
  %add0 = add i32 %e1, %e0
  %add1 = add i32 %e2, %add0
  %add2 = add i32 %add1, %e3
  ret i32 %add2
}

; Negative test - Reduction order isn't compatibile with current
; incremental matching scheme.
define i32 @reduce_sum_4xi32_reduce_order(<4 x i32> %v) {
; RV32-LABEL: reduce_sum_4xi32_reduce_order:
; RV32:       # %bb.0:
; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
; RV32-NEXT:    vmv.x.s a0, v8
; RV32-NEXT:    vslidedown.vi v9, v8, 1
; RV32-NEXT:    vmv.x.s a1, v9
; RV32-NEXT:    vslidedown.vi v9, v8, 2
; RV32-NEXT:    vmv.x.s a2, v9
; RV32-NEXT:    vslidedown.vi v8, v8, 3
; RV32-NEXT:    vmv.x.s a3, v8
; RV32-NEXT:    add a1, a1, a2
; RV32-NEXT:    add a0, a0, a3
; RV32-NEXT:    add a0, a0, a1
; RV32-NEXT:    ret
;
; RV64-LABEL: reduce_sum_4xi32_reduce_order:
; RV64:       # %bb.0:
; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
; RV64-NEXT:    vmv.x.s a0, v8
; RV64-NEXT:    vslidedown.vi v9, v8, 1
; RV64-NEXT:    vmv.x.s a1, v9
; RV64-NEXT:    vslidedown.vi v9, v8, 2
; RV64-NEXT:    vmv.x.s a2, v9
; RV64-NEXT:    vslidedown.vi v8, v8, 3
; RV64-NEXT:    vmv.x.s a3, v8
; RV64-NEXT:    add a1, a1, a2
; RV64-NEXT:    add a0, a0, a3
; RV64-NEXT:    addw a0, a0, a1
; RV64-NEXT:    ret
  %e0 = extractelement <4 x i32> %v, i32 0
  %e1 = extractelement <4 x i32> %v, i32 1
  %e2 = extractelement <4 x i32> %v, i32 2
  %e3 = extractelement <4 x i32> %v, i32 3
  %add0 = add i32 %e1, %e2
  %add1 = add i32 %e0, %add0
  %add2 = add i32 %add1, %e3
  ret i32 %add2
}

;; Most of the cornercases are exercised above, the following just
;; makes sure that other opcodes work as expected.

define i32 @reduce_xor_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_xor_16xi32_prefix2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v9, zero
; CHECK-NEXT:    vredxor.vs v8, v8, v9
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %xor0 = xor i32 %e0, %e1
  ret i32 %xor0
}

define i32 @reduce_xor_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_xor_16xi32_prefix5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v10, zero
; CHECK-NEXT:    vredxor.vs v8, v8, v10
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %xor0 = xor i32 %e0, %e1
  %xor1 = xor i32 %xor0, %e2
  %xor2 = xor i32 %xor1, %e3
  %xor3 = xor i32 %xor2, %e4
  ret i32 %xor3
}

define i32 @reduce_and_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_and_16xi32_prefix2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vredand.vs v8, v8, v8
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %and0 = and i32 %e0, %e1
  ret i32 %and0
}

define i32 @reduce_and_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_and_16xi32_prefix5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
; CHECK-NEXT:    vmv.v.i v10, -1
; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT:    vredand.vs v8, v8, v10
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %and0 = and i32 %e0, %e1
  %and1 = and i32 %and0, %e2
  %and2 = and i32 %and1, %e3
  %and3 = and i32 %and2, %e4
  ret i32 %and3
}

define i32 @reduce_or_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_or_16xi32_prefix2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vredor.vs v8, v8, v8
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %or0 = or i32 %e0, %e1
  ret i32 %or0
}

define i32 @reduce_or_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_or_16xi32_prefix5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v10, zero
; CHECK-NEXT:    vredor.vs v8, v8, v10
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %or0 = or i32 %e0, %e1
  %or1 = or i32 %or0, %e2
  %or2 = or i32 %or1, %e3
  %or3 = or i32 %or2, %e4
  ret i32 %or3
}

declare i32 @llvm.smax.i32(i32 %a, i32 %b)
declare i32 @llvm.smin.i32(i32 %a, i32 %b)
declare i32 @llvm.umax.i32(i32 %a, i32 %b)
declare i32 @llvm.umin.i32(i32 %a, i32 %b)

define i32 @reduce_smax_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_smax_16xi32_prefix2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vredmax.vs v8, v8, v8
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1)
  ret i32 %smax0
}

define i32 @reduce_smax_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_smax_16xi32_prefix5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    lui a0, 524288
; CHECK-NEXT:    vmv.s.x v10, a0
; CHECK-NEXT:    vredmax.vs v8, v8, v10
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1)
  %smax1 = call i32 @llvm.smax.i32(i32 %smax0, i32 %e2)
  %smax2 = call i32 @llvm.smax.i32(i32 %smax1, i32 %e3)
  %smax3 = call i32 @llvm.smax.i32(i32 %smax2, i32 %e4)
  ret i32 %smax3
}

define i32 @reduce_smin_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_smin_16xi32_prefix2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vredmin.vs v8, v8, v8
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1)
  ret i32 %smin0
}

define i32 @reduce_smin_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_smin_16xi32_prefix5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    lui a0, 524288
; CHECK-NEXT:    addi a0, a0, -1
; CHECK-NEXT:    vmv.s.x v10, a0
; CHECK-NEXT:    vredmin.vs v8, v8, v10
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1)
  %smin1 = call i32 @llvm.smin.i32(i32 %smin0, i32 %e2)
  %smin2 = call i32 @llvm.smin.i32(i32 %smin1, i32 %e3)
  %smin3 = call i32 @llvm.smin.i32(i32 %smin2, i32 %e4)
  ret i32 %smin3
}

define i32 @reduce_umax_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_umax_16xi32_prefix2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1)
  ret i32 %umax0
}

define i32 @reduce_umax_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_umax_16xi32_prefix5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v10, zero
; CHECK-NEXT:    vredmaxu.vs v8, v8, v10
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1)
  %umax1 = call i32 @llvm.umax.i32(i32 %umax0, i32 %e2)
  %umax2 = call i32 @llvm.umax.i32(i32 %umax1, i32 %e3)
  %umax3 = call i32 @llvm.umax.i32(i32 %umax2, i32 %e4)
  ret i32 %umax3
}

define i32 @reduce_umin_16xi32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_umin_16xi32_prefix2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vredminu.vs v8, v8, v8
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1)
  ret i32 %umin0
}

define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
; RV32-LABEL: reduce_umin_16xi32_prefix5:
; RV32:       # %bb.0:
; RV32-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
; RV32-NEXT:    vle32.v v8, (a0)
; RV32-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
; RV32-NEXT:    vmv.v.i v10, -1
; RV32-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
; RV32-NEXT:    vredminu.vs v8, v8, v10
; RV32-NEXT:    vmv.x.s a0, v8
; RV32-NEXT:    ret
;
; RV64-LABEL: reduce_umin_16xi32_prefix5:
; RV64:       # %bb.0:
; RV64-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
; RV64-NEXT:    vle32.v v8, (a0)
; RV64-NEXT:    li a0, -1
; RV64-NEXT:    vmv.s.x v10, a0
; RV64-NEXT:    vredminu.vs v8, v8, v10
; RV64-NEXT:    vmv.x.s a0, v8
; RV64-NEXT:    ret
  %v = load <16 x i32>, ptr %p, align 256
  %e0 = extractelement <16 x i32> %v, i32 0
  %e1 = extractelement <16 x i32> %v, i32 1
  %e2 = extractelement <16 x i32> %v, i32 2
  %e3 = extractelement <16 x i32> %v, i32 3
  %e4 = extractelement <16 x i32> %v, i32 4
  %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1)
  %umin1 = call i32 @llvm.umin.i32(i32 %umin0, i32 %e2)
  %umin2 = call i32 @llvm.umin.i32(i32 %umin1, i32 %e3)
  %umin3 = call i32 @llvm.umin.i32(i32 %umin2, i32 %e4)
  ret i32 %umin3
}

define float @reduce_fadd_16xf32_prefix2(ptr %p) {
; CHECK-LABEL: reduce_fadd_16xf32_prefix2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vmv.s.x v9, zero
; CHECK-NEXT:    vfredusum.vs v8, v8, v9
; CHECK-NEXT:    vfmv.f.s fa0, v8
; CHECK-NEXT:    ret
  %v = load <16 x float>, ptr %p, align 256
  %e0 = extractelement <16 x float> %v, i32 0
  %e1 = extractelement <16 x float> %v, i32 1
  %fadd0 = fadd fast float %e0, %e1
  ret float %fadd0
}

define float @reduce_fadd_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_fadd_16xi32_prefix5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    lui a0, 524288
; CHECK-NEXT:    vmv.s.x v10, a0
; CHECK-NEXT:    vfredusum.vs v8, v8, v10
; CHECK-NEXT:    vfmv.f.s fa0, v8
; CHECK-NEXT:    ret
  %v = load <16 x float>, ptr %p, align 256
  %e0 = extractelement <16 x float> %v, i32 0
  %e1 = extractelement <16 x float> %v, i32 1
  %e2 = extractelement <16 x float> %v, i32 2
  %e3 = extractelement <16 x float> %v, i32 3
  %e4 = extractelement <16 x float> %v, i32 4
  %fadd0 = fadd fast float %e0, %e1
  %fadd1 = fadd fast float %fadd0, %e2
  %fadd2 = fadd fast float %fadd1, %e3
  %fadd3 = fadd fast float %fadd2, %e4
  ret float %fadd3
}

;; Corner case tests for fadd associativity

; Negative test, not associative.  Would need strict opcode.
define float @reduce_fadd_2xf32_non_associative(ptr %p) {
; CHECK-LABEL: reduce_fadd_2xf32_non_associative:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vfmv.f.s fa5, v8
; CHECK-NEXT:    vslidedown.vi v8, v8, 1
; CHECK-NEXT:    vfmv.f.s fa4, v8
; CHECK-NEXT:    fadd.s fa0, fa5, fa4
; CHECK-NEXT:    ret
  %v = load <2 x float>, ptr %p, align 256
  %e0 = extractelement <2 x float> %v, i32 0
  %e1 = extractelement <2 x float> %v, i32 1
  %fadd0 = fadd float %e0, %e1
  ret float %fadd0
}

; Positive test - minimal set of fast math flags
define float @reduce_fadd_2xf32_reassoc_only(ptr %p) {
; CHECK-LABEL: reduce_fadd_2xf32_reassoc_only:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    lui a0, 524288
; CHECK-NEXT:    vmv.s.x v9, a0
; CHECK-NEXT:    vfredusum.vs v8, v8, v9
; CHECK-NEXT:    vfmv.f.s fa0, v8
; CHECK-NEXT:    ret
  %v = load <2 x float>, ptr %p, align 256
  %e0 = extractelement <2 x float> %v, i32 0
  %e1 = extractelement <2 x float> %v, i32 1
  %fadd0 = fadd reassoc float %e0, %e1
  ret float %fadd0
}

; Negative test - wrong fast math flag.
define float @reduce_fadd_2xf32_ninf_only(ptr %p) {
; CHECK-LABEL: reduce_fadd_2xf32_ninf_only:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vfmv.f.s fa5, v8
; CHECK-NEXT:    vslidedown.vi v8, v8, 1
; CHECK-NEXT:    vfmv.f.s fa4, v8
; CHECK-NEXT:    fadd.s fa0, fa5, fa4
; CHECK-NEXT:    ret
  %v = load <2 x float>, ptr %p, align 256
  %e0 = extractelement <2 x float> %v, i32 0
  %e1 = extractelement <2 x float> %v, i32 1
  %fadd0 = fadd ninf float %e0, %e1
  ret float %fadd0
}


; Negative test - last fadd is not associative
define float @reduce_fadd_4xi32_non_associative(ptr %p) {
; CHECK-LABEL: reduce_fadd_4xi32_non_associative:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vslidedown.vi v9, v8, 3
; CHECK-NEXT:    vfmv.f.s fa5, v9
; CHECK-NEXT:    lui a0, 524288
; CHECK-NEXT:    vmv.s.x v9, a0
; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
; CHECK-NEXT:    vfredusum.vs v8, v8, v9
; CHECK-NEXT:    vfmv.f.s fa4, v8
; CHECK-NEXT:    fadd.s fa0, fa4, fa5
; CHECK-NEXT:    ret
  %v = load <4 x float>, ptr %p, align 256
  %e0 = extractelement <4 x float> %v, i32 0
  %e1 = extractelement <4 x float> %v, i32 1
  %e2 = extractelement <4 x float> %v, i32 2
  %e3 = extractelement <4 x float> %v, i32 3
  %fadd0 = fadd fast float %e0, %e1
  %fadd1 = fadd fast float %fadd0, %e2
  %fadd2 = fadd float %fadd1, %e3
  ret float %fadd2
}

; Negative test - first fadd is not associative
; We could form a reduce for elements 2 and 3.
define float @reduce_fadd_4xi32_non_associative2(ptr %p) {
; CHECK-LABEL: reduce_fadd_4xi32_non_associative2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    vfmv.f.s fa5, v8
; CHECK-NEXT:    vslidedown.vi v9, v8, 1
; CHECK-NEXT:    vfmv.f.s fa4, v9
; CHECK-NEXT:    vslidedown.vi v9, v8, 2
; CHECK-NEXT:    vfmv.f.s fa3, v9
; CHECK-NEXT:    vslidedown.vi v8, v8, 3
; CHECK-NEXT:    vfmv.f.s fa2, v8
; CHECK-NEXT:    fadd.s fa5, fa5, fa4
; CHECK-NEXT:    fadd.s fa4, fa3, fa2
; CHECK-NEXT:    fadd.s fa0, fa5, fa4
; CHECK-NEXT:    ret
  %v = load <4 x float>, ptr %p, align 256
  %e0 = extractelement <4 x float> %v, i32 0
  %e1 = extractelement <4 x float> %v, i32 1
  %e2 = extractelement <4 x float> %v, i32 2
  %e3 = extractelement <4 x float> %v, i32 3
  %fadd0 = fadd float %e0, %e1
  %fadd1 = fadd fast float %fadd0, %e2
  %fadd2 = fadd fast float %fadd1, %e3
  ret float %fadd2
}