llvm/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2

; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV32
; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV64

declare <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
  <vscale x 1 x i8>,
  <vscale x 1 x i8>,
  <vscale x 1 x i8>,
  iXLen, iXLen);
declare <vscale x 1 x i8> @llvm.riscv.vasub.nxv1i8.nxv1i8(
  <vscale x 1 x i8>,
  <vscale x 1 x i8>,
  <vscale x 1 x i8>,
  iXLen, iXLen);

; Test same rounding mode in one block.
define <vscale x 1 x i8> @test1(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
; CHECK-LABEL: test1:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    csrwi vxrm, 0
; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
; CHECK-NEXT:    vaadd.vv v8, v8, v9
; CHECK-NEXT:    vaadd.vv v8, v8, v10
; CHECK-NEXT:    ret
entry:
  %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %0,
    <vscale x 1 x i8> %1,
    iXLen 0, iXLen %3)
  %b = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %a,
    <vscale x 1 x i8> %2,
    iXLen 0, iXLen %3)

  ret <vscale x 1 x i8> %b
}

; Test different rounding mode.
define <vscale x 1 x i8> @test2(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
; CHECK-LABEL: test2:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    csrwi vxrm, 2
; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
; CHECK-NEXT:    vaadd.vv v8, v8, v9
; CHECK-NEXT:    csrwi vxrm, 0
; CHECK-NEXT:    vaadd.vv v8, v8, v10
; CHECK-NEXT:    ret
entry:
  %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %0,
    <vscale x 1 x i8> %1,
    iXLen 2, iXLen %3)
  %b = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %a,
    <vscale x 1 x i8> %2,
    iXLen 0, iXLen %3)

  ret <vscale x 1 x i8> %b
}

declare <vscale x 1 x i8> @foo(<vscale x 1 x i8>)

; Test same vxrm with call in between which may invalidate vxrm.
define <vscale x 1 x i8> @test3(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
; RV32-LABEL: test3:
; RV32:       # %bb.0: # %entry
; RV32-NEXT:    addi sp, sp, -32
; RV32-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
; RV32-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
; RV32-NEXT:    csrr a1, vlenb
; RV32-NEXT:    slli a1, a1, 1
; RV32-NEXT:    sub sp, sp, a1
; RV32-NEXT:    mv s0, a0
; RV32-NEXT:    addi a1, sp, 16
; RV32-NEXT:    vs1r.v v10, (a1) # Unknown-size Folded Spill
; RV32-NEXT:    csrwi vxrm, 0
; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
; RV32-NEXT:    vaadd.vv v8, v8, v9
; RV32-NEXT:    call foo
; RV32-NEXT:    csrwi vxrm, 0
; RV32-NEXT:    addi a0, sp, 16
; RV32-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
; RV32-NEXT:    vsetvli zero, s0, e8, mf8, ta, ma
; RV32-NEXT:    vaadd.vv v8, v8, v9
; RV32-NEXT:    csrr a0, vlenb
; RV32-NEXT:    slli a0, a0, 1
; RV32-NEXT:    add sp, sp, a0
; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
; RV32-NEXT:    addi sp, sp, 32
; RV32-NEXT:    ret
;
; RV64-LABEL: test3:
; RV64:       # %bb.0: # %entry
; RV64-NEXT:    addi sp, sp, -32
; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
; RV64-NEXT:    csrr a1, vlenb
; RV64-NEXT:    slli a1, a1, 1
; RV64-NEXT:    sub sp, sp, a1
; RV64-NEXT:    mv s0, a0
; RV64-NEXT:    addi a1, sp, 16
; RV64-NEXT:    vs1r.v v10, (a1) # Unknown-size Folded Spill
; RV64-NEXT:    csrwi vxrm, 0
; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
; RV64-NEXT:    vaadd.vv v8, v8, v9
; RV64-NEXT:    call foo
; RV64-NEXT:    csrwi vxrm, 0
; RV64-NEXT:    addi a0, sp, 16
; RV64-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
; RV64-NEXT:    vsetvli zero, s0, e8, mf8, ta, ma
; RV64-NEXT:    vaadd.vv v8, v8, v9
; RV64-NEXT:    csrr a0, vlenb
; RV64-NEXT:    slli a0, a0, 1
; RV64-NEXT:    add sp, sp, a0
; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
; RV64-NEXT:    addi sp, sp, 32
; RV64-NEXT:    ret
entry:
  %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %0,
    <vscale x 1 x i8> %1,
    iXLen 0, iXLen %3)
  %b = call <vscale x 1 x i8> @foo(<vscale x 1 x i8> %a)
  %c = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %b,
    <vscale x 1 x i8> %2,
    iXLen 0, iXLen %3)

  ret <vscale x 1 x i8> %c
}

; Test same vxrm with asm in between which may invalidate vxrm.
define <vscale x 1 x i8> @test4(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
; CHECK-LABEL: test4:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    csrwi vxrm, 0
; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
; CHECK-NEXT:    vaadd.vv v8, v8, v9
; CHECK-NEXT:    #APP
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    csrwi vxrm, 0
; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
; CHECK-NEXT:    vaadd.vv v8, v8, v10
; CHECK-NEXT:    ret
entry:
  %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %0,
    <vscale x 1 x i8> %1,
    iXLen 0, iXLen %3)
  %b = call <vscale x 1 x i8> asm "", "=^vr,0"(<vscale x 1 x i8> %a)
  %c = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %b,
    <vscale x 1 x i8> %2,
    iXLen 0, iXLen %3)

  ret <vscale x 1 x i8> %c
}

; Test same rounding mode in triangle.
define <vscale x 1 x i8> @test5(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3, i1 %cond) nounwind {
; CHECK-LABEL: test5:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    andi a1, a1, 1
; CHECK-NEXT:    csrwi vxrm, 0
; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
; CHECK-NEXT:    vaadd.vv v8, v8, v9
; CHECK-NEXT:    beqz a1, .LBB4_2
; CHECK-NEXT:  # %bb.1: # %condblock
; CHECK-NEXT:    vaadd.vv v8, v8, v10
; CHECK-NEXT:  .LBB4_2: # %mergeblock
; CHECK-NEXT:    ret
entry:
  %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %0,
    <vscale x 1 x i8> %1,
    iXLen 0, iXLen %3)
  br i1 %cond, label %condblock, label %mergeblock

condblock:
  %b = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %a,
    <vscale x 1 x i8> %2,
    iXLen 0, iXLen %3)
  br label %mergeblock

mergeblock:
  %c = phi <vscale x 1 x i8> [%a, %entry], [%b, %condblock]

  ret <vscale x 1 x i8> %c
}

; Test same rounding mode in diamond with no dominating vxrm.
define <vscale x 1 x i8> @test6(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3, i1 %cond) nounwind {
; CHECK-LABEL: test6:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    andi a1, a1, 1
; CHECK-NEXT:    csrwi vxrm, 0
; CHECK-NEXT:    beqz a1, .LBB5_2
; CHECK-NEXT:  # %bb.1: # %trueblock
; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
; CHECK-NEXT:    vaadd.vv v8, v8, v9
; CHECK-NEXT:    ret
; CHECK-NEXT:  .LBB5_2: # %falseblock
; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
; CHECK-NEXT:    vaadd.vv v8, v8, v10
; CHECK-NEXT:    ret
entry:
  br i1 %cond, label %trueblock, label %falseblock

trueblock:
  %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %0,
    <vscale x 1 x i8> %1,
    iXLen 0, iXLen %3)
  br label %mergeblock

falseblock:
  %b = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %0,
    <vscale x 1 x i8> %2,
    iXLen 0, iXLen %3)
  br label %mergeblock

mergeblock:
  %c = phi <vscale x 1 x i8> [%a, %trueblock], [%b, %falseblock]

  ret <vscale x 1 x i8> %c
}

; Test same rounding mode in diamond with same dominating vxrm.
define <vscale x 1 x i8> @test7(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3, i1 %cond) nounwind {
; CHECK-LABEL: test7:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    andi a1, a1, 1
; CHECK-NEXT:    csrwi vxrm, 0
; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
; CHECK-NEXT:    vaadd.vv v8, v8, v9
; CHECK-NEXT:    beqz a1, .LBB6_2
; CHECK-NEXT:  # %bb.1: # %trueblock
; CHECK-NEXT:    vaadd.vv v8, v8, v10
; CHECK-NEXT:    ret
; CHECK-NEXT:  .LBB6_2: # %falseblock
; CHECK-NEXT:    vasub.vv v8, v8, v10
; CHECK-NEXT:    ret
entry:
  %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %0,
    <vscale x 1 x i8> %1,
    iXLen 0, iXLen %3)
  br i1 %cond, label %trueblock, label %falseblock

trueblock:
  %b = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %a,
    <vscale x 1 x i8> %2,
    iXLen 0, iXLen %3)
  br label %mergeblock

falseblock:
  %c = call <vscale x 1 x i8> @llvm.riscv.vasub.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %a,
    <vscale x 1 x i8> %2,
    iXLen 0, iXLen %3)
  br label %mergeblock

mergeblock:
  %d = phi <vscale x 1 x i8> [%b, %trueblock], [%c, %falseblock]

  ret <vscale x 1 x i8> %d
}

; Test same rounding mode in diamond with same vxrm at merge.
define <vscale x 1 x i8> @test8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3, i1 %cond) nounwind {
; CHECK-LABEL: test8:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    andi a1, a1, 1
; CHECK-NEXT:    csrwi vxrm, 0
; CHECK-NEXT:    beqz a1, .LBB7_2
; CHECK-NEXT:  # %bb.1: # %trueblock
; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
; CHECK-NEXT:    vaadd.vv v8, v8, v9
; CHECK-NEXT:    vaadd.vv v8, v8, v10
; CHECK-NEXT:    ret
; CHECK-NEXT:  .LBB7_2: # %falseblock
; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
; CHECK-NEXT:    vasub.vv v8, v8, v9
; CHECK-NEXT:    vaadd.vv v8, v8, v10
; CHECK-NEXT:    ret
entry:
  br i1 %cond, label %trueblock, label %falseblock

trueblock:
  %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %0,
    <vscale x 1 x i8> %1,
    iXLen 0, iXLen %3)
  br label %mergeblock

falseblock:
  %b = call <vscale x 1 x i8> @llvm.riscv.vasub.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %0,
    <vscale x 1 x i8> %1,
    iXLen 0, iXLen %3)
  br label %mergeblock

mergeblock:
  %c = phi <vscale x 1 x i8> [%a, %trueblock], [%b, %falseblock]
  %d = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %c,
    <vscale x 1 x i8> %2,
    iXLen 0, iXLen %3)

  ret <vscale x 1 x i8> %d
}

; Test same rounding mode in diamond with different vxrm at merge.
define <vscale x 1 x i8> @test9(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3, i1 %cond) nounwind {
; CHECK-LABEL: test9:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    andi a1, a1, 1
; CHECK-NEXT:    csrwi vxrm, 0
; CHECK-NEXT:    beqz a1, .LBB8_2
; CHECK-NEXT:  # %bb.1: # %trueblock
; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
; CHECK-NEXT:    vaadd.vv v8, v8, v9
; CHECK-NEXT:    j .LBB8_3
; CHECK-NEXT:  .LBB8_2: # %falseblock
; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
; CHECK-NEXT:    vasub.vv v8, v8, v9
; CHECK-NEXT:  .LBB8_3: # %mergeblock
; CHECK-NEXT:    csrwi vxrm, 2
; CHECK-NEXT:    vaadd.vv v8, v8, v10
; CHECK-NEXT:    ret
entry:
  br i1 %cond, label %trueblock, label %falseblock

trueblock:
  %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %0,
    <vscale x 1 x i8> %1,
    iXLen 0, iXLen %3)
  br label %mergeblock

falseblock:
  %b = call <vscale x 1 x i8> @llvm.riscv.vasub.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %0,
    <vscale x 1 x i8> %1,
    iXLen 0, iXLen %3)
  br label %mergeblock

mergeblock:
  %c = phi <vscale x 1 x i8> [%a, %trueblock], [%b, %falseblock]
  %d = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
    <vscale x 1 x i8> undef,
    <vscale x 1 x i8> %c,
    <vscale x 1 x i8> %2,
    iXLen 2, iXLen %3)

  ret <vscale x 1 x i8> %d
}

; Test loop with no dominating vxrm write.
define void @test10(ptr nocapture %ptr_dest, ptr nocapture readonly %ptr_op1, ptr nocapture readonly %ptr_op2, iXLen %n) {
; CHECK-LABEL: test10:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    beqz a3, .LBB9_3
; CHECK-NEXT:  # %bb.1: # %for.body.preheader
; CHECK-NEXT:    csrwi vxrm, 2
; CHECK-NEXT:  .LBB9_2: # %for.body
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vsetvli a4, a3, e8, mf8, ta, ma
; CHECK-NEXT:    vle8.v v8, (a1)
; CHECK-NEXT:    vle8.v v9, (a2)
; CHECK-NEXT:    vaadd.vv v8, v8, v9
; CHECK-NEXT:    sub a3, a3, a4
; CHECK-NEXT:    vse8.v v8, (a0)
; CHECK-NEXT:    bnez a3, .LBB9_2
; CHECK-NEXT:  .LBB9_3: # %for.end
; CHECK-NEXT:    ret
entry:
  %tobool.not9 = icmp eq iXLen %n, 0
  br i1 %tobool.not9, label %for.end, label %for.body

for.body:
  %n.addr.011 = phi iXLen [ %n, %entry ], [ %sub, %for.body ]
  %vl = tail call iXLen @llvm.riscv.vsetvli.iXLen(iXLen %n.addr.011, iXLen 0, iXLen 5)
  %load1 = tail call <vscale x 1 x i8> @llvm.riscv.vle.nxv1i8.iXLen(<vscale x 1 x i8> undef, ptr %ptr_op1, iXLen %vl)
  %load2 = tail call <vscale x 1 x i8> @llvm.riscv.vle.nxv1i8.iXLen(<vscale x 1 x i8> undef, ptr %ptr_op2, iXLen %vl)
  %vadd = tail call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %load1, <vscale x 1 x i8> %load2, iXLen 2, iXLen %vl)
  tail call void @llvm.riscv.vse.nxv1i8.iXLen(<vscale x 1 x i8> %vadd, ptr %ptr_dest, iXLen %vl)
  %sub = sub iXLen %n.addr.011, %vl
  %tobool.not = icmp eq iXLen %sub, 0
  br i1 %tobool.not, label %for.end, label %for.body

for.end:
  ret void
}

declare iXLen @llvm.riscv.vsetvli.iXLen(iXLen, iXLen immarg, iXLen immarg)
declare <vscale x 1 x i8> @llvm.riscv.vle.nxv1i8.iXLen(<vscale x 1 x i8>, ptr nocapture, iXLen)
declare void @llvm.riscv.vse.nxv1i8.iXLen(<vscale x 1 x i8>, ptr nocapture, iXLen)

; Test loop with dominating vxrm write. Make sure there is no write in the loop.
define void @test11(ptr nocapture %ptr_dest, ptr nocapture readonly %ptr_op1, ptr nocapture readonly %ptr_op2, iXLen %n) {
; CHECK-LABEL: test11:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vsetvli a4, a3, e8, mf8, ta, ma
; CHECK-NEXT:    vle8.v v8, (a1)
; CHECK-NEXT:    vle8.v v9, (a2)
; CHECK-NEXT:    csrwi vxrm, 2
; CHECK-NEXT:  .LBB10_1: # %for.body
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vaadd.vv v8, v8, v9
; CHECK-NEXT:    sub a3, a3, a4
; CHECK-NEXT:    vse8.v v8, (a0)
; CHECK-NEXT:    beqz a3, .LBB10_3
; CHECK-NEXT:  # %bb.2: # %for.body
; CHECK-NEXT:    # in Loop: Header=BB10_1 Depth=1
; CHECK-NEXT:    vsetvli a4, a3, e8, mf8, ta, ma
; CHECK-NEXT:    vle8.v v8, (a1)
; CHECK-NEXT:    vle8.v v9, (a2)
; CHECK-NEXT:    j .LBB10_1
; CHECK-NEXT:  .LBB10_3: # %for.end
; CHECK-NEXT:    ret
entry:
  %vl = tail call iXLen @llvm.riscv.vsetvli.iXLen(iXLen %n, iXLen 0, iXLen 5)
  %load1a = tail call <vscale x 1 x i8> @llvm.riscv.vle.nxv1i8.iXLen(<vscale x 1 x i8> undef, ptr %ptr_op1, iXLen %vl)
  %load2a = tail call <vscale x 1 x i8> @llvm.riscv.vle.nxv1i8.iXLen(<vscale x 1 x i8> undef, ptr %ptr_op2, iXLen %vl)
  %vadda = tail call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %load1a, <vscale x 1 x i8> %load2a, iXLen 2, iXLen %vl)
  tail call void @llvm.riscv.vse.nxv1i8.iXLen(<vscale x 1 x i8> %vadda, ptr %ptr_dest, iXLen %vl)
  %suba = sub iXLen %n, %vl
  %tobool.not9 = icmp eq iXLen %suba, 0
  br i1 %tobool.not9, label %for.end, label %for.body

for.body:
  %n.addr.011 = phi iXLen [ %suba, %entry ], [ %sub, %for.body ]
  %vl2 = tail call iXLen @llvm.riscv.vsetvli.iXLen(iXLen %n.addr.011, iXLen 0, iXLen 5)
  %load1 = tail call <vscale x 1 x i8> @llvm.riscv.vle.nxv1i8.iXLen(<vscale x 1 x i8> undef, ptr %ptr_op1, iXLen %vl2)
  %load2 = tail call <vscale x 1 x i8> @llvm.riscv.vle.nxv1i8.iXLen(<vscale x 1 x i8> undef, ptr %ptr_op2, iXLen %vl2)
  %vadd = tail call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %load1, <vscale x 1 x i8> %load2, iXLen 2, iXLen %vl2)
  tail call void @llvm.riscv.vse.nxv1i8.iXLen(<vscale x 1 x i8> %vadd, ptr %ptr_dest, iXLen %vl2)
  %sub = sub iXLen %n.addr.011, %vl2
  %tobool.not = icmp eq iXLen %sub, 0
  br i1 %tobool.not, label %for.end, label %for.body

for.end:
  ret void
}

; The edge from entry to block2 is a critical edge. The vxrm write in block2
; is redundant when coming from block1, but is needed when coming from entry.
; FIXME: We could remove the write from the end of block1 without splitting the
; critical edge.
define <vscale x 1 x i8> @test12(i1 %c1, <vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %vl) {
; CHECK-LABEL: test12:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    andi a0, a0, 1
; CHECK-NEXT:    csrwi vxrm, 0
; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
; CHECK-NEXT:    vaadd.vv v9, v8, v9
; CHECK-NEXT:    beqz a0, .LBB11_2
; CHECK-NEXT:  # %bb.1: # %block1
; CHECK-NEXT:    csrwi vxrm, 1
; CHECK-NEXT:    vaadd.vv v9, v8, v9
; CHECK-NEXT:    csrwi vxrm, 2
; CHECK-NEXT:  .LBB11_2: # %block2
; CHECK-NEXT:    csrwi vxrm, 2
; CHECK-NEXT:    vaadd.vv v8, v8, v9
; CHECK-NEXT:    ret
entry:
  %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen 0, iXLen %vl)
  br i1 %c1, label %block1, label %block2

block1:
  %b = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %0, <vscale x 1 x i8> %a, iXLen 1, iXLen %vl)
  br label %block2

block2:
  %c = phi <vscale x 1 x i8> [ %a, %entry ], [ %b, %block1]
  %d = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %0, <vscale x 1 x i8> %c, iXLen 2, iXLen %vl)
  ret <vscale x 1 x i8> %d
}

; Similar to test12, but introduces a second critical edge from block1 to
; block3. Now the write to vxrm at the end of block1, can't be removed because
; it is needed by block3.
define <vscale x 1 x i8> @test13(i1 %c1, i1 %c2, i1 %c3, <vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %vl) {
; CHECK-LABEL: test13:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    andi a0, a0, 1
; CHECK-NEXT:    csrwi vxrm, 0
; CHECK-NEXT:    vsetvli zero, a3, e8, mf8, ta, ma
; CHECK-NEXT:    vaadd.vv v10, v8, v9
; CHECK-NEXT:    beqz a0, .LBB12_2
; CHECK-NEXT:  # %bb.1: # %block1
; CHECK-NEXT:    csrwi vxrm, 1
; CHECK-NEXT:    vaadd.vv v10, v8, v10
; CHECK-NEXT:    andi a1, a1, 1
; CHECK-NEXT:    csrwi vxrm, 2
; CHECK-NEXT:    beqz a1, .LBB12_3
; CHECK-NEXT:  .LBB12_2: # %block2
; CHECK-NEXT:    csrwi vxrm, 2
; CHECK-NEXT:    vaadd.vv v8, v8, v10
; CHECK-NEXT:    ret
; CHECK-NEXT:  .LBB12_3: # %block3
; CHECK-NEXT:    vaadd.vv v8, v9, v10
; CHECK-NEXT:    ret
entry:
  %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen 0, iXLen %vl)
  br i1 %c1, label %block1, label %block2

block1:
  %b = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %0, <vscale x 1 x i8> %a, iXLen 1, iXLen %vl)
  br i1 %c2, label %block2, label %block3

block2:
  %c = phi <vscale x 1 x i8> [ %a, %entry ], [ %b, %block1]
  %d = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %0, <vscale x 1 x i8> %c, iXLen 2, iXLen %vl)
  ret <vscale x 1 x i8> %d

block3:
  %e = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %1, <vscale x 1 x i8> %b, iXLen 2, iXLen %vl)
  ret <vscale x 1 x i8> %e
}