llvm/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE

; assemble_acc
declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
define void @ass_acc(ptr %ptr, <16 x i8> %vc) {
; CHECK-LABEL: ass_acc:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    xxlor vs3, v2, v2
; CHECK-NEXT:    xxlor vs2, v2, v2
; CHECK-NEXT:    xxlor vs0, vs2, vs2
; CHECK-NEXT:    xxlor vs1, vs3, vs3
; CHECK-NEXT:    stxv vs0, 48(r3)
; CHECK-NEXT:    stxv vs1, 32(r3)
; CHECK-NEXT:    stxv vs2, 16(r3)
; CHECK-NEXT:    stxv vs3, 0(r3)
; CHECK-NEXT:    blr
;
; CHECK-BE-LABEL: ass_acc:
; CHECK-BE:       # %bb.0: # %entry
; CHECK-BE-NEXT:    xxlor vs3, v2, v2
; CHECK-BE-NEXT:    xxlor vs2, v2, v2
; CHECK-BE-NEXT:    xxlor vs0, vs2, vs2
; CHECK-BE-NEXT:    xxlor vs1, vs3, vs3
; CHECK-BE-NEXT:    stxv vs1, 16(r3)
; CHECK-BE-NEXT:    stxv vs0, 0(r3)
; CHECK-BE-NEXT:    stxv vs3, 48(r3)
; CHECK-BE-NEXT:    stxv vs2, 32(r3)
; CHECK-BE-NEXT:    blr
entry:
  %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc)
  store <512 x i1> %0, ptr %ptr, align 64
  ret void
}

; xxmtacc
declare <512 x i1> @llvm.ppc.mma.xxmtacc(<512 x i1>)
define void @int_xxmtacc(ptr %ptr, <16 x i8> %vc) {
; CHECK-LABEL: int_xxmtacc:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    xxlor vs3, v2, v2
; CHECK-NEXT:    xxlor vs2, v2, v2
; CHECK-NEXT:    xxlor vs0, vs2, vs2
; CHECK-NEXT:    xxlor vs1, vs3, vs3
; CHECK-NEXT:    xxmtacc acc0
; CHECK-NEXT:    stxv vs0, 48(r3)
; CHECK-NEXT:    stxv vs1, 32(r3)
; CHECK-NEXT:    stxv vs2, 16(r3)
; CHECK-NEXT:    stxv vs3, 0(r3)
; CHECK-NEXT:    blr
;
; CHECK-BE-LABEL: int_xxmtacc:
; CHECK-BE:       # %bb.0: # %entry
; CHECK-BE-NEXT:    xxlor vs3, v2, v2
; CHECK-BE-NEXT:    xxlor vs2, v2, v2
; CHECK-BE-NEXT:    xxlor vs0, vs2, vs2
; CHECK-BE-NEXT:    xxlor vs1, vs3, vs3
; CHECK-BE-NEXT:    xxmtacc acc0
; CHECK-BE-NEXT:    stxv vs1, 16(r3)
; CHECK-BE-NEXT:    stxv vs0, 0(r3)
; CHECK-BE-NEXT:    stxv vs3, 48(r3)
; CHECK-BE-NEXT:    stxv vs2, 32(r3)
; CHECK-BE-NEXT:    blr
entry:
; One xxmtacc is generated from the call to assemble.acc then one xxmtacc is
; generated from the call to xxmtacc then one xxmfacc is generated for the store
  %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc)
  %1 = tail call <512 x i1> @llvm.ppc.mma.xxmtacc(<512 x i1> %0)
  store <512 x i1> %1, ptr %ptr, align 64
  ret void
}

; xxmfacc
declare <512 x i1> @llvm.ppc.mma.xxmfacc(<512 x i1>)
define void @int_xxmfacc(ptr %ptr, <16 x i8> %vc) {
; CHECK-LABEL: int_xxmfacc:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    xxlor vs3, v2, v2
; CHECK-NEXT:    xxlor vs2, v2, v2
; CHECK-NEXT:    xxlor vs0, vs2, vs2
; CHECK-NEXT:    xxlor vs1, vs3, vs3
; CHECK-NEXT:    stxv vs0, 48(r3)
; CHECK-NEXT:    stxv vs1, 32(r3)
; CHECK-NEXT:    stxv vs2, 16(r3)
; CHECK-NEXT:    stxv vs3, 0(r3)
; CHECK-NEXT:    blr
;
; CHECK-BE-LABEL: int_xxmfacc:
; CHECK-BE:       # %bb.0: # %entry
; CHECK-BE-NEXT:    xxlor vs3, v2, v2
; CHECK-BE-NEXT:    xxlor vs2, v2, v2
; CHECK-BE-NEXT:    xxlor vs0, vs2, vs2
; CHECK-BE-NEXT:    xxlor vs1, vs3, vs3
; CHECK-BE-NEXT:    stxv vs1, 16(r3)
; CHECK-BE-NEXT:    stxv vs0, 0(r3)
; CHECK-BE-NEXT:    stxv vs3, 48(r3)
; CHECK-BE-NEXT:    stxv vs2, 32(r3)
; CHECK-BE-NEXT:    blr
entry:
; One xxmtacc is generated from the call to assemble.acc then one xxmfacc is
; generated from the call to xxmfacc then one xxmfacc is generated for the store
  %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc)
  %1 = tail call <512 x i1> @llvm.ppc.mma.xxmfacc(<512 x i1> %0)
  store <512 x i1> %1, ptr %ptr, align 64
  ret void
}

; xxsetaccz
declare <512 x i1> @llvm.ppc.mma.xxsetaccz()
define void @int_xxsetaccz(ptr %ptr) {
; CHECK-LABEL: int_xxsetaccz:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    xxsetaccz acc0
; CHECK-NEXT:    xxmfacc acc0
; CHECK-NEXT:    stxv vs0, 48(r3)
; CHECK-NEXT:    stxv vs1, 32(r3)
; CHECK-NEXT:    stxv vs2, 16(r3)
; CHECK-NEXT:    stxv vs3, 0(r3)
; CHECK-NEXT:    blr
;
; CHECK-BE-LABEL: int_xxsetaccz:
; CHECK-BE:       # %bb.0: # %entry
; CHECK-BE-NEXT:    xxsetaccz acc0
; CHECK-BE-NEXT:    xxmfacc acc0
; CHECK-BE-NEXT:    stxv vs1, 16(r3)
; CHECK-BE-NEXT:    stxv vs0, 0(r3)
; CHECK-BE-NEXT:    stxv vs3, 48(r3)
; CHECK-BE-NEXT:    stxv vs2, 32(r3)
; CHECK-BE-NEXT:    blr
entry:
  %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
  store <512 x i1> %0, ptr %ptr, align 64
  ret void
}

; disassemble_acc
declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1>)
define void @disass_acc(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) {
; CHECK-LABEL: disass_acc:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    xxsetaccz acc0
; CHECK-NEXT:    xxmfacc acc0
; CHECK-NEXT:    stxv vs3, 0(r3)
; CHECK-NEXT:    stxv vs2, 0(r4)
; CHECK-NEXT:    stxv vs1, 0(r5)
; CHECK-NEXT:    stxv vs0, 0(r6)
; CHECK-NEXT:    blr
;
; CHECK-BE-LABEL: disass_acc:
; CHECK-BE:       # %bb.0: # %entry
; CHECK-BE-NEXT:    xxsetaccz acc0
; CHECK-BE-NEXT:    xxmfacc acc0
; CHECK-BE-NEXT:    stxv vs0, 0(r3)
; CHECK-BE-NEXT:    stxv vs1, 0(r4)
; CHECK-BE-NEXT:    stxv vs2, 0(r5)
; CHECK-BE-NEXT:    stxv vs3, 0(r6)
; CHECK-BE-NEXT:    blr
entry:
  %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
  %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %0)
  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
  %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
  %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 3
  store <16 x i8> %2, ptr %ptr1, align 16
  store <16 x i8> %3, ptr %ptr2, align 16
  store <16 x i8> %4, ptr %ptr3, align 16
  store <16 x i8> %5, ptr %ptr4, align 16
  ret void
}

declare <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1>, <16 x i8>, <16 x i8>)
define void @testBranch(ptr %ptr, <16 x i8> %vc, i32 %val) {
; CHECK-LABEL: testBranch:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    cmplwi r7, 0
; CHECK-NEXT:    beq cr0, .LBB5_2
; CHECK-NEXT:  # %bb.1: # %if.then
; CHECK-NEXT:    xxsetaccz acc0
; CHECK-NEXT:    b .LBB5_3
; CHECK-NEXT:  .LBB5_2: # %if.else
; CHECK-NEXT:    lxv vs1, 32(r3)
; CHECK-NEXT:    lxv vs0, 48(r3)
; CHECK-NEXT:    lxv vs3, 0(r3)
; CHECK-NEXT:    lxv vs2, 16(r3)
; CHECK-NEXT:    xxmtacc acc0
; CHECK-NEXT:    xvi4ger8pp acc0, v2, v2
; CHECK-NEXT:  .LBB5_3: # %if.end
; CHECK-NEXT:    xxmfacc acc0
; CHECK-NEXT:    stxv vs0, 48(r3)
; CHECK-NEXT:    stxv vs1, 32(r3)
; CHECK-NEXT:    stxv vs2, 16(r3)
; CHECK-NEXT:    stxv vs3, 0(r3)
; CHECK-NEXT:    blr
;
; CHECK-BE-LABEL: testBranch:
; CHECK-BE:       # %bb.0: # %entry
; CHECK-BE-NEXT:    cmplwi r7, 0
; CHECK-BE-NEXT:    beq cr0, .LBB5_2
; CHECK-BE-NEXT:  # %bb.1: # %if.then
; CHECK-BE-NEXT:    xxsetaccz acc0
; CHECK-BE-NEXT:    b .LBB5_3
; CHECK-BE-NEXT:  .LBB5_2: # %if.else
; CHECK-BE-NEXT:    lxv vs1, 16(r3)
; CHECK-BE-NEXT:    lxv vs0, 0(r3)
; CHECK-BE-NEXT:    lxv vs3, 48(r3)
; CHECK-BE-NEXT:    lxv vs2, 32(r3)
; CHECK-BE-NEXT:    xxmtacc acc0
; CHECK-BE-NEXT:    xvi4ger8pp acc0, v2, v2
; CHECK-BE-NEXT:  .LBB5_3: # %if.end
; CHECK-BE-NEXT:    xxmfacc acc0
; CHECK-BE-NEXT:    stxv vs1, 16(r3)
; CHECK-BE-NEXT:    stxv vs0, 0(r3)
; CHECK-BE-NEXT:    stxv vs3, 48(r3)
; CHECK-BE-NEXT:    stxv vs2, 32(r3)
; CHECK-BE-NEXT:    blr
entry:
  %tobool = icmp eq i32 %val, 0
  br i1 %tobool, label %if.else, label %if.then

if.then:
  %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
  br label %if.end

if.else:
  %1 = load <512 x i1>, ptr %ptr, align 64
  %2 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
  br label %if.end

if.end:
  %vq1.0 = phi <512 x i1> [ %0, %if.then ], [ %2, %if.else ]
  store <512 x i1> %vq1.0, ptr %ptr, align 64
  ret void
}

; The following test cases check that the xxsetaccz instruction is correctly rematerialized
declare <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1>, <16 x i8>, <16 x i8>)
declare <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1>, <16 x i8>, <16 x i8>)
declare <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1>, <16 x i8>, <16 x i8>)

define void @testcse(ptr %res, <16 x i8> %vc) {
; CHECK-LABEL: testcse:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    xxsetaccz acc0
; CHECK-NEXT:    xvf32gerpp acc0, v2, v2
; CHECK-NEXT:    xxmfacc acc0
; CHECK-NEXT:    stxv vs0, 48(r3)
; CHECK-NEXT:    stxv vs1, 32(r3)
; CHECK-NEXT:    stxv vs2, 16(r3)
; CHECK-NEXT:    stxv vs3, 0(r3)
; CHECK-NEXT:    stxv vs0, 112(r3)
; CHECK-NEXT:    stxv vs1, 96(r3)
; CHECK-NEXT:    stxv vs2, 80(r3)
; CHECK-NEXT:    stxv vs3, 64(r3)
; CHECK-NEXT:    blr
;
; CHECK-BE-LABEL: testcse:
; CHECK-BE:       # %bb.0: # %entry
; CHECK-BE-NEXT:    xxsetaccz acc0
; CHECK-BE-NEXT:    xvf32gerpp acc0, v2, v2
; CHECK-BE-NEXT:    xxmfacc acc0
; CHECK-BE-NEXT:    stxv vs1, 16(r3)
; CHECK-BE-NEXT:    stxv vs0, 0(r3)
; CHECK-BE-NEXT:    stxv vs3, 48(r3)
; CHECK-BE-NEXT:    stxv vs2, 32(r3)
; CHECK-BE-NEXT:    stxv vs1, 80(r3)
; CHECK-BE-NEXT:    stxv vs0, 64(r3)
; CHECK-BE-NEXT:    stxv vs3, 112(r3)
; CHECK-BE-NEXT:    stxv vs2, 96(r3)
; CHECK-BE-NEXT:    blr
entry:
  %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
  %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
  %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
  %3 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
  %4 = getelementptr inbounds <512 x i1>, ptr %res, i64 1
  store <512 x i1> %2, ptr %res, align 64
  store <512 x i1> %3, ptr %4, align 64
  ret void
}

define void @testcse2(ptr %res, <16 x i8> %vc) {
; CHECK-LABEL: testcse2:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    xxsetaccz acc0
; CHECK-NEXT:    xxsetaccz acc1
; CHECK-NEXT:    xvf32gerpp acc1, v2, v2
; CHECK-NEXT:    xvf32gerpn acc0, v2, v2
; CHECK-NEXT:    xxmfacc acc1
; CHECK-NEXT:    xxmfacc acc0
; CHECK-NEXT:    stxv vs4, 48(r3)
; CHECK-NEXT:    stxv vs5, 32(r3)
; CHECK-NEXT:    stxv vs6, 16(r3)
; CHECK-NEXT:    stxv vs7, 0(r3)
; CHECK-NEXT:    stxv vs0, 112(r3)
; CHECK-NEXT:    stxv vs1, 96(r3)
; CHECK-NEXT:    stxv vs2, 80(r3)
; CHECK-NEXT:    stxv vs3, 64(r3)
; CHECK-NEXT:    blr
;
; CHECK-BE-LABEL: testcse2:
; CHECK-BE:       # %bb.0: # %entry
; CHECK-BE-NEXT:    xxsetaccz acc0
; CHECK-BE-NEXT:    xxsetaccz acc1
; CHECK-BE-NEXT:    xvf32gerpp acc1, v2, v2
; CHECK-BE-NEXT:    xvf32gerpn acc0, v2, v2
; CHECK-BE-NEXT:    xxmfacc acc1
; CHECK-BE-NEXT:    xxmfacc acc0
; CHECK-BE-NEXT:    stxv vs5, 16(r3)
; CHECK-BE-NEXT:    stxv vs4, 0(r3)
; CHECK-BE-NEXT:    stxv vs7, 48(r3)
; CHECK-BE-NEXT:    stxv vs6, 32(r3)
; CHECK-BE-NEXT:    stxv vs1, 80(r3)
; CHECK-BE-NEXT:    stxv vs0, 64(r3)
; CHECK-BE-NEXT:    stxv vs3, 112(r3)
; CHECK-BE-NEXT:    stxv vs2, 96(r3)
; CHECK-BE-NEXT:    blr
entry:
  %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
  %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
  %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
  %3 = call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
  %4 = getelementptr inbounds <512 x i1>, ptr %res, i64 1
  store <512 x i1> %2, ptr %res, align 64
  store <512 x i1> %3, ptr %4, align 64
  ret void
}

define void @testcse3(ptr %res, <16 x i8> %vc) {
; CHECK-LABEL: testcse3:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    xxsetaccz acc0
; CHECK-NEXT:    xxsetaccz acc1
; CHECK-NEXT:    xvf32gerpp acc1, v2, v2
; CHECK-NEXT:    xvf32gerpn acc0, v2, v2
; CHECK-NEXT:    xxmfacc acc1
; CHECK-NEXT:    xxmfacc acc0
; CHECK-NEXT:    stxv vs4, 48(r3)
; CHECK-NEXT:    stxv vs5, 32(r3)
; CHECK-NEXT:    stxv vs6, 16(r3)
; CHECK-NEXT:    stxv vs7, 0(r3)
; CHECK-NEXT:    stxv vs0, 112(r3)
; CHECK-NEXT:    stxv vs1, 96(r3)
; CHECK-NEXT:    stxv vs2, 80(r3)
; CHECK-NEXT:    stxv vs3, 64(r3)
; CHECK-NEXT:    blr
;
; CHECK-BE-LABEL: testcse3:
; CHECK-BE:       # %bb.0: # %entry
; CHECK-BE-NEXT:    xxsetaccz acc0
; CHECK-BE-NEXT:    xxsetaccz acc1
; CHECK-BE-NEXT:    xvf32gerpp acc1, v2, v2
; CHECK-BE-NEXT:    xvf32gerpn acc0, v2, v2
; CHECK-BE-NEXT:    xxmfacc acc1
; CHECK-BE-NEXT:    xxmfacc acc0
; CHECK-BE-NEXT:    stxv vs5, 16(r3)
; CHECK-BE-NEXT:    stxv vs4, 0(r3)
; CHECK-BE-NEXT:    stxv vs7, 48(r3)
; CHECK-BE-NEXT:    stxv vs6, 32(r3)
; CHECK-BE-NEXT:    stxv vs1, 80(r3)
; CHECK-BE-NEXT:    stxv vs0, 64(r3)
; CHECK-BE-NEXT:    stxv vs3, 112(r3)
; CHECK-BE-NEXT:    stxv vs2, 96(r3)
; CHECK-BE-NEXT:    blr
entry:
  %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
  %1 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
  %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
  %3 = getelementptr inbounds <512 x i1>, ptr %res, i64 1
  store <512 x i1> %1, ptr %res, align 64
  store <512 x i1> %2, ptr %3, align 64
  ret void
}

define void @testcse4(ptr %res, i32 %lim, ptr %vc) {
; CHECK-LABEL: testcse4:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    cmpwi r4, 1
; CHECK-NEXT:    bltlr cr0
; CHECK-NEXT:  # %bb.1: # %for.body.preheader
; CHECK-NEXT:    clrldi r4, r4, 32
; CHECK-NEXT:    li r6, 0
; CHECK-NEXT:    mtctr r4
; CHECK-NEXT:    li r4, 0
; CHECK-NEXT:    .p2align 4
; CHECK-NEXT:  .LBB9_2: # %for.body
; CHECK-NEXT:    #
; CHECK-NEXT:    rldic r7, r6, 4, 28
; CHECK-NEXT:    xxsetaccz acc2
; CHECK-NEXT:    xxsetaccz acc1
; CHECK-NEXT:    addi r6, r6, 6
; CHECK-NEXT:    lxvx vs0, r5, r7
; CHECK-NEXT:    add r7, r5, r7
; CHECK-NEXT:    lxv vs1, 16(r7)
; CHECK-NEXT:    xvf32gerpp acc2, vs0, vs1
; CHECK-NEXT:    lxv vs0, 32(r7)
; CHECK-NEXT:    lxv vs1, 48(r7)
; CHECK-NEXT:    xvf32gerpn acc1, vs0, vs1
; CHECK-NEXT:    lxv vs12, 64(r7)
; CHECK-NEXT:    lxv vs13, 80(r7)
; CHECK-NEXT:    xxsetaccz acc0
; CHECK-NEXT:    rldic r7, r4, 6, 26
; CHECK-NEXT:    addi r4, r4, 3
; CHECK-NEXT:    add r8, r3, r7
; CHECK-NEXT:    xxmfacc acc2
; CHECK-NEXT:    xvf32gernp acc0, vs12, vs13
; CHECK-NEXT:    stxvx vs11, r3, r7
; CHECK-NEXT:    stxv vs8, 48(r8)
; CHECK-NEXT:    xxmfacc acc1
; CHECK-NEXT:    stxv vs9, 32(r8)
; CHECK-NEXT:    stxv vs10, 16(r8)
; CHECK-NEXT:    stxv vs4, 112(r8)
; CHECK-NEXT:    stxv vs5, 96(r8)
; CHECK-NEXT:    xxmfacc acc0
; CHECK-NEXT:    stxv vs6, 80(r8)
; CHECK-NEXT:    stxv vs7, 64(r8)
; CHECK-NEXT:    stxv vs0, 176(r8)
; CHECK-NEXT:    stxv vs1, 160(r8)
; CHECK-NEXT:    stxv vs2, 144(r8)
; CHECK-NEXT:    stxv vs3, 128(r8)
; CHECK-NEXT:    bdnz .LBB9_2
; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
; CHECK-NEXT:    blr
;
; CHECK-BE-LABEL: testcse4:
; CHECK-BE:       # %bb.0: # %entry
; CHECK-BE-NEXT:    cmpwi r4, 1
; CHECK-BE-NEXT:    bltlr cr0
; CHECK-BE-NEXT:  # %bb.1: # %for.body.preheader
; CHECK-BE-NEXT:    clrldi r4, r4, 32
; CHECK-BE-NEXT:    li r6, 0
; CHECK-BE-NEXT:    mtctr r4
; CHECK-BE-NEXT:    li r4, 0
; CHECK-BE-NEXT:    .p2align 4
; CHECK-BE-NEXT:  .LBB9_2: # %for.body
; CHECK-BE-NEXT:    #
; CHECK-BE-NEXT:    rldic r7, r6, 4, 28
; CHECK-BE-NEXT:    xxsetaccz acc2
; CHECK-BE-NEXT:    xxsetaccz acc1
; CHECK-BE-NEXT:    addi r6, r6, 6
; CHECK-BE-NEXT:    lxvx vs0, r5, r7
; CHECK-BE-NEXT:    add r7, r5, r7
; CHECK-BE-NEXT:    lxv vs1, 16(r7)
; CHECK-BE-NEXT:    xvf32gerpp acc2, vs0, vs1
; CHECK-BE-NEXT:    lxv vs0, 32(r7)
; CHECK-BE-NEXT:    lxv vs1, 48(r7)
; CHECK-BE-NEXT:    xvf32gerpn acc1, vs0, vs1
; CHECK-BE-NEXT:    lxv vs12, 64(r7)
; CHECK-BE-NEXT:    lxv vs13, 80(r7)
; CHECK-BE-NEXT:    xxsetaccz acc0
; CHECK-BE-NEXT:    rldic r7, r4, 6, 26
; CHECK-BE-NEXT:    addi r4, r4, 3
; CHECK-BE-NEXT:    add r8, r3, r7
; CHECK-BE-NEXT:    xxmfacc acc2
; CHECK-BE-NEXT:    xvf32gernp acc0, vs12, vs13
; CHECK-BE-NEXT:    stxvx vs8, r3, r7
; CHECK-BE-NEXT:    stxv vs9, 16(r8)
; CHECK-BE-NEXT:    xxmfacc acc1
; CHECK-BE-NEXT:    stxv vs11, 48(r8)
; CHECK-BE-NEXT:    stxv vs10, 32(r8)
; CHECK-BE-NEXT:    stxv vs5, 80(r8)
; CHECK-BE-NEXT:    stxv vs4, 64(r8)
; CHECK-BE-NEXT:    xxmfacc acc0
; CHECK-BE-NEXT:    stxv vs7, 112(r8)
; CHECK-BE-NEXT:    stxv vs6, 96(r8)
; CHECK-BE-NEXT:    stxv vs1, 144(r8)
; CHECK-BE-NEXT:    stxv vs0, 128(r8)
; CHECK-BE-NEXT:    stxv vs3, 176(r8)
; CHECK-BE-NEXT:    stxv vs2, 160(r8)
; CHECK-BE-NEXT:    bdnz .LBB9_2
; CHECK-BE-NEXT:  # %bb.3: # %for.cond.cleanup
; CHECK-BE-NEXT:    blr
entry:
  %cmp55 = icmp sgt i32 %lim, 0
  br i1 %cmp55, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader:                               ; preds = %entry
  %wide.trip.count = zext i32 %lim to i64
  br label %for.body

for.cond.cleanup:                                 ; preds = %for.body, %entry
  ret void

for.body:                                         ; preds = %for.body, %for.body.preheader
  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
  %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
  %1 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
  %2 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
  %3 = trunc i64 %indvars.iv to i32
  %mul = mul nsw i32 %3, 6
  %idxprom = zext i32 %mul to i64
  %arrayidx = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom
  %4 = load <16 x i8>, ptr %arrayidx, align 16
  %add2 = or disjoint i32 %mul, 1
  %idxprom3 = zext i32 %add2 to i64
  %arrayidx4 = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom3
  %5 = load <16 x i8>, ptr %arrayidx4, align 16
  %6 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %4, <16 x i8> %5)
  %add6 = add nuw nsw i32 %mul, 2
  %idxprom7 = zext i32 %add6 to i64
  %arrayidx8 = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom7
  %7 = load <16 x i8>, ptr %arrayidx8, align 16
  %add10 = add nuw nsw i32 %mul, 3
  %idxprom11 = zext i32 %add10 to i64
  %arrayidx12 = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom11
  %8 = load <16 x i8>, ptr %arrayidx12, align 16
  %9 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %1, <16 x i8> %7, <16 x i8> %8)
  %add14 = add nuw nsw i32 %mul, 4
  %idxprom15 = zext i32 %add14 to i64
  %arrayidx16 = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom15
  %10 = load <16 x i8>, ptr %arrayidx16, align 16
  %add18 = add nuw nsw i32 %mul, 5
  %idxprom19 = zext i32 %add18 to i64
  %arrayidx20 = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom19
  %11 = load <16 x i8>, ptr %arrayidx20, align 16
  %12 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %2, <16 x i8> %10, <16 x i8> %11)
  %mul21 = mul i64 %indvars.iv, 3
  %idx.ext = and i64 %mul21, 4294967295
  %add.ptr = getelementptr inbounds <512 x i1>, ptr %res, i64 %idx.ext
  store <512 x i1> %6, ptr %add.ptr, align 64
  %add.ptr26 = getelementptr inbounds <512 x i1>, ptr %add.ptr, i64 1
  store <512 x i1> %9, ptr %add.ptr26, align 64
  %add.ptr30 = getelementptr inbounds <512 x i1>, ptr %add.ptr, i64 2
  store <512 x i1> %12, ptr %add.ptr30, align 64
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}

declare i32 @testRedundantPrimeUnprimeF()
define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind {
; CHECK-LABEL: testRedundantPrimeUnprime:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    mflr r0
; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
; CHECK-NEXT:    std r0, 16(r1)
; CHECK-NEXT:    stdu r1, -112(r1)
; CHECK-NEXT:    xxsetaccz acc0
; CHECK-NEXT:    xxsetaccz acc1
; CHECK-NEXT:    mr r30, r3
; CHECK-NEXT:    xxmfacc acc0
; CHECK-NEXT:    stxv vs0, 48(r3)
; CHECK-NEXT:    stxv vs1, 32(r3)
; CHECK-NEXT:    stxv vs2, 16(r3)
; CHECK-NEXT:    stxv vs3, 0(r3)
; CHECK-NEXT:    xvf32gerpp acc1, v2, v2
; CHECK-NEXT:    xxmfacc acc1
; CHECK-NEXT:    stxv vs4, 80(r1)
; CHECK-NEXT:    stxv vs5, 64(r1)
; CHECK-NEXT:    stxv vs6, 48(r1)
; CHECK-NEXT:    stxv vs7, 32(r1)
; CHECK-NEXT:    bl testRedundantPrimeUnprimeF@notoc
; CHECK-NEXT:    lxvp vsp0, 64(r1)
; CHECK-NEXT:    lxvp vsp2, 32(r1)
; CHECK-NEXT:    stxv vs0, 112(r30)
; CHECK-NEXT:    stxv vs1, 96(r30)
; CHECK-NEXT:    stxv vs2, 80(r30)
; CHECK-NEXT:    stxv vs3, 64(r30)
; CHECK-NEXT:    addi r1, r1, 112
; CHECK-NEXT:    ld r0, 16(r1)
; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-NEXT:    mtlr r0
; CHECK-NEXT:    blr
;
; CHECK-BE-LABEL: testRedundantPrimeUnprime:
; CHECK-BE:       # %bb.0: # %entry
; CHECK-BE-NEXT:    mflr r0
; CHECK-BE-NEXT:    std r0, 16(r1)
; CHECK-BE-NEXT:    stdu r1, -192(r1)
; CHECK-BE-NEXT:    xxsetaccz acc0
; CHECK-BE-NEXT:    xxsetaccz acc1
; CHECK-BE-NEXT:    std r30, 176(r1) # 8-byte Folded Spill
; CHECK-BE-NEXT:    mr r30, r3
; CHECK-BE-NEXT:    xxmfacc acc0
; CHECK-BE-NEXT:    stxv vs1, 16(r3)
; CHECK-BE-NEXT:    stxv vs0, 0(r3)
; CHECK-BE-NEXT:    stxv vs3, 48(r3)
; CHECK-BE-NEXT:    stxv vs2, 32(r3)
; CHECK-BE-NEXT:    xvf32gerpp acc1, v2, v2
; CHECK-BE-NEXT:    xxmfacc acc1
; CHECK-BE-NEXT:    stxv vs4, 112(r1)
; CHECK-BE-NEXT:    stxv vs5, 128(r1)
; CHECK-BE-NEXT:    stxv vs6, 144(r1)
; CHECK-BE-NEXT:    stxv vs7, 160(r1)
; CHECK-BE-NEXT:    bl testRedundantPrimeUnprimeF
; CHECK-BE-NEXT:    nop
; CHECK-BE-NEXT:    lxvp vsp0, 112(r1)
; CHECK-BE-NEXT:    lxvp vsp2, 144(r1)
; CHECK-BE-NEXT:    stxv vs3, 112(r30)
; CHECK-BE-NEXT:    stxv vs2, 96(r30)
; CHECK-BE-NEXT:    stxv vs1, 80(r30)
; CHECK-BE-NEXT:    stxv vs0, 64(r30)
; CHECK-BE-NEXT:    ld r30, 176(r1) # 8-byte Folded Reload
; CHECK-BE-NEXT:    addi r1, r1, 192
; CHECK-BE-NEXT:    ld r0, 16(r1)
; CHECK-BE-NEXT:    mtlr r0
; CHECK-BE-NEXT:    blr
entry:
  %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
  store <512 x i1> %0, ptr %dst, align 64
  %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
  %call = tail call signext i32 @testRedundantPrimeUnprimeF()
  %add.ptr1 = getelementptr inbounds <512 x i1>, ptr %dst, i64 1
  store <512 x i1> %1, ptr %add.ptr1, align 64
  ret void
}

declare <256 x i1> @llvm.ppc.vsx.lxvp(ptr)
declare void @llvm.ppc.vsx.stxvp(<256 x i1>, ptr)

; Function Attrs: nofree nounwind
define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, ptr nocapture %resp)  {
; CHECK-LABEL: test_ldst_1:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    lxv vs1, 32(r3)
; CHECK-NEXT:    lxv vs0, 48(r3)
; CHECK-NEXT:    lxv vs3, 0(r3)
; CHECK-NEXT:    lxv vs2, 16(r3)
; CHECK-NEXT:    plxvp vsp36, 8(r4), 0
; CHECK-NEXT:    xxmtacc acc0
; CHECK-NEXT:    pmxvf64gernn acc0, vsp36, v2, 0, 0
; CHECK-NEXT:    xxmfacc acc0
; CHECK-NEXT:    stxv vs0, 48(r7)
; CHECK-NEXT:    stxv vs1, 32(r7)
; CHECK-NEXT:    stxv vs2, 16(r7)
; CHECK-NEXT:    stxv vs3, 0(r7)
; CHECK-NEXT:    blr
;
; CHECK-BE-LABEL: test_ldst_1:
; CHECK-BE:       # %bb.0: # %entry
; CHECK-BE-NEXT:    lxv vs1, 16(r3)
; CHECK-BE-NEXT:    lxv vs0, 0(r3)
; CHECK-BE-NEXT:    lxv vs3, 48(r3)
; CHECK-BE-NEXT:    lxv vs2, 32(r3)
; CHECK-BE-NEXT:    plxvp vsp36, 8(r4), 0
; CHECK-BE-NEXT:    xxmtacc acc0
; CHECK-BE-NEXT:    pmxvf64gernn acc0, vsp36, v2, 0, 0
; CHECK-BE-NEXT:    xxmfacc acc0
; CHECK-BE-NEXT:    stxv vs1, 16(r7)
; CHECK-BE-NEXT:    stxv vs0, 0(r7)
; CHECK-BE-NEXT:    stxv vs3, 48(r7)
; CHECK-BE-NEXT:    stxv vs2, 32(r7)
; CHECK-BE-NEXT:    blr
entry:
  %0 = load <512 x i1>, ptr %vqp, align 64
  %1 = getelementptr i8, ptr %vpp, i64 8
  %2 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %1)
  %3 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %0, <256 x i1> %2, <16 x i8> %vc, i32 0, i32 0)
  store <512 x i1> %3, ptr %resp, align 64
  ret void
}

; Function Attrs: nofree nounwind
define void @test_ldst_2(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, ptr nocapture %resp)  {
; CHECK-LABEL: test_ldst_2:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    lxv vs1, 32(r3)
; CHECK-NEXT:    lxv vs0, 48(r3)
; CHECK-NEXT:    lxv vs3, 0(r3)
; CHECK-NEXT:    lxv vs2, 16(r3)
; CHECK-NEXT:    xxmtacc acc0
; CHECK-NEXT:    lxvp vsp36, 0(r4)
; CHECK-NEXT:    xvf64gernp acc0, vsp36, v2
; CHECK-NEXT:    xxmfacc acc0
; CHECK-NEXT:    stxv vs0, 48(r7)
; CHECK-NEXT:    stxv vs1, 32(r7)
; CHECK-NEXT:    stxv vs2, 16(r7)
; CHECK-NEXT:    stxv vs3, 0(r7)
; CHECK-NEXT:    blr
;
; CHECK-BE-LABEL: test_ldst_2:
; CHECK-BE:       # %bb.0: # %entry
; CHECK-BE-NEXT:    lxv vs1, 16(r3)
; CHECK-BE-NEXT:    lxv vs0, 0(r3)
; CHECK-BE-NEXT:    lxv vs3, 48(r3)
; CHECK-BE-NEXT:    lxv vs2, 32(r3)
; CHECK-BE-NEXT:    xxmtacc acc0
; CHECK-BE-NEXT:    lxvp vsp36, 0(r4)
; CHECK-BE-NEXT:    xvf64gernp acc0, vsp36, v2
; CHECK-BE-NEXT:    xxmfacc acc0
; CHECK-BE-NEXT:    stxv vs1, 16(r7)
; CHECK-BE-NEXT:    stxv vs0, 0(r7)
; CHECK-BE-NEXT:    stxv vs3, 48(r7)
; CHECK-BE-NEXT:    stxv vs2, 32(r7)
; CHECK-BE-NEXT:    blr
entry:
  %0 = load <512 x i1>, ptr %vqp, align 64
  %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp)
  %2 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %0, <256 x i1> %1, <16 x i8> %vc)
  store <512 x i1> %2, ptr %resp, align 64
  ret void
}

; Function Attrs: nofree nounwind
define void @test_ldst_3(ptr nocapture readonly %vqp, i64 %offs, ptr %vpp, <16 x i8> %vc, ptr nocapture %resp)  {
; CHECK-LABEL: test_ldst_3:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    lxv vs1, 32(r3)
; CHECK-NEXT:    lxv vs0, 48(r3)
; CHECK-NEXT:    lxv vs3, 0(r3)
; CHECK-NEXT:    lxv vs2, 16(r3)
; CHECK-NEXT:    xxmtacc acc0
; CHECK-NEXT:    lxvp vsp36, 0(r5)
; CHECK-NEXT:    xvf64gernp acc0, vsp36, v2
; CHECK-NEXT:    xxmfacc acc0
; CHECK-NEXT:    stxv vs0, 48(r9)
; CHECK-NEXT:    stxv vs1, 32(r9)
; CHECK-NEXT:    stxv vs2, 16(r9)
; CHECK-NEXT:    stxv vs3, 0(r9)
; CHECK-NEXT:    blr
;
; CHECK-BE-LABEL: test_ldst_3:
; CHECK-BE:       # %bb.0: # %entry
; CHECK-BE-NEXT:    lxv vs1, 16(r3)
; CHECK-BE-NEXT:    lxv vs0, 0(r3)
; CHECK-BE-NEXT:    lxv vs3, 48(r3)
; CHECK-BE-NEXT:    lxv vs2, 32(r3)
; CHECK-BE-NEXT:    xxmtacc acc0
; CHECK-BE-NEXT:    lxvp vsp36, 0(r5)
; CHECK-BE-NEXT:    xvf64gernp acc0, vsp36, v2
; CHECK-BE-NEXT:    xxmfacc acc0
; CHECK-BE-NEXT:    stxv vs1, 16(r9)
; CHECK-BE-NEXT:    stxv vs0, 0(r9)
; CHECK-BE-NEXT:    stxv vs3, 48(r9)
; CHECK-BE-NEXT:    stxv vs2, 32(r9)
; CHECK-BE-NEXT:    blr
entry:
  %0 = load <512 x i1>, ptr %vqp, align 64
  %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp)
  %2 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %0, <256 x i1> %1, <16 x i8> %vc)
  store <512 x i1> %2, ptr %resp, align 64
  ret void
}

declare <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1>, <256 x i1>, <16 x i8>, i32, i32)
declare <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1>, <256 x i1>, <16 x i8>)