; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
; Stack reload folding tests.
;
; By including a nop call with sideeffects we can force a partial register spill of the
; relevant registers and check that the reload is correctly folded into the instruction.
define <32 x half> @stack_fold_addph_zmm(<32 x half> %a0, <32 x half> %a1) {
; CHECK-LABEL: stack_fold_addph_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fadd <32 x half> %a0, %a1
ret <32 x half> %2
}
define <32 x half> @stack_fold_addph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
; CHECK-LABEL: stack_fold_addph_zmm_k:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovaps (%rsi), %zmm2
; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fadd <32 x half> %a0, %a1
%3 = bitcast i32 %mask to <32 x i1>
%4 = load <32 x half>, ptr %passthru
%5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
ret <32 x half> %5
}
define <32 x half> @stack_fold_addph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
; CHECK-LABEL: stack_fold_addph_zmm_k_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovaps (%rsi), %zmm2
; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fadd <32 x half> %a1, %a0
%3 = bitcast i32 %mask to <32 x i1>
%4 = load <32 x half>, ptr %passthru
%5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
ret <32 x half> %5
}
define <32 x half> @stack_fold_addph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
; CHECK-LABEL: stack_fold_addph_zmm_kz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fadd <32 x half> %a1, %a0
%3 = bitcast i32 %mask to <32 x i1>
%4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
ret <32 x half> %4
}
define half @stack_fold_addsh(half %a0, half %a1) {
; CHECK-LABEL: stack_fold_addsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vaddsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fadd half %a0, %a1
ret half %2
}
define <8 x half> @stack_fold_addsh_int(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_addsh_int:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vaddsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = extractelement <8 x half> %a0, i32 0
%3 = extractelement <8 x half> %a1, i32 0
%4 = fadd half %2, %3
%5 = insertelement <8 x half> %a0, half %4, i32 0
ret <8 x half> %5
}
define i32 @stack_fold_cmpph(<32 x half> %a0, <32 x half> %a1) {
; CHECK-LABEL: stack_fold_cmpph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%res = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %a0, <32 x half> %a1, i32 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
%2 = bitcast <32 x i1> %res to i32
ret i32 %2
}
declare <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half>, <32 x half>, i32, <32 x i1>, i32)
define <32 x half> @stack_fold_cmpph_mask(<32 x half> %a0, <32 x half> %a1, ptr %a2, i32 %mask, <32 x half> %b0, <32 x half> %b1) {
; CHECK-LABEL: stack_fold_cmpph_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: subq $136, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 144
; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; CHECK-NEXT: vaddph (%rdi), %zmm0, %zmm0
; CHECK-NEXT: vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: kandd %k0, %k1, %k1
; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; CHECK-NEXT: vmovdqu16 (%rsp), %zmm0 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: addq $136, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
%2 = load <32 x half>, ptr %a2
%3 = fadd <32 x half> %a1, %2
%4 = bitcast i32 %mask to <32 x i1>
%5 = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %3, <32 x half> %a0, i32 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
%6 = and <32 x i1> %4, %5
%7 = select <32 x i1> %6, <32 x half> %b0, <32 x half> %b1
ret <32 x half> %7
}
define <32 x half> @stack_fold_cmpph_mask_commuted(<32 x half> %a0, <32 x half> %a1, ptr %a2, i32 %mask, <32 x half> %b0, <32 x half> %b1) {
; CHECK-LABEL: stack_fold_cmpph_mask_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: subq $136, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 144
; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; CHECK-NEXT: vaddph (%rdi), %zmm0, %zmm0
; CHECK-NEXT: vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: kandd %k0, %k1, %k1
; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; CHECK-NEXT: vmovdqu16 (%rsp), %zmm0 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: addq $136, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
%2 = load <32 x half>, ptr %a2
%3 = fadd <32 x half> %a1, %2
%4 = bitcast i32 %mask to <32 x i1>
%5 = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %a0, <32 x half> %3, i32 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
%6 = and <32 x i1> %4, %5
%7 = select <32 x i1> %6, <32 x half> %b0, <32 x half> %b1
ret <32 x half> %7
}
define half @stack_fold_divsh(half %a0, half %a1) {
; CHECK-LABEL: stack_fold_divsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vdivsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fdiv half %a0, %a1
ret half %2
}
define <8 x half> @stack_fold_divsh_int(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_divsh_int:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vdivsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = extractelement <8 x half> %a0, i32 0
%3 = extractelement <8 x half> %a1, i32 0
%4 = fdiv half %2, %3
%5 = insertelement <8 x half> %a0, half %4, i32 0
ret <8 x half> %5
}
define i32 @stack_fold_fpclassph(<32 x half> %a0) {
; CHECK-LABEL: stack_fold_fpclassph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vfpclassphz $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 64-byte Folded Reload
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %a0, i32 4)
%3 = bitcast <32 x i1> %2 to i32
ret i32 %3
}
declare <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half>, i32)
define i32 @stack_fold_fpclassph_mask(<32 x half> %a0, ptr %p) {
; CHECK-LABEL: stack_fold_fpclassph_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd (%rdi), %k1
; CHECK-NEXT: vfpclassphz $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %a0, i32 4)
%mask = load <32 x i1>, ptr %p
%3 = and <32 x i1> %2, %mask
%4 = bitcast <32 x i1> %3 to i32
ret i32 %4
}
define i8 @stack_fold_fpclasssh(<8 x half> %a0) {
; CHECK-LABEL: stack_fold_fpclasssh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vfpclasssh $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 16-byte Folded Reload
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %a0, i32 4, i8 -1)
ret i8 %2
}
declare i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half>, i32, i8)
define i8 @stack_fold_fpclasssh_mask(<8 x half> %a0, ptr %p) {
; CHECK-LABEL: stack_fold_fpclasssh_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovb (%rdi), %k1
; CHECK-NEXT: vfpclasssh $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%mask = load i8, ptr %p
%2 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %a0, i32 4, i8 %mask)
ret i8 %2
}
define <32 x half> @stack_fold_getexpph(<32 x half> %a0) {
; CHECK-LABEL: stack_fold_getexpph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1, i32 4)
ret <32 x half> %2
}
declare <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half>, <32 x half>, i32, i32)
define <32 x half> @stack_fold_getexpph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
; CHECK-LABEL: stack_fold_getexpph_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %zmm1
; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <32 x half>, ptr %passthru
%3 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask, i32 4)
ret <32 x half> %3
}
define <32 x half> @stack_fold_getexpph_maskz(<32 x half> %a0, ptr %mask) {
; CHECK-LABEL: stack_fold_getexpph_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd (%rdi), %k1
; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i32, ptr %mask
%3 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2, i32 4)
ret <32 x half> %3
}
define <8 x half> @stack_fold_getexpsh(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_getexpsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vgetexpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
ret <8 x half> %2
}
declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32)
define <8 x half> @stack_fold_getexpsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
; CHECK-LABEL: stack_fold_getexpsh_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vgetexpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <8 x half>, ptr %passthru
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
ret <8 x half> %3
}
define <8 x half> @stack_fold_getexpsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
; CHECK-LABEL: stack_fold_getexpsh_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovb (%rdi), %k1
; CHECK-NEXT: vgetexpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i8, ptr %mask
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 4)
ret <8 x half> %3
}
define <32 x half> @stack_fold_getmantph(<32 x half> %a0) {
; CHECK-LABEL: stack_fold_getmantph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
ret <32 x half> %2
}
declare <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
define <32 x half> @stack_fold_getmantph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
; CHECK-LABEL: stack_fold_getmantph_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %zmm1
; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <32 x half>, ptr %passthru
%3 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
ret <32 x half> %3
}
define <32 x half> @stack_fold_getmantph_maskz(<32 x half> %a0, ptr %mask) {
; CHECK-LABEL: stack_fold_getmantph_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd (%rdi), %k1
; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i32, ptr %mask
%3 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
ret <32 x half> %3
}
define <8 x half> @stack_fold_getmantsh(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_getmantsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vgetmantsh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> undef, i8 -1, i32 4)
ret <8 x half> %2
}
declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half>, <8 x half>, i32, <8 x half>, i8, i32)
define <8 x half> @stack_fold_getmantsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
; CHECK-LABEL: stack_fold_getmantsh_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vgetmantsh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <8 x half>, ptr %passthru
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> %2, i8 %mask, i32 4)
ret <8 x half> %3
}
define <8 x half> @stack_fold_getmantsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
; CHECK-LABEL: stack_fold_getmantsh_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovb (%rdi), %k1
; CHECK-NEXT: vgetmantsh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i8, ptr %mask
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> zeroinitializer, i8 %2, i32 4)
ret <8 x half> %3
}
define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
; CHECK-LABEL: stack_fold_maxph_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
ret <32 x half> %2
}
declare <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
; CHECK-LABEL: stack_fold_maxph_zmm_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
ret <32 x half> %2
}
define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
; CHECK-LABEL: stack_fold_maxph_zmm_k:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovaps (%rsi), %zmm2
; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = load <32 x half>, ptr %passthru
%5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
ret <32 x half> %5
}
define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
; CHECK-LABEL: stack_fold_maxph_zmm_k_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovaps (%rsi), %zmm2
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = load <32 x half>, ptr %passthru
%5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
ret <32 x half> %5
}
define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
; CHECK-LABEL: stack_fold_maxph_zmm_kz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
ret <32 x half> %4
}
define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
; CHECK-LABEL: stack_fold_maxph_zmm_kz_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
ret <32 x half> %4
}
define <32 x half> @stack_fold_maxph_zmm_commutable(<32 x half> %a0, <32 x half> %a1) #1 {
; CHECK-LABEL: stack_fold_maxph_zmm_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
ret <32 x half> %2
}
define <32 x half> @stack_fold_maxph_zmm_commutable_commuted(<32 x half> %a0, <32 x half> %a1) #1 {
; CHECK-LABEL: stack_fold_maxph_zmm_commutable_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
ret <32 x half> %2
}
define <32 x half> @stack_fold_maxph_zmm_commutable_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 {
; CHECK-LABEL: stack_fold_maxph_zmm_commutable_k:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovaps (%rsi), %zmm2
; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = load <32 x half>, ptr %passthru
%5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
ret <32 x half> %5
}
define <32 x half> @stack_fold_maxph_zmm_commutable_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 {
; CHECK-LABEL: stack_fold_maxph_zmm_commutable_k_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovaps (%rsi), %zmm2
; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = load <32 x half>, ptr %passthru
%5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
ret <32 x half> %5
}
define <32 x half> @stack_fold_maxph_zmm_commutable_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
; CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
ret <32 x half> %4
}
define <32 x half> @stack_fold_maxph_zmm_commutable_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
; CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
ret <32 x half> %4
}
define half @stack_fold_maxsh(half %a0, half %a1) #0 {
; CHECK-LABEL: stack_fold_maxsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp ogt half %a0, %a1
%3 = select i1 %2, half %a0, half %a1
ret half %3
}
define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 {
; CHECK-LABEL: stack_fold_maxsh_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vmaxsh %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp ogt half %a1, %a0
%3 = select i1 %2, half %a1, half %a0
ret half %3
}
define half @stack_fold_maxsh_commutable(half %a0, half %a1) #1 {
; CHECK-LABEL: stack_fold_maxsh_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp ogt half %a0, %a1
%3 = select i1 %2, half %a0, half %a1
ret half %3
}
define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 {
; CHECK-LABEL: stack_fold_maxsh_commutable_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp ogt half %a1, %a0
%3 = select i1 %2, half %a1, half %a0
ret half %3
}
define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
; CHECK-LABEL: stack_fold_maxsh_int:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
ret <8 x half> %2
}
declare <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
define <8 x half> @stack_fold_maxsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mask, ptr %passthru) {
; CHECK-LABEL: stack_fold_maxsh_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rsi), %xmm2
; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <8 x half>, ptr %passthru
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
ret <8 x half> %3
}
define <8 x half> @stack_fold_maxsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %mask) {
; CHECK-LABEL: stack_fold_maxsh_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 4)
ret <8 x half> %2
}
define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
; CHECK-LABEL: stack_fold_minph_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
ret <32 x half> %2
}
declare <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
; CHECK-LABEL: stack_fold_minph_zmm_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
ret <32 x half> %2
}
define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
; CHECK-LABEL: stack_fold_minph_zmm_k:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovaps (%rsi), %zmm2
; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = load <32 x half>, ptr %passthru
%5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
ret <32 x half> %5
}
define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
; CHECK-LABEL: stack_fold_minph_zmm_k_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovaps (%rsi), %zmm2
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = load <32 x half>, ptr %passthru
%5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
ret <32 x half> %5
}
define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
; CHECK-LABEL: stack_fold_minph_zmm_kz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
ret <32 x half> %4
}
define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
; CHECK-LABEL: stack_fold_minph_zmm_kz_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
ret <32 x half> %4
}
define <32 x half> @stack_fold_minph_zmm_commutable(<32 x half> %a0, <32 x half> %a1) #1 {
; CHECK-LABEL: stack_fold_minph_zmm_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
ret <32 x half> %2
}
define <32 x half> @stack_fold_minph_zmm_commutable_commuted(<32 x half> %a0, <32 x half> %a1) #1 {
; CHECK-LABEL: stack_fold_minph_zmm_commutable_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
ret <32 x half> %2
}
define <32 x half> @stack_fold_minph_zmm_commutable_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 {
; CHECK-LABEL: stack_fold_minph_zmm_commutable_k:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovaps (%rsi), %zmm2
; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = load <32 x half>, ptr %passthru
%5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
ret <32 x half> %5
}
define <32 x half> @stack_fold_minph_zmm_commutable_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 {
; CHECK-LABEL: stack_fold_minph_zmm_commutable_k_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovaps (%rsi), %zmm2
; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = load <32 x half>, ptr %passthru
%5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
ret <32 x half> %5
}
define <32 x half> @stack_fold_minph_zmm_commutable_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
; CHECK-LABEL: stack_fold_minph_zmm_commutable_kz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
ret <32 x half> %4
}
define <32 x half> @stack_fold_minph_zmm_commutable_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
; CHECK-LABEL: stack_fold_minph_zmm_commutable_kz_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
%3 = bitcast i32 %mask to <32 x i1>
%4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
ret <32 x half> %4
}
define half @stack_fold_minsh(half %a0, half %a1) #0 {
; CHECK-LABEL: stack_fold_minsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp olt half %a0, %a1
%3 = select i1 %2, half %a0, half %a1
ret half %3
}
define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 {
; CHECK-LABEL: stack_fold_minsh_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vminsh %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp olt half %a1, %a0
%3 = select i1 %2, half %a1, half %a0
ret half %3
}
define half @stack_fold_minsh_commutable(half %a0, half %a1) #1 {
; CHECK-LABEL: stack_fold_minsh_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp olt half %a0, %a1
%3 = select i1 %2, half %a0, half %a1
ret half %3
}
define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 {
; CHECK-LABEL: stack_fold_minsh_commutable_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp olt half %a1, %a0
%3 = select i1 %2, half %a1, half %a0
ret half %3
}
define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
; CHECK-LABEL: stack_fold_minsh_int:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
ret <8 x half> %2
}
declare <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
define <8 x half> @stack_fold_minsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mask, ptr %passthru) {
; CHECK-LABEL: stack_fold_minsh_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rsi), %xmm2
; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <8 x half>, ptr %passthru
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
ret <8 x half> %3
}
define <8 x half> @stack_fold_minsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %mask) {
; CHECK-LABEL: stack_fold_minsh_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 4)
ret <8 x half> %2
}
define <32 x half> @stack_fold_mulph_zmm(<32 x half> %a0, <32 x half> %a1) {
; CHECK-LABEL: stack_fold_mulph_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fmul <32 x half> %a0, %a1
ret <32 x half> %2
}
define <32 x half> @stack_fold_mulph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
; CHECK-LABEL: stack_fold_mulph_zmm_k:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovaps (%rsi), %zmm2
; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fmul <32 x half> %a0, %a1
%3 = bitcast i32 %mask to <32 x i1>
%4 = load <32 x half>, ptr %passthru
%5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
ret <32 x half> %5
}
define <32 x half> @stack_fold_mulph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
; CHECK-LABEL: stack_fold_mulph_zmm_k_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovaps (%rsi), %zmm2
; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fmul <32 x half> %a1, %a0
%3 = bitcast i32 %mask to <32 x i1>
%4 = load <32 x half>, ptr %passthru
%5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
ret <32 x half> %5
}
define <32 x half> @stack_fold_mulph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
; CHECK-LABEL: stack_fold_mulph_zmm_kz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fmul <32 x half> %a1, %a0
%3 = bitcast i32 %mask to <32 x i1>
%4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
ret <32 x half> %4
}
define half @stack_fold_mulsh(half %a0, half %a1) {
; CHECK-LABEL: stack_fold_mulsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmulsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fmul half %a0, %a1
ret half %2
}
define <8 x half> @stack_fold_mulsh_int(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_mulsh_int:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmulsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = extractelement <8 x half> %a0, i32 0
%3 = extractelement <8 x half> %a1, i32 0
%4 = fmul half %2, %3
%5 = insertelement <8 x half> %a0, half %4, i32 0
ret <8 x half> %5
}
define <32 x half> @stack_fold_rcpph(<32 x half> %a0) {
; CHECK-LABEL: stack_fold_rcpph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1)
ret <32 x half> %2
}
declare <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half>, <32 x half>, i32)
define <32 x half> @stack_fold_rcpph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
; CHECK-LABEL: stack_fold_rcpph_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %zmm1
; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <32 x half>, ptr %passthru
%3 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask)
ret <32 x half> %3
}
define <32 x half> @stack_fold_rcpph_maskz(<32 x half> %a0, ptr %mask) {
; CHECK-LABEL: stack_fold_rcpph_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd (%rdi), %k1
; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i32, ptr %mask
%3 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2)
ret <32 x half> %3
}
define <8 x half> @stack_fold_rcpsh(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_rcpsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vrcpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1)
ret <8 x half> %2
}
declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half>, <8 x half>, <8 x half>, i8)
define <8 x half> @stack_fold_rcpsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
; CHECK-LABEL: stack_fold_rcpsh_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vrcpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <8 x half>, ptr %passthru
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask)
ret <8 x half> %3
}
define <8 x half> @stack_fold_rcpsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
; CHECK-LABEL: stack_fold_rcpsh_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovb (%rdi), %k1
; CHECK-NEXT: vrcpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i8, ptr %mask
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2)
ret <8 x half> %3
}
define <32 x half> @stack_fold_reduceph(<32 x half> %a0) {
; CHECK-LABEL: stack_fold_reduceph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
ret <32 x half> %2
}
declare <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
define <32 x half> @stack_fold_reduceph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
; CHECK-LABEL: stack_fold_reduceph_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %zmm1
; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <32 x half>, ptr %passthru
%3 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
ret <32 x half> %3
}
define <32 x half> @stack_fold_reduceph_maskz(<32 x half> %a0, ptr %mask) {
; CHECK-LABEL: stack_fold_reduceph_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd (%rdi), %k1
; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i32, ptr %mask
%3 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
ret <32 x half> %3
}
define <8 x half> @stack_fold_reducesh(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_reducesh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vreducesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 8, i32 4)
ret <8 x half> %2
}
declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32, i32)
define <8 x half> @stack_fold_reducesh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
; CHECK-LABEL: stack_fold_reducesh_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vreducesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <8 x half>, ptr %passthru
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 8, i32 4)
ret <8 x half> %3
}
define <8 x half> @stack_fold_reducesh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
; CHECK-LABEL: stack_fold_reducesh_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovb (%rdi), %k1
; CHECK-NEXT: vreducesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i8, ptr %mask
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 8, i32 4)
ret <8 x half> %3
}
define <32 x half> @stack_fold_rndscaleph(<32 x half> %a0) {
; CHECK-LABEL: stack_fold_rndscaleph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
ret <32 x half> %2
}
declare <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
define <32 x half> @stack_fold_rndscaleph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
; CHECK-LABEL: stack_fold_rndscaleph_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %zmm1
; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <32 x half>, ptr %passthru
%3 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
ret <32 x half> %3
}
define <32 x half> @stack_fold_rndscaleph_maskz(<32 x half> %a0, ptr %mask) {
; CHECK-LABEL: stack_fold_rndscaleph_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd (%rdi), %k1
; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i32, ptr %mask
%3 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
ret <32 x half> %3
}
define <8 x half> @stack_fold_rndscalesh(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_rndscalesh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vrndscalesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 8, i32 4)
ret <8 x half> %2
}
declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32, i32)
define <8 x half> @stack_fold_rndscalesh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
; CHECK-LABEL: stack_fold_rndscalesh_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vrndscalesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <8 x half>, ptr %passthru
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 8, i32 4)
ret <8 x half> %3
}
define <8 x half> @stack_fold_rndscalesh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
; CHECK-LABEL: stack_fold_rndscalesh_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovb (%rdi), %k1
; CHECK-NEXT: vrndscalesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i8, ptr %mask
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 8, i32 4)
ret <8 x half> %3
}
define <32 x half> @stack_fold_rsqrtph(<32 x half> %a0) {
; CHECK-LABEL: stack_fold_rsqrtph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1)
ret <32 x half> %2
}
declare <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half>, <32 x half>, i32)
define <32 x half> @stack_fold_rsqrtph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
; CHECK-LABEL: stack_fold_rsqrtph_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %zmm1
; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <32 x half>, ptr %passthru
%3 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask)
ret <32 x half> %3
}
define <32 x half> @stack_fold_rsqrtph_maskz(<32 x half> %a0, ptr %mask) {
; CHECK-LABEL: stack_fold_rsqrtph_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd (%rdi), %k1
; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i32, ptr %mask
%3 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2)
ret <32 x half> %3
}
define <8 x half> @stack_fold_rsqrtsh(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_rsqrtsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vrsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1)
ret <8 x half> %2
}
declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8)
define <8 x half> @stack_fold_rsqrtsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
; CHECK-LABEL: stack_fold_rsqrtsh_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vrsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <8 x half>, ptr %passthru
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask)
ret <8 x half> %3
}
define <8 x half> @stack_fold_rsqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
; CHECK-LABEL: stack_fold_rsqrtsh_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovb (%rdi), %k1
; CHECK-NEXT: vrsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i8, ptr %mask
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2)
ret <8 x half> %3
}
define <32 x half> @stack_fold_sqrtph(<32 x half> %a0) {
; CHECK-LABEL: stack_fold_sqrtph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
ret <32 x half> %2
}
declare <32 x half> @llvm.sqrt.v32f16(<32 x half>)
define <32 x half> @stack_fold_sqrtph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
; CHECK-LABEL: stack_fold_sqrtph_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %zmm1
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <32 x half>, ptr %passthru
%3 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
%4 = bitcast i32 %mask to <32 x i1>
%5 = select <32 x i1> %4, <32 x half> %3, <32 x half> %2
ret <32 x half> %5
}
define <32 x half> @stack_fold_sqrtph_maskz(<32 x half> %a0, ptr %mask) {
; CHECK-LABEL: stack_fold_sqrtph_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovd (%rdi), %k1
; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i32, ptr %mask
%3 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
%4 = bitcast i32 %2 to <32 x i1>
%5 = select <32 x i1> %4, <32 x half> %3, <32 x half> zeroinitializer
ret <32 x half> %5
}
define <8 x half> @stack_fold_sqrtsh(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_sqrtsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
ret <8 x half> %2
}
declare <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32)
define <8 x half> @stack_fold_sqrtsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
; CHECK-LABEL: stack_fold_sqrtsh_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <8 x half>, ptr %passthru
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
ret <8 x half> %3
}
define <8 x half> @stack_fold_sqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
; CHECK-LABEL: stack_fold_sqrtsh_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovb (%rdi), %k1
; CHECK-NEXT: vsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i8, ptr %mask
%3 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 4)
ret <8 x half> %3
}
define <32 x half> @stack_fold_subph_zmm(<32 x half> %a0, <32 x half> %a1) {
; CHECK-LABEL: stack_fold_subph_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vsubph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fsub <32 x half> %a0, %a1
ret <32 x half> %2
}
define half @stack_fold_subsh(half %a0, half %a1) {
; CHECK-LABEL: stack_fold_subsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vsubsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fsub half %a0, %a1
ret half %2
}
define <8 x half> @stack_fold_subsh_int(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_subsh_int:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vsubsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = extractelement <8 x half> %a0, i32 0
%3 = extractelement <8 x half> %a1, i32 0
%4 = fsub half %2, %3
%5 = insertelement <8 x half> %a0, half %4, i32 0
ret <8 x half> %5
}
define <16 x float> @stack_fold_fmulcph(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: stack_fold_fmulcph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
ret <16 x float> %2
}
define <16 x float> @stack_fold_fmulcph_commute(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: stack_fold_fmulcph_commute:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a1, <16 x float> %a0, <16 x float> undef, i16 -1, i32 4)
ret <16 x float> %2
}
declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <16 x float> @stack_fold_fmulcph_mask(<16 x float> %a0, <16 x float> %a1, ptr %passthru, i16 %mask) {
; CHECK-LABEL: stack_fold_fmulcph_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %zmm2
; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <16 x float>, ptr %passthru
%3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %2, i16 %mask, i32 4)
ret <16 x float> %3
}
define <16 x float> @stack_fold_fmulcph_maskz(<16 x float> %a0, <16 x float> %a1, ptr %mask) {
; CHECK-LABEL: stack_fold_fmulcph_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovw (%rdi), %k1
; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i16, ptr %mask
%3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4)
ret <16 x float> %3
}
define <16 x float> @stack_fold_fcmulcph(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: stack_fold_fcmulcph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
ret <16 x float> %2
}
define <16 x float> @stack_fold_fcmulcph_commute(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: stack_fold_fcmulcph_commute:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; CHECK-NEXT: vfcmulcph %zmm0, %zmm1, %zmm2
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a1, <16 x float> %a0, <16 x float> undef, i16 -1, i32 4)
ret <16 x float> %2
}
declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <16 x float> @stack_fold_fcmulcph_mask(<16 x float> %a0, <16 x float> %a1, ptr %passthru, i16 %mask) {
; CHECK-LABEL: stack_fold_fcmulcph_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %zmm2
; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <16 x float>, ptr %passthru
%3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %2, i16 %mask, i32 4)
ret <16 x float> %3
}
define <16 x float> @stack_fold_fcmulcph_maskz(<16 x float> %a0, <16 x float> %a1, ptr %mask) {
; CHECK-LABEL: stack_fold_fcmulcph_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovw (%rdi), %k1
; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i16, ptr %mask
%3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4)
ret <16 x float> %3
}
define <16 x float> @stack_fold_fmaddcph(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: stack_fold_fmaddcph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 -1, i32 4)
ret <16 x float> %2
}
define <16 x float> @stack_fold_fmaddcph_commute(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: stack_fold_fmaddcph_commute:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a2, <16 x float> %a1, <16 x float> %a0, i16 -1, i32 4)
ret <16 x float> %2
}
declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <16 x float> @stack_fold_fmaddcph_mask(ptr %p, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: stack_fold_fmaddcph_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %zmm2
; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%a0 = load <16 x float>, ptr %p
%2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 %mask, i32 4)
ret <16 x float> %2
}
define <16 x float> @stack_fold_fmaddcph_maskz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmaddcph_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovw (%rdi), %k1
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i16, ptr %mask
%3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> zeroinitializer, i16 %2, i32 4)
ret <16 x float> %3
}
declare <16 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <16 x float> @stack_fold_fcmaddcph(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: stack_fold_fcmaddcph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 -1, i32 4)
ret <16 x float> %2
}
define <16 x float> @stack_fold_fcmaddcph_commute(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: stack_fold_fcmaddcph_commute:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; CHECK-NEXT: vfcmaddcph %zmm1, %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a2, <16 x float> %a1, <16 x float> %a0, i16 -1, i32 4)
ret <16 x float> %2
}
declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <16 x float> @stack_fold_fcmaddcph_mask(ptr %p, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: stack_fold_fcmaddcph_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %zmm2
; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%a0 = load <16 x float>, ptr %p
%2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 %mask, i32 4)
ret <16 x float> %2
}
define <16 x float> @stack_fold_fcmaddcph_maskz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fcmaddcph_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovw (%rdi), %k1
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i16, ptr %mask
%3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> zeroinitializer, i16 %2, i32 4)
ret <16 x float> %3
}
declare <16 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <4 x float> @stack_fold_fmulcsh(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_fmulcsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
ret <4 x float> %2
}
define <4 x float> @stack_fold_fmulcsh_commute(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_fmulcsh_commute:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a1, <4 x float> %a0, <4 x float> undef, i8 -1, i32 4)
ret <4 x float> %2
}
declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
define <4 x float> @stack_fold_fmulcsh_mask(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %mask) {
; CHECK-LABEL: stack_fold_fmulcsh_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <4 x float>, ptr %passthru
%3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask, i32 4)
ret <4 x float> %3
}
define <4 x float> @stack_fold_fmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
; CHECK-LABEL: stack_fold_fmulcsh_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovb (%rdi), %k1
; CHECK-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i8, ptr %mask
%3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4)
ret <4 x float> %3
}
define <4 x float> @stack_fold_fcmulcsh(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_fcmulcsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
ret <4 x float> %2
}
define <4 x float> @stack_fold_fcmulcsh_commute(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_fcmulcsh_commute:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-NEXT: vfcmulcsh %xmm0, %xmm1, %xmm2
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a1, <4 x float> %a0, <4 x float> undef, i8 -1, i32 4)
ret <4 x float> %2
}
declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
define <4 x float> @stack_fold_fcmulcsh_mask(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %mask) {
; CHECK-LABEL: stack_fold_fcmulcsh_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <4 x float>, ptr %passthru
%3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask, i32 4)
ret <4 x float> %3
}
define <4 x float> @stack_fold_fcmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
; CHECK-LABEL: stack_fold_fcmulcsh_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovb (%rdi), %k1
; CHECK-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i8, ptr %mask
%3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4)
ret <4 x float> %3
}
define <4 x float> @stack_fold_fmaddcsh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; CHECK-LABEL: stack_fold_fmaddcsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1, i32 4)
ret <4 x float> %2
}
define <4 x float> @stack_fold_fmaddcsh_commute(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; CHECK-LABEL: stack_fold_fmaddcsh_commute:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a2, <4 x float> %a1, <4 x float> %a0, i8 -1, i32 4)
ret <4 x float> %2
}
declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
define <4 x float> @stack_fold_fmaddcsh_mask(ptr %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: stack_fold_fmaddcsh_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%a0 = load <4 x float>, ptr %p
%2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask, i32 4)
ret <4 x float> %2
}
define <4 x float> @stack_fold_fmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmaddcsh_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovb (%rdi), %k1
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i8, ptr %mask
%3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2, i32 4)
ret <4 x float> %3
}
declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
define <4 x float> @stack_fold_fcmaddcsh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; CHECK-LABEL: stack_fold_fcmaddcsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vfcmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1, i32 4)
ret <4 x float> %2
}
define <4 x float> @stack_fold_fcmaddcsh_commute(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; CHECK-LABEL: stack_fold_fcmaddcsh_commute:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; CHECK-NEXT: vfcmaddcsh %xmm1, %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a2, <4 x float> %a1, <4 x float> %a0, i8 -1, i32 4)
ret <4 x float> %2
}
declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
define <4 x float> @stack_fold_fcmaddcsh_mask(ptr %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: stack_fold_fcmaddcsh_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vfcmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%a0 = load <4 x float>, ptr %p
%2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask, i32 4)
ret <4 x float> %2
}
define <4 x float> @stack_fold_fcmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fcmaddcsh_maskz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: kmovb (%rdi), %k1
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vfcmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load i8, ptr %mask
%3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2, i32 4)
ret <4 x float> %3
}
declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
attributes #0 = { "unsafe-fp-math"="false" }
attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }