llvm/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx        | FileCheck %s --check-prefixes=X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown                  | FileCheck %s --check-prefixes=SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2     | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512

declare <4 x ptr> @llvm.vp.inttoptr.v4p0.v4i32(<4 x i32>, <4 x i1>, i32)
define <4 x ptr> @inttoptr_v4p0_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
; X86-LABEL: inttoptr_v4p0_v4i32:
; X86:       # %bb.0:
; X86-NEXT:    retl
;
; SSE-LABEL: inttoptr_v4p0_v4i32:
; SSE:       # %bb.0:
; SSE-NEXT:    movaps %xmm0, %xmm1
; SSE-NEXT:    xorps %xmm2, %xmm2
; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE-NEXT:    retq
;
; AVX1-LABEL: inttoptr_v4p0_v4i32:
; AVX1:       # %bb.0:
; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT:    retq
;
; AVX2-LABEL: inttoptr_v4p0_v4i32:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT:    retq
;
; AVX512-LABEL: inttoptr_v4p0_v4i32:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512-NEXT:    retq
  %v = call <4 x ptr> @llvm.vp.inttoptr.v4p0.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
  ret <4 x ptr> %v
}

declare <4 x ptr> @llvm.vp.inttoptr.v4p0.v4i64(<4 x i64>, <4 x i1>, i32)

define <4 x ptr> @inttoptr_v4p0_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
; X86-LABEL: inttoptr_v4p0_v4i64:
; X86:       # %bb.0:
; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X86-NEXT:    vzeroupper
; X86-NEXT:    retl
;
; SSE-LABEL: inttoptr_v4p0_v4i64:
; SSE:       # %bb.0:
; SSE-NEXT:    retq
;
; AVX-LABEL: inttoptr_v4p0_v4i64:
; AVX:       # %bb.0:
; AVX-NEXT:    retq
  %v = call <4 x ptr> @llvm.vp.inttoptr.v4p0.v4i64(<4 x i64> %va, <4 x i1> %m, i32 %evl)
  ret <4 x ptr> %v
}

declare <4 x i32> @llvm.vp.ptrtoint.v4i32.v4p0(<4 x ptr>, <4 x i1>, i32)

define <4 x i32> @ptrtoint_v4i32_v4p0(<4 x ptr> %va, <4 x i1> %m, i32 zeroext %evl) {
; X86-LABEL: ptrtoint_v4i32_v4p0:
; X86:       # %bb.0:
; X86-NEXT:    retl
;
; SSE-LABEL: ptrtoint_v4i32_v4p0:
; SSE:       # %bb.0:
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT:    retq
;
; AVX1-LABEL: ptrtoint_v4i32_v4p0:
; AVX1:       # %bb.0:
; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT:    vzeroupper
; AVX1-NEXT:    retq
;
; AVX2-LABEL: ptrtoint_v4i32_v4p0:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-NEXT:    vzeroupper
; AVX2-NEXT:    retq
;
; AVX512-LABEL: ptrtoint_v4i32_v4p0:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
; AVX512-NEXT:    vzeroupper
; AVX512-NEXT:    retq
  %v = call <4 x i32> @llvm.vp.ptrtoint.v4i32.v4p0(<4 x ptr> %va, <4 x i1> %m, i32 %evl)
  ret <4 x i32> %v
}

declare <4 x i64> @llvm.vp.ptrtoint.v4i64.v4p0(<4 x ptr>, <4 x i1>, i32)

define <4 x i64> @ptrtoint_v4i64_v4p0(<4 x ptr> %va, <4 x i1> %m, i32 zeroext %evl) {
; X86-LABEL: ptrtoint_v4i64_v4p0:
; X86:       # %bb.0:
; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; X86-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-NEXT:    retl
;
; SSE-LABEL: ptrtoint_v4i64_v4p0:
; SSE:       # %bb.0:
; SSE-NEXT:    retq
;
; AVX-LABEL: ptrtoint_v4i64_v4p0:
; AVX:       # %bb.0:
; AVX-NEXT:    retq
  %v = call <4 x i64> @llvm.vp.ptrtoint.v4i64.v4p0(<4 x ptr> %va, <4 x i1> %m, i32 %evl)
  ret <4 x i64> %v
}

define <4 x i32> @vsext_v4i32_v4i1(<4 x i1> %va, <4 x i1> %m, i32 zeroext %evl) {
; X86-LABEL: vsext_v4i32_v4i1:
; X86:       # %bb.0:
; X86-NEXT:    vpslld $31, %xmm0, %xmm0
; X86-NEXT:    vpsrad $31, %xmm0, %xmm0
; X86-NEXT:    retl
;
; SSE-LABEL: vsext_v4i32_v4i1:
; SSE:       # %bb.0:
; SSE-NEXT:    pslld $31, %xmm0
; SSE-NEXT:    psrad $31, %xmm0
; SSE-NEXT:    retq
;
; AVX-LABEL: vsext_v4i32_v4i1:
; AVX:       # %bb.0:
; AVX-NEXT:    vpslld $31, %xmm0, %xmm0
; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
; AVX-NEXT:    retq
  %v = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> %va, <4 x i1> %m, i32 %evl)
  ret <4 x i32> %v
}

define <4 x i64> @vsext_v4i64_v4i1(<4 x i1> %va, <4 x i1> %m, i32 zeroext %evl) {
; X86-LABEL: vsext_v4i64_v4i1:
; X86:       # %bb.0:
; X86-NEXT:    vpslld $31, %xmm0, %xmm0
; X86-NEXT:    vpsrad $31, %xmm0, %xmm0
; X86-NEXT:    vpmovsxdq %xmm0, %xmm1
; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X86-NEXT:    retl
;
; SSE-LABEL: vsext_v4i64_v4i1:
; SSE:       # %bb.0:
; SSE-NEXT:    pslld $31, %xmm0
; SSE-NEXT:    psrad $31, %xmm0
; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
; SSE-NEXT:    movdqa %xmm2, %xmm0
; SSE-NEXT:    retq
;
; AVX1-LABEL: vsext_v4i64_v4i1:
; AVX1:       # %bb.0:
; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT:    retq
;
; AVX2-LABEL: vsext_v4i64_v4i1:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT:    retq
;
; AVX512-LABEL: vsext_v4i64_v4i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
; AVX512-NEXT:    retq
  %v = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> %va, <4 x i1> %m, i32 %evl)
  ret <4 x i64> %v
}

define <4 x i32> @vzext_v4i32_v4i1(<4 x i1> %va, <4 x i1> %m, i32 zeroext %evl) {
; X86-LABEL: vzext_v4i32_v4i1:
; X86:       # %bb.0:
; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-NEXT:    retl
;
; SSE-LABEL: vzext_v4i32_v4i1:
; SSE:       # %bb.0:
; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT:    retq
;
; AVX1-LABEL: vzext_v4i32_v4i1:
; AVX1:       # %bb.0:
; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT:    retq
;
; AVX2-LABEL: vzext_v4i32_v4i1:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
; AVX2-NEXT:    retq
;
; AVX512-LABEL: vzext_v4i32_v4i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-NEXT:    retq
  %v = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> %va, <4 x i1> %m, i32 %evl)
  ret <4 x i32> %v
}

define <4 x i64> @vzext_v4i64_v4i1(<4 x i1> %va, <4 x i1> %m, i32 zeroext %evl) {
; X86-LABEL: vzext_v4i64_v4i1:
; X86:       # %bb.0:
; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; X86-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-NEXT:    retl
;
; SSE-LABEL: vzext_v4i64_v4i1:
; SSE:       # %bb.0:
; SSE-NEXT:    movaps %xmm0, %xmm1
; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE-NEXT:    xorps %xmm2, %xmm2
; SSE-NEXT:    movaps %xmm1, %xmm0
; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE-NEXT:    retq
;
; AVX1-LABEL: vzext_v4i64_v4i1:
; AVX1:       # %bb.0:
; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT:    retq
;
; AVX2-LABEL: vzext_v4i64_v4i1:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT:    retq
;
; AVX512-LABEL: vzext_v4i64_v4i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512-NEXT:    retq
  %v = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> %va, <4 x i1> %m, i32 %evl)
  ret <4 x i64> %v
}

define <4 x i1> @vtrunc_v4i1_v4i32(<4 x i32> %a, <4 x i1> %m, i32 zeroext %vl) {
; X86-LABEL: vtrunc_v4i1_v4i32:
; X86:       # %bb.0:
; X86-NEXT:    retl
;
; SSE-LABEL: vtrunc_v4i1_v4i32:
; SSE:       # %bb.0:
; SSE-NEXT:    retq
;
; AVX-LABEL: vtrunc_v4i1_v4i32:
; AVX:       # %bb.0:
; AVX-NEXT:    retq
  %v = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> %a, <4 x i1> %m, i32 %vl)
  ret <4 x i1> %v
}

define <4 x i1> @vtrunc_v4i1_v4i64(<4 x i64> %a, <4 x i1> %m, i32 zeroext %vl) {
; X86-LABEL: vtrunc_v4i1_v4i64:
; X86:       # %bb.0:
; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X86-NEXT:    vzeroupper
; X86-NEXT:    retl
;
; SSE-LABEL: vtrunc_v4i1_v4i64:
; SSE:       # %bb.0:
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT:    retq
;
; AVX1-LABEL: vtrunc_v4i1_v4i64:
; AVX1:       # %bb.0:
; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT:    vzeroupper
; AVX1-NEXT:    retq
;
; AVX2-LABEL: vtrunc_v4i1_v4i64:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-NEXT:    vzeroupper
; AVX2-NEXT:    retq
;
; AVX512-LABEL: vtrunc_v4i1_v4i64:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
; AVX512-NEXT:    vzeroupper
; AVX512-NEXT:    retq
  %v = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> %a, <4 x i1> %m, i32 %vl)
  ret <4 x i1> %v
}

define <4 x i32> @vfptoui_v4i32_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %evl) {
; X86-LABEL: vfptoui_v4i32_v4f32:
; X86:       # %bb.0:
; X86-NEXT:    vcvttps2dq %xmm0, %xmm1
; X86-NEXT:    vpsrad $31, %xmm1, %xmm2
; X86-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-NEXT:    vcvttps2dq %xmm0, %xmm0
; X86-NEXT:    vpand %xmm2, %xmm0, %xmm0
; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
; X86-NEXT:    retl
;
; SSE-LABEL: vfptoui_v4i32_v4f32:
; SSE:       # %bb.0:
; SSE-NEXT:    cvttps2dq %xmm0, %xmm1
; SSE-NEXT:    movdqa %xmm1, %xmm2
; SSE-NEXT:    psrad $31, %xmm2
; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
; SSE-NEXT:    pand %xmm2, %xmm0
; SSE-NEXT:    por %xmm1, %xmm0
; SSE-NEXT:    retq
;
; AVX1-LABEL: vfptoui_v4i32_v4f32:
; AVX1:       # %bb.0:
; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm1
; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
; AVX1-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
; AVX1-NEXT:    retq
;
; AVX2-LABEL: vfptoui_v4i32_v4f32:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
; AVX2-NEXT:    vsubps %xmm1, %xmm0, %xmm1
; AVX2-NEXT:    vcvttps2dq %xmm1, %xmm1
; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT:    retq
;
; AVX512-LABEL: vfptoui_v4i32_v4f32:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vcvttps2udq %xmm0, %xmm0
; AVX512-NEXT:    retq
  %v = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> %va, <4 x i1> %m, i32 %evl)
  ret <4 x i32> %v
}

define <4 x i32> @vfptosi_v4i32_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %evl) {
; X86-LABEL: vfptosi_v4i32_v4f32:
; X86:       # %bb.0:
; X86-NEXT:    vcvttps2dq %xmm0, %xmm0
; X86-NEXT:    retl
;
; SSE-LABEL: vfptosi_v4i32_v4f32:
; SSE:       # %bb.0:
; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
; SSE-NEXT:    retq
;
; AVX-LABEL: vfptosi_v4i32_v4f32:
; AVX:       # %bb.0:
; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
; AVX-NEXT:    retq
  %v = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> %va, <4 x i1> %m, i32 %evl)
  ret <4 x i32> %v
}

define <4 x float> @vuitofp_v4f32_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
; X86-LABEL: vuitofp_v4f32_v4i32:
; X86:       # %bb.0:
; X86-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
; X86-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
; X86-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-NEXT:    vaddps %xmm0, %xmm1, %xmm0
; X86-NEXT:    retl
;
; SSE-LABEL: vuitofp_v4f32_v4i32:
; SSE:       # %bb.0:
; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
; SSE-NEXT:    pand %xmm0, %xmm1
; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE-NEXT:    psrld $16, %xmm0
; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT:    addps %xmm1, %xmm0
; SSE-NEXT:    retq
;
; AVX1-LABEL: vuitofp_v4f32_v4i32:
; AVX1:       # %bb.0:
; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
; AVX1-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
; AVX1-NEXT:    retq
;
; AVX2-LABEL: vuitofp_v4f32_v4i32:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928]
; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
; AVX2-NEXT:    vsubps %xmm2, %xmm0, %xmm0
; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
; AVX2-NEXT:    retq
;
; AVX512-LABEL: vuitofp_v4f32_v4i32:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vcvtudq2ps %xmm0, %xmm0
; AVX512-NEXT:    retq
  %v = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
  ret <4 x float> %v
}

define <4 x float> @vsitofp_v4f32_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
; X86-LABEL: vsitofp_v4f32_v4i32:
; X86:       # %bb.0:
; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
; X86-NEXT:    retl
;
; SSE-LABEL: vsitofp_v4f32_v4i32:
; SSE:       # %bb.0:
; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
; SSE-NEXT:    retq
;
; AVX-LABEL: vsitofp_v4f32_v4i32:
; AVX:       # %bb.0:
; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT:    retq
  %v = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
  ret <4 x float> %v
}

define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroext %vl) {
; X86-LABEL: vfptrunc_v2f16_v2f64:
; X86:       # %bb.0:
; X86-NEXT:    subl $40, %esp
; X86-NEXT:    .cfi_def_cfa_offset 44
; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT:    vmovlps %xmm0, (%esp)
; X86-NEXT:    calll __truncdfhf2
; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT:    vmovhps %xmm0, (%esp)
; X86-NEXT:    calll __truncdfhf2
; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-NEXT:    addl $40, %esp
; X86-NEXT:    .cfi_def_cfa_offset 4
; X86-NEXT:    retl
;
; SSE-LABEL: vfptrunc_v2f16_v2f64:
; SSE:       # %bb.0:
; SSE-NEXT:    subq $40, %rsp
; SSE-NEXT:    .cfi_def_cfa_offset 48
; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT:    callq __truncdfhf2@PLT
; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    callq __truncdfhf2@PLT
; SSE-NEXT:    punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload
; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; SSE-NEXT:    addq $40, %rsp
; SSE-NEXT:    .cfi_def_cfa_offset 8
; SSE-NEXT:    retq
;
; AVX1-LABEL: vfptrunc_v2f16_v2f64:
; AVX1:       # %bb.0:
; AVX1-NEXT:    subq $40, %rsp
; AVX1-NEXT:    .cfi_def_cfa_offset 48
; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT:    callq __truncdfhf2@PLT
; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT:    callq __truncdfhf2@PLT
; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX1-NEXT:    addq $40, %rsp
; AVX1-NEXT:    .cfi_def_cfa_offset 8
; AVX1-NEXT:    retq
;
; AVX2-LABEL: vfptrunc_v2f16_v2f64:
; AVX2:       # %bb.0:
; AVX2-NEXT:    subq $40, %rsp
; AVX2-NEXT:    .cfi_def_cfa_offset 48
; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT:    callq __truncdfhf2@PLT
; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-NEXT:    callq __truncdfhf2@PLT
; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-NEXT:    addq $40, %rsp
; AVX2-NEXT:    .cfi_def_cfa_offset 8
; AVX2-NEXT:    retq
;
; AVX512-LABEL: vfptrunc_v2f16_v2f64:
; AVX512:       # %bb.0:
; AVX512-NEXT:    subq $40, %rsp
; AVX512-NEXT:    .cfi_def_cfa_offset 48
; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT:    callq __truncdfhf2@PLT
; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX512-NEXT:    # xmm0 = mem[1,0]
; AVX512-NEXT:    callq __truncdfhf2@PLT
; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT:    callq __truncdfhf2@PLT
; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm1
; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = [4,0,0,0]
; AVX512-NEXT:    vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT:    addq $40, %rsp
; AVX512-NEXT:    .cfi_def_cfa_offset 8
; AVX512-NEXT:    retq
  %v = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f64(<2 x double> %a, <2 x i1> %m, i32 %vl)
  ret <2 x half> %v
}

define <2 x double> @vfpext_v2f32_v2f64(<2 x float> %a, <2 x i1> %m, i32 zeroext %vl) {
; X86-LABEL: vfpext_v2f32_v2f64:
; X86:       # %bb.0:
; X86-NEXT:    vcvtps2pd %xmm0, %xmm0
; X86-NEXT:    retl
;
; SSE-LABEL: vfpext_v2f32_v2f64:
; SSE:       # %bb.0:
; SSE-NEXT:    cvtps2pd %xmm0, %xmm0
; SSE-NEXT:    retq
;
; AVX-LABEL: vfpext_v2f32_v2f64:
; AVX:       # %bb.0:
; AVX-NEXT:    vcvtps2pd %xmm0, %xmm0
; AVX-NEXT:    retq
  %v = call <2 x double> @llvm.vp.fpext.v2f64.v2f32(<2 x float> %a, <2 x i1> %m, i32 %vl)
  ret <2 x double> %v
}

declare <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1>, <4 x i1>, i32)
declare <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1>, <4 x i1>, i32)
declare <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1>, <4 x i1>, i32)
declare <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1>, <4 x i1>, i32)
declare <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32>, <4 x i1>, i32)
declare <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64>, <4 x i1>, i32)
declare <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float>, <4 x i1>, i32)
declare <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float>, <4 x i1>, i32)
declare <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32>, <4 x i1>, i32)
declare <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32>, <4 x i1>, i32)
declare <2 x half> @llvm.vp.fptrunc.v2f16.v2f64(<2 x double>, <2 x i1>, i32)
declare <2 x double> @llvm.vp.fpext.v2f64.v2f32(<2 x float>, <2 x i1>, i32)