llvm/llvm/test/CodeGen/X86/avx512fp16-mov.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X64
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X86

define <8 x half> @broadcastph128(ptr %x) {
; X64-LABEL: broadcastph128:
; X64:       # %bb.0:
; X64-NEXT:    vpbroadcastw (%rdi), %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: broadcastph128:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpbroadcastw (%eax), %xmm0
; X86-NEXT:    retl
  %l1 = load half, ptr %x, align 2
  %vec = insertelement <8 x half> undef, half %l1, i32 0
  %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer
  ret <8 x half> %res
}

define <16 x half> @broadcastph256(ptr %x) {
; X64-LABEL: broadcastph256:
; X64:       # %bb.0:
; X64-NEXT:    vpbroadcastw (%rdi), %ymm0
; X64-NEXT:    retq
;
; X86-LABEL: broadcastph256:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpbroadcastw (%eax), %ymm0
; X86-NEXT:    retl
  %l1 = load half, ptr %x, align 2
  %vec = insertelement <16 x half> undef, half %l1, i32 0
  %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer
  ret <16 x half> %res
}

define <32 x half> @broadcastph512(ptr %x) {
; X64-LABEL: broadcastph512:
; X64:       # %bb.0:
; X64-NEXT:    vpbroadcastw (%rdi), %zmm0
; X64-NEXT:    retq
;
; X86-LABEL: broadcastph512:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpbroadcastw (%eax), %zmm0
; X86-NEXT:    retl
  %l1 = load half, ptr %x, align 2
  %vec = insertelement <32 x half> undef, half %l1, i32 0
  %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer
  ret <32 x half> %res
}

define <8 x half> @broadcastph128_scalar(half %x) {
; X64-LABEL: broadcastph128_scalar:
; X64:       # %bb.0:
; X64-NEXT:    vpbroadcastw %xmm0, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: broadcastph128_scalar:
; X86:       # %bb.0:
; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %xmm0
; X86-NEXT:    retl
  %vec = insertelement <8 x half> undef, half %x, i32 0
  %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer
  ret <8 x half> %res
}

define <16 x half> @broadcastph256_scalar(half %x) {
; X64-LABEL: broadcastph256_scalar:
; X64:       # %bb.0:
; X64-NEXT:    vpbroadcastw %xmm0, %ymm0
; X64-NEXT:    retq
;
; X86-LABEL: broadcastph256_scalar:
; X86:       # %bb.0:
; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %ymm0
; X86-NEXT:    retl
  %vec = insertelement <16 x half> undef, half %x, i32 0
  %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer
  ret <16 x half> %res
}

define <32 x half> @broadcastph512_scalar(half %x) {
; X64-LABEL: broadcastph512_scalar:
; X64:       # %bb.0:
; X64-NEXT:    vpbroadcastw %xmm0, %zmm0
; X64-NEXT:    retq
;
; X86-LABEL: broadcastph512_scalar:
; X86:       # %bb.0:
; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %zmm0
; X86-NEXT:    retl
  %vec = insertelement <32 x half> undef, half %x, i32 0
  %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer
  ret <32 x half> %res
}

define <8 x half> @broadcastph128_reg(<8 x half> %x) {
; CHECK-LABEL: broadcastph128_reg:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
; CHECK-NEXT:    ret{{[l|q]}}
  %res = shufflevector <8 x half> %x, <8 x half> undef, <8 x i32> zeroinitializer
  ret <8 x half> %res
}

define <16 x half> @broadcastph256_reg(<16 x half> %x) {
; CHECK-LABEL: broadcastph256_reg:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
; CHECK-NEXT:    ret{{[l|q]}}
  %res = shufflevector <16 x half> %x, <16 x half> undef, <16 x i32> zeroinitializer
  ret <16 x half> %res
}

define <32 x half> @broadcastph512_reg(<32 x half> %x) {
; CHECK-LABEL: broadcastph512_reg:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0
; CHECK-NEXT:    ret{{[l|q]}}
  %res = shufflevector <32 x half> %x, <32 x half> undef, <32 x i32> zeroinitializer
  ret <32 x half> %res
}

define i16 @test1(half %x) {
; X64-LABEL: test1:
; X64:       # %bb.0:
; X64-NEXT:    vmovw %xmm0, %eax
; X64-NEXT:    # kill: def $ax killed $ax killed $eax
; X64-NEXT:    retq
;
; X86-LABEL: test1:
; X86:       # %bb.0:
; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    retl
   %res = bitcast half %x to i16
   ret i16 %res
}

define <8 x i16> @test2(i16 %x) {
; X64-LABEL: test2:
; X64:       # %bb.0:
; X64-NEXT:    vmovw %edi, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test2:
; X86:       # %bb.0:
; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %xmm0
; X86-NEXT:    retl
   %res = insertelement <8 x i16>undef, i16 %x, i32 0
   ret <8 x i16>%res
}

define <8 x i16> @test4(ptr %x) {
; X64-LABEL: test4:
; X64:       # %bb.0:
; X64-NEXT:    vpbroadcastw (%rdi), %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test4:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpbroadcastw (%eax), %xmm0
; X86-NEXT:    retl
   %y = load i16, ptr %x
   %res = insertelement <8 x i16>undef, i16 %y, i32 0
   ret <8 x i16>%res
}

define void @test5(half %x, ptr %y) {
; X64-LABEL: test5:
; X64:       # %bb.0:
; X64-NEXT:    vmovsh %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: test5:
; X86:       # %bb.0:
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovsh %xmm0, (%eax)
; X86-NEXT:    retl
   store half %x, ptr %y, align 2
   ret void
}

define half @test7(ptr %x) {
; X64-LABEL: test7:
; X64:       # %bb.0:
; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X64-NEXT:    retq
;
; X86-LABEL: test7:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    retl
   %y = load i16, ptr %x
   %res = bitcast i16 %y to half
   ret half %res
}

define <8 x i16> @test10(ptr %x) {
; X64-LABEL: test10:
; X64:       # %bb.0:
; X64-NEXT:    vmovw (%rdi), %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test10:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovw (%eax), %xmm0
; X86-NEXT:    retl
   %y = load i16, ptr %x, align 2
   %res = insertelement <8 x i16>zeroinitializer, i16 %y, i32 0
   ret <8 x i16>%res
}

define <16 x i16> @test10b(ptr %x) {
; X64-LABEL: test10b:
; X64:       # %bb.0:
; X64-NEXT:    vmovw (%rdi), %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test10b:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovw (%eax), %xmm0
; X86-NEXT:    retl
   %y = load i16, ptr %x, align 2
   %res = insertelement <16 x i16>zeroinitializer, i16 %y, i32 0
   ret <16 x i16>%res
}

define <32 x i16> @test10c(ptr %x) {
; X64-LABEL: test10c:
; X64:       # %bb.0:
; X64-NEXT:    vmovw (%rdi), %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test10c:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovw (%eax), %xmm0
; X86-NEXT:    retl
   %y = load i16, ptr %x, align 2
   %res = insertelement <32 x i16>zeroinitializer, i16 %y, i32 0
   ret <32 x i16>%res
}

define <8 x half> @test11(ptr %x) {
; X64-LABEL: test11:
; X64:       # %bb.0:
; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X64-NEXT:    retq
;
; X86-LABEL: test11:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    retl
   %y = load half, ptr %x, align 2
   %res = insertelement <8 x half>zeroinitializer, half %y, i32 0
   ret <8 x half>%res
}

define <16 x half> @test11b(ptr %x) {
; X64-LABEL: test11b:
; X64:       # %bb.0:
; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X64-NEXT:    retq
;
; X86-LABEL: test11b:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    retl
   %y = load half, ptr %x, align 2
   %res = insertelement <16 x half>zeroinitializer, half %y, i32 0
   ret <16 x half>%res
}

define <32 x half> @test11c(ptr %x) {
; X64-LABEL: test11c:
; X64:       # %bb.0:
; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X64-NEXT:    retq
;
; X86-LABEL: test11c:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    retl
   %y = load half, ptr %x, align 2
   %res = insertelement <32 x half>zeroinitializer, half %y, i32 0
   ret <32 x half>%res
}

define <8 x half> @test14(half %x) {
; X64-LABEL: test14:
; X64:       # %bb.0:
; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test14:
; X86:       # %bb.0:
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    retl
   %res = insertelement <8 x half>zeroinitializer, half %x, i32 0
   ret <8 x half>%res
}

define <16 x half> @test14b(half %x) {
; X64-LABEL: test14b:
; X64:       # %bb.0:
; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test14b:
; X86:       # %bb.0:
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    retl
   %res = insertelement <16 x half>zeroinitializer, half %x, i32 0
   ret <16 x half>%res
}

define <32 x half> @test14c(half %x) {
; X64-LABEL: test14c:
; X64:       # %bb.0:
; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test14c:
; X86:       # %bb.0:
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    retl
   %res = insertelement <32 x half>zeroinitializer, half %x, i32 0
   ret <32 x half>%res
}

define <8 x i16> @test15(i16 %x) {
; X64-LABEL: test15:
; X64:       # %bb.0:
; X64-NEXT:    vmovw %edi, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test15:
; X86:       # %bb.0:
; X86-NEXT:    vmovw {{[0-9]+}}(%esp), %xmm0
; X86-NEXT:    retl
   %res = insertelement <8 x i16>zeroinitializer, i16 %x, i32 0
   ret <8 x i16>%res
}

define <16 x i16> @test16(i16 %x) {
; X64-LABEL: test16:
; X64:       # %bb.0:
; X64-NEXT:    vmovw %edi, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test16:
; X86:       # %bb.0:
; X86-NEXT:    vmovw {{[0-9]+}}(%esp), %xmm0
; X86-NEXT:    retl
   %res = insertelement <16 x i16>zeroinitializer, i16 %x, i32 0
   ret <16 x i16>%res
}

define <32 x i16> @test17(i16 %x) {
; X64-LABEL: test17:
; X64:       # %bb.0:
; X64-NEXT:    vmovw %edi, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test17:
; X86:       # %bb.0:
; X86-NEXT:    vmovw {{[0-9]+}}(%esp), %xmm0
; X86-NEXT:    retl
   %res = insertelement <32 x i16>zeroinitializer, i16 %x, i32 0
   ret <32 x i16>%res
}

define <8 x i16> @test18(i16 %x) {
; X64-LABEL: test18:
; X64:       # %bb.0:
; X64-NEXT:    vmovw %edi, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test18:
; X86:       # %bb.0:
; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %xmm0
; X86-NEXT:    retl
   %res = insertelement <8 x i16> undef, i16 %x, i32 0
   ret <8 x i16>%res
}

define <16 x i16> @test19(i16 %x) {
; X64-LABEL: test19:
; X64:       # %bb.0:
; X64-NEXT:    vmovw %edi, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test19:
; X86:       # %bb.0:
; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %ymm0
; X86-NEXT:    retl
   %res = insertelement <16 x i16> undef, i16 %x, i32 0
   ret <16 x i16>%res
}

define <32 x i16> @test20(i16 %x) {
; X64-LABEL: test20:
; X64:       # %bb.0:
; X64-NEXT:    vmovw %edi, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test20:
; X86:       # %bb.0:
; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %zmm0
; X86-NEXT:    retl
   %res = insertelement <32 x i16> undef, i16 %x, i32 0
   ret <32 x i16>%res
}

@g8f16 = external global <8 x half>
@g8f16u = external global <8 x half>, align 8
@g16f16 = external global <16 x half>
@g16f16u = external global <16 x half>, align 8
@g32f16 = external global <32 x half>
@g32f16u = external global <32 x half>, align 8

define <32 x half> @load32f16(ptr %a) {
; X64-LABEL: load32f16:
; X64:       # %bb.0:
; X64-NEXT:    vmovaps (%rdi), %zmm0
; X64-NEXT:    retq
;
; X86-LABEL: load32f16:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovaps (%eax), %zmm0
; X86-NEXT:    retl
  %res = load <32 x half>, ptr %a
  ret <32 x half> %res
}

define <32 x half> @load32f16mask(ptr %a, <32 x half> %b, i32 %c) {
; X64-LABEL: load32f16mask:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %esi, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
; X64-NEXT:    retq
;
; X86-LABEL: load32f16mask:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1}
; X86-NEXT:    retl
  %msk = bitcast i32 %c to <32 x i1>
  %res0 = load <32 x half>, ptr %a
  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b
  ret <32 x half> %res
}

define <32 x half> @load32f16maskz(ptr %a, i32 %c) {
; X64-LABEL: load32f16maskz:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %esi, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: load32f16maskz:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} {z}
; X86-NEXT:    retl
  %msk = bitcast i32 %c to <32 x i1>
  %res0 = load <32 x half>, ptr %a
  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
  ret <32 x half> %res
}

define <32 x half> @loadu32f16(ptr %a) {
; X64-LABEL: loadu32f16:
; X64:       # %bb.0:
; X64-NEXT:    vmovups (%rdi), %zmm0
; X64-NEXT:    retq
;
; X86-LABEL: loadu32f16:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovups (%eax), %zmm0
; X86-NEXT:    retl
  %res = load <32 x half>, ptr %a, align 8
  ret <32 x half> %res
}

define <32 x half> @loadu32f16mask(ptr %a, <32 x half> %b, i32 %c) {
; X64-LABEL: loadu32f16mask:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %esi, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
; X64-NEXT:    retq
;
; X86-LABEL: loadu32f16mask:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1}
; X86-NEXT:    retl
  %msk = bitcast i32 %c to <32 x i1>
  %res0 = load <32 x half>, ptr %a, align 8
  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b
  ret <32 x half> %res
}

define <32 x half> @loadu32f16maskz(ptr %a, i32 %c) {
; X64-LABEL: loadu32f16maskz:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %esi, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: loadu32f16maskz:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} {z}
; X86-NEXT:    retl
  %msk = bitcast i32 %c to <32 x i1>
  %res0 = load <32 x half>, ptr %a, align 8
  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
  ret <32 x half> %res
}

define void @store32f16(<32 x half> %a) {
; X64-LABEL: store32f16:
; X64:       # %bb.0:
; X64-NEXT:    movq g32f16@GOTPCREL(%rip), %rax
; X64-NEXT:    vmovaps %zmm0, (%rax)
; X64-NEXT:    vzeroupper
; X64-NEXT:    retq
;
; X86-LABEL: store32f16:
; X86:       # %bb.0:
; X86-NEXT:    vmovaps %zmm0, g32f16
; X86-NEXT:    vzeroupper
; X86-NEXT:    retl
  store <32 x half> %a, ptr @g32f16
  ret void
}

define void @storeu32f16(<32 x half> %a) {
; X64-LABEL: storeu32f16:
; X64:       # %bb.0:
; X64-NEXT:    movq g32f16u@GOTPCREL(%rip), %rax
; X64-NEXT:    vmovups %zmm0, (%rax)
; X64-NEXT:    vzeroupper
; X64-NEXT:    retq
;
; X86-LABEL: storeu32f16:
; X86:       # %bb.0:
; X86-NEXT:    vmovups %zmm0, g32f16u
; X86-NEXT:    vzeroupper
; X86-NEXT:    retl
  store <32 x half> %a, ptr @g32f16u, align 8
  ret void
}

declare void @llvm.masked.store.v32f16.p0(<32 x half>, ptr, i32, <32 x i1>)
declare <32 x half> @llvm.masked.load.v32f16.p0(ptr, i32,  <32 x i1>, <32 x half>)

define void @storeu32f16mask(<32 x i1> %mask, ptr %addr, <32 x half> %val) {
; X64-LABEL: storeu32f16mask:
; X64:       # %bb.0:
; X64-NEXT:    vpsllw $7, %ymm0, %ymm0
; X64-NEXT:    vpmovb2m %ymm0, %k1
; X64-NEXT:    vmovdqu16 %zmm1, (%rdi) {%k1}
; X64-NEXT:    vzeroupper
; X64-NEXT:    retq
;
; X86-LABEL: storeu32f16mask:
; X86:       # %bb.0:
; X86-NEXT:    vpsllw $7, %ymm0, %ymm0
; X86-NEXT:    vpmovb2m %ymm0, %k1
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovdqu16 %zmm1, (%eax) {%k1}
; X86-NEXT:    vzeroupper
; X86-NEXT:    retl
  call void @llvm.masked.store.v32f16.p0(<32 x half> %val, ptr %addr, i32 4, <32 x i1>%mask)
  ret void
}

define <32 x half> @maskloadu32f16(ptr %addr, <32 x half> %val, <32 x i1> %mask) {
; X64-LABEL: maskloadu32f16:
; X64:       # %bb.0:
; X64-NEXT:    vpsllw $7, %ymm1, %ymm1
; X64-NEXT:    vpmovb2m %ymm1, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
; X64-NEXT:    retq
;
; X86-LABEL: maskloadu32f16:
; X86:       # %bb.0:
; X86-NEXT:    vpsllw $7, %ymm1, %ymm1
; X86-NEXT:    vpmovb2m %ymm1, %k1
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1}
; X86-NEXT:    retl
  %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> %val)
  ret <32 x half> %res
}

define <32 x half> @maskuloadu32f16(ptr %addr, <32 x i1> %mask) {
; X64-LABEL: maskuloadu32f16:
; X64:       # %bb.0:
; X64-NEXT:    vpsllw $7, %ymm0, %ymm0
; X64-NEXT:    vpmovb2m %ymm0, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: maskuloadu32f16:
; X86:       # %bb.0:
; X86-NEXT:    vpsllw $7, %ymm0, %ymm0
; X86-NEXT:    vpmovb2m %ymm0, %k1
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} {z}
; X86-NEXT:    retl
  %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> undef)
  ret <32 x half> %res
}

define <32 x half> @maskzloadu32f16(ptr %addr, <32 x i1> %mask) {
; X64-LABEL: maskzloadu32f16:
; X64:       # %bb.0:
; X64-NEXT:    vpsllw $7, %ymm0, %ymm0
; X64-NEXT:    vpmovb2m %ymm0, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: maskzloadu32f16:
; X86:       # %bb.0:
; X86-NEXT:    vpsllw $7, %ymm0, %ymm0
; X86-NEXT:    vpmovb2m %ymm0, %k1
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} {z}
; X86-NEXT:    retl
  %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> zeroinitializer)
  ret <32 x half> %res
}

define <32 x half> @movrr32f16(<32 x half> %a, <32 x half> %b) {
; CHECK-LABEL: movrr32f16:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %zmm1, %zmm0
; CHECK-NEXT:    ret{{[l|q]}}
  ret <32 x half> %b
}

define <32 x half> @movrrk32f16(<32 x half> %a, <32 x half> %b, i32 %msk) {
; X64-LABEL: movrrk32f16:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %edi, %k1
; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; X64-NEXT:    retq
;
; X86-LABEL: movrrk32f16:
; X86:       # %bb.0:
; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; X86-NEXT:    retl
  %mask = bitcast i32 %msk to <32 x i1>
  %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> %b
  ret <32 x half> %res
}

define <32 x half> @movrrkz32f16(<32 x half> %a, i32 %msk) {
; X64-LABEL: movrrkz32f16:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %edi, %k1
; X64-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: movrrkz32f16:
; X86:       # %bb.0:
; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; X86-NEXT:    retl
  %mask = bitcast i32 %msk to <32 x i1>
  %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> zeroinitializer
  ret <32 x half> %res
}

define <16 x half> @load16f16(ptr %a) {
; X64-LABEL: load16f16:
; X64:       # %bb.0:
; X64-NEXT:    vmovaps (%rdi), %ymm0
; X64-NEXT:    retq
;
; X86-LABEL: load16f16:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovaps (%eax), %ymm0
; X86-NEXT:    retl
  %res = load <16 x half>, ptr %a
  ret <16 x half> %res
}

define <16 x half> @load16f16mask(ptr %a, <16 x half> %b, i16 %c) {
; X64-LABEL: load16f16mask:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %esi, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
; X64-NEXT:    retq
;
; X86-LABEL: load16f16mask:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1}
; X86-NEXT:    retl
  %msk = bitcast i16 %c to <16 x i1>
  %res0 = load <16 x half>, ptr %a
  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
  ret <16 x half> %res
}

define <16 x half> @load16f16maskz(ptr %a, i16 %c) {
; X64-LABEL: load16f16maskz:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %esi, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: load16f16maskz:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
; X86-NEXT:    retl
  %msk = bitcast i16 %c to <16 x i1>
  %res0 = load <16 x half>, ptr %a
  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
  ret <16 x half> %res
}

define <16 x half> @loadu16f16(ptr %a) {
; X64-LABEL: loadu16f16:
; X64:       # %bb.0:
; X64-NEXT:    vmovups (%rdi), %ymm0
; X64-NEXT:    retq
;
; X86-LABEL: loadu16f16:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovups (%eax), %ymm0
; X86-NEXT:    retl
  %res = load <16 x half>, ptr %a, align 8
  ret <16 x half> %res
}

define <16 x half> @loadu16f16mask(ptr %a, <16 x half> %b, i16 %c) {
; X64-LABEL: loadu16f16mask:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %esi, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
; X64-NEXT:    retq
;
; X86-LABEL: loadu16f16mask:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1}
; X86-NEXT:    retl
  %msk = bitcast i16 %c to <16 x i1>
  %res0 = load <16 x half>, ptr %a, align 8
  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
  ret <16 x half> %res
}

define <16 x half> @loadu16f16maskz(ptr %a, i16 %c) {
; X64-LABEL: loadu16f16maskz:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %esi, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: loadu16f16maskz:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
; X86-NEXT:    retl
  %msk = bitcast i16 %c to <16 x i1>
  %res0 = load <16 x half>, ptr %a, align 8
  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
  ret <16 x half> %res
}

define void @store16f16(<16 x half> %a) {
; X64-LABEL: store16f16:
; X64:       # %bb.0:
; X64-NEXT:    movq g16f16@GOTPCREL(%rip), %rax
; X64-NEXT:    vmovaps %ymm0, (%rax)
; X64-NEXT:    vzeroupper
; X64-NEXT:    retq
;
; X86-LABEL: store16f16:
; X86:       # %bb.0:
; X86-NEXT:    vmovaps %ymm0, g16f16
; X86-NEXT:    vzeroupper
; X86-NEXT:    retl
  store <16 x half> %a, ptr @g16f16
  ret void
}

define void @storeu16f16(<16 x half> %a) {
; X64-LABEL: storeu16f16:
; X64:       # %bb.0:
; X64-NEXT:    movq g16f16u@GOTPCREL(%rip), %rax
; X64-NEXT:    vmovups %ymm0, (%rax)
; X64-NEXT:    vzeroupper
; X64-NEXT:    retq
;
; X86-LABEL: storeu16f16:
; X86:       # %bb.0:
; X86-NEXT:    vmovups %ymm0, g16f16u
; X86-NEXT:    vzeroupper
; X86-NEXT:    retl
  store <16 x half> %a, ptr @g16f16u, align 8
  ret void
}

declare void @llvm.masked.store.v16f16.p0(<16 x half>, ptr, i32, <16 x i1>)
declare <16 x half> @llvm.masked.load.v16f16.p0(ptr, i32,  <16 x i1>, <16 x half>)

define void @storeu16f16mask(<16 x i1> %mask, ptr %addr, <16 x half> %val) {
; X64-LABEL: storeu16f16mask:
; X64:       # %bb.0:
; X64-NEXT:    vpsllw $7, %xmm0, %xmm0
; X64-NEXT:    vpmovb2m %xmm0, %k1
; X64-NEXT:    vmovdqu16 %ymm1, (%rdi) {%k1}
; X64-NEXT:    vzeroupper
; X64-NEXT:    retq
;
; X86-LABEL: storeu16f16mask:
; X86:       # %bb.0:
; X86-NEXT:    vpsllw $7, %xmm0, %xmm0
; X86-NEXT:    vpmovb2m %xmm0, %k1
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovdqu16 %ymm1, (%eax) {%k1}
; X86-NEXT:    vzeroupper
; X86-NEXT:    retl
  call void @llvm.masked.store.v16f16.p0(<16 x half> %val, ptr %addr, i32 4, <16 x i1>%mask)
  ret void
}

define <16 x half> @maskloadu16f16(ptr %addr, <16 x half> %val, <16 x i1> %mask) {
; X64-LABEL: maskloadu16f16:
; X64:       # %bb.0:
; X64-NEXT:    vpsllw $7, %xmm1, %xmm1
; X64-NEXT:    vpmovb2m %xmm1, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
; X64-NEXT:    retq
;
; X86-LABEL: maskloadu16f16:
; X86:       # %bb.0:
; X86-NEXT:    vpsllw $7, %xmm1, %xmm1
; X86-NEXT:    vpmovb2m %xmm1, %k1
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1}
; X86-NEXT:    retl
  %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> %val)
  ret <16 x half> %res
}

define <16 x half> @maskuloadu16f16(ptr %addr, <16 x i1> %mask) {
; X64-LABEL: maskuloadu16f16:
; X64:       # %bb.0:
; X64-NEXT:    vpsllw $7, %xmm0, %xmm0
; X64-NEXT:    vpmovb2m %xmm0, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: maskuloadu16f16:
; X86:       # %bb.0:
; X86-NEXT:    vpsllw $7, %xmm0, %xmm0
; X86-NEXT:    vpmovb2m %xmm0, %k1
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
; X86-NEXT:    retl
  %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> undef)
  ret <16 x half> %res
}

define <16 x half> @maskzloadu16f16(ptr %addr, <16 x i1> %mask) {
; X64-LABEL: maskzloadu16f16:
; X64:       # %bb.0:
; X64-NEXT:    vpsllw $7, %xmm0, %xmm0
; X64-NEXT:    vpmovb2m %xmm0, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: maskzloadu16f16:
; X86:       # %bb.0:
; X86-NEXT:    vpsllw $7, %xmm0, %xmm0
; X86-NEXT:    vpmovb2m %xmm0, %k1
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
; X86-NEXT:    retl
  %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> zeroinitializer)
  ret <16 x half> %res
}

define <16 x half> @movrr16f16(<16 x half> %a, <16 x half> %b) {
; CHECK-LABEL: movrr16f16:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %ymm1, %ymm0
; CHECK-NEXT:    ret{{[l|q]}}
  ret <16 x half> %b
}

define <16 x half> @movrrk16f16(<16 x half> %a, <16 x half> %b, i16 %msk) {
; X64-LABEL: movrrk16f16:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %edi, %k1
; X64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; X64-NEXT:    retq
;
; X86-LABEL: movrrk16f16:
; X86:       # %bb.0:
; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; X86-NEXT:    retl
  %mask = bitcast i16 %msk to <16 x i1>
  %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> %b
  ret <16 x half> %res
}

define <16 x half> @movrrkz16f16(<16 x half> %a, i16 %msk) {
; X64-LABEL: movrrkz16f16:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %edi, %k1
; X64-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: movrrkz16f16:
; X86:       # %bb.0:
; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
; X86-NEXT:    retl
  %mask = bitcast i16 %msk to <16 x i1>
  %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> zeroinitializer
  ret <16 x half> %res
}

define <8 x half> @load8f16(ptr %a) {
; X64-LABEL: load8f16:
; X64:       # %bb.0:
; X64-NEXT:    vmovaps (%rdi), %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: load8f16:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovaps (%eax), %xmm0
; X86-NEXT:    retl
  %res = load <8 x half>, ptr %a
  ret <8 x half> %res
}

define <8 x half> @load8f16mask(ptr %a, <8 x half> %b, i8 %c) {
; X64-LABEL: load8f16mask:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %esi, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
; X64-NEXT:    retq
;
; X86-LABEL: load8f16mask:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1}
; X86-NEXT:    retl
  %msk = bitcast i8 %c to <8 x i1>
  %res0 = load <8 x half>, ptr %a
  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
  ret <8 x half> %res
}

define <8 x half> @load8f16maskz(ptr %a, i8 %c) {
; X64-LABEL: load8f16maskz:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %esi, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: load8f16maskz:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
; X86-NEXT:    retl
  %msk = bitcast i8 %c to <8 x i1>
  %res0 = load <8 x half>, ptr %a
  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
  ret <8 x half> %res
}

define <8 x half> @loadu8f16(ptr %a) {
; X64-LABEL: loadu8f16:
; X64:       # %bb.0:
; X64-NEXT:    vmovups (%rdi), %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: loadu8f16:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovups (%eax), %xmm0
; X86-NEXT:    retl
  %res = load <8 x half>, ptr %a, align 8
  ret <8 x half> %res
}

define <8 x half> @loadu8f16mask(ptr %a, <8 x half> %b, i8 %c) {
; X64-LABEL: loadu8f16mask:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %esi, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
; X64-NEXT:    retq
;
; X86-LABEL: loadu8f16mask:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1}
; X86-NEXT:    retl
  %msk = bitcast i8 %c to <8 x i1>
  %res0 = load <8 x half>, ptr %a, align 8
  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
  ret <8 x half> %res
}

define <8 x half> @loadu8f16maskz(ptr %a, i8 %c) {
; X64-LABEL: loadu8f16maskz:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %esi, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: loadu8f16maskz:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
; X86-NEXT:    retl
  %msk = bitcast i8 %c to <8 x i1>
  %res0 = load <8 x half>, ptr %a, align 8
  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
  ret <8 x half> %res
}

define void @store8f16(<8 x half> %a) {
; X64-LABEL: store8f16:
; X64:       # %bb.0:
; X64-NEXT:    movq g8f16@GOTPCREL(%rip), %rax
; X64-NEXT:    vmovaps %xmm0, (%rax)
; X64-NEXT:    retq
;
; X86-LABEL: store8f16:
; X86:       # %bb.0:
; X86-NEXT:    vmovaps %xmm0, g8f16
; X86-NEXT:    retl
  store <8 x half> %a, ptr @g8f16
  ret void
}

define void @storeu8f16(<8 x half> %a) {
; X64-LABEL: storeu8f16:
; X64:       # %bb.0:
; X64-NEXT:    movq g8f16u@GOTPCREL(%rip), %rax
; X64-NEXT:    vmovups %xmm0, (%rax)
; X64-NEXT:    retq
;
; X86-LABEL: storeu8f16:
; X86:       # %bb.0:
; X86-NEXT:    vmovups %xmm0, g8f16u
; X86-NEXT:    retl
  store <8 x half> %a, ptr @g8f16u, align 8
  ret void
}

declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32, <8 x i1>)
declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32,  <8 x i1>, <8 x half>)

define void @storeu8f16mask(<8 x i1> %mask, ptr %addr, <8 x half> %val) {
; X64-LABEL: storeu8f16mask:
; X64:       # %bb.0:
; X64-NEXT:    vpsllw $15, %xmm0, %xmm0
; X64-NEXT:    vpmovw2m %xmm0, %k1
; X64-NEXT:    vmovdqu16 %xmm1, (%rdi) {%k1}
; X64-NEXT:    retq
;
; X86-LABEL: storeu8f16mask:
; X86:       # %bb.0:
; X86-NEXT:    vpsllw $15, %xmm0, %xmm0
; X86-NEXT:    vpmovw2m %xmm0, %k1
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovdqu16 %xmm1, (%eax) {%k1}
; X86-NEXT:    retl
  call void @llvm.masked.store.v8f16.p0(<8 x half> %val, ptr %addr, i32 4, <8 x i1>%mask)
  ret void
}

define <8 x half> @maskloadu8f16(ptr %addr, <8 x half> %val, <8 x i1> %mask) {
; X64-LABEL: maskloadu8f16:
; X64:       # %bb.0:
; X64-NEXT:    vpsllw $15, %xmm1, %xmm1
; X64-NEXT:    vpmovw2m %xmm1, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
; X64-NEXT:    retq
;
; X86-LABEL: maskloadu8f16:
; X86:       # %bb.0:
; X86-NEXT:    vpsllw $15, %xmm1, %xmm1
; X86-NEXT:    vpmovw2m %xmm1, %k1
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1}
; X86-NEXT:    retl
  %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> %val)
  ret <8 x half> %res
}

define <8 x half> @maskuloadu8f16(ptr %addr, <8 x i1> %mask) {
; X64-LABEL: maskuloadu8f16:
; X64:       # %bb.0:
; X64-NEXT:    vpsllw $15, %xmm0, %xmm0
; X64-NEXT:    vpmovw2m %xmm0, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: maskuloadu8f16:
; X86:       # %bb.0:
; X86-NEXT:    vpsllw $15, %xmm0, %xmm0
; X86-NEXT:    vpmovw2m %xmm0, %k1
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
; X86-NEXT:    retl
  %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> undef)
  ret <8 x half> %res
}

define <8 x half> @maskzloadu8f16(ptr %addr, <8 x i1> %mask) {
; X64-LABEL: maskzloadu8f16:
; X64:       # %bb.0:
; X64-NEXT:    vpsllw $15, %xmm0, %xmm0
; X64-NEXT:    vpmovw2m %xmm0, %k1
; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: maskzloadu8f16:
; X86:       # %bb.0:
; X86-NEXT:    vpsllw $15, %xmm0, %xmm0
; X86-NEXT:    vpmovw2m %xmm0, %k1
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
; X86-NEXT:    retl
  %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> zeroinitializer)
  ret <8 x half> %res
}

define <8 x half> @movrr8f16(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: movrr8f16:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm1, %xmm0
; CHECK-NEXT:    ret{{[l|q]}}
  ret <8 x half> %b
}

define <8 x half> @movrrk8f16(<8 x half> %a, <8 x half> %b, i8 %msk) {
; X64-LABEL: movrrk8f16:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %edi, %k1
; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; X64-NEXT:    retq
;
; X86-LABEL: movrrk8f16:
; X86:       # %bb.0:
; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; X86-NEXT:    retl
  %mask = bitcast i8 %msk to <8 x i1>
  %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> %b
  ret <8 x half> %res
}

define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) {
; X64-LABEL: movrrkz8f16:
; X64:       # %bb.0:
; X64-NEXT:    kmovd %edi, %k1
; X64-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: movrrkz8f16:
; X86:       # %bb.0:
; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; X86-NEXT:    retl
  %mask = bitcast i8 %msk to <8 x i1>
  %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> zeroinitializer
  ret <8 x half> %res
}

define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: movsh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
; CHECK-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
; CHECK-NEXT:    vaddph %xmm0, %xmm2, %xmm0
; CHECK-NEXT:    ret{{[l|q]}}
  %res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
  %res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %res = fadd <8 x half> %res1, %res2
  ret <8 x half> %res
}

define i16 @test_movw(half %x) {
; X64-LABEL: test_movw:
; X64:       # %bb.0:
; X64-NEXT:    vmovw %xmm0, %eax
; X64-NEXT:    # kill: def $ax killed $ax killed $eax
; X64-NEXT:    retq
;
; X86-LABEL: test_movw:
; X86:       # %bb.0:
; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    retl
  %res = bitcast half %x to i16
  ret i16 %res
}

define half @test_movw2(i16 %x) {
; X64-LABEL: test_movw2:
; X64:       # %bb.0:
; X64-NEXT:    vmovw %edi, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test_movw2:
; X86:       # %bb.0:
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    retl
  %res = bitcast i16 %x to half
  ret half %res
}

; sext avoids having a truncate in front of the bitcast input due to calling
; convention or i16 op promotion.
define half @test_movw3(i8 %x) {
; X64-LABEL: test_movw3:
; X64:       # %bb.0:
; X64-NEXT:    movsbl %dil, %eax
; X64-NEXT:    vmovw %eax, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test_movw3:
; X86:       # %bb.0:
; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovw %eax, %xmm0
; X86-NEXT:    retl
  %z = sext i8 %x to i16
  %a = bitcast i16 %z to half
  ret half %a
}

define half @extract_f16_0(<8 x half> %x) {
; CHECK-LABEL: extract_f16_0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x half> %x, i32 0
   ret half %res
}

define half @extract_f16_1(<8 x half> %x) {
; CHECK-LABEL: extract_f16_1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x half> %x, i32 1
   ret half %res
}

define half @extract_f16_2(<8 x half> %x) {
; CHECK-LABEL: extract_f16_2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x half> %x, i32 2
   ret half %res
}

define half @extract_f16_3(<8 x half> %x) {
; CHECK-LABEL: extract_f16_3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x half> %x, i32 3
   ret half %res
}

define half @extract_f16_4(<8 x half> %x) {
; CHECK-LABEL: extract_f16_4:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x half> %x, i32 4
   ret half %res
}

define half @extract_f16_5(<8 x half> %x) {
; CHECK-LABEL: extract_f16_5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x half> %x, i32 5
   ret half %res
}

define half @extract_f16_6(<8 x half> %x) {
; CHECK-LABEL: extract_f16_6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x half> %x, i32 6
   ret half %res
}

define half @extract_f16_7(<8 x half> %x) {
; CHECK-LABEL: extract_f16_7:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x half> %x, i32 7
   ret half %res
}

define half @extract_f16_8(<32 x half> %x, i64 %idx) nounwind {
; X64-LABEL: extract_f16_8:
; X64:       # %bb.0:
; X64-NEXT:    pushq %rbp
; X64-NEXT:    movq %rsp, %rbp
; X64-NEXT:    andq $-64, %rsp
; X64-NEXT:    subq $128, %rsp
; X64-NEXT:    andl $31, %edi
; X64-NEXT:    vmovaps %zmm0, (%rsp)
; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X64-NEXT:    movq %rbp, %rsp
; X64-NEXT:    popq %rbp
; X64-NEXT:    vzeroupper
; X64-NEXT:    retq
;
; X86-LABEL: extract_f16_8:
; X86:       # %bb.0:
; X86-NEXT:    pushl %ebp
; X86-NEXT:    movl %esp, %ebp
; X86-NEXT:    andl $-64, %esp
; X86-NEXT:    subl $128, %esp
; X86-NEXT:    movl 8(%ebp), %eax
; X86-NEXT:    andl $31, %eax
; X86-NEXT:    vmovaps %zmm0, (%esp)
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    movl %ebp, %esp
; X86-NEXT:    popl %ebp
; X86-NEXT:    vzeroupper
; X86-NEXT:    retl
   %res = extractelement <32 x half> %x, i64 %idx
   ret half %res
}

define half @extract_f16_9(<64 x half> %x, i64 %idx) nounwind {
; X64-LABEL: extract_f16_9:
; X64:       # %bb.0:
; X64-NEXT:    pushq %rbp
; X64-NEXT:    movq %rsp, %rbp
; X64-NEXT:    andq $-64, %rsp
; X64-NEXT:    subq $192, %rsp
; X64-NEXT:    andl $63, %edi
; X64-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%rsp)
; X64-NEXT:    vmovaps %zmm0, (%rsp)
; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X64-NEXT:    movq %rbp, %rsp
; X64-NEXT:    popq %rbp
; X64-NEXT:    vzeroupper
; X64-NEXT:    retq
;
; X86-LABEL: extract_f16_9:
; X86:       # %bb.0:
; X86-NEXT:    pushl %ebp
; X86-NEXT:    movl %esp, %ebp
; X86-NEXT:    andl $-64, %esp
; X86-NEXT:    subl $192, %esp
; X86-NEXT:    movl 8(%ebp), %eax
; X86-NEXT:    andl $63, %eax
; X86-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%esp)
; X86-NEXT:    vmovaps %zmm0, (%esp)
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    movl %ebp, %esp
; X86-NEXT:    popl %ebp
; X86-NEXT:    vzeroupper
; X86-NEXT:    retl
   %res = extractelement <64 x half> %x, i64 %idx
   ret half %res
}

define i16 @extract_i16_0(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovw %xmm0, %eax
; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x i16> %x, i32 0
   ret i16 %res
}

define i16 @extract_i16_1(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x i16> %x, i32 1
   ret i16 %res
}

define i16 @extract_i16_2(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpextrw $2, %xmm0, %eax
; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x i16> %x, i32 2
   ret i16 %res
}

define i16 @extract_i16_3(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpextrw $3, %xmm0, %eax
; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x i16> %x, i32 3
   ret i16 %res
}

define i16 @extract_i16_4(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_4:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpextrw $4, %xmm0, %eax
; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x i16> %x, i32 4
   ret i16 %res
}

define i16 @extract_i16_5(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpextrw $5, %xmm0, %eax
; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x i16> %x, i32 5
   ret i16 %res
}

define i16 @extract_i16_6(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpextrw $6, %xmm0, %eax
; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x i16> %x, i32 6
   ret i16 %res
}

define i16 @extract_i16_7(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_7:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpextrw $7, %xmm0, %eax
; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x i16> %x, i32 7
   ret i16 %res
}

define void @extract_store_f16_0(<8 x half> %x, ptr %y) {
; X64-LABEL: extract_store_f16_0:
; X64:       # %bb.0:
; X64-NEXT:    vmovsh %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_f16_0:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovsh %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x half> %x, i32 0
   store half %res, ptr %y
   ret void
}

define void @extract_store_f16_1(<8 x half> %x, ptr %y) {
; X64-LABEL: extract_store_f16_1:
; X64:       # %bb.0:
; X64-NEXT:    vpsrld $16, %xmm0, %xmm0
; X64-NEXT:    vmovsh %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_f16_1:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
; X86-NEXT:    vmovsh %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x half> %x, i32 1
   store half %res, ptr %y
   ret void
}

define void @extract_store_f16_2(<8 x half> %x, ptr %y) {
; X64-LABEL: extract_store_f16_2:
; X64:       # %bb.0:
; X64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X64-NEXT:    vmovsh %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_f16_2:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X86-NEXT:    vmovsh %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x half> %x, i32 2
   store half %res, ptr %y
   ret void
}

define void @extract_store_f16_3(<8 x half> %x, ptr %y) {
; X64-LABEL: extract_store_f16_3:
; X64:       # %bb.0:
; X64-NEXT:    vpsrlq $48, %xmm0, %xmm0
; X64-NEXT:    vmovsh %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_f16_3:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
; X86-NEXT:    vmovsh %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x half> %x, i32 3
   store half %res, ptr %y
   ret void
}

define void @extract_store_f16_4(<8 x half> %x, ptr %y) {
; X64-LABEL: extract_store_f16_4:
; X64:       # %bb.0:
; X64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; X64-NEXT:    vmovsh %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_f16_4:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; X86-NEXT:    vmovsh %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x half> %x, i32 4
   store half %res, ptr %y
   ret void
}

define void @extract_store_f16_5(<8 x half> %x, ptr %y) {
; X64-LABEL: extract_store_f16_5:
; X64:       # %bb.0:
; X64-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X64-NEXT:    vmovsh %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_f16_5:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x half> %x, i32 5
   store half %res, ptr %y
   ret void
}

define void @extract_store_f16_6(<8 x half> %x, ptr %y) {
; X64-LABEL: extract_store_f16_6:
; X64:       # %bb.0:
; X64-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; X64-NEXT:    vmovsh %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_f16_6:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; X86-NEXT:    vmovsh %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x half> %x, i32 6
   store half %res, ptr %y
   ret void
}

define void @extract_store_f16_7(<8 x half> %x, ptr %y) {
; X64-LABEL: extract_store_f16_7:
; X64:       # %bb.0:
; X64-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X64-NEXT:    vmovsh %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_f16_7:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x half> %x, i32 7
   store half %res, ptr %y
   ret void
}

define void @extract_store_i16_0(<8 x i16> %x, ptr %y) {
; X64-LABEL: extract_store_i16_0:
; X64:       # %bb.0:
; X64-NEXT:    vpextrw $0, %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_i16_0:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpextrw $0, %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x i16> %x, i32 0
   store i16 %res, ptr %y
   ret void
}

define void @extract_store_i16_1(<8 x i16> %x, ptr %y) {
; X64-LABEL: extract_store_i16_1:
; X64:       # %bb.0:
; X64-NEXT:    vpextrw $1, %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_i16_1:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpextrw $1, %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x i16> %x, i32 1
   store i16 %res, ptr %y
   ret void
}

define void @extract_store_i16_2(<8 x i16> %x, ptr %y) {
; X64-LABEL: extract_store_i16_2:
; X64:       # %bb.0:
; X64-NEXT:    vpextrw $2, %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_i16_2:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpextrw $2, %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x i16> %x, i32 2
   store i16 %res, ptr %y
   ret void
}

define void @extract_store_i16_3(<8 x i16> %x, ptr %y) {
; X64-LABEL: extract_store_i16_3:
; X64:       # %bb.0:
; X64-NEXT:    vpextrw $3, %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_i16_3:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpextrw $3, %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x i16> %x, i32 3
   store i16 %res, ptr %y
   ret void
}

define void @extract_store_i16_4(<8 x i16> %x, ptr %y) {
; X64-LABEL: extract_store_i16_4:
; X64:       # %bb.0:
; X64-NEXT:    vpextrw $4, %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_i16_4:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpextrw $4, %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x i16> %x, i32 4
   store i16 %res, ptr %y
   ret void
}

define void @extract_store_i16_5(<8 x i16> %x, ptr %y) {
; X64-LABEL: extract_store_i16_5:
; X64:       # %bb.0:
; X64-NEXT:    vpextrw $5, %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_i16_5:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpextrw $5, %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x i16> %x, i32 5
   store i16 %res, ptr %y
   ret void
}

define void @extract_store_i16_6(<8 x i16> %x, ptr %y) {
; X64-LABEL: extract_store_i16_6:
; X64:       # %bb.0:
; X64-NEXT:    vpextrw $6, %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_i16_6:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpextrw $6, %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x i16> %x, i32 6
   store i16 %res, ptr %y
   ret void
}

define void @extract_store_i16_7(<8 x i16> %x, ptr %y) {
; X64-LABEL: extract_store_i16_7:
; X64:       # %bb.0:
; X64-NEXT:    vpextrw $7, %xmm0, (%rdi)
; X64-NEXT:    retq
;
; X86-LABEL: extract_store_i16_7:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vpextrw $7, %xmm0, (%eax)
; X86-NEXT:    retl
   %res = extractelement <8 x i16> %x, i32 7
   store i16 %res, ptr %y
   ret void
}

define i32 @extract_zext_i16_0(<8 x i16> %x) {
; CHECK-LABEL: extract_zext_i16_0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpextrw $0, %xmm0, %eax
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x i16> %x, i32 0
   %res2 = zext i16 %res to i32
   ret i32 %res2
}

define i32 @extract_zext_i16_1(<8 x i16> %x) {
; CHECK-LABEL: extract_zext_i16_1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
; CHECK-NEXT:    ret{{[l|q]}}
   %res = extractelement <8 x i16> %x, i32 1
   %res2 = zext i16 %res to i32
   ret i32 %res2
}

define <8 x half> @build_vector_xxxxuuuu(half %a0, half %a1, half %a2, half %a3) {
; X64-LABEL: build_vector_xxxxuuuu:
; X64:       # %bb.0:
; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
; X64-NEXT:    retq
;
; X86-LABEL: build_vector_xxxxuuuu:
; X86:       # %bb.0:
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
; X86-NEXT:    retl
  %a = insertelement <8 x half> undef, half %a0, i32 0
  %b = insertelement <8 x half> %a, half %a1, i32 1
  %c = insertelement <8 x half> %b, half %a2, i32 2
  %d = insertelement <8 x half> %c, half %a3, i32 3
  ret <8 x half> %d
}

define <8 x half> @build_vector_uuuuxxxx(half %a0, half %a1, half %a2, half %a3) {
; X64-LABEL: build_vector_uuuuxxxx:
; X64:       # %bb.0:
; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: build_vector_uuuuxxxx:
; X86:       # %bb.0:
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT:    vpbroadcastq %xmm0, %xmm0
; X86-NEXT:    retl
  %a = insertelement <8 x half> undef, half %a0, i32 4
  %b = insertelement <8 x half> %a, half %a1, i32 5
  %c = insertelement <8 x half> %b, half %a2, i32 6
  %d = insertelement <8 x half> %c, half %a3, i32 7
  ret <8 x half> %d
}

define <8 x half> @build_vector_xxxxxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) {
; X64-LABEL: build_vector_xxxxxxxx:
; X64:       # %bb.0:
; X64-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; X64-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; X64-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; X64-NEXT:    retq
;
; X86-LABEL: build_vector_xxxxxxxx:
; X86:       # %bb.0:
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X86-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X86-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X86-NEXT:    retl
  %a = insertelement <8 x half> undef, half %a0, i32 0
  %b = insertelement <8 x half> %a, half %a1, i32 1
  %c = insertelement <8 x half> %b, half %a2, i32 2
  %d = insertelement <8 x half> %c, half %a3, i32 3
  %e = insertelement <8 x half> %d, half %a4, i32 4
  %f = insertelement <8 x half> %e, half %a5, i32 5
  %g = insertelement <8 x half> %f, half %a6, i32 6
  %h = insertelement <8 x half> %g, half %a7, i32 7
  ret <8 x half> %h
}

define <16 x half> @build_vector_xxxxuuuuuuuuxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) {
; X64-LABEL: build_vector_xxxxuuuuuuuuxxxx:
; X64:       # %bb.0:
; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
; X64-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; X64-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X64-NEXT:    vpbroadcastq %xmm1, %xmm1
; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-NEXT:    retq
;
; X86-LABEL: build_vector_xxxxuuuuuuuuxxxx:
; X86:       # %bb.0:
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
; X86-NEXT:    vpbroadcastq %xmm0, %xmm0
; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X86-NEXT:    retl
  %a = insertelement <16 x half> undef, half %a0, i32 0
  %b = insertelement <16 x half> %a, half %a1, i32 1
  %c = insertelement <16 x half> %b, half %a2, i32 2
  %d = insertelement <16 x half> %c, half %a3, i32 3
  %e = insertelement <16 x half> %d, half %a4, i32 12
  %f = insertelement <16 x half> %e, half %a5, i32 13
  %g = insertelement <16 x half> %f, half %a6, i32 14
  %h = insertelement <16 x half> %g, half %a7, i32 15
  ret <16 x half> %h
}

define <8 x half> @regression1(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: regression1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
; CHECK-NEXT:    ret{{[l|q]}}
  %res = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
  ret <8 x half> %res
}

define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2, <4 x float> %3, ptr %4) {
; X64-LABEL: regression2:
; X64:       # %bb.0:
; X64-NEXT:    vmovw (%rsi), %xmm0
; X64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
; X64-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: regression2:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovw (%eax), %xmm0
; X86-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
; X86-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
; X86-NEXT:    retl
  %6 = load i8, ptr %4, align 1
  %7 = getelementptr i8, ptr %4, i64 1
  %8 = addrspacecast ptr %7 to ptr addrspace(4)
  %9 = load i8, ptr addrspace(4) %8, align 1
  %10 = insertelement <2 x i8> poison, i8 %6, i32 0
  %11 = insertelement <2 x i8> %10, i8 %9, i32 1
  %12 = uitofp <2 x i8> %11 to <2 x float>
  %13 = shufflevector <2 x float> %12, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
  %14 = shufflevector <4 x float> %13, <4 x float> <float poison, float poison, float 0.000000e+00, float 2.550000e+02>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
  %15 = fmul contract <4 x float> %14, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000>
  ret <4 x float> %15
}

; Make sure load/stores of v4f16 are handled well on 32-bit targets where
; default widening legalization can't use i64.
define void @load_store_v4f16(ptr %x, ptr %y, ptr %z) {
; X64-LABEL: load_store_v4f16:
; X64:       # %bb.0:
; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
; X64-NEXT:    vaddph %xmm1, %xmm0, %xmm0
; X64-NEXT:    vmovlps %xmm0, (%rdx)
; X64-NEXT:    retq
;
; X86-LABEL: load_store_v4f16:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT:    vaddph %xmm1, %xmm0, %xmm0
; X86-NEXT:    vmovlps %xmm0, (%eax)
; X86-NEXT:    retl
  %a = load <4 x half>, ptr %x
  %b = load <4 x half>, ptr %y
  %c = fadd <4 x half> %a, %b
  store <4 x half> %c, ptr %z
  ret void
}

define <8 x half> @test21(half %a, half %b, half %c) nounwind {
; X64-LABEL: test21:
; X64:       # %bb.0:
; X64-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT:    vmovsh %xmm2, %xmm3, %xmm2
; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT:    vpbroadcastw %xmm1, %xmm1
; X64-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; X64-NEXT:    retq
;
; X86-LABEL: test21:
; X86:       # %bb.0:
; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; X86-NEXT:    vpbroadcastw %xmm1, %xmm1
; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; X86-NEXT:    retl
  %1 = insertelement <8 x half> <half poison, half poison, half poison, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000>, half %a, i32 0
  %2 = insertelement <8 x half> %1, half %b, i32 1
  %3 = insertelement <8 x half> %2, half %c, i32 2
  ret <8 x half> %3
}

define <16 x i16> @test22(ptr %mem) nounwind {
; X64-LABEL: test22:
; X64:       # %bb.0:
; X64-NEXT:    movzwl 0, %eax
; X64-NEXT:    andw (%rdi), %ax
; X64-NEXT:    vmovw %eax, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: test22:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movzwl 0, %ecx
; X86-NEXT:    andw (%eax), %cx
; X86-NEXT:    vmovw %ecx, %xmm0
; X86-NEXT:    retl
  %1 = load i16, ptr null, align 2
  %2 = load i16, ptr %mem, align 2
  %3 = and i16 %1, %2
  %4 = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %3, i32 0
  ret <16 x i16> %4
}

define void @pr52560(i8 %0, <2 x i16> %1, ptr %c) nounwind {
; X64-LABEL: pr52560:
; X64:       # %bb.0: # %entry
; X64-NEXT:    movsbl %dil, %eax
; X64-NEXT:    vmovw %eax, %xmm1
; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
; X64-NEXT:    vpcmpgtw %xmm2, %xmm1, %k1
; X64-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; X64-NEXT:    vmovw %xmm0, %eax
; X64-NEXT:    testw %ax, %ax
; X64-NEXT:    je .LBB123_2
; X64-NEXT:  # %bb.1: # %for.body.preheader
; X64-NEXT:    movb $0, (%rsi)
; X64-NEXT:  .LBB123_2: # %for.end
; X64-NEXT:    retq
;
; X86-LABEL: pr52560:
; X86:       # %bb.0: # %entry
; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vmovw %eax, %xmm1
; X86-NEXT:    vpxor %xmm2, %xmm2, %xmm2
; X86-NEXT:    vpcmpgtw %xmm2, %xmm1, %k1
; X86-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; X86-NEXT:    vmovw %xmm0, %eax
; X86-NEXT:    testw %ax, %ax
; X86-NEXT:    je .LBB123_2
; X86-NEXT:  # %bb.1: # %for.body.preheader
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movb $0, (%eax)
; X86-NEXT:  .LBB123_2: # %for.end
; X86-NEXT:    retl
entry:
  %conv = sext i8 %0 to i16
  %2 = insertelement <2 x i16> <i16 poison, i16 0>, i16 %conv, i32 0
  %3 = icmp sgt <2 x i16> %2, zeroinitializer
  %4 = select <2 x i1> %3, <2 x i16> %1, <2 x i16> <i16 0, i16 poison>
  %5 = extractelement <2 x i16> %4, i32 0
  %tobool.not14 = icmp eq i16 %5, 0
  br i1 %tobool.not14, label %for.end, label %for.body.preheader

for.body.preheader:                               ; preds = %entry
  store i8 0, ptr %c, align 1
  br label %for.end

for.end:                                          ; preds = %for.body.preheader, %entry
  ret void
}

define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width"="256" "prefer-vector-width"="256" nounwind {
; X64-LABEL: pr52561:
; X64:       # %bb.0:
; X64-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
; X64-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
; X64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
; X64-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
; X64-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
; X64-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
; X64-NEXT:    vmovsh %xmm0, %xmm2, %xmm0
; X64-NEXT:    retq
;
; X86-LABEL: pr52561:
; X86:       # %bb.0:
; X86-NEXT:    pushl %ebp
; X86-NEXT:    movl %esp, %ebp
; X86-NEXT:    andl $-32, %esp
; X86-NEXT:    subl $32, %esp
; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
; X86-NEXT:    vpaddd 8(%ebp), %ymm1, %ymm1
; X86-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
; X86-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
; X86-NEXT:    vpxor %xmm2, %xmm2, %xmm2
; X86-NEXT:    vmovsh %xmm0, %xmm2, %xmm0
; X86-NEXT:    movl %ebp, %esp
; X86-NEXT:    popl %ebp
; X86-NEXT:    retl
  %1 = add <16 x i32> %a, <i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
  %2 = add <16 x i32> %1, %b
  %3 = and <16 x i32> %2, <i32 65535, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 65535>
  ret <16 x i32> %3
}

define <8 x i16> @pr59628_xmm(i16 %arg) {
; X64-LABEL: pr59628_xmm:
; X64:       # %bb.0:
; X64-NEXT:    vmovw %edi, %xmm0
; X64-NEXT:    vpbroadcastw %edi, %xmm1
; X64-NEXT:    vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
; X64-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; X64-NEXT:    retq
;
; X86-LABEL: pr59628_xmm:
; X86:       # %bb.0:
; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
; X86-NEXT:    vpbroadcastw %eax, %xmm1
; X86-NEXT:    vmovsh %xmm1, %xmm0, %xmm0
; X86-NEXT:    vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %k1
; X86-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; X86-NEXT:    retl
  %I1 = insertelement <8 x i16> zeroinitializer, i16 %arg, i16 0
  %I2 = insertelement <8 x i16> %I1, i16 0, i16 %arg
  ret <8 x i16> %I2
}