llvm/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s

define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) {
; CHECK-LABEL: test_16xi16_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
; CHECK-NEXT:    retq
  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
  ret <16 x i16> %res
}
define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
  ret <16 x i16> %res
}

define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
  ret <16 x i16> %res
}
define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
  ret <16 x i16> %res
}

define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
  ret <16 x i16> %res
}
define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
  ret <16 x i16> %res
}

define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
  ret <16 x i16> %res
}
define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) {
; CHECK-LABEL: test_16xi16_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
; CHECK-NEXT:    retq
  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
  ret <16 x i16> %res
}
define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
  ret <16 x i16> %res
}

define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
  ret <16 x i16> %res
}
define <16 x i16> @test_16xi16_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_16xi16_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0
; CHECK-NEXT:    retq
  %vec = load <16 x i16>, ptr %vp
  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
  ret <16 x i16> %res
}
define <16 x i16> @test_masked_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <16 x i16>, ptr %vp
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
  ret <16 x i16> %res
}

define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <16 x i16>, ptr %vp
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
  ret <16 x i16> %res
}

define <16 x i16> @test_masked_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <16 x i16>, ptr %vp
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
  ret <16 x i16> %res
}

define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <16 x i16>, ptr %vp
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
  ret <16 x i16> %res
}

define <16 x i16> @test_masked_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <16 x i16>, ptr %vp
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
  ret <16 x i16> %res
}

define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <16 x i16>, ptr %vp
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
  ret <16 x i16> %res
}

define <16 x i16> @test_16xi16_perm_mem_mask3(ptr %vp) {
; CHECK-LABEL: test_16xi16_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0
; CHECK-NEXT:    retq
  %vec = load <16 x i16>, ptr %vp
  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
  ret <16 x i16> %res
}
define <16 x i16> @test_masked_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <16 x i16>, ptr %vp
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
  ret <16 x i16> %res
}

define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <16 x i16>, ptr %vp
  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
  ret <16 x i16> %res
}

define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) {
; CHECK-LABEL: test_32xi16_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
; CHECK-NEXT:    retq
  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
  ret <32 x i16> %res
}
define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
  ret <32 x i16> %res
}

define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
  ret <32 x i16> %res
}
define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16]
; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
  ret <32 x i16> %res
}

define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16]
; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
  ret <32 x i16> %res
}
define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27]
; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
  ret <32 x i16> %res
}

define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27]
; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
  ret <32 x i16> %res
}
define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) {
; CHECK-LABEL: test_32xi16_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
; CHECK-NEXT:    retq
  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
  ret <32 x i16> %res
}
define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
  ret <32 x i16> %res
}

define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
  ret <32 x i16> %res
}
define <32 x i16> @test_32xi16_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_32xi16_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
; CHECK-NEXT:    retq
  %vec = load <32 x i16>, ptr %vp
  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
  ret <32 x i16> %res
}
define <32 x i16> @test_masked_32xi16_perm_mem_mask0(ptr %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <32 x i16>, ptr %vp
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
  ret <32 x i16> %res
}

define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(ptr %vp, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <32 x i16>, ptr %vp
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
  ret <32 x i16> %res
}

define <32 x i16> @test_masked_32xi16_perm_mem_mask1(ptr %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6]
; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <32 x i16>, ptr %vp
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
  ret <32 x i16> %res
}

define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(ptr %vp, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6]
; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <32 x i16>, ptr %vp
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
  ret <32 x i16> %res
}

define <32 x i16> @test_masked_32xi16_perm_mem_mask2(ptr %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25]
; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <32 x i16>, ptr %vp
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
  ret <32 x i16> %res
}

define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(ptr %vp, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25]
; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <32 x i16>, ptr %vp
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
  ret <32 x i16> %res
}

define <32 x i16> @test_32xi16_perm_mem_mask3(ptr %vp) {
; CHECK-LABEL: test_32xi16_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
; CHECK-NEXT:    retq
  %vec = load <32 x i16>, ptr %vp
  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
  ret <32 x i16> %res
}
define <32 x i16> @test_masked_32xi16_perm_mem_mask3(ptr %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <32 x i16>, ptr %vp
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
  ret <32 x i16> %res
}

define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(ptr %vp, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <32 x i16>, ptr %vp
  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
  ret <32 x i16> %res
}

define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
; CHECK-LABEL: test_8xi32_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6]
; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT:    retq
  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
  ret <8 x i32> %res
}
define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6]
; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
  ret <8 x i32> %res
}

define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6]
; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
  ret <8 x i32> %res
}
define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3]
; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
  ret <8 x i32> %res
}

define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3]
; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
  ret <8 x i32> %res
}
define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4]
; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
  ret <8 x i32> %res
}

define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4]
; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
  ret <8 x i32> %res
}
define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
; CHECK-LABEL: test_8xi32_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0]
; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT:    retq
  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
  ret <8 x i32> %res
}
define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0]
; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
  ret <8 x i32> %res
}

define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0]
; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
  ret <8 x i32> %res
}
define <8 x i32> @test_8xi32_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_8xi32_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5]
; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
; CHECK-NEXT:    retq
  %vec = load <8 x i32>, ptr %vp
  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
  ret <8 x i32> %res
}
define <8 x i32> @test_masked_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5]
; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x i32>, ptr %vp
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
  ret <8 x i32> %res
}

define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5]
; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x i32>, ptr %vp
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
  ret <8 x i32> %res
}

define <8 x i32> @test_masked_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5]
; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x i32>, ptr %vp
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
  ret <8 x i32> %res
}

define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5]
; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x i32>, ptr %vp
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
  ret <8 x i32> %res
}

define <8 x i32> @test_masked_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3]
; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x i32>, ptr %vp
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
  ret <8 x i32> %res
}

define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3]
; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x i32>, ptr %vp
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
  ret <8 x i32> %res
}

define <8 x i32> @test_8xi32_perm_mem_mask3(ptr %vp) {
; CHECK-LABEL: test_8xi32_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5]
; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
; CHECK-NEXT:    retq
  %vec = load <8 x i32>, ptr %vp
  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
  ret <8 x i32> %res
}
define <8 x i32> @test_masked_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5]
; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x i32>, ptr %vp
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
  ret <8 x i32> %res
}

define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5]
; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x i32>, ptr %vp
  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
  ret <8 x i32> %res
}

define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
; CHECK-NEXT:    retq
  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
  ret <16 x i32> %res
}
define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
  ret <16 x i32> %res
}

define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
  ret <16 x i32> %res
}
define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3]
; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
  ret <16 x i32> %res
}

define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3]
; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
  ret <16 x i32> %res
}
define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5]
; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
  ret <16 x i32> %res
}

define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5]
; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
  ret <16 x i32> %res
}
define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
; CHECK-NEXT:    retq
  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
  ret <16 x i32> %res
}
define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
  ret <16 x i32> %res
}

define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
  ret <16 x i32> %res
}
define <16 x i32> @test_16xi32_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_16xi32_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
; CHECK-NEXT:    retq
  %vec = load <16 x i32>, ptr %vp
  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
  ret <16 x i32> %res
}
define <16 x i32> @test_masked_16xi32_perm_mem_mask0(ptr %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <16 x i32>, ptr %vp
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
  ret <16 x i32> %res
}

define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(ptr %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <16 x i32>, ptr %vp
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
  ret <16 x i32> %res
}

define <16 x i32> @test_masked_16xi32_perm_mem_mask1(ptr %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3]
; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <16 x i32>, ptr %vp
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
  ret <16 x i32> %res
}

define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(ptr %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3]
; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <16 x i32>, ptr %vp
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
  ret <16 x i32> %res
}

define <16 x i32> @test_masked_16xi32_perm_mem_mask2(ptr %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2]
; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <16 x i32>, ptr %vp
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
  ret <16 x i32> %res
}

define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(ptr %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2]
; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <16 x i32>, ptr %vp
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
  ret <16 x i32> %res
}

define <16 x i32> @test_16xi32_perm_mem_mask3(ptr %vp) {
; CHECK-LABEL: test_16xi32_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
; CHECK-NEXT:    retq
  %vec = load <16 x i32>, ptr %vp
  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
  ret <16 x i32> %res
}
define <16 x i32> @test_masked_16xi32_perm_mem_mask3(ptr %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <16 x i32>, ptr %vp
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
  ret <16 x i32> %res
}

define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(ptr %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <16 x i32>, ptr %vp
  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
  ret <16 x i32> %res
}

define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) {
; CHECK-LABEL: test_4xi64_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1]
; CHECK-NEXT:    retq
  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
  ret <4 x i64> %res
}
define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1]
; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
  ret <4 x i64> %res
}

define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1]
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
  ret <4 x i64> %res
}
define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3]
; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
  ret <4 x i64> %res
}

define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3]
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
  ret <4 x i64> %res
}
define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1]
; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
  ret <4 x i64> %res
}

define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1]
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
  ret <4 x i64> %res
}
define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) {
; CHECK-LABEL: test_4xi64_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
; CHECK-NEXT:    retq
  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
  ret <4 x i64> %res
}
define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3]
; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
  ret <4 x i64> %res
}

define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3]
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
  ret <4 x i64> %res
}
define <4 x i64> @test_4xi64_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_4xi64_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0]
; CHECK-NEXT:    retq
  %vec = load <4 x i64>, ptr %vp
  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
  ret <4 x i64> %res
}
define <4 x i64> @test_masked_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0]
; CHECK-NEXT:    retq
  %vec = load <4 x i64>, ptr %vp
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
  ret <4 x i64> %res
}

define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0]
; CHECK-NEXT:    retq
  %vec = load <4 x i64>, ptr %vp
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
  ret <4 x i64> %res
}

define <4 x i64> @test_masked_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1]
; CHECK-NEXT:    retq
  %vec = load <4 x i64>, ptr %vp
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
  ret <4 x i64> %res
}

define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1]
; CHECK-NEXT:    retq
  %vec = load <4 x i64>, ptr %vp
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
  ret <4 x i64> %res
}

define <4 x i64> @test_masked_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0]
; CHECK-NEXT:    retq
  %vec = load <4 x i64>, ptr %vp
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
  ret <4 x i64> %res
}

define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0]
; CHECK-NEXT:    retq
  %vec = load <4 x i64>, ptr %vp
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
  ret <4 x i64> %res
}

define <4 x i64> @test_4xi64_perm_mem_mask3(ptr %vp) {
; CHECK-LABEL: test_4xi64_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3]
; CHECK-NEXT:    retq
  %vec = load <4 x i64>, ptr %vp
  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
  ret <4 x i64> %res
}
define <4 x i64> @test_masked_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3]
; CHECK-NEXT:    retq
  %vec = load <4 x i64>, ptr %vp
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
  ret <4 x i64> %res
}

define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3]
; CHECK-NEXT:    retq
  %vec = load <4 x i64>, ptr %vp
  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
  ret <4 x i64> %res
}

define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) {
; CHECK-LABEL: test_8xi64_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6]
; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT:    retq
  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
  ret <8 x i64> %res
}
define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6]
; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6]
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}
define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5]
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5]
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}
define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1]
; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1]
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}
define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) {
; CHECK-LABEL: test_8xi64_perm_imm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5]
; CHECK-NEXT:    retq
  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
  ret <8 x i64> %res
}
define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5]
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5]
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}
define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_mask4:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3]
; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_mask4:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3]
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}
define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mask5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4]
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}
define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
; CHECK-LABEL: test_8xi64_perm_mask6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7]
; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT:    retq
  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
  ret <8 x i64> %res
}
define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_mask6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7]
; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_mask6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7]
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}
define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mask7:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7]
; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask7:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7]
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}
define <8 x i64> @test_8xi64_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_8xi64_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3]
; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
  ret <8 x i64> %res
}
define <8 x i64> @test_masked_8xi64_perm_mem_mask0(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3]
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(ptr %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3]
; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4]
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(ptr %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4]
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_8xi64_perm_mem_mask2(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5]
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(ptr %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5]
; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}

define <8 x i64> @test_8xi64_perm_imm_mem_mask3(ptr %vp) {
; CHECK-LABEL: test_8xi64_perm_imm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5]
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
  ret <8 x i64> %res
}
define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5]
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(ptr %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5]
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_8xi64_perm_mem_mask4(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_mem_mask4:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6]
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(ptr %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask4:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6]
; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4]
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(ptr %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4]
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}

define <8 x i64> @test_8xi64_perm_mem_mask6(ptr %vp) {
; CHECK-LABEL: test_8xi64_perm_mem_mask6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6]
; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
  ret <8 x i64> %res
}
define <8 x i64> @test_masked_8xi64_perm_mem_mask6(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_mem_mask6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6]
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(ptr %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6]
; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5]
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
  ret <8 x i64> %res
}

define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(ptr %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5]
; CHECK-NEXT:    retq
  %vec = load <8 x i64>, ptr %vp
  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
  ret <8 x i64> %res
}

define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) {
; CHECK-LABEL: test_8xfloat_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4]
; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT:    retq
  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
  ret <8 x float> %res
}
define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4]
; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT:    vmovaps %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
  ret <8 x float> %res
}

define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
  ret <8 x float> %res
}
define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1]
; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT:    vmovaps %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
  ret <8 x float> %res
}

define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
  ret <8 x float> %res
}
define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5]
; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT:    vmovaps %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
  ret <8 x float> %res
}

define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
  ret <8 x float> %res
}
define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) {
; CHECK-LABEL: test_8xfloat_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6]
; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT:    retq
  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
  ret <8 x float> %res
}
define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6]
; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT:    vmovaps %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
  ret <8 x float> %res
}

define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
  ret <8 x float> %res
}
define <8 x float> @test_8xfloat_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_8xfloat_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0]
; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
; CHECK-NEXT:    retq
  %vec = load <8 x float>, ptr %vp
  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
  ret <8 x float> %res
}
define <8 x float> @test_masked_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x float>, ptr %vp
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
  ret <8 x float> %res
}

define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0]
; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x float>, ptr %vp
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
  ret <8 x float> %res
}

define <8 x float> @test_masked_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x float>, ptr %vp
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
  ret <8 x float> %res
}

define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6]
; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x float>, ptr %vp
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
  ret <8 x float> %res
}

define <8 x float> @test_masked_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x float>, ptr %vp
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
  ret <8 x float> %res
}

define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4]
; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x float>, ptr %vp
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
  ret <8 x float> %res
}

define <8 x float> @test_8xfloat_perm_mem_mask3(ptr %vp) {
; CHECK-LABEL: test_8xfloat_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0]
; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
; CHECK-NEXT:    retq
  %vec = load <8 x float>, ptr %vp
  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
  ret <8 x float> %res
}
define <8 x float> @test_masked_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x float>, ptr %vp
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
  ret <8 x float> %res
}

define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0]
; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x float>, ptr %vp
  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
  ret <8 x float> %res
}

define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
; CHECK-LABEL: test_16xfloat_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
; CHECK-NEXT:    retq
  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
  ret <16 x float> %res
}
define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovaps %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
  ret <16 x float> %res
}

define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
  ret <16 x float> %res
}
define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1]
; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovaps %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
  ret <16 x float> %res
}

define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
  ret <16 x float> %res
}
define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11]
; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovaps %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
  ret <16 x float> %res
}

define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
  ret <16 x float> %res
}
define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
; CHECK-LABEL: test_16xfloat_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
; CHECK-NEXT:    retq
  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
  ret <16 x float> %res
}
define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovaps %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
  ret <16 x float> %res
}

define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
  ret <16 x float> %res
}
define <16 x float> @test_16xfloat_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_16xfloat_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
; CHECK-NEXT:    retq
  %vec = load <16 x float>, ptr %vp
  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
  ret <16 x float> %res
}
define <16 x float> @test_masked_16xfloat_perm_mem_mask0(ptr %vp, <16 x float> %vec2, <16 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <16 x float>, ptr %vp
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
  ret <16 x float> %res
}

define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(ptr %vp, <16 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <16 x float>, ptr %vp
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
  ret <16 x float> %res
}

define <16 x float> @test_masked_16xfloat_perm_mem_mask1(ptr %vp, <16 x float> %vec2, <16 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <16 x float>, ptr %vp
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
  ret <16 x float> %res
}

define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(ptr %vp, <16 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4]
; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <16 x float>, ptr %vp
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
  ret <16 x float> %res
}

define <16 x float> @test_masked_16xfloat_perm_mem_mask2(ptr %vp, <16 x float> %vec2, <16 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <16 x float>, ptr %vp
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
  ret <16 x float> %res
}

define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(ptr %vp, <16 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5]
; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <16 x float>, ptr %vp
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
  ret <16 x float> %res
}

define <16 x float> @test_16xfloat_perm_mem_mask3(ptr %vp) {
; CHECK-LABEL: test_16xfloat_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
; CHECK-NEXT:    retq
  %vec = load <16 x float>, ptr %vp
  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
  ret <16 x float> %res
}
define <16 x float> @test_masked_16xfloat_perm_mem_mask3(ptr %vp, <16 x float> %vec2, <16 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <16 x float>, ptr %vp
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
  ret <16 x float> %res
}

define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(ptr %vp, <16 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <16 x float>, ptr %vp
  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
  ret <16 x float> %res
}

define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) {
; CHECK-LABEL: test_4xdouble_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2]
; CHECK-NEXT:    retq
  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
  ret <4 x double> %res
}
define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_4xdouble_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2]
; CHECK-NEXT:    vmovapd %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
  ret <4 x double> %res
}

define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_4xdouble_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2]
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
  ret <4 x double> %res
}
define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_4xdouble_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0]
; CHECK-NEXT:    vmovapd %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
  ret <4 x double> %res
}

define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_4xdouble_perm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
  ret <4 x double> %res
}
define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_4xdouble_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1]
; CHECK-NEXT:    vmovapd %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
  ret <4 x double> %res
}

define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_4xdouble_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1]
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
  ret <4 x double> %res
}
define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) {
; CHECK-LABEL: test_4xdouble_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2]
; CHECK-NEXT:    retq
  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
  ret <4 x double> %res
}
define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_4xdouble_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2]
; CHECK-NEXT:    vmovapd %ymm1, %ymm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
  ret <4 x double> %res
}

define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_4xdouble_perm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2]
; CHECK-NEXT:    retq
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
  ret <4 x double> %res
}
define <4 x double> @test_4xdouble_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_4xdouble_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0]
; CHECK-NEXT:    retq
  %vec = load <4 x double>, ptr %vp
  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
  ret <4 x double> %res
}
define <4 x double> @test_masked_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0]
; CHECK-NEXT:    retq
  %vec = load <4 x double>, ptr %vp
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
  ret <4 x double> %res
}

define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0]
; CHECK-NEXT:    retq
  %vec = load <4 x double>, ptr %vp
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
  ret <4 x double> %res
}

define <4 x double> @test_masked_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2]
; CHECK-NEXT:    retq
  %vec = load <4 x double>, ptr %vp
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
  ret <4 x double> %res
}

define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2]
; CHECK-NEXT:    retq
  %vec = load <4 x double>, ptr %vp
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
  ret <4 x double> %res
}

define <4 x double> @test_masked_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1]
; CHECK-NEXT:    retq
  %vec = load <4 x double>, ptr %vp
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
  ret <4 x double> %res
}

define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1]
; CHECK-NEXT:    retq
  %vec = load <4 x double>, ptr %vp
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
  ret <4 x double> %res
}

define <4 x double> @test_4xdouble_perm_mem_mask3(ptr %vp) {
; CHECK-LABEL: test_4xdouble_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2]
; CHECK-NEXT:    retq
  %vec = load <4 x double>, ptr %vp
  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
  ret <4 x double> %res
}
define <4 x double> @test_masked_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2]
; CHECK-NEXT:    retq
  %vec = load <4 x double>, ptr %vp
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
  ret <4 x double> %res
}

define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2]
; CHECK-NEXT:    retq
  %vec = load <4 x double>, ptr %vp
  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
  ret <4 x double> %res
}

define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
; CHECK-LABEL: test_8xdouble_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4]
; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT:    retq
  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
  ret <8 x double> %res
}
define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4]
; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovapd %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4]
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}
define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6]
; CHECK-NEXT:    vmovapd %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6]
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}
define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7]
; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovapd %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7]
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}
define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) {
; CHECK-LABEL: test_8xdouble_perm_imm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4]
; CHECK-NEXT:    retq
  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
  ret <8 x double> %res
}
define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4]
; CHECK-NEXT:    vmovapd %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4]
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}
define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_mask4:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1]
; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovapd %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_mask4:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1]
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}
define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7]
; CHECK-NEXT:    vmovapd %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7]
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}
define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
; CHECK-LABEL: test_8xdouble_perm_mask6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2]
; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT:    retq
  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
  ret <8 x double> %res
}
define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_mask6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2]
; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT:    vmovapd %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_mask6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2]
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}
define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask7:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6]
; CHECK-NEXT:    vmovapd %zmm1, %zmm0
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6]
; CHECK-NEXT:    retq
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}
define <8 x double> @test_8xdouble_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_8xdouble_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1]
; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
  ret <8 x double> %res
}
define <8 x double> @test_masked_8xdouble_perm_mem_mask0(ptr %vp, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1]
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(ptr %vp, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1]
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}

define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(ptr %vp, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7]
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(ptr %vp, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7]
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}

define <8 x double> @test_masked_8xdouble_perm_mem_mask2(ptr %vp, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5]
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(ptr %vp, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5]
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}

define <8 x double> @test_8xdouble_perm_imm_mem_mask3(ptr %vp) {
; CHECK-LABEL: test_8xdouble_perm_imm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4]
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
  ret <8 x double> %res
}
define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(ptr %vp, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4]
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(ptr %vp, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4]
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}

define <8 x double> @test_masked_8xdouble_perm_mem_mask4(ptr %vp, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask4:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0]
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(ptr %vp, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0]
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}

define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(ptr %vp, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7]
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(ptr %vp, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7]
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}

define <8 x double> @test_8xdouble_perm_mem_mask6(ptr %vp) {
; CHECK-LABEL: test_8xdouble_perm_mem_mask6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5]
; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
  ret <8 x double> %res
}
define <8 x double> @test_masked_8xdouble_perm_mem_mask6(ptr %vp, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5]
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(ptr %vp, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5]
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}

define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(ptr %vp, <8 x double> %vec2, <8 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4]
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
  ret <8 x double> %res
}

define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(ptr %vp, <8 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
; CHECK-NEXT:    retq
  %vec = load <8 x double>, ptr %vp
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
  ret <8 x double> %res
}