llvm/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512F-ONLY,AVX512F-SLOW,FALLBACK0
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512F-ONLY,AVX512F-FAST,FALLBACK1
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ,AVX512DQ-SLOW,FALLBACK2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512DQ,AVX512DQ-FAST,FALLBACK3
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-ONLY,AVX512BW-SLOW,FALLBACK4
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-ONLY,AVX512BW-FAST,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-SLOW,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-FAST,FALLBACK7

define void @mask_replication_factor2_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor2_vf2:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512F-ONLY-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxdq %xmm0, %xmm0
; AVX512F-ONLY-NEXT:    vptestmd %xmm0, %xmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %xmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa %xmm0, (%rdx)
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor2_vf2:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %xmm0
; AVX512DQ-NEXT:    vpmovsxdq %xmm0, %xmm0
; AVX512DQ-NEXT:    vpmovd2m %xmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %xmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rdx)
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor2_vf2:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovq (%rdi), %k1
; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxdq %xmm0, %xmm0
; AVX512BW-NEXT:    vptestmd %xmm0, %xmm0, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %xmm0 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa %xmm0, (%rdx)
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
  %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
  %data = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %in.vec, i32 64, <4 x i1> %tgt.mask, <4 x i32> poison)
  %data.padded = shufflevector <4 x i32> %data, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  store <4 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor2_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor2_vf4:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-ONLY-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxdq %xmm0, %ymm0
; AVX512F-ONLY-NEXT:    vptestmd %ymm0, %ymm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor2_vf4:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovb (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %ymm0
; AVX512DQ-NEXT:    vpmovsxdq %xmm0, %ymm0
; AVX512DQ-NEXT:    vpmovd2m %ymm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor2_vf4:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxdq %xmm0, %ymm0
; AVX512BW-NEXT:    vptestmd %ymm0, %ymm0, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
  %data = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %in.vec, i32 64, <8 x i1> %tgt.mask, <8 x i32> poison)
  %data.padded = shufflevector <8 x i32> %data, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  store <8 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor2_vf8:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor2_vf8:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovb (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor2_vf8:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
  %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison)
  store <16 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor2_vf16:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k2
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor2_vf16:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor2_vf16:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k0
; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT:    vpmovw2m %zmm0, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
  %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison)
  store <32 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor2_vf32:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k2
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k3
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k4
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor2_vf32:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    kmovw 2(%rdi), %k1
; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k4
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k3} {z}
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf32:
; AVX512BW-ONLY:       # %bb.0:
; AVX512BW-ONLY-NEXT:    kmovq (%rdi), %k0
; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k1
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-ONLY-NEXT:    vzeroupper
; AVX512BW-ONLY-NEXT:    retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf32:
; AVX512VBMI-ONLY:       # %bb.0:
; AVX512VBMI-ONLY-NEXT:    kmovq (%rdi), %k0
; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k1
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512VBMI-ONLY-NEXT:    vzeroupper
; AVX512VBMI-ONLY-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
  %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison)
  store <64 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor2_vf64:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k3
; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k5
; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k4
; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k2
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm3
; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k4
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k6
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k5} {z}
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm3
; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k5
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k7
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k3} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k7} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k6} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k4} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512F-ONLY-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor2_vf64:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    kmovw 2(%rdi), %k5
; AVX512DQ-NEXT:    kmovw 4(%rdi), %k3
; AVX512DQ-NEXT:    kmovw 6(%rdi), %k1
; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT:    vpmovm2d %k3, %zmm0
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm3
; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k3
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k4
; AVX512DQ-NEXT:    vpmovm2d %k5, %zmm0
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm3
; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k5
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k6
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k7
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k7} {z}
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf64:
; AVX512BW-ONLY:       # %bb.0:
; AVX512BW-ONLY-NEXT:    kmovq (%rdi), %k0
; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7]
; AVX512BW-ONLY-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-ONLY-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k1
; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm0, %zmm0
; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k2
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k2, %k2
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-ONLY-NEXT:    vzeroupper
; AVX512BW-ONLY-NEXT:    retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf64:
; AVX512VBMI-ONLY:       # %bb.0:
; AVX512VBMI-ONLY-NEXT:    kmovq (%rdi), %k0
; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k1
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k2
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k2, %k2
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512VBMI-ONLY-NEXT:    vzeroupper
; AVX512VBMI-ONLY-NEXT:    retq
  %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
  %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
  %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison)
  store <128 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor3_vf2:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-ONLY-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0]
; AVX512F-ONLY-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512F-ONLY-NEXT:    vpslld $31, %ymm0, %ymm0
; AVX512F-ONLY-NEXT:    movb $63, %al
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vptestmd %ymm0, %ymm0, %k1 {%k1}
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vextracti128 $1, %ymm0, %xmm1
; AVX512F-ONLY-NEXT:    vmovq %xmm1, 16(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa %xmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor3_vf2:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovb (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %ymm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0]
; AVX512DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT:    movb $63, %al
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1 {%k1}
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT:    vmovq %xmm1, 16(%rdx)
; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor3_vf2:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0]
; AVX512BW-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512BW-NEXT:    vpslld $31, %ymm0, %ymm0
; AVX512BW-NEXT:    movb $63, %al
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    vptestmd %ymm0, %ymm0, %k1 {%k1}
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT:    vmovq %xmm1, 16(%rdx)
; AVX512BW-NEXT:    vmovdqa %xmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
  %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
  %data = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr %in.vec, i32 64, <6 x i1> %tgt.mask, <6 x i32> poison)
  %data.padded = shufflevector <6 x i32> %data, <6 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  store <6 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor3_vf4:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vpslld $31, %zmm0, %zmm0
; AVX512F-ONLY-NEXT:    movw $4095, %ax # imm = 0xFFF
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor3_vf4:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT:    movw $4095, %ax # imm = 0xFFF
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor3_vf4:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT:    vpslld $31, %zmm0, %zmm0
; AVX512BW-NEXT:    movw $4095, %ax # imm = 0xFFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512BW-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
  %data = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %in.vec, i32 64, <12 x i1> %tgt.mask, <12 x i32> poison)
  %data.padded = shufflevector <12 x i32> %data, <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
  store <12 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor3_vf8:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k2
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-ONLY-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7]
; AVX512F-ONLY-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512F-ONLY-NEXT:    vptestmd %ymm0, %ymm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa %ymm1, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor3_vf8:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovb (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm1
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovm2d %k0, %ymm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7]
; AVX512DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT:    vpmovd2m %ymm0, %k2
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa %ymm1, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor3_vf8:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k0
; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT:    movl $16777215, %eax # imm = 0xFFFFFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT:    vmovdqa %ymm0, 64(%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
  %data = call <24 x i32> @llvm.masked.load.v24i32.p0(ptr %in.vec, i32 64, <24 x i1> %tgt.mask, <24 x i32> poison)
  %data.padded = shufflevector <24 x i32> %data, <24 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  store <24 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor3_vf16:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k3
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor3_vf16:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k3
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor3_vf16:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm2, %zmm2
; AVX512BW-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
  %data = call <48 x i32> @llvm.masked.load.v48i32.p0(ptr %in.vec, i32 64, <48 x i1> %tgt.mask, <48 x i32> poison)
  store <48 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor3_vf32:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k2
; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k2
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k2
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k2}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k3
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm3
; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k4
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k5
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k6
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k3} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm4 {%k4} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm5 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor3_vf32:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k1
; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm2
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k2
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm3
; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k3
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k4
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k5
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k6
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k4} {z}
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm4 {%k3} {z}
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor3_vf32:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovd (%rdi), %k0
; AVX512BW-NEXT:    kshiftrd $1, %k0, %k1
; AVX512BW-NEXT:    movw $-3, %ax
; AVX512BW-NEXT:    kmovd %eax, %k4
; AVX512BW-NEXT:    kmovw (%rdi), %k2
; AVX512BW-NEXT:    kandw %k4, %k2, %k3
; AVX512BW-NEXT:    kmovq %k4, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    movw $-5, %ax
; AVX512BW-NEXT:    kmovd %eax, %k4
; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k3, %k2
; AVX512BW-NEXT:    movw $-9, %ax
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-17, %ax
; AVX512BW-NEXT:    kmovd %eax, %k5
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-33, %ax
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k2, %k1
; AVX512BW-NEXT:    movw $-65, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $2, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k2, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    movw $-129, %ax
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k2, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $3, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k2, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k2, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $4, %k0, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k2
; AVX512BW-NEXT:    kshiftrw $3, %k2, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT:    kmovd %eax, %k6
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $2, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $5, %k0, %k2
; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $27, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k4
; AVX512BW-NEXT:    kshiftrd $26, %k0, %k1
; AVX512BW-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovq %k7, %k2
; AVX512BW-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $28, %k0, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $11, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $29, %k0, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $8, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $30, %k0, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $5, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $31, %k0, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k7
; AVX512BW-NEXT:    kshiftrw $2, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $21, %k0, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k6, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $22, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $23, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $10, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $24, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $25, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k0, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k2, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $17, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $12, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $18, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $19, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $20, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $11, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k2
; AVX512BW-NEXT:    kshiftrd $10, %k0, %k4
; AVX512BW-NEXT:    kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $13, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $12, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k4, %k2
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $12, %k0, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $11, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $10, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $13, %k0, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $8, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $7, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $14, %k0, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $5, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $4, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $3, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $15, %k0, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $14, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm4 {%k2} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $14, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $6, %k0, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $13, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $12, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $11, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $7, %k0, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $10, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $8, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $8, %k0, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $7, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $5, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $9, %k0, %k0
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k2, %k0
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k2
; AVX512BW-NEXT:    korw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
  %data = call <96 x i32> @llvm.masked.load.v96i32.p0(ptr %in.vec, i32 64, <96 x i1> %tgt.mask, <96 x i32> poison)
  store <96 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor3_vf64:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm6
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm0
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm1, %zmm8
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm2, %zmm9
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm7, %zmm3
; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm1, %zmm10
; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm2, %zmm11
; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm7, %zmm4
; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm2, %zmm2
; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm7, %zmm5
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 384(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, 448(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 512(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 576(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 640(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 704(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor3_vf64:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm3
; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm4
; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm5
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm6
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm0
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm1, %zmm8
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm2, %zmm9
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm7, %zmm3
; AVX512DQ-NEXT:    vpermd %zmm4, %zmm1, %zmm10
; AVX512DQ-NEXT:    vpermd %zmm4, %zmm2, %zmm11
; AVX512DQ-NEXT:    vpermd %zmm4, %zmm7, %zmm4
; AVX512DQ-NEXT:    vpermd %zmm5, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpermd %zmm5, %zmm2, %zmm2
; AVX512DQ-NEXT:    vpermd %zmm5, %zmm7, %zmm5
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k1
; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 384(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 448(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 512(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 576(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 640(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 704(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm7, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor3_vf64:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovq (%rdi), %k0
; AVX512BW-NEXT:    kshiftrq $1, %k0, %k1
; AVX512BW-NEXT:    movw $-3, %ax
; AVX512BW-NEXT:    kmovd %eax, %k4
; AVX512BW-NEXT:    kmovw (%rdi), %k2
; AVX512BW-NEXT:    kandw %k4, %k2, %k3
; AVX512BW-NEXT:    kmovq %k4, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    movw $-5, %ax
; AVX512BW-NEXT:    kmovd %eax, %k4
; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k3, %k2
; AVX512BW-NEXT:    movw $-9, %ax
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-17, %ax
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-33, %ax
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k2, %k1
; AVX512BW-NEXT:    movw $-65, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $2, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k2, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    movw $-129, %ax
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kmovq %k3, %k5
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $8, %k2, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $3, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k2, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k2, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $4, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k3
; AVX512BW-NEXT:    kshiftrw $3, %k3, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT:    kmovd %eax, %k6
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $2, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $5, %k0, %k2
; AVX512BW-NEXT:    kmovq %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $59, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k2
; AVX512BW-NEXT:    kshiftrq $58, %k0, %k1
; AVX512BW-NEXT:    kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT:    kmovq %k7, %k3
; AVX512BW-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k2, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $60, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $11, %k2, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k2, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $61, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $8, %k2, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k2, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $62, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $5, %k2, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k2, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $63, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k7
; AVX512BW-NEXT:    kshiftrw $2, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $53, %k0, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k6, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $54, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $55, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $10, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $56, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $57, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 8-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $48, %k0, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k3, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $49, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $50, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $9, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $51, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $6, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $52, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $43, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k2
; AVX512BW-NEXT:    kshiftrq $42, %k0, %k1
; AVX512BW-NEXT:    kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k3
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k3, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $44, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $10, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $45, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $8, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $7, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $46, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $4, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $3, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $47, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm4 {%k2} {z}
; AVX512BW-NEXT:    kshiftrq $37, %k0, %k2
; AVX512BW-NEXT:    kandw %k4, %k2, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrq $38, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrq $39, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $10, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $9, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $8, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrq $40, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrq $41, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kshiftlw $1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $1, %k3, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k3, %k1
; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k0, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k3, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $33, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $34, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $9, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $35, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $6, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $36, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $27, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k2
; AVX512BW-NEXT:    kshiftrq $26, %k0, %k3
; AVX512BW-NEXT:    kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k3, %k2
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $28, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $10, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $29, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $8, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $7, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $30, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $4, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $3, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $31, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k2, %k2
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm7 {%k2} {z}
; AVX512BW-NEXT:    kshiftrq $21, %k0, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k2, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrq $22, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrq $23, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $10, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $9, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $8, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrq $24, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrq $25, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kshiftlw $1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $1, %k3, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k3, %k1
; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $16, %k0, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k3, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $17, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $18, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $9, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $19, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $6, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrq $20, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $11, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k2
; AVX512BW-NEXT:    kshiftrq $10, %k0, %k3
; AVX512BW-NEXT:    kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k3, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $12, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $10, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $13, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $8, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $7, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $14, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $4, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $3, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $15, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm10 {%k2} {z}
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $14, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $6, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $12, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $11, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $7, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $10, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $8, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $8, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $7, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $5, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrq $9, %k0, %k0
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k2, %k0
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k2
; AVX512BW-NEXT:    korw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm11 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm11, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm10, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm9, 192(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm8, 256(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm7, 320(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm5, 448(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm4, 512(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm3, 576(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 640(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, 704(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
  %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
  %data = call <192 x i32> @llvm.masked.load.v192i32.p0(ptr %in.vec, i32 64, <192 x i1> %tgt.mask, <192 x i32> poison)
  store <192 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-SLOW-LABEL: mask_replication_factor4_vf2:
; AVX512F-SLOW:       # %bb.0:
; AVX512F-SLOW-NEXT:    kmovw (%rdi), %k1
; AVX512F-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-SLOW-NEXT:    vpmovsxdq %xmm0, %xmm0
; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512F-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k1
; AVX512F-SLOW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512F-SLOW-NEXT:    vzeroupper
; AVX512F-SLOW-NEXT:    retq
;
; AVX512F-FAST-LABEL: mask_replication_factor4_vf2:
; AVX512F-FAST:       # %bb.0:
; AVX512F-FAST-NEXT:    kmovw (%rdi), %k1
; AVX512F-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512F-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k1
; AVX512F-FAST-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512F-FAST-NEXT:    vzeroupper
; AVX512F-FAST-NEXT:    retq
;
; AVX512DQ-SLOW-LABEL: mask_replication_factor4_vf2:
; AVX512DQ-SLOW:       # %bb.0:
; AVX512DQ-SLOW-NEXT:    kmovb (%rdi), %k0
; AVX512DQ-SLOW-NEXT:    vpmovm2d %k0, %ymm0
; AVX512DQ-SLOW-NEXT:    vpmovsxdq %xmm0, %xmm0
; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512DQ-SLOW-NEXT:    vpmovd2m %ymm0, %k1
; AVX512DQ-SLOW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512DQ-SLOW-NEXT:    vzeroupper
; AVX512DQ-SLOW-NEXT:    retq
;
; AVX512DQ-FAST-LABEL: mask_replication_factor4_vf2:
; AVX512DQ-FAST:       # %bb.0:
; AVX512DQ-FAST-NEXT:    kmovb (%rdi), %k0
; AVX512DQ-FAST-NEXT:    vpmovm2d %k0, %ymm0
; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512DQ-FAST-NEXT:    vpmovd2m %ymm0, %k1
; AVX512DQ-FAST-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512DQ-FAST-NEXT:    vzeroupper
; AVX512DQ-FAST-NEXT:    retq
;
; AVX512BW-SLOW-LABEL: mask_replication_factor4_vf2:
; AVX512BW-SLOW:       # %bb.0:
; AVX512BW-SLOW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512BW-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512BW-SLOW-NEXT:    vpmovsxdq %xmm0, %xmm0
; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512BW-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k1
; AVX512BW-SLOW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512BW-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512BW-SLOW-NEXT:    vzeroupper
; AVX512BW-SLOW-NEXT:    retq
;
; AVX512BW-FAST-LABEL: mask_replication_factor4_vf2:
; AVX512BW-FAST:       # %bb.0:
; AVX512BW-FAST-NEXT:    kmovw (%rdi), %k1
; AVX512BW-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512BW-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
; AVX512BW-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512BW-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k1
; AVX512BW-FAST-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512BW-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512BW-FAST-NEXT:    vzeroupper
; AVX512BW-FAST-NEXT:    retq
;
; AVX512VBMI-SLOW-LABEL: mask_replication_factor4_vf2:
; AVX512VBMI-SLOW:       # %bb.0:
; AVX512VBMI-SLOW-NEXT:    kmovw (%rdi), %k1
; AVX512VBMI-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VBMI-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VBMI-SLOW-NEXT:    vpmovsxdq %xmm0, %xmm0
; AVX512VBMI-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512VBMI-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k1
; AVX512VBMI-SLOW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512VBMI-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512VBMI-SLOW-NEXT:    vzeroupper
; AVX512VBMI-SLOW-NEXT:    retq
;
; AVX512VBMI-FAST-LABEL: mask_replication_factor4_vf2:
; AVX512VBMI-FAST:       # %bb.0:
; AVX512VBMI-FAST-NEXT:    kmovw (%rdi), %k1
; AVX512VBMI-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VBMI-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VBMI-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
; AVX512VBMI-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512VBMI-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k1
; AVX512VBMI-FAST-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512VBMI-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512VBMI-FAST-NEXT:    vzeroupper
; AVX512VBMI-FAST-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
  %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
  %data = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %in.vec, i32 64, <8 x i1> %tgt.mask, <8 x i32> poison)
  %data.padded = shufflevector <8 x i32> %data, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  store <8 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor4_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor4_vf4:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor4_vf4:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor4_vf4:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
  %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison)
  store <16 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor4_vf8:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k2
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor4_vf8:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovb (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor4_vf8:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k0
; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT:    vpmovw2m %zmm0, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
  %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison)
  store <32 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor4_vf16:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k4
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k3} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor4_vf16:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k4
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k3} {z}
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf16:
; AVX512BW-ONLY:       # %bb.0:
; AVX512BW-ONLY-NEXT:    kmovq (%rdi), %k0
; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k1
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-ONLY-NEXT:    vzeroupper
; AVX512BW-ONLY-NEXT:    retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf16:
; AVX512VBMI-ONLY:       # %bb.0:
; AVX512VBMI-ONLY-NEXT:    kmovq (%rdi), %k0
; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k1
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512VBMI-ONLY-NEXT:    vzeroupper
; AVX512VBMI-ONLY-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
  %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison)
  store <64 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor4_vf32:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k4
; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm3
; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm4
; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k3
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm4, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k5
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k4
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k6
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k7
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm4, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k7} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k5} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512F-ONLY-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor4_vf32:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    kmovw 2(%rdi), %k1
; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm3
; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k2
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm4
; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k3
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm4, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k4
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k5
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k6
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k7
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm4, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k7} {z}
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf32:
; AVX512BW-ONLY:       # %bb.0:
; AVX512BW-ONLY-NEXT:    kmovd (%rdi), %k0
; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k1
; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm0, %zmm0
; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k2
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k2, %k2
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-ONLY-NEXT:    vzeroupper
; AVX512BW-ONLY-NEXT:    retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf32:
; AVX512VBMI-ONLY:       # %bb.0:
; AVX512VBMI-ONLY-NEXT:    kmovd (%rdi), %k0
; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31]
; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k1
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k2
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k2, %k2
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512VBMI-ONLY-NEXT:    vzeroupper
; AVX512VBMI-ONLY-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
  %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison)
  store <128 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor4_vf64:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm4
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm6
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm8
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm0
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm3, %zmm10
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm5, %zmm11
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm7, %zmm12
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm9, %zmm1
; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm3, %zmm13
; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm5, %zmm14
; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm7, %zmm15
; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm9, %zmm2
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm3, %zmm3
; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm5, %zmm5
; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm7, %zmm7
; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm9, %zmm9
; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm15 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm14 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm13 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm12 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 960(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 896(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 832(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 768(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 704(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, 640(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm12, 576(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 512(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm13, 448(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm14, 384(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm15, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor4_vf64:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm4
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm5, %zmm6
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm8
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm0
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm3, %zmm10
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm5, %zmm11
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm7, %zmm12
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm9, %zmm1
; AVX512DQ-NEXT:    vpermd %zmm2, %zmm3, %zmm13
; AVX512DQ-NEXT:    vpermd %zmm2, %zmm5, %zmm14
; AVX512DQ-NEXT:    vpermd %zmm2, %zmm7, %zmm15
; AVX512DQ-NEXT:    vpermd %zmm2, %zmm9, %zmm2
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm16
; AVX512DQ-NEXT:    vpermd %zmm16, %zmm3, %zmm3
; AVX512DQ-NEXT:    vpermd %zmm16, %zmm5, %zmm5
; AVX512DQ-NEXT:    vpermd %zmm16, %zmm7, %zmm7
; AVX512DQ-NEXT:    vpermd %zmm16, %zmm9, %zmm9
; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k1
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm15 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k1
; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm14 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k1
; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm13 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k1
; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm12 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k1
; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 768(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT:    vmovdqa32 832(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT:    vmovdqa32 896(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT:    vmovdqa32 960(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 960(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 896(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 832(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 768(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 704(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 640(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 576(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 512(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 448(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 384(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm9, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf64:
; AVX512BW-ONLY:       # %bb.0:
; AVX512BW-ONLY-NEXT:    kmovq (%rdi), %k0
; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7]
; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k1
; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5]
; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k2
; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k3
; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm0, %zmm0
; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k4
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k4, %k5
; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k4} {z}
; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k4, %k4
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k4, %k5
; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k3, %k4
; AVX512BW-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k3, %k3
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k3, %k4
; AVX512BW-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
; AVX512BW-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k2, %k2
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
; AVX512BW-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm15, 896(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm14, 960(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm13, 768(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm12, 832(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm11, 640(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm10, 704(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm9, 512(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm8, 576(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-ONLY-NEXT:    vzeroupper
; AVX512BW-ONLY-NEXT:    retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf64:
; AVX512VBMI-ONLY:       # %bb.0:
; AVX512VBMI-ONLY-NEXT:    kmovq (%rdi), %k0
; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [48,48,48,48,49,49,49,49,50,50,50,50,51,51,51,51,52,52,52,52,53,53,53,53,54,54,54,54,55,55,55,55,56,56,56,56,57,57,57,57,58,58,58,58,59,59,59,59,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k1
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [32,32,32,32,33,33,33,33,34,34,34,34,35,35,35,35,36,36,36,36,37,37,37,37,38,38,38,38,39,39,39,39,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,44,44,44,44,45,45,45,45,46,46,46,46,47,47,47,47]
; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k2
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31]
; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k3
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k4
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k4, %k5
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k4} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k4, %k4
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k4, %k5
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k3, %k4
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k3, %k3
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k3, %k4
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k2, %k2
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm15, 896(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm14, 960(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm13, 768(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm12, 832(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm11, 640(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm10, 704(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm9, 512(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm8, 576(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512VBMI-ONLY-NEXT:    vzeroupper
; AVX512VBMI-ONLY-NEXT:    retq
  %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
  %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
  %data = call <256 x i32> @llvm.masked.load.v256i32.p0(ptr %in.vec, i32 64, <256 x i1> %tgt.mask, <256 x i32> poison)
  store <256 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor5_vf2:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vpslld $31, %zmm0, %zmm0
; AVX512F-ONLY-NEXT:    movw $1023, %ax # imm = 0x3FF
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
; AVX512F-ONLY-NEXT:    vmovq %xmm1, 32(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor5_vf2:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT:    movw $1023, %ax # imm = 0x3FF
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
; AVX512DQ-NEXT:    vmovq %xmm1, 32(%rdx)
; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor5_vf2:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT:    vpslld $31, %zmm0, %zmm0
; AVX512BW-NEXT:    movw $1023, %ax # imm = 0x3FF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
; AVX512BW-NEXT:    vmovq %xmm1, 32(%rdx)
; AVX512BW-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
  %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
  %data = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr %in.vec, i32 64, <10 x i1> %tgt.mask, <10 x i32> poison)
  %data.padded = shufflevector <10 x i32> %data, <10 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  store <10 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor5_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor5_vf4:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512F-ONLY-NEXT:    vpslld $31, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    movw $15, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k2
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa %xmm0, 64(%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor5_vf4:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-NEXT:    movw $15, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT:    vmovdqa %xmm0, 64(%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor5_vf4:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovd (%rdi), %k0
; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT:    movl $1048575, %eax # imm = 0xFFFFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT:    vmovdqa %xmm0, 64(%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
  %data = call <20 x i32> @llvm.masked.load.v20i32.p0(ptr %in.vec, i32 64, <20 x i1> %tgt.mask, <20 x i32> poison)
  %data.padded = shufflevector <20 x i32> %data, <20 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  store <20 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor5_vf8:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k2
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k3
; AVX512F-ONLY-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-ONLY-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7]
; AVX512F-ONLY-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512F-ONLY-NEXT:    vptestmd %ymm0, %ymm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k3} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa %ymm1, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor5_vf8:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovb (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm1
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT:    vpmovm2d %k0, %ymm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7]
; AVX512DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT:    vpmovd2m %ymm0, %k3
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa %ymm1, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor5_vf8:
; AVX512BW-ONLY:       # %bb.0:
; AVX512BW-ONLY-NEXT:    kmovw (%rdi), %k0
; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,38,38,38,39,39,39,39,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-ONLY-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-ONLY-NEXT:    movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF
; AVX512BW-ONLY-NEXT:    kmovq %rax, %k1
; AVX512BW-ONLY-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k2
; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k2} {z}
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k1
; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa %ymm1, 128(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-ONLY-NEXT:    vzeroupper
; AVX512BW-ONLY-NEXT:    retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor5_vf8:
; AVX512VBMI-ONLY:       # %bb.0:
; AVX512VBMI-ONLY-NEXT:    kmovw (%rdi), %k0
; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX512VBMI-ONLY-NEXT:    movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF
; AVX512VBMI-ONLY-NEXT:    kmovq %rax, %k1
; AVX512VBMI-ONLY-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k2
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k1
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa %ymm1, 128(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512VBMI-ONLY-NEXT:    vzeroupper
; AVX512VBMI-ONLY-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
  %data = call <40 x i32> @llvm.masked.load.v40i32.p0(ptr %in.vec, i32 64, <40 x i1> %tgt.mask, <40 x i32> poison)
  %data.padded = shufflevector <40 x i32> %data, <40 x i32> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  store <40 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor5_vf16:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k4
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k5
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm1 {%k5} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k4} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k3} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor5_vf16:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k4
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k5
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm1 {%k5} {z}
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k4} {z}
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k3} {z}
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor5_vf16:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm2, %zmm2
; AVX512BW-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm3, %zmm3
; AVX512BW-NEXT:    vptestmd %zmm3, %zmm3, %k1
; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm4, %zmm4
; AVX512BW-NEXT:    vptestmd %zmm4, %zmm4, %k1
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm4 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm5, %zmm0
; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
  %data = call <80 x i32> @llvm.masked.load.v80i32.p0(ptr %in.vec, i32 64, <80 x i1> %tgt.mask, <80 x i32> poison)
  store <80 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor5_vf32:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm4
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm6
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm8
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm0
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm2, %zmm2
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm5, %zmm5
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm7, %zmm7
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm9, %zmm3
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 448(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 512(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 576(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor5_vf32:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm3
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm4
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm5, %zmm6
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm8
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm0
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm2, %zmm2
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm5, %zmm5
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm7, %zmm7
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm9, %zmm3
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 448(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 512(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 576(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm9, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor5_vf32:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovd (%rdi), %k5
; AVX512BW-NEXT:    kshiftrd $1, %k5, %k1
; AVX512BW-NEXT:    movw $-3, %ax
; AVX512BW-NEXT:    kmovd %eax, %k6
; AVX512BW-NEXT:    kmovw (%rdi), %k2
; AVX512BW-NEXT:    kandw %k6, %k2, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    movw $-5, %ax
; AVX512BW-NEXT:    kmovd %eax, %k4
; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    movw $-9, %ax
; AVX512BW-NEXT:    kmovd %eax, %k4
; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    movw $-17, %ax
; AVX512BW-NEXT:    kmovd %eax, %k4
; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k3, %k2
; AVX512BW-NEXT:    movw $-33, %ax
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-65, %ax
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-129, %ax
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT:    kmovd %eax, %k7
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k2, %k1
; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k3
; AVX512BW-NEXT:    kshiftrd $2, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k2
; AVX512BW-NEXT:    kshiftrw $5, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT:    kmovd %eax, %k4
; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $4, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT:    kmovd %eax, %k4
; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $3, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT:    kmovd %eax, %k4
; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $2, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k3, %k2
; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT:    kmovd %eax, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k2, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $3, %k5, %k2
; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $29, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k2
; AVX512BW-NEXT:    kshiftrd $28, %k5, %k1
; AVX512BW-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kandw %k6, %k1, %k3
; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $10, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k3, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $30, %k5, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $9, %k3, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $8, %k3, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $7, %k3, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k3, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $5, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $31, %k5, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k4
; AVX512BW-NEXT:    kshiftrw $4, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $3, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $2, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm1 {%k2} {z}
; AVX512BW-NEXT:    kshiftrd $25, %k5, %k2
; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kandw %k6, %k2, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftrd $26, %k5, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k7, %k7
; AVX512BW-NEXT:    kshiftrw $13, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $10, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $9, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrd $27, %k5, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $8, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k1
; AVX512BW-NEXT:    korw %k1, %k3, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $22, %k5, %k0
; AVX512BW-NEXT:    kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k6, %k6
; AVX512BW-NEXT:    kshiftrd $23, %k5, %k7
; AVX512BW-NEXT:    kmovq %k5, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k7, %k7
; AVX512BW-NEXT:    kshiftrw $12, %k7, %k5
; AVX512BW-NEXT:    korw %k5, %k6, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $11, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $10, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $9, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $8, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kandw %k2, %k5, %k5
; AVX512BW-NEXT:    kshiftrd $24, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $2, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kandw %k4, %k5, %k5
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k5, %k2
; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
; AVX512BW-NEXT:    korw %k7, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm3 {%k2} {z}
; AVX512BW-NEXT:    kshiftrd $19, %k0, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k2, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k6
; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k5
; AVX512BW-NEXT:    korw %k5, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k5
; AVX512BW-NEXT:    korw %k5, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k5
; AVX512BW-NEXT:    korw %k5, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k4, %k4
; AVX512BW-NEXT:    kshiftrd $20, %k0, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $11, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $10, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $9, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $8, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $7, %k5, %k5
; AVX512BW-NEXT:    korw %k5, %k4, %k4
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrd $21, %k0, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $6, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kandw %k3, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $5, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $4, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $3, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $2, %k5, %k5
; AVX512BW-NEXT:    korw %k5, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k4, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm4 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k0, %k1
; AVX512BW-NEXT:    kandw %k7, %k1, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k3, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $17, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $10, %k3, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k3, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k3, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k3, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $18, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k4
; AVX512BW-NEXT:    kshiftrw $5, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $13, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $12, %k0, %k3
; AVX512BW-NEXT:    kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k2
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k2, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $14, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $15, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k4
; AVX512BW-NEXT:    kshiftrw $4, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm6 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $9, %k0, %k2
; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k4, %k4
; AVX512BW-NEXT:    kandw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftrd $10, %k0, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $13, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $12, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kandw %k7, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $11, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $10, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $9, %k5, %k5
; AVX512BW-NEXT:    korw %k5, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrd $11, %k0, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $8, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $7, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $6, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $5, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $4, %k5, %k5
; AVX512BW-NEXT:    korw %k5, %k4, %k4
; AVX512BW-NEXT:    kandw %k3, %k4, %k4
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $15, %k7, %k5
; AVX512BW-NEXT:    kshiftrw $3, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $2, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k4, %k4
; AVX512BW-NEXT:    kshiftlw $14, %k7, %k3
; AVX512BW-NEXT:    korw %k3, %k4, %k3
; AVX512BW-NEXT:    kshiftlw $1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $1, %k3, %k3
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm7 {%k3} {z}
; AVX512BW-NEXT:    kshiftrd $6, %k0, %k4
; AVX512BW-NEXT:    kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k4, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $13, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrd $7, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $11, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $10, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $9, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $8, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k5, %k5
; AVX512BW-NEXT:    kshiftrd $8, %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kandw %k2, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k5, %k2
; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
; AVX512BW-NEXT:    korw %k1, %k2, %k1
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $14, %k5, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k5, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k5, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $4, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $11, %k2, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k2, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k2, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k2, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $5, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    korw %k1, %k0, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm9, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm8, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm7, 192(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm6, 256(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm4, 384(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm3, 448(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 512(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, 576(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
  %data = call <160 x i32> @llvm.masked.load.v160i32.p0(ptr %in.vec, i32 64, <160 x i1> %tgt.mask, <160 x i32> poison)
  store <160 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor5_vf64:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm7, %zmm1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm8, %zmm2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm9, %zmm10
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm11, %zmm12
; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm3, %zmm4
; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm7, %zmm13
; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm8, %zmm14
; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm9, %zmm15
; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm11, %zmm16
; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm3, %zmm5
; AVX512F-ONLY-NEXT:    vpermd %zmm6, %zmm7, %zmm17
; AVX512F-ONLY-NEXT:    vpermd %zmm6, %zmm8, %zmm18
; AVX512F-ONLY-NEXT:    vpermd %zmm6, %zmm9, %zmm19
; AVX512F-ONLY-NEXT:    vpermd %zmm6, %zmm3, %zmm3
; AVX512F-ONLY-NEXT:    vpermd %zmm6, %zmm11, %zmm6
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm7
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm8, %zmm8
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm9
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm11, %zmm0
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm19, %zmm19, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm19 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm18, %zmm18, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm18 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm17, %zmm17, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm17 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm16, %zmm16, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm16 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm15 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm14 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm13 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1024(%rsi), %zmm12 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1088(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1152(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1216(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 1216(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 1152(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 1088(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm12, 1024(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 960(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm13, 896(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm14, 832(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm15, 768(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm16, 704(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 640(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm17, 576(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm18, 512(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm19, 448(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 384(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor5_vf64:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm4
; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm5
; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm6
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm4, %zmm7, %zmm1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
; AVX512DQ-NEXT:    vpermd %zmm4, %zmm8, %zmm2
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
; AVX512DQ-NEXT:    vpermd %zmm4, %zmm9, %zmm10
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512DQ-NEXT:    vpermd %zmm4, %zmm11, %zmm12
; AVX512DQ-NEXT:    vpermd %zmm4, %zmm3, %zmm4
; AVX512DQ-NEXT:    vpermd %zmm5, %zmm7, %zmm13
; AVX512DQ-NEXT:    vpermd %zmm5, %zmm8, %zmm14
; AVX512DQ-NEXT:    vpermd %zmm5, %zmm9, %zmm15
; AVX512DQ-NEXT:    vpermd %zmm5, %zmm11, %zmm16
; AVX512DQ-NEXT:    vpermd %zmm5, %zmm3, %zmm5
; AVX512DQ-NEXT:    vpermd %zmm6, %zmm7, %zmm17
; AVX512DQ-NEXT:    vpermd %zmm6, %zmm8, %zmm18
; AVX512DQ-NEXT:    vpermd %zmm6, %zmm9, %zmm19
; AVX512DQ-NEXT:    vpermd %zmm6, %zmm3, %zmm3
; AVX512DQ-NEXT:    vpermd %zmm6, %zmm11, %zmm6
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm7
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm8, %zmm8
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm9
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm11, %zmm0
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm19, %k1
; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm19 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm18, %k1
; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm18 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm17, %k1
; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm17 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm16, %k1
; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm16 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k1
; AVX512DQ-NEXT:    vmovdqa32 768(%rsi), %zmm15 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k1
; AVX512DQ-NEXT:    vmovdqa32 832(%rsi), %zmm14 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k1
; AVX512DQ-NEXT:    vmovdqa32 896(%rsi), %zmm13 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT:    vmovdqa32 960(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k1
; AVX512DQ-NEXT:    vmovdqa32 1024(%rsi), %zmm12 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT:    vmovdqa32 1088(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vmovdqa32 1152(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vmovdqa32 1216(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 1216(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 1152(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 1088(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 1024(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 960(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 896(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 832(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 768(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 704(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 640(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 576(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 512(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 448(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 384(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm11, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor5_vf64:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovq (%rdi), %k5
; AVX512BW-NEXT:    kshiftrq $1, %k5, %k0
; AVX512BW-NEXT:    movw $-3, %ax
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kmovw (%rdi), %k2
; AVX512BW-NEXT:    kandw %k1, %k2, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    movw $-5, %ax
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    movw $-9, %ax
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    movw $-17, %ax
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k3, %k2
; AVX512BW-NEXT:    movw $-33, %ax
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-65, %ax
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-129, %ax
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k2, %k0
; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k0, %k3
; AVX512BW-NEXT:    kshiftrq $2, %k5, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k2
; AVX512BW-NEXT:    kshiftrw $5, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $4, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $3, %k2, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k7
; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k7, %k7
; AVX512BW-NEXT:    kshiftrw $2, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k7, %k7
; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT:    kmovd %eax, %k6
; AVX512BW-NEXT:    kandw %k6, %k7, %k7
; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $3, %k5, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k7, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k7, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $4, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $5, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $6, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k6
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k6} {z}
; AVX512BW-NEXT:    kandw %k2, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k7, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k7, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $7, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $8, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $9, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $10, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $11, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $12, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k6
; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k6} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrq $13, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $14, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $15, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k1
; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm4 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $16, %k5, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $17, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $18, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $19, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm5 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $20, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $21, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $22, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $23, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $24, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $25, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm7 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $26, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $27, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $28, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k6
; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm8 {%k6} {z}
; AVX512BW-NEXT:    kandw %k2, %k1, %k0
; AVX512BW-NEXT:    kshiftrq $29, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $30, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $31, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k1
; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k5, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $33, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $34, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $35, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm10 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $36, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $37, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $38, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm11 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $39, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $40, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $41, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm12 {%k7} {z}
; AVX512BW-NEXT:    kandw %k3, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $42, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $43, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $44, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k6
; AVX512BW-NEXT:    vmovdqa32 832(%rsi), %zmm13 {%k6} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrq $45, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $46, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $47, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k1
; AVX512BW-NEXT:    vmovdqa32 896(%rsi), %zmm14 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $48, %k5, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $49, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $50, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $51, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 960(%rsi), %zmm15 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $52, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $53, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $54, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 1024(%rsi), %zmm16 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $55, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $56, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $57, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 1088(%rsi), %zmm17 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $58, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $59, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $60, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k6
; AVX512BW-NEXT:    vmovdqa32 1152(%rsi), %zmm18 {%k6} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrq $61, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $62, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $63, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k5, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k5, %k2
; AVX512BW-NEXT:    korw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k1, %k0, %k1
; AVX512BW-NEXT:    vmovdqa32 1216(%rsi), %zmm19 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm19, 1216(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm18, 1152(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm17, 1088(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm16, 1024(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm15, 960(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm14, 896(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm13, 832(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm12, 768(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm11, 704(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm10, 640(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm9, 576(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm8, 512(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm7, 448(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
  %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
  %data = call <320 x i32> @llvm.masked.load.v320i32.p0(ptr %in.vec, i32 64, <320 x i1> %tgt.mask, <320 x i32> poison)
  store <320 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor6_vf2:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vpslld $31, %zmm0, %zmm0
; AVX512F-ONLY-NEXT:    movw $4095, %ax # imm = 0xFFF
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor6_vf2:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT:    movw $4095, %ax # imm = 0xFFF
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor6_vf2:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT:    vpslld $31, %zmm0, %zmm0
; AVX512BW-NEXT:    movw $4095, %ax # imm = 0xFFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512BW-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
  %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %data = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %in.vec, i32 64, <12 x i1> %tgt.mask, <12 x i32> poison)
  %data.padded = shufflevector <12 x i32> %data, <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
  store <12 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-SLOW-LABEL: mask_replication_factor6_vf4:
; AVX512F-SLOW:       # %bb.0:
; AVX512F-SLOW-NEXT:    kmovw (%rdi), %k1
; AVX512F-SLOW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
; AVX512F-SLOW-NEXT:    vpslld $31, %zmm1, %zmm1
; AVX512F-SLOW-NEXT:    movw $255, %ax
; AVX512F-SLOW-NEXT:    kmovw %eax, %k1
; AVX512F-SLOW-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
; AVX512F-SLOW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512F-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-SLOW-NEXT:    vptestmd %zmm0, %zmm0, %k2
; AVX512F-SLOW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-SLOW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, 64(%rdx)
; AVX512F-SLOW-NEXT:    vzeroupper
; AVX512F-SLOW-NEXT:    retq
;
; AVX512F-FAST-LABEL: mask_replication_factor6_vf4:
; AVX512F-FAST:       # %bb.0:
; AVX512F-FAST-NEXT:    kmovw (%rdi), %k1
; AVX512F-FAST-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm1
; AVX512F-FAST-NEXT:    vpslld $31, %zmm1, %zmm1
; AVX512F-FAST-NEXT:    movw $255, %ax
; AVX512F-FAST-NEXT:    kmovw %eax, %k1
; AVX512F-FAST-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
; AVX512F-FAST-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-FAST-NEXT:    vptestmd %zmm0, %zmm0, %k2
; AVX512F-FAST-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-FAST-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512F-FAST-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512F-FAST-NEXT:    vmovdqa %ymm0, 64(%rdx)
; AVX512F-FAST-NEXT:    vzeroupper
; AVX512F-FAST-NEXT:    retq
;
; AVX512DQ-SLOW-LABEL: mask_replication_factor6_vf4:
; AVX512DQ-SLOW:       # %bb.0:
; AVX512DQ-SLOW-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-SLOW-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
; AVX512DQ-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-SLOW-NEXT:    movw $255, %ax
; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
; AVX512DQ-SLOW-NEXT:    vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
; AVX512DQ-SLOW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512DQ-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-SLOW-NEXT:    vpmovd2m %zmm0, %k2
; AVX512DQ-SLOW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-SLOW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, 64(%rdx)
; AVX512DQ-SLOW-NEXT:    vzeroupper
; AVX512DQ-SLOW-NEXT:    retq
;
; AVX512DQ-FAST-LABEL: mask_replication_factor6_vf4:
; AVX512DQ-FAST:       # %bb.0:
; AVX512DQ-FAST-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-FAST-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm1
; AVX512DQ-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-FAST-NEXT:    movw $255, %ax
; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
; AVX512DQ-FAST-NEXT:    vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-FAST-NEXT:    vpmovd2m %zmm0, %k2
; AVX512DQ-FAST-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-FAST-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, 64(%rdx)
; AVX512DQ-FAST-NEXT:    vzeroupper
; AVX512DQ-FAST-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor6_vf4:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovd (%rdi), %k0
; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT:    movl $16777215, %eax # imm = 0xFFFFFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT:    vmovdqa %ymm0, 64(%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  %data = call <24 x i32> @llvm.masked.load.v24i32.p0(ptr %in.vec, i32 64, <24 x i1> %tgt.mask, <24 x i32> poison)
  %data.padded = shufflevector <24 x i32> %data, <24 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  store <24 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor6_vf8:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k3
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor6_vf8:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovb (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k3
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor6_vf8:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm2, %zmm2
; AVX512BW-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
  %data = call <48 x i32> @llvm.masked.load.v48i32.p0(ptr %in.vec, i32 64, <48 x i1> %tgt.mask, <48 x i32> poison)
  store <48 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor6_vf16:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k4
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k5
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k6
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k4} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm4 {%k3} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor6_vf16:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k4
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k5
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k6
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k4} {z}
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm4 {%k3} {z}
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor6_vf16:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm2, %zmm2
; AVX512BW-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm3, %zmm3
; AVX512BW-NEXT:    vptestmd %zmm3, %zmm3, %k1
; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm4, %zmm4
; AVX512BW-NEXT:    vptestmd %zmm4, %zmm4, %k1
; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm4 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm5, %zmm5
; AVX512BW-NEXT:    vptestmd %zmm5, %zmm5, %k1
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm6, %zmm0
; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm5, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm3, 256(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 320(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
  %data = call <96 x i32> @llvm.masked.load.v96i32.p0(ptr %in.vec, i32 64, <96 x i1> %tgt.mask, <96 x i32> poison)
  store <96 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor6_vf32:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm4
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm6
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm8
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm10
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm11, %zmm0
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm2, %zmm2
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm5, %zmm5
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm7, %zmm7
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm9, %zmm9
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm11, %zmm3
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 384(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 448(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 512(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 576(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 640(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 704(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor6_vf32:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm3
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm4
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm5, %zmm6
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm8
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm10
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm11, %zmm0
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm2, %zmm2
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm5, %zmm5
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm7, %zmm7
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm9, %zmm9
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm11, %zmm3
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 384(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 448(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 512(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 576(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 640(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 704(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm11, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor6_vf32:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovd (%rdi), %k5
; AVX512BW-NEXT:    movw $-3, %ax
; AVX512BW-NEXT:    kmovd %eax, %k0
; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    kandw %k0, %k1, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-5, %ax
; AVX512BW-NEXT:    kmovd %eax, %k0
; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k0, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-9, %ax
; AVX512BW-NEXT:    kmovd %eax, %k0
; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k0, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-17, %ax
; AVX512BW-NEXT:    kmovd %eax, %k7
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-33, %ax
; AVX512BW-NEXT:    kmovd %eax, %k0
; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k0, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k2, %k1
; AVX512BW-NEXT:    movw $-65, %ax
; AVX512BW-NEXT:    kmovd %eax, %k0
; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k0, %k1, %k2
; AVX512BW-NEXT:    kshiftrd $1, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-129, %ax
; AVX512BW-NEXT:    kmovd %eax, %k0
; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k0, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT:    kmovd %eax, %k0
; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k0, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT:    kmovd %eax, %k0
; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k0, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT:    kmovd %eax, %k6
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT:    kmovd %eax, %k0
; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k0, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k2, %k1
; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT:    kmovd %eax, %k0
; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k0, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $2, %k5, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k3
; AVX512BW-NEXT:    kmovq %k2, %k4
; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kshiftrw $3, %k3, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k3, %k2
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k4, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $29, %k5, %k0
; AVX512BW-NEXT:    kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k4
; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $30, %k5, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $11, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k4, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kandw %k6, %k1, %k4
; AVX512BW-NEXT:    kshiftrd $31, %k5, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k7, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftlw $14, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftlw $1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $1, %k4, %k4
; AVX512BW-NEXT:    korw %k1, %k4, %k1
; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $26, %k5, %k4
; AVX512BW-NEXT:    kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kandw %k2, %k6, %k6
; AVX512BW-NEXT:    kshiftrd $27, %k5, %k7
; AVX512BW-NEXT:    kmovq %k5, %k2
; AVX512BW-NEXT:    kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kshiftlw $15, %k7, %k7
; AVX512BW-NEXT:    kshiftrw $13, %k7, %k5
; AVX512BW-NEXT:    korw %k5, %k6, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $12, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $11, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $10, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $9, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $8, %k7, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrd $28, %k2, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k2, %k3
; AVX512BW-NEXT:    korw %k3, %k5, %k3
; AVX512BW-NEXT:    kshiftlw $1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $1, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    korw %k2, %k3, %k2
; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm2 {%k2} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT:    kshiftrd $24, %k0, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k2, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k2, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k2, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $10, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k3, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $25, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $9, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $8, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $7, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $5, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $4, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $3, %k4, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $2, %k4, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
; AVX512BW-NEXT:    korw %k4, %k2, %k1
; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kshiftrd $21, %k1, %k2
; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k2, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kandw %k6, %k3, %k3
; AVX512BW-NEXT:    kshiftrd $22, %k1, %k4
; AVX512BW-NEXT:    kmovq %k1, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $11, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $10, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $9, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $8, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $7, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $6, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k3, %k4
; AVX512BW-NEXT:    kshiftrd $23, %k7, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k5, %k3
; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $4, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $3, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $2, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftlw $14, %k5, %k5
; AVX512BW-NEXT:    korw %k5, %k4, %k4
; AVX512BW-NEXT:    kshiftlw $1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $1, %k4, %k4
; AVX512BW-NEXT:    korw %k3, %k4, %k3
; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm4 {%k3} {z}
; AVX512BW-NEXT:    kmovq %k7, %k4
; AVX512BW-NEXT:    kshiftrd $18, %k7, %k6
; AVX512BW-NEXT:    kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k6, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrd $19, %k7, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $11, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $10, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $9, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $8, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kandw %k2, %k5, %k5
; AVX512BW-NEXT:    kshiftrd $20, %k4, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k2
; AVX512BW-NEXT:    korw %k2, %k5, %k2
; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    korw %k0, %k2, %k1
; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT:    kmovq %k4, %k0
; AVX512BW-NEXT:    kshiftrd $16, %k4, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k2, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $17, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k2, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k2, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k2, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k2, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k2, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $3, %k3, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k3, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k1
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT:    kshiftrd $13, %k0, %k3
; AVX512BW-NEXT:    kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k5
; AVX512BW-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k5, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $13, %k5, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $12, %k5, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $14, %k0, %k3
; AVX512BW-NEXT:    kmovq %k0, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $10, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $8, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $7, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k2, %k3
; AVX512BW-NEXT:    kshiftrd $15, %k7, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k5, %k2
; AVX512BW-NEXT:    kshiftrw $5, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $4, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $3, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $2, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftlw $14, %k5, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftlw $1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $1, %k3, %k3
; AVX512BW-NEXT:    korw %k2, %k3, %k2
; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm7 {%k2} {z}
; AVX512BW-NEXT:    kmovq %k7, %k2
; AVX512BW-NEXT:    kshiftrd $10, %k7, %k0
; AVX512BW-NEXT:    kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k0, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrd $11, %k7, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $11, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $10, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $9, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $8, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kandw %k4, %k5, %k5
; AVX512BW-NEXT:    kshiftrd $12, %k2, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k5, %k4
; AVX512BW-NEXT:    kshiftlw $1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $1, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    korw %k1, %k4, %k1
; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $8, %k2, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k4, %k4
; AVX512BW-NEXT:    kandw %k0, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k4, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $9, %k2, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $9, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $3, %k2, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k2, %k4
; AVX512BW-NEXT:    kmovq %k2, %k5
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k5, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kshiftrd $5, %k1, %k2
; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kandw %k6, %k2, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k7
; AVX512BW-NEXT:    kshiftrw $14, %k7, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k7, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k7, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrd $6, %k1, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $11, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $10, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $9, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $8, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $7, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $6, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k3, %k4
; AVX512BW-NEXT:    kshiftrd $7, %k1, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k5, %k3
; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $4, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $3, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $2, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k4, %k4
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k4, %k4
; AVX512BW-NEXT:    kshiftlw $14, %k5, %k5
; AVX512BW-NEXT:    korw %k5, %k4, %k4
; AVX512BW-NEXT:    kshiftlw $1, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $1, %k4, %k4
; AVX512BW-NEXT:    korw %k3, %k4, %k3
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm10 {%k3} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $14, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrd $3, %k1, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kshiftrw $13, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $10, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $9, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $8, %k4, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftrd $4, %k1, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k3, %k0
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k2
; AVX512BW-NEXT:    korw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k7, %k0, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm11 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm11, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm10, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm9, 192(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm8, 256(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm7, 320(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm5, 448(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm4, 512(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm3, 576(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 640(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, 704(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
  %data = call <192 x i32> @llvm.masked.load.v192i32.p0(ptr %in.vec, i32 64, <192 x i1> %tgt.mask, <192 x i32> poison)
  store <192 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor6_vf64:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm4, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm9, %zmm9, %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm7, %zmm10, %zmm1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
; AVX512F-ONLY-NEXT:    vpermd %zmm7, %zmm11, %zmm2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512F-ONLY-NEXT:    vpermd %zmm7, %zmm12, %zmm3
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm7, %zmm13, %zmm5
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512F-ONLY-NEXT:    vpermd %zmm7, %zmm14, %zmm6
; AVX512F-ONLY-NEXT:    vpermd %zmm7, %zmm4, %zmm7
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm10, %zmm15
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm11, %zmm16
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm12, %zmm17
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm13, %zmm18
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm14, %zmm19
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm4, %zmm8
; AVX512F-ONLY-NEXT:    vpermd %zmm9, %zmm10, %zmm20
; AVX512F-ONLY-NEXT:    vpermd %zmm9, %zmm11, %zmm21
; AVX512F-ONLY-NEXT:    vpermd %zmm9, %zmm12, %zmm22
; AVX512F-ONLY-NEXT:    vpermd %zmm9, %zmm13, %zmm23
; AVX512F-ONLY-NEXT:    vpermd %zmm9, %zmm4, %zmm24
; AVX512F-ONLY-NEXT:    vpermd %zmm9, %zmm14, %zmm9
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm10, %zmm10
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm11, %zmm11
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm12, %zmm12
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm13, %zmm13
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm14, %zmm4
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm13 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm12 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm24, %zmm24, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm14 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm23, %zmm23, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm23 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm22, %zmm22, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm22 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm21, %zmm21, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm21 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm20, %zmm20, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm20 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm19, %zmm19, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm19 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm18, %zmm18, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm18 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm17, %zmm17, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm17 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm16, %zmm16, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1024(%rsi), %zmm16 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1088(%rsi), %zmm15 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1152(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1216(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1280(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1344(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1408(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1472(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 1472(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 1408(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 1344(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 1280(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 1216(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 1152(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm15, 1088(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm16, 1024(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm17, 960(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm18, 896(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm19, 832(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 768(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm20, 704(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm21, 640(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm22, 576(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm23, 512(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 448(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm14, 384(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm12, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm13, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor6_vf64:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm4, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm7
; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm8
; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm9
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm7, %zmm10, %zmm1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
; AVX512DQ-NEXT:    vpermd %zmm7, %zmm11, %zmm2
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512DQ-NEXT:    vpermd %zmm7, %zmm12, %zmm3
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512DQ-NEXT:    vpermd %zmm7, %zmm13, %zmm5
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512DQ-NEXT:    vpermd %zmm7, %zmm14, %zmm6
; AVX512DQ-NEXT:    vpermd %zmm7, %zmm4, %zmm7
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm10, %zmm15
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm11, %zmm16
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm12, %zmm17
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm13, %zmm18
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm14, %zmm19
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm4, %zmm8
; AVX512DQ-NEXT:    vpermd %zmm9, %zmm10, %zmm20
; AVX512DQ-NEXT:    vpermd %zmm9, %zmm11, %zmm21
; AVX512DQ-NEXT:    vpermd %zmm9, %zmm12, %zmm22
; AVX512DQ-NEXT:    vpermd %zmm9, %zmm13, %zmm23
; AVX512DQ-NEXT:    vpermd %zmm9, %zmm4, %zmm24
; AVX512DQ-NEXT:    vpermd %zmm9, %zmm14, %zmm9
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm10, %zmm10
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm11, %zmm11
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm12, %zmm12
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm13, %zmm13
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm14, %zmm4
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k1
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm13 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k1
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm12 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k1
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm24, %k1
; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm14 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm23, %k1
; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm23 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm22, %k1
; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm22 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm21, %k1
; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm21 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm20, %k1
; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm20 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT:    vmovdqa32 768(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm19, %k1
; AVX512DQ-NEXT:    vmovdqa32 832(%rsi), %zmm19 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm18, %k1
; AVX512DQ-NEXT:    vmovdqa32 896(%rsi), %zmm18 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm17, %k1
; AVX512DQ-NEXT:    vmovdqa32 960(%rsi), %zmm17 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm16, %k1
; AVX512DQ-NEXT:    vmovdqa32 1024(%rsi), %zmm16 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k1
; AVX512DQ-NEXT:    vmovdqa32 1088(%rsi), %zmm15 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT:    vmovdqa32 1152(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT:    vmovdqa32 1216(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT:    vmovdqa32 1280(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT:    vmovdqa32 1344(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vmovdqa32 1408(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vmovdqa32 1472(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 1472(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 1408(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 1344(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 1280(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 1216(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 1152(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 1088(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 1024(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 960(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 896(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 832(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 768(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 704(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm21, 640(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 576(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm23, 512(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 448(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 384(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor6_vf64:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovq (%rdi), %k5
; AVX512BW-NEXT:    movw $-3, %ax
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kmovw (%rdi), %k0
; AVX512BW-NEXT:    kandw %k1, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k0
; AVX512BW-NEXT:    korw %k0, %k3, %k0
; AVX512BW-NEXT:    movw $-5, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    movw $-9, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    movw $-17, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    movw $-33, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    movw $-65, %ax
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $1, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    movw $-129, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k0, %k3
; AVX512BW-NEXT:    kshiftrq $2, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k4
; AVX512BW-NEXT:    korw %k4, %k3, %k3
; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k7
; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k7, %k7
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k7, %k6
; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
; AVX512BW-NEXT:    korw %k0, %k6, %k6
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k6} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovq %k5, %k3
; AVX512BW-NEXT:    kshiftrq $3, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $4, %k3, %k1
; AVX512BW-NEXT:    kmovq %k3, %k7
; AVX512BW-NEXT:    kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $5, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k7} {z}
; AVX512BW-NEXT:    kandw %k4, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $6, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $7, %k7, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $8, %k7, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $9, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $10, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $11, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $12, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $13, %k7, %k1
; AVX512BW-NEXT:    kmovq %k7, %k2
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm4 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovq %k2, %k7
; AVX512BW-NEXT:    kshiftrq $14, %k2, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $15, %k7, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $16, %k5, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $17, %k5, %k1
; AVX512BW-NEXT:    kmovq %k5, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kmovq %k7, %k4
; AVX512BW-NEXT:    kshiftrq $18, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $19, %k4, %k1
; AVX512BW-NEXT:    kmovq %k4, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $20, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $21, %k7, %k1
; AVX512BW-NEXT:    kmovq %k7, %k3
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm7 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kmovq %k3, %k5
; AVX512BW-NEXT:    kshiftrq $22, %k3, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $23, %k5, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $24, %k5, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $25, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $26, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm9 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kmovq %k5, %k7
; AVX512BW-NEXT:    kshiftrq $27, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $28, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $29, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm10 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $30, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $31, %k7, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm11 {%k1} {z}
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $32, %k2, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $33, %k2, %k1
; AVX512BW-NEXT:    kmovq %k2, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kmovq %k7, %k5
; AVX512BW-NEXT:    kshiftrq $34, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm12 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $35, %k5, %k1
; AVX512BW-NEXT:    kmovq %k5, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $36, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $37, %k7, %k1
; AVX512BW-NEXT:    kmovq %k7, %k3
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 832(%rsi), %zmm13 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovq %k3, %k7
; AVX512BW-NEXT:    kshiftrq $38, %k3, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $39, %k7, %k6
; AVX512BW-NEXT:    kmovq %k7, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 896(%rsi), %zmm14 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $40, %k5, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $41, %k5, %k1
; AVX512BW-NEXT:    kmovq %k5, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $42, %k4, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 960(%rsi), %zmm15 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kmovq %k4, %k7
; AVX512BW-NEXT:    kshiftrq $43, %k4, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $44, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $45, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 1024(%rsi), %zmm16 {%k7} {z}
; AVX512BW-NEXT:    kandw %k5, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $46, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $47, %k5, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 1088(%rsi), %zmm17 {%k1} {z}
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $48, %k5, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $49, %k5, %k1
; AVX512BW-NEXT:    kmovq %k5, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kmovq %k7, %k5
; AVX512BW-NEXT:    kshiftrq $50, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 1152(%rsi), %zmm18 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $51, %k5, %k1
; AVX512BW-NEXT:    kmovq %k5, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $52, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $53, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 1216(%rsi), %zmm19 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $54, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $55, %k7, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 1280(%rsi), %zmm20 {%k1} {z}
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $56, %k5, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $57, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $58, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 1344(%rsi), %zmm21 {%k7} {z}
; AVX512BW-NEXT:    kandw %k2, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $59, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $60, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $61, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 1408(%rsi), %zmm22 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $62, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k6
; AVX512BW-NEXT:    kshiftrq $63, %k5, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k6, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k2, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k1, %k0, %k1
; AVX512BW-NEXT:    vmovdqa32 1472(%rsi), %zmm23 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm23, 1472(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm22, 1408(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm21, 1344(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm20, 1280(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm19, 1216(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm18, 1152(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm17, 1088(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm16, 1024(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm15, 960(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm14, 896(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm13, 832(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm12, 768(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm11, 704(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm10, 640(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm9, 576(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm8, 512(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm7, 448(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
  %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
  %data = call <384 x i32> @llvm.masked.load.v384i32.p0(ptr %in.vec, i32 64, <384 x i1> %tgt.mask, <384 x i32> poison)
  store <384 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor7_vf2:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vpslld $31, %zmm0, %zmm0
; AVX512F-ONLY-NEXT:    movw $16383, %ax # imm = 0x3FFF
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512F-ONLY-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
; AVX512F-ONLY-NEXT:    vmovq %xmm1, 48(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor7_vf2:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT:    movw $16383, %ax # imm = 0x3FFF
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
; AVX512DQ-NEXT:    vmovq %xmm1, 48(%rdx)
; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor7_vf2:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT:    vpslld $31, %zmm0, %zmm0
; AVX512BW-NEXT:    movw $16383, %ax # imm = 0x3FFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512BW-NEXT:    vmovdqa %ymm0, (%rdx)
; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
; AVX512BW-NEXT:    vmovq %xmm0, 48(%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
  %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %data = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr %in.vec, i32 64, <14 x i1> %tgt.mask, <14 x i32> poison)
  %data.padded = shufflevector <14 x i32> %data, <14 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 undef, i32 undef>
  store <14 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor7_vf4:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vpslld $31, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    movw $4095, %ax # imm = 0xFFF
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k2
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512F-ONLY-NEXT:    vextracti32x4 $2, %zmm0, 96(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa %ymm0, 64(%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor7_vf4:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-NEXT:    movw $4095, %ax # imm = 0xFFF
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, 96(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT:    vmovdqa %ymm0, 64(%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor7_vf4:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovd (%rdi), %k0
; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0]
; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT:    movl $268435455, %eax # imm = 0xFFFFFFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, 96(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vmovdqa %ymm1, 64(%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  %data = call <28 x i32> @llvm.masked.load.v28i32.p0(ptr %in.vec, i32 64, <28 x i1> %tgt.mask, <28 x i32> poison)
  %data.padded = shufflevector <28 x i32> %data, <28 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef>
  store <28 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-SLOW-LABEL: mask_replication_factor7_vf8:
; AVX512F-SLOW:       # %bb.0:
; AVX512F-SLOW-NEXT:    kmovw (%rdi), %k1
; AVX512F-SLOW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-SLOW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512F-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-SLOW-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-SLOW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512F-SLOW-NEXT:    movw $1, %ax
; AVX512F-SLOW-NEXT:    kmovw %eax, %k2
; AVX512F-SLOW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
; AVX512F-SLOW-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-SLOW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512F-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-SLOW-NEXT:    vptestmd %zmm1, %zmm1, %k3
; AVX512F-SLOW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512F-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-SLOW-NEXT:    vptestmd %zmm0, %zmm0, %k4
; AVX512F-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7]
; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3]
; AVX512F-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k1
; AVX512F-SLOW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-SLOW-NEXT:    vmovdqa32 192(%rsi), %zmm1 {%k1} {z}
; AVX512F-SLOW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k4} {z}
; AVX512F-SLOW-NEXT:    vmovdqa32 64(%rsi), %zmm3 {%k3} {z}
; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, 192(%rdx)
; AVX512F-SLOW-NEXT:    vzeroupper
; AVX512F-SLOW-NEXT:    retq
;
; AVX512F-FAST-LABEL: mask_replication_factor7_vf8:
; AVX512F-FAST:       # %bb.0:
; AVX512F-FAST-NEXT:    kmovw (%rdi), %k1
; AVX512F-FAST-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-FAST-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-FAST-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-FAST-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512F-FAST-NEXT:    movw $1, %ax
; AVX512F-FAST-NEXT:    kmovw %eax, %k2
; AVX512F-FAST-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
; AVX512F-FAST-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-FAST-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-FAST-NEXT:    vptestmd %zmm1, %zmm1, %k3
; AVX512F-FAST-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-FAST-NEXT:    vptestmd %zmm0, %zmm0, %k4
; AVX512F-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7]
; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512F-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k1
; AVX512F-FAST-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-FAST-NEXT:    vmovdqa32 192(%rsi), %zmm1 {%k1} {z}
; AVX512F-FAST-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k4} {z}
; AVX512F-FAST-NEXT:    vmovdqa32 64(%rsi), %zmm3 {%k3} {z}
; AVX512F-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
; AVX512F-FAST-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
; AVX512F-FAST-NEXT:    vmovdqa %ymm1, 192(%rdx)
; AVX512F-FAST-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-FAST-NEXT:    vzeroupper
; AVX512F-FAST-NEXT:    retq
;
; AVX512DQ-SLOW-LABEL: mask_replication_factor7_vf8:
; AVX512DQ-SLOW:       # %bb.0:
; AVX512DQ-SLOW-NEXT:    kmovb (%rdi), %k0
; AVX512DQ-SLOW-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-SLOW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512DQ-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-SLOW-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-SLOW-NEXT:    vpmovm2d %k1, %zmm1
; AVX512DQ-SLOW-NEXT:    movw $1, %ax
; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
; AVX512DQ-SLOW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-SLOW-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-SLOW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512DQ-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-SLOW-NEXT:    vpmovd2m %zmm1, %k2
; AVX512DQ-SLOW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512DQ-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-SLOW-NEXT:    vpmovd2m %zmm0, %k3
; AVX512DQ-SLOW-NEXT:    vpmovm2d %k0, %ymm0
; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7]
; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3]
; AVX512DQ-SLOW-NEXT:    vpmovd2m %ymm0, %k4
; AVX512DQ-SLOW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-SLOW-NEXT:    vmovdqa32 192(%rsi), %zmm1 {%k4} {z}
; AVX512DQ-SLOW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k3} {z}
; AVX512DQ-SLOW-NEXT:    vmovdqa32 64(%rsi), %zmm3 {%k2} {z}
; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm1, 192(%rdx)
; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-SLOW-NEXT:    vzeroupper
; AVX512DQ-SLOW-NEXT:    retq
;
; AVX512DQ-FAST-LABEL: mask_replication_factor7_vf8:
; AVX512DQ-FAST:       # %bb.0:
; AVX512DQ-FAST-NEXT:    kmovb (%rdi), %k0
; AVX512DQ-FAST-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-FAST-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-FAST-NEXT:    vpmovm2d %k1, %zmm1
; AVX512DQ-FAST-NEXT:    movw $1, %ax
; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
; AVX512DQ-FAST-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-FAST-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-FAST-NEXT:    vpmovd2m %zmm1, %k2
; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-FAST-NEXT:    vpmovd2m %zmm0, %k3
; AVX512DQ-FAST-NEXT:    vpmovm2d %k0, %ymm0
; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7]
; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512DQ-FAST-NEXT:    vpmovd2m %ymm0, %k4
; AVX512DQ-FAST-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-FAST-NEXT:    vmovdqa32 192(%rsi), %zmm1 {%k4} {z}
; AVX512DQ-FAST-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k3} {z}
; AVX512DQ-FAST-NEXT:    vmovdqa32 64(%rsi), %zmm3 {%k2} {z}
; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
; AVX512DQ-FAST-NEXT:    vmovdqa %ymm1, 192(%rdx)
; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-FAST-NEXT:    vzeroupper
; AVX512DQ-FAST-NEXT:    retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor7_vf8:
; AVX512BW-ONLY:       # %bb.0:
; AVX512BW-ONLY-NEXT:    kmovw (%rdi), %k0
; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,54,55,55,55,55,55,55,55,u,u,u,u,u,u,u,u]
; AVX512BW-ONLY-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-ONLY-NEXT:    movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF
; AVX512BW-ONLY-NEXT:    kmovq %rax, %k1
; AVX512BW-ONLY-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k2
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm0 {%k3} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k1
; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa %ymm0, 192(%rdx)
; AVX512BW-ONLY-NEXT:    vzeroupper
; AVX512BW-ONLY-NEXT:    retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor7_vf8:
; AVX512VBMI-ONLY:       # %bb.0:
; AVX512VBMI-ONLY-NEXT:    kmovw (%rdi), %k0
; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7,7,7,7,7,7,7,u,u,u,u,u,u,u,u]
; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT:    vpxor %xmm1, %xmm1, %xmm1
; AVX512VBMI-ONLY-NEXT:    movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF
; AVX512VBMI-ONLY-NEXT:    kmovq %rax, %k1
; AVX512VBMI-ONLY-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k2
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm0 {%k3} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k1
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa %ymm0, 192(%rdx)
; AVX512VBMI-ONLY-NEXT:    vzeroupper
; AVX512VBMI-ONLY-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
  %data = call <56 x i32> @llvm.masked.load.v56i32.p0(ptr %in.vec, i32 64, <56 x i1> %tgt.mask, <56 x i32> poison)
  %data.padded = shufflevector <56 x i32> %data, <56 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  store <56 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor7_vf16:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k4
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k5
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k6
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k7
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm1 {%k7} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm2 {%k6} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm3 {%k5} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm4 {%k4} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm5 {%k3} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 384(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor7_vf16:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k4
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k5
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k6
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k7
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm1 {%k7} {z}
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm2 {%k6} {z}
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm3 {%k5} {z}
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm4 {%k4} {z}
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm5 {%k3} {z}
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 384(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor7_vf16:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm2, %zmm2
; AVX512BW-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm3, %zmm3
; AVX512BW-NEXT:    vptestmd %zmm3, %zmm3, %k1
; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm4, %zmm4
; AVX512BW-NEXT:    vptestmd %zmm4, %zmm4, %k1
; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm4 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm5, %zmm5
; AVX512BW-NEXT:    vptestmd %zmm5, %zmm5, %k1
; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm6, %zmm6
; AVX512BW-NEXT:    vptestmd %zmm6, %zmm6, %k1
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm7, %zmm0
; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm5, 192(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm3, 320(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
  %data = call <112 x i32> @llvm.masked.load.v112i32.p0(ptr %in.vec, i32 64, <112 x i1> %tgt.mask, <112 x i32> poison)
  store <112 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor7_vf32:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm4
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm6
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm8
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm10
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm11, %zmm12
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm13, %zmm0
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm2, %zmm2
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm5, %zmm5
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm7, %zmm7
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm9, %zmm9
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm11, %zmm11
; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm13, %zmm3
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm13 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm12 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm12, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 384(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 512(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 576(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 640(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 704(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, 768(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 832(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm13, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor7_vf32:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm3
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm4
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm5, %zmm6
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm8
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm10
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm11, %zmm12
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm13, %zmm0
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm2, %zmm2
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm5, %zmm5
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm7, %zmm7
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm9, %zmm9
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm11, %zmm11
; AVX512DQ-NEXT:    vpermd %zmm3, %zmm13, %zmm3
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm13 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT:    vmovdqa32 832(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k1
; AVX512DQ-NEXT:    vmovdqa32 768(%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k1
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm12 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 384(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 512(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 576(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 640(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 704(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 768(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 832(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm13, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor7_vf32:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    movw $-3, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw (%rdi), %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k1
; AVX512BW-NEXT:    kmovq %k2, %k3
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-5, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovq %k2, %k4
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-9, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-17, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-33, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-65, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    movw $-129, %ax
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kmovd (%rdi), %k6
; AVX512BW-NEXT:    kshiftrd $1, %k6, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovq %k2, %k7
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT:    kmovd %eax, %k5
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrd $2, %k6, %k2
; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kshiftlw $14, %k2, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    korw %k1, %k0, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    kmovq %k6, %k2
; AVX512BW-NEXT:    kshiftrd $29, %k6, %k1
; AVX512BW-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kandw %k3, %k1, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovq %k4, %k6
; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrd $30, %k2, %k1
; AVX512BW-NEXT:    kmovq %k2, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k0, %k0
; AVX512BW-NEXT:    kandw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k5, %k0, %k3
; AVX512BW-NEXT:    kshiftrd $31, %k4, %k0
; AVX512BW-NEXT:    kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k3, %k3
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k3, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k1, %k0, %k1
; AVX512BW-NEXT:    vmovdqa32 832(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $27, %k4, %k1
; AVX512BW-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k3
; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k3, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k3, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k3, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k7
; AVX512BW-NEXT:    kshiftrd $28, %k4, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k7, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kandw %k5, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k6, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $4, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kmovq %k2, %k4
; AVX512BW-NEXT:    kshiftrw $3, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k5, %k6
; AVX512BW-NEXT:    kmovq %k5, %k7
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k2, %k5
; AVX512BW-NEXT:    korw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    korw %k7, %k0, %k2
; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm2 {%k2} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
; AVX512BW-NEXT:    kshiftrd $25, %k6, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k5
; AVX512BW-NEXT:    kshiftrd $26, %k6, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $8, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $7, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $6, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $5, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $4, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kandw %k4, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $3, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k5, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
; AVX512BW-NEXT:    korw %k6, %k2, %k1
; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT:    kshiftrd $23, %k2, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k3
; AVX512BW-NEXT:    kshiftrd $22, %k2, %k5
; AVX512BW-NEXT:    kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovq %k2, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k2
; AVX512BW-NEXT:    kshiftrw $14, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $13, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $12, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $11, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $10, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $8, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $24, %k6, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k5
; AVX512BW-NEXT:    kshiftrw $7, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $5, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $4, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $3, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $2, %k5, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
; AVX512BW-NEXT:    korw %k0, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm4 {%k2} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
; AVX512BW-NEXT:    kshiftrd $20, %k3, %k5
; AVX512BW-NEXT:    kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k5, %k6
; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kandw %k1, %k2, %k5
; AVX512BW-NEXT:    kshiftrd $21, %k3, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $12, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $11, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $10, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kandw %k7, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $9, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $8, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $7, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $6, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k5, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k2, %k5
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $15, %k7, %k2
; AVX512BW-NEXT:    kshiftrw $5, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $4, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $3, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $2, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kandw %k4, %k5, %k5
; AVX512BW-NEXT:    kshiftlw $14, %k7, %k1
; AVX512BW-NEXT:    korw %k1, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT:    kshiftrd $18, %k2, %k4
; AVX512BW-NEXT:    kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k4, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $13, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $12, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $11, %k4, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k6
; AVX512BW-NEXT:    kshiftrd $19, %k2, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $10, %k5, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $9, %k5, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $8, %k5, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k5, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $6, %k5, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $5, %k5, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kandw %k3, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k5, %k5
; AVX512BW-NEXT:    korw %k5, %k6, %k5
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $3, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $2, %k3, %k6
; AVX512BW-NEXT:    kmovq %k3, %k7
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k3
; AVX512BW-NEXT:    korw %k3, %k5, %k3
; AVX512BW-NEXT:    kshiftlw $1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $1, %k3, %k3
; AVX512BW-NEXT:    korw %k7, %k3, %k3
; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm6 {%k3} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k3, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k3
; AVX512BW-NEXT:    kshiftrd $17, %k1, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k3, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k2
; AVX512BW-NEXT:    korw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    korw %k1, %k0, %k1
; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm7 {%k1} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT:    kshiftrd $13, %k0, %k2
; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kandw %k6, %k2, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $14, %k2, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kandw %k7, %k1, %k3
; AVX512BW-NEXT:    kshiftrd $14, %k0, %k1
; AVX512BW-NEXT:    kmovq %k0, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $12, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $11, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k5
; AVX512BW-NEXT:    korw %k5, %k3, %k3
; AVX512BW-NEXT:    kandw %k4, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k3, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k5
; AVX512BW-NEXT:    kshiftrd $15, %k6, %k3
; AVX512BW-NEXT:    kmovq %k6, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k5, %k5
; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k5, %k3
; AVX512BW-NEXT:    kshiftlw $1, %k3, %k3
; AVX512BW-NEXT:    kshiftrw $1, %k3, %k3
; AVX512BW-NEXT:    korw %k1, %k3, %k1
; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT:    kmovq %k0, %k3
; AVX512BW-NEXT:    kshiftrd $11, %k0, %k0
; AVX512BW-NEXT:    kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kandw %k7, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k6
; AVX512BW-NEXT:    kshiftrd $12, %k3, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $11, %k5, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $10, %k5, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $9, %k5, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $8, %k5, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k5, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $6, %k5, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $5, %k5, %k5
; AVX512BW-NEXT:    korw %k5, %k6, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $4, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $3, %k3, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kandw %k2, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $2, %k3, %k6
; AVX512BW-NEXT:    kmovq %k3, %k0
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k5, %k2
; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
; AVX512BW-NEXT:    korw %k0, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm9 {%k2} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
; AVX512BW-NEXT:    kshiftrd $9, %k6, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k5
; AVX512BW-NEXT:    kshiftrd $10, %k6, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $8, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $7, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $6, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kandw %k4, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $5, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $4, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $3, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k5, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $2, %k4, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k2, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload
; AVX512BW-NEXT:    kshiftrd $7, %k4, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k3
; AVX512BW-NEXT:    kshiftrd $6, %k4, %k2
; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $14, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $13, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $12, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $11, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $10, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k3, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $8, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $8, %k4, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k5
; AVX512BW-NEXT:    kshiftrw $7, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $5, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $4, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $3, %k5, %k6
; AVX512BW-NEXT:    korw %k6, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $2, %k5, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
; AVX512BW-NEXT:    korw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
; AVX512BW-NEXT:    korw %k0, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm11 {%k2} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
; AVX512BW-NEXT:    kshiftrd $4, %k6, %k3
; AVX512BW-NEXT:    kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k3, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k3, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k5
; AVX512BW-NEXT:    korw %k5, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k2, %k5
; AVX512BW-NEXT:    kshiftrd $5, %k6, %k2
; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $12, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $11, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $10, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $9, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $8, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $7, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kandw %k1, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $6, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k5, %k2
; AVX512BW-NEXT:    kandw %k4, %k2, %k5
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k2
; AVX512BW-NEXT:    kshiftrw $5, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kandw %k7, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $4, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $3, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k5, %k5
; AVX512BW-NEXT:    kshiftrw $2, %k2, %k6
; AVX512BW-NEXT:    korw %k6, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k5, %k5
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm12 {%k1} {z}
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kshiftrw $14, %k4, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k4, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k4, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k4, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k2
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT:    kshiftrd $3, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $9, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $8, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $7, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $6, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k4
; AVX512BW-NEXT:    korw %k4, %k2, %k2
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k2, %k2
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k2, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm13 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm13, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm12, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm11, 192(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm8, 384(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm7, 448(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm6, 512(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm5, 576(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm4, 640(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm3, 704(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 768(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, 832(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
  %data = call <224 x i32> @llvm.masked.load.v224i32.p0(ptr %in.vec, i32 64, <224 x i1> %tgt.mask, <224 x i32> poison)
  store <224 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor7_vf64:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm5, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    movw $1, %ax
; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm9, %zmm9, %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm11, %zmm11, %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm13, %zmm0
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm15, %zmm2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm16, %zmm3
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm17, %zmm4
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm18, %zmm6
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm19, %zmm7
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm5, %zmm8
; AVX512F-ONLY-NEXT:    vpermd %zmm9, %zmm13, %zmm10
; AVX512F-ONLY-NEXT:    vpermd %zmm9, %zmm15, %zmm12
; AVX512F-ONLY-NEXT:    vpermd %zmm9, %zmm16, %zmm14
; AVX512F-ONLY-NEXT:    vpermd %zmm9, %zmm17, %zmm20
; AVX512F-ONLY-NEXT:    vpermd %zmm9, %zmm18, %zmm21
; AVX512F-ONLY-NEXT:    vpermd %zmm9, %zmm19, %zmm22
; AVX512F-ONLY-NEXT:    vpermd %zmm9, %zmm5, %zmm23
; AVX512F-ONLY-NEXT:    vpermd %zmm11, %zmm13, %zmm24
; AVX512F-ONLY-NEXT:    vpermd %zmm11, %zmm15, %zmm25
; AVX512F-ONLY-NEXT:    vpermd %zmm11, %zmm16, %zmm26
; AVX512F-ONLY-NEXT:    vpermd %zmm11, %zmm17, %zmm27
; AVX512F-ONLY-NEXT:    vpermd %zmm11, %zmm18, %zmm28
; AVX512F-ONLY-NEXT:    vpermd %zmm11, %zmm5, %zmm29
; AVX512F-ONLY-NEXT:    vpermd %zmm11, %zmm19, %zmm30
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm13, %zmm31
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm15, %zmm15
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm16, %zmm13
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm17, %zmm11
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm18, %zmm9
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm19, %zmm5
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm13 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm15 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm31, %zmm31, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm16 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm29, %zmm29, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm17 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm30, %zmm30, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm18 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm28, %zmm28, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm19 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm27, %zmm27, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm27 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm26, %zmm26, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm26 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm25, %zmm25, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm25 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm24, %zmm24, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm24 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm23, %zmm23, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm23 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm22, %zmm22, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm22 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm21, %zmm21, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1024(%rsi), %zmm21 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm20, %zmm20, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1088(%rsi), %zmm20 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1152(%rsi), %zmm14 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1216(%rsi), %zmm12 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1280(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1344(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1408(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1472(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1536(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1600(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1664(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1728(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 1728(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 1664(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 1600(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 1536(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 1472(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 1408(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 1344(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 1280(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm12, 1216(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm14, 1152(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm20, 1088(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm21, 1024(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm22, 960(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm23, 896(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm24, 832(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm25, 768(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm26, 704(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm27, 640(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm19, 576(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm18, 512(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm17, 448(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm16, 384(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm15, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm13, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor7_vf64:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm5, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    movw $1, %ax
; AVX512DQ-NEXT:    kmovw %eax, %k1
; AVX512DQ-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm8
; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm9
; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm11
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm13, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm15, %zmm2
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm16, %zmm3
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm17, %zmm4
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm18, %zmm6
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm19, %zmm7
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm5, %zmm8
; AVX512DQ-NEXT:    vpermd %zmm9, %zmm13, %zmm10
; AVX512DQ-NEXT:    vpermd %zmm9, %zmm15, %zmm12
; AVX512DQ-NEXT:    vpermd %zmm9, %zmm16, %zmm14
; AVX512DQ-NEXT:    vpermd %zmm9, %zmm17, %zmm20
; AVX512DQ-NEXT:    vpermd %zmm9, %zmm18, %zmm21
; AVX512DQ-NEXT:    vpermd %zmm9, %zmm19, %zmm22
; AVX512DQ-NEXT:    vpermd %zmm9, %zmm5, %zmm23
; AVX512DQ-NEXT:    vpermd %zmm11, %zmm13, %zmm24
; AVX512DQ-NEXT:    vpermd %zmm11, %zmm15, %zmm25
; AVX512DQ-NEXT:    vpermd %zmm11, %zmm16, %zmm26
; AVX512DQ-NEXT:    vpermd %zmm11, %zmm17, %zmm27
; AVX512DQ-NEXT:    vpermd %zmm11, %zmm18, %zmm28
; AVX512DQ-NEXT:    vpermd %zmm11, %zmm5, %zmm29
; AVX512DQ-NEXT:    vpermd %zmm11, %zmm19, %zmm30
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm13, %zmm31
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm15, %zmm15
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm16, %zmm13
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm17, %zmm11
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm18, %zmm9
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm19, %zmm5
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k1
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k1
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm13 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k1
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm15 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm31, %k1
; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm16 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm29, %k1
; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm17 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm30, %k1
; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm18 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm28, %k1
; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm19 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm27, %k1
; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm27 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm26, %k1
; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm26 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm25, %k1
; AVX512DQ-NEXT:    vmovdqa32 768(%rsi), %zmm25 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm24, %k1
; AVX512DQ-NEXT:    vmovdqa32 832(%rsi), %zmm24 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm23, %k1
; AVX512DQ-NEXT:    vmovdqa32 896(%rsi), %zmm23 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm22, %k1
; AVX512DQ-NEXT:    vmovdqa32 960(%rsi), %zmm22 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm21, %k1
; AVX512DQ-NEXT:    vmovdqa32 1024(%rsi), %zmm21 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm20, %k1
; AVX512DQ-NEXT:    vmovdqa32 1088(%rsi), %zmm20 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k1
; AVX512DQ-NEXT:    vmovdqa32 1152(%rsi), %zmm14 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k1
; AVX512DQ-NEXT:    vmovdqa32 1216(%rsi), %zmm12 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT:    vmovdqa32 1280(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT:    vmovdqa32 1344(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT:    vmovdqa32 1408(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT:    vmovdqa32 1472(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT:    vmovdqa32 1536(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT:    vmovdqa32 1600(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vmovdqa32 1664(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 1728(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 1728(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 1664(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 1600(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 1536(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 1472(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 1408(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 1344(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 1280(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 1216(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 1152(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 1088(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm21, 1024(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 960(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm23, 896(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm24, 832(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 768(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm26, 704(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm27, 640(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 576(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 512(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 448(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 384(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor7_vf64:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    movw $-3, %ax
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kmovw (%rdi), %k0
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-5, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kmovq %k2, %k3
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-9, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-17, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-33, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-65, %ax
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    movw $-129, %ax
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kmovq (%rdi), %k4
; AVX512BW-NEXT:    kshiftrq $1, %k4, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT:    kmovd %eax, %k5
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT:    kmovd %eax, %k2
; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT:    kmovd %eax, %k1
; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $2, %k4, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k0, %k6
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k6} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k7, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k7, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k7, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k7, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kmovq %k4, %k7
; AVX512BW-NEXT:    kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT:    kshiftrq $3, %k4, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $4, %k7, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    kandw %k2, %k6, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $5, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k6
; AVX512BW-NEXT:    kshiftrq $6, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kandw %k5, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
; AVX512BW-NEXT:    korw %k1, %k6, %k1
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $7, %k4, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $8, %k4, %k0
; AVX512BW-NEXT:    kmovq %k4, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $9, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $10, %k5, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $11, %k5, %k6
; AVX512BW-NEXT:    kmovq %k5, %k4
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm4 {%k1} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k6, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k1
; AVX512BW-NEXT:    kmovq %k4, %k7
; AVX512BW-NEXT:    kshiftrq $12, %k4, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k6
; AVX512BW-NEXT:    kshiftrq $13, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kandw %k5, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
; AVX512BW-NEXT:    korw %k1, %k6, %k6
; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm5 {%k6} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $14, %k5, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k6
; AVX512BW-NEXT:    kshiftrq $15, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k6, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $16, %k5, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $17, %k5, %k0
; AVX512BW-NEXT:    kmovq %k5, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $18, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm7 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $19, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $20, %k7, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT:    kandw %k5, %k6, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $21, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k6
; AVX512BW-NEXT:    kshiftrq $22, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
; AVX512BW-NEXT:    korw %k1, %k6, %k1
; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT:    kandw %k5, %k0, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $23, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $24, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $25, %k2, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm10 {%k7} {z}
; AVX512BW-NEXT:    kandw %k5, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kmovq %k2, %k7
; AVX512BW-NEXT:    kshiftrq $26, %k2, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $27, %k7, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm11 {%k1} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k6, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $28, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k6
; AVX512BW-NEXT:    kshiftrq $29, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kandw %k5, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
; AVX512BW-NEXT:    korw %k1, %k6, %k6
; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm12 {%k6} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $30, %k5, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k6
; AVX512BW-NEXT:    kshiftrq $31, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kandw %k3, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k6, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 832(%rsi), %zmm13 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k5, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $33, %k5, %k0
; AVX512BW-NEXT:    kmovq %k5, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kmovq %k7, %k3
; AVX512BW-NEXT:    kshiftrq $34, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 896(%rsi), %zmm14 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $35, %k3, %k0
; AVX512BW-NEXT:    kmovq %k3, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $36, %k7, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 960(%rsi), %zmm15 {%k1} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k6, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $37, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k5, %k0, %k6
; AVX512BW-NEXT:    kshiftrq $38, %k7, %k0
; AVX512BW-NEXT:    kmovq %k7, %k5
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
; AVX512BW-NEXT:    korw %k1, %k6, %k1
; AVX512BW-NEXT:    vmovdqa32 1024(%rsi), %zmm16 {%k1} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kmovq %k5, %k7
; AVX512BW-NEXT:    kshiftrq $39, %k5, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $40, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $41, %k4, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 1088(%rsi), %zmm17 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $42, %k4, %k0
; AVX512BW-NEXT:    kmovq %k4, %k3
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $43, %k3, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 1152(%rsi), %zmm18 {%k1} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k6, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kmovq %k3, %k7
; AVX512BW-NEXT:    kshiftrq $44, %k3, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k6
; AVX512BW-NEXT:    kshiftrq $45, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
; AVX512BW-NEXT:    korw %k1, %k6, %k6
; AVX512BW-NEXT:    vmovdqa32 1216(%rsi), %zmm19 {%k6} {z}
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $46, %k5, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k6
; AVX512BW-NEXT:    kshiftrq $47, %k5, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k6, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 1280(%rsi), %zmm20 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $48, %k5, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $49, %k5, %k0
; AVX512BW-NEXT:    kmovq %k5, %k7
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrq $50, %k7, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 1344(%rsi), %zmm21 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k4, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $51, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $52, %k7, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 1408(%rsi), %zmm22 {%k1} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k6, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kandw %k2, %k0, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $53, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k6
; AVX512BW-NEXT:    kshiftrq $54, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
; AVX512BW-NEXT:    korw %k1, %k6, %k1
; AVX512BW-NEXT:    vmovdqa32 1472(%rsi), %zmm23 {%k1} {z}
; AVX512BW-NEXT:    kandw %k3, %k0, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $55, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $56, %k7, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k6
; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $57, %k4, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
; AVX512BW-NEXT:    korw %k6, %k0, %k7
; AVX512BW-NEXT:    vmovdqa32 1536(%rsi), %zmm24 {%k7} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $10, %k6, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k0, %k1
; AVX512BW-NEXT:    kmovq %k4, %k7
; AVX512BW-NEXT:    kshiftrq $58, %k4, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $59, %k7, %k6
; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
; AVX512BW-NEXT:    korw %k7, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 1600(%rsi), %zmm25 {%k1} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k6, %k1
; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k5, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT:    kshiftrq $60, %k5, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k4, %k0, %k6
; AVX512BW-NEXT:    kshiftrq $61, %k5, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
; AVX512BW-NEXT:    korw %k7, %k6, %k6
; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
; AVX512BW-NEXT:    korw %k1, %k6, %k6
; AVX512BW-NEXT:    vmovdqa32 1664(%rsi), %zmm26 {%k6} {z}
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
; AVX512BW-NEXT:    korw %k1, %k0, %k0
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftrq $62, %k5, %k0
; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k6, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $7, %k0, %k0
; AVX512BW-NEXT:    korw %k0, %k1, %k0
; AVX512BW-NEXT:    kshiftrq $63, %k5, %k5
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k1, %k0, %k1
; AVX512BW-NEXT:    kshiftlw $15, %k5, %k0
; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k3, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k4, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
; AVX512BW-NEXT:    korw %k6, %k1, %k1
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $3, %k0, %k4
; AVX512BW-NEXT:    korw %k4, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $2, %k0, %k3
; AVX512BW-NEXT:    korw %k3, %k1, %k1
; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT:    kandw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $14, %k5, %k2
; AVX512BW-NEXT:    korw %k2, %k1, %k1
; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
; AVX512BW-NEXT:    korw %k0, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 1728(%rsi), %zmm27 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm27, 1728(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm26, 1664(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm25, 1600(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm24, 1536(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm23, 1472(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm22, 1408(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm21, 1344(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm20, 1280(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm19, 1216(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm18, 1152(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm17, 1088(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm16, 1024(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm15, 960(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm14, 896(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm13, 832(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm12, 768(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm11, 704(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm10, 640(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm9, 576(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm8, 512(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm7, 448(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
  %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
  %data = call <448 x i32> @llvm.masked.load.v448i32.p0(ptr %in.vec, i32 64, <448 x i1> %tgt.mask, <448 x i32> poison)
  store <448 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor8_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor8_vf2:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor8_vf2:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor8_vf2:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k1
; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
  %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison)
  store <16 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor8_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor8_vf4:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k2
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor8_vf4:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor8_vf4:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovd (%rdi), %k0
; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT:    vpmovw2m %zmm0, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison)
  store <32 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor8_vf8:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k4
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k3} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor8_vf8:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k4
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k3} {z}
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor8_vf8:
; AVX512BW-ONLY:       # %bb.0:
; AVX512BW-ONLY-NEXT:    kmovq (%rdi), %k0
; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k1
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-ONLY-NEXT:    vzeroupper
; AVX512BW-ONLY-NEXT:    retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor8_vf8:
; AVX512VBMI-ONLY:       # %bb.0:
; AVX512VBMI-ONLY-NEXT:    kmovq (%rdi), %k0
; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k1
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512VBMI-ONLY-NEXT:    vzeroupper
; AVX512VBMI-ONLY-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
  %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison)
  store <64 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor8_vf16:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k4
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k5
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k6
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k7
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k7} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512F-ONLY-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor8_vf16:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k4
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k5
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k6
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k7
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k7} {z}
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor8_vf16:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovw (%rdi), %k0
; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm0[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63]
; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
; AVX512BW-NEXT:    vpmovb2m %zmm0, %k2
; AVX512BW-NEXT:    kshiftrd $16, %k2, %k3
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $16, %k2, %k3
; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
  %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison)
  store <128 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor8_vf32:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm2, %zmm0
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm3, %zmm4
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm5, %zmm6
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm7, %zmm8
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm9, %zmm10
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm11, %zmm12
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm13, %zmm14
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm15, %zmm1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm2, %zmm2
; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm3, %zmm3
; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm5, %zmm5
; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm7, %zmm7
; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm9, %zmm9
; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm11, %zmm11
; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm13, %zmm13
; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm15, %zmm15
; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm15 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm13 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm14 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm12 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 960(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 896(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 832(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 768(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 704(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm12, 640(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm14, 576(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 512(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 448(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 384(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm13, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm15, (%rdx)
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor8_vf32:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm2, %zmm0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm3, %zmm4
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm5, %zmm6
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm7, %zmm8
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm9, %zmm10
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm11, %zmm12
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm13, %zmm14
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512DQ-NEXT:    vpermd %zmm1, %zmm15, %zmm1
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm16
; AVX512DQ-NEXT:    vpermd %zmm16, %zmm2, %zmm2
; AVX512DQ-NEXT:    vpermd %zmm16, %zmm3, %zmm3
; AVX512DQ-NEXT:    vpermd %zmm16, %zmm5, %zmm5
; AVX512DQ-NEXT:    vpermd %zmm16, %zmm7, %zmm7
; AVX512DQ-NEXT:    vpermd %zmm16, %zmm9, %zmm9
; AVX512DQ-NEXT:    vpermd %zmm16, %zmm11, %zmm11
; AVX512DQ-NEXT:    vpermd %zmm16, %zmm13, %zmm13
; AVX512DQ-NEXT:    vpermd %zmm16, %zmm15, %zmm15
; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k1
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm15 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k1
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm13 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k1
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k1
; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm14 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k1
; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm12 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT:    vmovdqa32 768(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT:    vmovdqa32 832(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT:    vmovdqa32 896(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 960(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 960(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 896(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 832(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 768(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 704(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 640(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 576(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 512(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 448(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 384(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm15, (%rdx)
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor8_vf32:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovd (%rdi), %k0
; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm3
; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT:    vpmovb2m %zmm1, %k2
; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-NEXT:    vpshufb %zmm2, %zmm0, %zmm1
; AVX512BW-NEXT:    vpmovb2m %zmm1, %k3
; AVX512BW-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT:    vpmovb2m %zmm0, %k4
; AVX512BW-NEXT:    kshiftrd $16, %k4, %k5
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k4} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k4, %k4
; AVX512BW-NEXT:    kshiftrd $16, %k4, %k5
; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k3, %k4
; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k3, %k3
; AVX512BW-NEXT:    kshiftrd $16, %k3, %k4
; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k2, %k3
; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
; AVX512BW-NEXT:    kshiftrd $16, %k2, %k3
; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-NEXT:    vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-NEXT:    vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
; AVX512BW-NEXT:    vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm15, 896(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm14, 960(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm13, 768(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm12, 832(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm11, 640(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm10, 704(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm9, 512(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm8, 576(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
  %data = call <256 x i32> @llvm.masked.load.v256i32.p0(ptr %in.vec, i32 64, <256 x i1> %tgt.mask, <256 x i32> poison)
  store <256 x i32> %data, ptr %out.vec, align 64
  ret void
}

define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor8_vf64:
; AVX512F-ONLY:       # %bb.0:
; AVX512F-ONLY-NEXT:    subq $136, %rsp
; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm10, %zmm10, %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT:    vpermd %zmm6, %zmm12, %zmm0
; AVX512F-ONLY-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
; AVX512F-ONLY-NEXT:    vpermd %zmm6, %zmm14, %zmm0
; AVX512F-ONLY-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
; AVX512F-ONLY-NEXT:    vpermd %zmm6, %zmm16, %zmm0
; AVX512F-ONLY-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
; AVX512F-ONLY-NEXT:    vpermd %zmm6, %zmm18, %zmm0
; AVX512F-ONLY-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT:    vpermd %zmm6, %zmm20, %zmm4
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512F-ONLY-NEXT:    vpermd %zmm6, %zmm22, %zmm5
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512F-ONLY-NEXT:    vpermd %zmm6, %zmm24, %zmm7
; AVX512F-ONLY-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512F-ONLY-NEXT:    vpermd %zmm6, %zmm26, %zmm9
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm12, %zmm11
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm14, %zmm13
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm16, %zmm15
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm18, %zmm17
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm20, %zmm19
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm22, %zmm21
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm24, %zmm23
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm26, %zmm25
; AVX512F-ONLY-NEXT:    vpermd %zmm10, %zmm12, %zmm27
; AVX512F-ONLY-NEXT:    vpermd %zmm10, %zmm14, %zmm28
; AVX512F-ONLY-NEXT:    vpermd %zmm10, %zmm16, %zmm29
; AVX512F-ONLY-NEXT:    vpermd %zmm10, %zmm18, %zmm30
; AVX512F-ONLY-NEXT:    vpermd %zmm10, %zmm20, %zmm31
; AVX512F-ONLY-NEXT:    vpermd %zmm10, %zmm22, %zmm3
; AVX512F-ONLY-NEXT:    vpermd %zmm10, %zmm24, %zmm6
; AVX512F-ONLY-NEXT:    vpermd %zmm10, %zmm26, %zmm2
; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm12, %zmm1
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm14, %zmm0
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm16, %zmm16
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm18, %zmm14
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm20, %zmm12
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm22, %zmm10
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm24, %zmm18
; AVX512F-ONLY-NEXT:    vpermd %zmm8, %zmm26, %zmm8
; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm26 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm18, %zmm18, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm12 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm14 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm16, %zmm16, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm16 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm18 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm20 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm22 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm24 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm31, %zmm31, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm30, %zmm30, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm29, %zmm29, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm28, %zmm28, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm27, %zmm27, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm27 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm25, %zmm25, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1024(%rsi), %zmm25 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm23, %zmm23, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1088(%rsi), %zmm23 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm21, %zmm21, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1152(%rsi), %zmm21 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm19, %zmm19, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1216(%rsi), %zmm19 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm17, %zmm17, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1280(%rsi), %zmm17 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1344(%rsi), %zmm15 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1408(%rsi), %zmm13 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1472(%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1536(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1600(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1664(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1728(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
; AVX512F-ONLY-NEXT:    vptestmd %zmm28, %zmm28, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1792(%rsi), %zmm28 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
; AVX512F-ONLY-NEXT:    vptestmd %zmm29, %zmm29, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1856(%rsi), %zmm29 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqu64 (%rsp), %zmm30 # 64-byte Reload
; AVX512F-ONLY-NEXT:    vptestmd %zmm30, %zmm30, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1920(%rsi), %zmm30 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
; AVX512F-ONLY-NEXT:    vptestmd %zmm31, %zmm31, %k1
; AVX512F-ONLY-NEXT:    vmovdqa32 1984(%rsi), %zmm31 {%k1} {z}
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm31, 1984(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm30, 1920(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm29, 1856(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm28, 1792(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 1728(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 1664(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 1600(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 1536(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, 1472(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm13, 1408(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm15, 1344(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm17, 1280(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm19, 1216(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm21, 1152(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm23, 1088(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm25, 1024(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm27, 960(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 896(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 832(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 768(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 704(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 640(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm24, 576(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm22, 512(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm20, 448(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm18, 384(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm16, 320(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm14, 256(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm12, 192(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 128(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 64(%rdx)
; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm26, (%rdx)
; AVX512F-ONLY-NEXT:    addq $136, %rsp
; AVX512F-ONLY-NEXT:    vzeroupper
; AVX512F-ONLY-NEXT:    retq
;
; AVX512DQ-LABEL: mask_replication_factor8_vf64:
; AVX512DQ:       # %bb.0:
; AVX512DQ-NEXT:    subq $136, %rsp
; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm6
; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm8
; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm10
; AVX512DQ-NEXT:    kmovw (%rdi), %k0
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT:    vpermd %zmm6, %zmm12, %zmm0
; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
; AVX512DQ-NEXT:    vpermd %zmm6, %zmm14, %zmm0
; AVX512DQ-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
; AVX512DQ-NEXT:    vpermd %zmm6, %zmm16, %zmm0
; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
; AVX512DQ-NEXT:    vpermd %zmm6, %zmm18, %zmm0
; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512DQ-NEXT:    vpermd %zmm6, %zmm20, %zmm4
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512DQ-NEXT:    vpermd %zmm6, %zmm22, %zmm5
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512DQ-NEXT:    vpermd %zmm6, %zmm24, %zmm7
; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512DQ-NEXT:    vpermd %zmm6, %zmm26, %zmm9
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm12, %zmm11
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm14, %zmm13
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm16, %zmm15
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm18, %zmm17
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm20, %zmm19
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm22, %zmm21
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm24, %zmm23
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm26, %zmm25
; AVX512DQ-NEXT:    vpermd %zmm10, %zmm12, %zmm27
; AVX512DQ-NEXT:    vpermd %zmm10, %zmm14, %zmm28
; AVX512DQ-NEXT:    vpermd %zmm10, %zmm16, %zmm29
; AVX512DQ-NEXT:    vpermd %zmm10, %zmm18, %zmm30
; AVX512DQ-NEXT:    vpermd %zmm10, %zmm20, %zmm31
; AVX512DQ-NEXT:    vpermd %zmm10, %zmm22, %zmm3
; AVX512DQ-NEXT:    vpermd %zmm10, %zmm24, %zmm6
; AVX512DQ-NEXT:    vpermd %zmm10, %zmm26, %zmm2
; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm8
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm12, %zmm1
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm14, %zmm0
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm16, %zmm16
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm18, %zmm14
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm20, %zmm12
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm22, %zmm10
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm24, %zmm18
; AVX512DQ-NEXT:    vpermd %zmm8, %zmm26, %zmm8
; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm26 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm18, %k1
; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k1
; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm12 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k1
; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm14 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm16, %k1
; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm16 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm18 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm20 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm22 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm24 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm31, %k1
; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm30, %k1
; AVX512DQ-NEXT:    vmovdqa32 768(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm29, %k1
; AVX512DQ-NEXT:    vmovdqa32 832(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm28, %k1
; AVX512DQ-NEXT:    vmovdqa32 896(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm27, %k1
; AVX512DQ-NEXT:    vmovdqa32 960(%rsi), %zmm27 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm25, %k1
; AVX512DQ-NEXT:    vmovdqa32 1024(%rsi), %zmm25 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm23, %k1
; AVX512DQ-NEXT:    vmovdqa32 1088(%rsi), %zmm23 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm21, %k1
; AVX512DQ-NEXT:    vmovdqa32 1152(%rsi), %zmm21 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm19, %k1
; AVX512DQ-NEXT:    vmovdqa32 1216(%rsi), %zmm19 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm17, %k1
; AVX512DQ-NEXT:    vmovdqa32 1280(%rsi), %zmm17 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k1
; AVX512DQ-NEXT:    vmovdqa32 1344(%rsi), %zmm15 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k1
; AVX512DQ-NEXT:    vmovdqa32 1408(%rsi), %zmm13 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k1
; AVX512DQ-NEXT:    vmovdqa32 1472(%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT:    vmovdqa32 1536(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT:    vmovdqa32 1600(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT:    vmovdqa32 1664(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT:    vmovdqa32 1728(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
; AVX512DQ-NEXT:    vpmovd2m %zmm28, %k1
; AVX512DQ-NEXT:    vmovdqa32 1792(%rsi), %zmm28 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
; AVX512DQ-NEXT:    vpmovd2m %zmm29, %k1
; AVX512DQ-NEXT:    vmovdqa32 1856(%rsi), %zmm29 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqu64 (%rsp), %zmm30 # 64-byte Reload
; AVX512DQ-NEXT:    vpmovd2m %zmm30, %k1
; AVX512DQ-NEXT:    vmovdqa32 1920(%rsi), %zmm30 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
; AVX512DQ-NEXT:    vpmovd2m %zmm31, %k1
; AVX512DQ-NEXT:    vmovdqa32 1984(%rsi), %zmm31 {%k1} {z}
; AVX512DQ-NEXT:    vmovdqa64 %zmm31, 1984(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm30, 1920(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm29, 1856(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm28, 1792(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 1728(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 1664(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 1600(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 1536(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 1472(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 1408(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 1344(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 1280(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 1216(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm21, 1152(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm23, 1088(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 1024(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm27, 960(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 896(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 832(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 768(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 704(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 640(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm24, 576(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 512(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 448(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 384(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 320(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 256(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 192(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 128(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 64(%rdx)
; AVX512DQ-NEXT:    vmovdqa64 %zmm26, (%rdx)
; AVX512DQ-NEXT:    addq $136, %rsp
; AVX512DQ-NEXT:    vzeroupper
; AVX512DQ-NEXT:    retq
;
; AVX512BW-LABEL: mask_replication_factor8_vf64:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    kmovq (%rdi), %k0
; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7]
; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm7
; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm12
; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5]
; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm16
; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm15
; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm10
; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm5
; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-NEXT:    vpshufb %zmm2, %zmm0, %zmm1
; AVX512BW-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT:    vpmovb2m %zmm0, %k2
; AVX512BW-NEXT:    kshiftrd $16, %k2, %k1
; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k2} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512BW-NEXT:    vpmovb2m %zmm5, %k2
; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k2, %k1
; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT:    vpmovb2m %zmm10, %k1
; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm10 {%k2} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm13 {%k2} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-NEXT:    vmovdqa32 832(%rsi), %zmm14 {%k2} {z}
; AVX512BW-NEXT:    vpmovb2m %zmm15, %k2
; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm15 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 896(%rsi), %zmm17 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 960(%rsi), %zmm18 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k2, %k1
; AVX512BW-NEXT:    vmovdqa32 1088(%rsi), %zmm19 {%k1} {z}
; AVX512BW-NEXT:    vpmovb2m %zmm16, %k1
; AVX512BW-NEXT:    vmovdqa32 1024(%rsi), %zmm16 {%k2} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 1152(%rsi), %zmm20 {%k2} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 1216(%rsi), %zmm21 {%k2} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-NEXT:    vmovdqa32 1344(%rsi), %zmm22 {%k2} {z}
; AVX512BW-NEXT:    vpmovb2m %zmm12, %k2
; AVX512BW-NEXT:    vmovdqa32 1280(%rsi), %zmm12 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 1408(%rsi), %zmm23 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 1472(%rsi), %zmm24 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k2, %k1
; AVX512BW-NEXT:    vmovdqa32 1600(%rsi), %zmm25 {%k1} {z}
; AVX512BW-NEXT:    vpmovb2m %zmm7, %k1
; AVX512BW-NEXT:    vmovdqa32 1536(%rsi), %zmm7 {%k2} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 1664(%rsi), %zmm26 {%k2} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k2, %k2
; AVX512BW-NEXT:    vmovdqa32 1728(%rsi), %zmm27 {%k2} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
; AVX512BW-NEXT:    vmovdqa32 1856(%rsi), %zmm28 {%k2} {z}
; AVX512BW-NEXT:    vmovdqa32 1792(%rsi), %zmm29 {%k1} {z}
; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 1920(%rsi), %zmm30 {%k1} {z}
; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
; AVX512BW-NEXT:    vmovdqa32 1984(%rsi), %zmm31 {%k1} {z}
; AVX512BW-NEXT:    vmovdqa64 %zmm31, 1984(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm30, 1920(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm28, 1856(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm29, 1792(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm27, 1728(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm26, 1664(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm25, 1600(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm7, 1536(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm24, 1472(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm23, 1408(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm22, 1344(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm12, 1280(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm21, 1216(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm20, 1152(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm19, 1088(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm16, 1024(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm18, 960(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm17, 896(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm14, 832(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm15, 768(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm13, 704(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm11, 640(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm9, 576(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm10, 512(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm8, 448(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT:    vzeroupper
; AVX512BW-NEXT:    retq
  %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
  %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
  %data = call <512 x i32> @llvm.masked.load.v512i32.p0(ptr %in.vec, i32 64, <512 x i1> %tgt.mask, <512 x i32> poison)
  store <512 x i32> %data, ptr %out.vec, align 64
  ret void
}

declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
declare <20 x i32> @llvm.masked.load.v20i32.p0(ptr, i32, <20 x i1>, <20 x i32>)
declare <24 x i32> @llvm.masked.load.v24i32.p0(ptr, i32, <24 x i1>, <24 x i32>)
declare <28 x i32> @llvm.masked.load.v28i32.p0(ptr, i32, <28 x i1>, <28 x i32>)
declare <32 x i32> @llvm.masked.load.v32i32.p0(ptr, i32, <32 x i1>, <32 x i32>)
declare <40 x i32> @llvm.masked.load.v40i32.p0(ptr, i32, <40 x i1>, <40 x i32>)
declare <48 x i32> @llvm.masked.load.v48i32.p0(ptr, i32, <48 x i1>, <48 x i32>)
declare <56 x i32> @llvm.masked.load.v56i32.p0(ptr, i32, <56 x i1>, <56 x i32>)
declare <64 x i32> @llvm.masked.load.v64i32.p0(ptr, i32, <64 x i1>, <64 x i32>)
declare <80 x i32> @llvm.masked.load.v80i32.p0(ptr, i32, <80 x i1>, <80 x i32>)
declare <96 x i32> @llvm.masked.load.v96i32.p0(ptr, i32, <96 x i1>, <96 x i32>)
declare <112 x i32> @llvm.masked.load.v112i32.p0(ptr, i32, <112 x i1>, <112 x i32>)
declare <128 x i32> @llvm.masked.load.v128i32.p0(ptr, i32, <128 x i1>, <128 x i32>)
declare <160 x i32> @llvm.masked.load.v160i32.p0(ptr, i32, <160 x i1>, <160 x i32>)
declare <192 x i32> @llvm.masked.load.v192i32.p0(ptr, i32, <192 x i1>, <192 x i32>)
declare <224 x i32> @llvm.masked.load.v224i32.p0(ptr, i32, <224 x i1>, <224 x i32>)
declare <256 x i32> @llvm.masked.load.v256i32.p0(ptr, i32, <256 x i1>, <256 x i32>)
declare <320 x i32> @llvm.masked.load.v320i32.p0(ptr, i32, <320 x i1>, <320 x i32>)
declare <384 x i32> @llvm.masked.load.v384i32.p0(ptr, i32, <384 x i1>, <384 x i32>)
declare <448 x i32> @llvm.masked.load.v448i32.p0(ptr, i32, <448 x i1>, <448 x i32>)
declare <512 x i32> @llvm.masked.load.v512i32.p0(ptr, i32, <512 x i1>, <512 x i32>)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX512: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
; FALLBACK2: {{.*}}
; FALLBACK3: {{.*}}
; FALLBACK4: {{.*}}
; FALLBACK5: {{.*}}
; FALLBACK6: {{.*}}
; FALLBACK7: {{.*}}