; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512F-ONLY,AVX512F-SLOW,FALLBACK0
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512F-ONLY,AVX512F-FAST,FALLBACK1
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ,AVX512DQ-SLOW,FALLBACK2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512DQ,AVX512DQ-FAST,FALLBACK3
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-ONLY,AVX512BW-SLOW,FALLBACK4
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-ONLY,AVX512BW-FAST,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-SLOW,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-FAST,FALLBACK7
define void @mask_replication_factor2_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor2_vf2:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512F-ONLY-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512F-ONLY-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa %xmm0, (%rdx)
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor2_vf2:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %xmm0
; AVX512DQ-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512DQ-NEXT: vpmovd2m %xmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor2_vf2:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovq (%rdi), %k1
; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512BW-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx)
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
%tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
%data = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %in.vec, i32 64, <4 x i1> %tgt.mask, <4 x i32> poison)
%data.padded = shufflevector <4 x i32> %data, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <4 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor2_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor2_vf4:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor2_vf4:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0
; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX512DQ-NEXT: vpmovd2m %ymm0, %k1
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor2_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX512BW-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
%data = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %in.vec, i32 64, <8 x i1> %tgt.mask, <8 x i32> poison)
%data.padded = shufflevector <8 x i32> %data, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <8 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor2_vf8:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor2_vf8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor2_vf8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
%data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison)
store <16 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor2_vf16:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor2_vf16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor2_vf16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
%data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison)
store <32 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor2_vf32:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k2
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor2_vf32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: kmovw 2(%rdi), %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z}
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf32:
; AVX512BW-ONLY: # %bb.0:
; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-ONLY-NEXT: vzeroupper
; AVX512BW-ONLY-NEXT: retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf32:
; AVX512VBMI-ONLY: # %bb.0:
; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0
; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512VBMI-ONLY-NEXT: vzeroupper
; AVX512VBMI-ONLY-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
%data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison)
store <64 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor2_vf64:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k3
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k5
; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k4
; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3
; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k4
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k5} {z}
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3
; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k5
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k7} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k6} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k4} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor2_vf64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: kmovw 2(%rdi), %k5
; AVX512DQ-NEXT: kmovw 4(%rdi), %k3
; AVX512DQ-NEXT: kmovw 6(%rdi), %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3
; AVX512DQ-NEXT: vpmovd2m %zmm3, %k3
; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4
; AVX512DQ-NEXT: vpmovm2d %k5, %zmm0
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3
; AVX512DQ-NEXT: vpmovd2m %zmm3, %k5
; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7
; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z}
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf64:
; AVX512BW-ONLY: # %bb.0:
; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7]
; AVX512BW-ONLY-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-ONLY-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-ONLY-NEXT: vzeroupper
; AVX512BW-ONLY-NEXT: retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf64:
; AVX512VBMI-ONLY: # %bb.0:
; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0
; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512VBMI-ONLY-NEXT: vzeroupper
; AVX512VBMI-ONLY-NEXT: retq
%src.mask = load <64 x i1>, ptr %in.maskvec, align 64
%tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
%data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison)
store <128 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor3_vf2:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0]
; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512F-ONLY-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512F-ONLY-NEXT: movb $63, %al
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 {%k1}
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-ONLY-NEXT: vmovq %xmm1, 16(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa %xmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor3_vf2:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0]
; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: movb $63, %al
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 {%k1}
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vmovq %xmm1, 16(%rdx)
; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor3_vf2:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0]
; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512BW-NEXT: movb $63, %al
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vptestmd %ymm0, %ymm0, %k1 {%k1}
; AVX512BW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vmovq %xmm1, 16(%rdx)
; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
%tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
%data = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr %in.vec, i32 64, <6 x i1> %tgt.mask, <6 x i32> poison)
%data.padded = shufflevector <6 x i32> %data, <6 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <6 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor3_vf4:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor3_vf4:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor3_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512BW-NEXT: movw $4095, %ax # imm = 0xFFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
%data = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %in.vec, i32 64, <12 x i1> %tgt.mask, <12 x i32> poison)
%data.padded = shufflevector <12 x i32> %data, <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
store <12 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor3_vf8:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k2
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7]
; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa %ymm1, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor3_vf8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7]
; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vpmovd2m %ymm0, %k2
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor3_vf8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: movl $16777215, %eax # imm = 0xFFFFFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
%data = call <24 x i32> @llvm.masked.load.v24i32.p0(ptr %in.vec, i32 64, <24 x i1> %tgt.mask, <24 x i32> poison)
%data.padded = shufflevector <24 x i32> %data, <24 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <24 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor3_vf16:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor3_vf16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor3_vf16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2
; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
%data = call <48 x i32> @llvm.masked.load.v48i32.p0(ptr %in.vec, i32 64, <48 x i1> %tgt.mask, <48 x i32> poison)
store <48 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor3_vf32:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k2
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k2
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k3
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3
; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k4} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor3_vf32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k1
; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k2
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3
; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4
; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5
; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z}
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z}
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor3_vf32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd (%rdi), %k0
; AVX512BW-NEXT: kshiftrd $1, %k0, %k1
; AVX512BW-NEXT: movw $-3, %ax
; AVX512BW-NEXT: kmovd %eax, %k4
; AVX512BW-NEXT: kmovw (%rdi), %k2
; AVX512BW-NEXT: kandw %k4, %k2, %k3
; AVX512BW-NEXT: kmovq %k4, %k7
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-5, %ax
; AVX512BW-NEXT: kmovd %eax, %k4
; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: movw $-9, %ax
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-17, %ax
; AVX512BW-NEXT: kmovd %eax, %k5
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $11, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-33, %ax
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: movw $-65, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $2, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k2, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: movw $-129, %ax
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k2, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $3, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k2, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k2, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $4, %k0, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k2
; AVX512BW-NEXT: kshiftrw $3, %k2, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT: kmovd %eax, %k6
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $2, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrd $5, %k0, %k2
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $27, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k4
; AVX512BW-NEXT: kshiftrd $26, %k0, %k1
; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovq %k7, %k2
; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrd $28, %k0, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $11, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrd $29, %k0, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $8, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrd $30, %k0, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $5, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrd $31, %k0, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k7
; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $21, %k0, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k6
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $22, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrd $23, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $24, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $25, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k0, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k2
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrd $17, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $18, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $19, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $20, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $11, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
; AVX512BW-NEXT: kshiftrd $10, %k0, %k4
; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $13, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $12, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k4, %k2
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrd $12, %k0, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $11, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrd $13, %k0, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $8, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrd $14, %k0, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $5, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrd $15, %k0, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k6
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftlw $14, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k2} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $14, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrd $6, %k0, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $13, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrd $7, %k0, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $10, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrd $8, %k0, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $7, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrd $9, %k0, %k0
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k0, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k0, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k2, %k0
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k1, %k2
; AVX512BW-NEXT: korw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
%data = call <96 x i32> @llvm.masked.load.v96i32.p0(ptr %in.vec, i32 64, <96 x i1> %tgt.mask, <96 x i32> poison)
store <96 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor3_vf64:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm6
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm0
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm8
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm9
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm3
; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm10
; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm11
; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm4
; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm2
; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm5
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 448(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 512(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 576(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 640(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 704(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor3_vf64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3
; AVX512DQ-NEXT: kmovw 4(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4
; AVX512DQ-NEXT: kmovw 6(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm6
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm0
; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm8
; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm9
; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm3
; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm10
; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm11
; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm4
; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1
; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm2
; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm5
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1
; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm10, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm11, 448(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 512(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 576(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 640(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 704(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor3_vf64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovq (%rdi), %k0
; AVX512BW-NEXT: kshiftrq $1, %k0, %k1
; AVX512BW-NEXT: movw $-3, %ax
; AVX512BW-NEXT: kmovd %eax, %k4
; AVX512BW-NEXT: kmovw (%rdi), %k2
; AVX512BW-NEXT: kandw %k4, %k2, %k3
; AVX512BW-NEXT: kmovq %k4, %k7
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-5, %ax
; AVX512BW-NEXT: kmovd %eax, %k4
; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: movw $-9, %ax
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-17, %ax
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-33, %ax
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: movw $-65, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $2, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k2, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: movw $-129, %ax
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kmovq %k3, %k5
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $8, %k2, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $3, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k2, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k2, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $4, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k3
; AVX512BW-NEXT: kshiftrw $3, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT: kmovd %eax, %k6
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $2, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrq $5, %k0, %k2
; AVX512BW-NEXT: kmovq %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $59, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
; AVX512BW-NEXT: kshiftrq $58, %k0, %k1
; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT: kmovq %k7, %k3
; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrq $60, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrq $61, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $62, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrq $63, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k7
; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $53, %k0, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k6
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $54, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $55, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrq $56, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrq $57, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 8-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $48, %k0, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k3
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k3, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrq $49, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrq $50, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrq $51, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $52, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $43, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
; AVX512BW-NEXT: kshiftrq $42, %k0, %k1
; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k3
; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrq $44, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrq $45, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrq $46, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrq $47, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k2} {z}
; AVX512BW-NEXT: kshiftrq $37, %k0, %k2
; AVX512BW-NEXT: kandw %k4, %k2, %k3
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrq $38, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrq $39, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrq $40, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrq $41, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k3, %k1
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $32, %k0, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k3
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k3, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrq $33, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $34, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $35, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrq $36, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $27, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
; AVX512BW-NEXT: kshiftrq $26, %k0, %k3
; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrq $28, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrq $29, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrq $30, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrq $31, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k2, %k2
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z}
; AVX512BW-NEXT: kshiftrq $21, %k0, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k3
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrq $22, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrq $23, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrq $24, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrq $25, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k3, %k1
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $16, %k0, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k3
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k3, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrq $17, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrq $18, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrq $19, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $20, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $11, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
; AVX512BW-NEXT: kshiftrq $10, %k0, %k3
; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrq $12, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrq $13, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrq $14, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrq $15, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k2} {z}
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrq $6, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrq $7, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrq $8, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $7, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrq $9, %k0, %k0
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k0, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k0, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k2, %k0
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k1, %k2
; AVX512BW-NEXT: korw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask = load <64 x i1>, ptr %in.maskvec, align 64
%tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
%data = call <192 x i32> @llvm.masked.load.v192i32.p0(ptr %in.vec, i32 64, <192 x i1> %tgt.mask, <192 x i32> poison)
store <192 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-SLOW-LABEL: mask_replication_factor4_vf2:
; AVX512F-SLOW: # %bb.0:
; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1
; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
;
; AVX512F-FAST-LABEL: mask_replication_factor4_vf2:
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: kmovw (%rdi), %k1
; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-FAST-NEXT: vzeroupper
; AVX512F-FAST-NEXT: retq
;
; AVX512DQ-SLOW-LABEL: mask_replication_factor4_vf2:
; AVX512DQ-SLOW: # %bb.0:
; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0
; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0
; AVX512DQ-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k1
; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-SLOW-NEXT: vzeroupper
; AVX512DQ-SLOW-NEXT: retq
;
; AVX512DQ-FAST-LABEL: mask_replication_factor4_vf2:
; AVX512DQ-FAST: # %bb.0:
; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0
; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k1
; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-FAST-NEXT: vzeroupper
; AVX512DQ-FAST-NEXT: retq
;
; AVX512BW-SLOW-LABEL: mask_replication_factor4_vf2:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: kmovw (%rdi), %k1
; AVX512BW-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512BW-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512BW-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512BW-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512BW-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512BW-SLOW-NEXT: vzeroupper
; AVX512BW-SLOW-NEXT: retq
;
; AVX512BW-FAST-LABEL: mask_replication_factor4_vf2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: kmovw (%rdi), %k1
; AVX512BW-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512BW-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
; AVX512BW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512BW-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512BW-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512BW-FAST-NEXT: vzeroupper
; AVX512BW-FAST-NEXT: retq
;
; AVX512VBMI-SLOW-LABEL: mask_replication_factor4_vf2:
; AVX512VBMI-SLOW: # %bb.0:
; AVX512VBMI-SLOW-NEXT: kmovw (%rdi), %k1
; AVX512VBMI-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VBMI-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VBMI-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512VBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512VBMI-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512VBMI-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512VBMI-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512VBMI-SLOW-NEXT: vzeroupper
; AVX512VBMI-SLOW-NEXT: retq
;
; AVX512VBMI-FAST-LABEL: mask_replication_factor4_vf2:
; AVX512VBMI-FAST: # %bb.0:
; AVX512VBMI-FAST-NEXT: kmovw (%rdi), %k1
; AVX512VBMI-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VBMI-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VBMI-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
; AVX512VBMI-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512VBMI-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512VBMI-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512VBMI-FAST-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512VBMI-FAST-NEXT: vzeroupper
; AVX512VBMI-FAST-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
%tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
%data = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %in.vec, i32 64, <8 x i1> %tgt.mask, <8 x i32> poison)
%data.padded = shufflevector <8 x i32> %data, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <8 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor4_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor4_vf4:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor4_vf4:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor4_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
%data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison)
store <16 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor4_vf8:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor4_vf8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor4_vf8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
%data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison)
store <32 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor4_vf16:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor4_vf16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z}
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf16:
; AVX512BW-ONLY: # %bb.0:
; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-ONLY-NEXT: vzeroupper
; AVX512BW-ONLY-NEXT: retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf16:
; AVX512VBMI-ONLY: # %bb.0:
; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0
; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512VBMI-ONLY-NEXT: vzeroupper
; AVX512VBMI-ONLY-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
%data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison)
store <64 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor4_vf32:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k4
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3
; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4
; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k3
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k7
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k5} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor4_vf32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: kmovw 2(%rdi), %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3
; AVX512DQ-NEXT: vpmovd2m %zmm3, %k2
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4
; AVX512DQ-NEXT: vpmovd2m %zmm4, %k3
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5
; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6
; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7
; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z}
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf32:
; AVX512BW-ONLY: # %bb.0:
; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0
; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-ONLY-NEXT: vzeroupper
; AVX512BW-ONLY-NEXT: retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf32:
; AVX512VBMI-ONLY: # %bb.0:
; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0
; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512VBMI-ONLY-NEXT: vzeroupper
; AVX512VBMI-ONLY-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
%data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison)
store <128 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor4_vf64:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm10
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm11
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm12
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm1
; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm3, %zmm13
; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm14
; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm15
; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm9, %zmm2
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z}
; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm3, %zmm3
; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm5, %zmm5
; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm7, %zmm7
; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm9, %zmm9
; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm13 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm12 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 960(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 896(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 832(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 768(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 576(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 512(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 448(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor4_vf64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw 6(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: kmovw 4(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0
; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm10
; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm11
; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm12
; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm1
; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm13
; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm14
; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm15
; AVX512DQ-NEXT: vpermd %zmm2, %zmm9, %zmm2
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16
; AVX512DQ-NEXT: vpermd %zmm16, %zmm3, %zmm3
; AVX512DQ-NEXT: vpermd %zmm16, %zmm5, %zmm5
; AVX512DQ-NEXT: vpermd %zmm16, %zmm7, %zmm7
; AVX512DQ-NEXT: vpermd %zmm16, %zmm9, %zmm9
; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1
; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1
; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm13 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1
; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm12 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1
; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 960(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 896(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm8, 832(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 768(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm10, 704(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm11, 640(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm12, 576(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 512(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm13, 448(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm15, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf64:
; AVX512BW-ONLY: # %bb.0:
; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7]
; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4
; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4
; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 768(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 832(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 512(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 576(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-ONLY-NEXT: vzeroupper
; AVX512BW-ONLY-NEXT: retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf64:
; AVX512VBMI-ONLY: # %bb.0:
; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0
; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [48,48,48,48,49,49,49,49,50,50,50,50,51,51,51,51,52,52,52,52,53,53,53,53,54,54,54,54,55,55,55,55,56,56,56,56,57,57,57,57,58,58,58,58,59,59,59,59,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,32,32,33,33,33,33,34,34,34,34,35,35,35,35,36,36,36,36,37,37,37,37,38,38,38,38,39,39,39,39,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,44,44,44,44,45,45,45,45,46,46,46,46,47,47,47,47]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4
; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4
; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 768(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 832(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 512(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 576(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512VBMI-ONLY-NEXT: vzeroupper
; AVX512VBMI-ONLY-NEXT: retq
%src.mask = load <64 x i1>, ptr %in.maskvec, align 64
%tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
%data = call <256 x i32> @llvm.masked.load.v256i32.p0(ptr %in.vec, i32 64, <256 x i1> %tgt.mask, <256 x i32> poison)
store <256 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor5_vf2:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-ONLY-NEXT: movw $1023, %ax # imm = 0x3FF
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; AVX512F-ONLY-NEXT: vmovq %xmm1, 32(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor5_vf2:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: movw $1023, %ax # imm = 0x3FF
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; AVX512DQ-NEXT: vmovq %xmm1, 32(%rdx)
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor5_vf2:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512BW-NEXT: movw $1023, %ax # imm = 0x3FF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; AVX512BW-NEXT: vmovq %xmm1, 32(%rdx)
; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
%tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
%data = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr %in.vec, i32 64, <10 x i1> %tgt.mask, <10 x i32> poison)
%data.padded = shufflevector <10 x i32> %data, <10 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <10 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor5_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor5_vf4:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512F-ONLY-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: movw $15, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT: vmovdqa %xmm0, 64(%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor5_vf4:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-NEXT: movw $15, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT: vmovdqa %xmm0, 64(%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor5_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: movl $1048575, %eax # imm = 0xFFFFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT: vmovdqa %xmm0, 64(%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
%data = call <20 x i32> @llvm.masked.load.v20i32.p0(ptr %in.vec, i32 64, <20 x i1> %tgt.mask, <20 x i32> poison)
%data.padded = shufflevector <20 x i32> %data, <20 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <20 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor5_vf8:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k2
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3
; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7]
; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k3} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor5_vf8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7]
; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vpmovd2m %ymm0, %k3
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa %ymm1, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor5_vf8:
; AVX512BW-ONLY: # %bb.0:
; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0
; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,38,38,38,39,39,39,39,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF
; AVX512BW-ONLY-NEXT: kmovq %rax, %k1
; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z}
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-ONLY-NEXT: vzeroupper
; AVX512BW-ONLY-NEXT: retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor5_vf8:
; AVX512VBMI-ONLY: # %bb.0:
; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0
; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VBMI-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF
; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1
; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512VBMI-ONLY-NEXT: vzeroupper
; AVX512VBMI-ONLY-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
%data = call <40 x i32> @llvm.masked.load.v40i32.p0(ptr %in.vec, i32 64, <40 x i1> %tgt.mask, <40 x i32> poison)
%data.padded = shufflevector <40 x i32> %data, <40 x i32> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <40 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor5_vf16:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k5} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k4} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor5_vf16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k5
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k5} {z}
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k4} {z}
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z}
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor5_vf16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2
; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3
; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4
; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm0
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
%data = call <80 x i32> @llvm.masked.load.v80i32.p0(ptr %in.vec, i32 64, <80 x i1> %tgt.mask, <80 x i32> poison)
store <80 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor5_vf32:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm3
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 448(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 512(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 576(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor5_vf32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0
; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1
; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2
; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5
; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7
; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm3
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 448(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm7, 512(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 576(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor5_vf32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd (%rdi), %k5
; AVX512BW-NEXT: kshiftrd $1, %k5, %k1
; AVX512BW-NEXT: movw $-3, %ax
; AVX512BW-NEXT: kmovd %eax, %k6
; AVX512BW-NEXT: kmovw (%rdi), %k2
; AVX512BW-NEXT: kandw %k6, %k2, %k3
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-5, %ax
; AVX512BW-NEXT: kmovd %eax, %k4
; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-9, %ax
; AVX512BW-NEXT: kmovd %eax, %k4
; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-17, %ax
; AVX512BW-NEXT: kmovd %eax, %k4
; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: movw $-33, %ax
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-65, %ax
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-129, %ax
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT: kmovd %eax, %k7
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k3
; AVX512BW-NEXT: kshiftrd $2, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
; AVX512BW-NEXT: kshiftrw $5, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT: kmovd %eax, %k4
; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $4, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT: kmovd %eax, %k4
; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $3, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT: kmovd %eax, %k4
; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $2, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrd $3, %k5, %k2
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $29, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
; AVX512BW-NEXT: kshiftrd $28, %k5, %k1
; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kandw %k6, %k1, %k3
; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrd $30, %k5, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrd $31, %k5, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k4
; AVX512BW-NEXT: kshiftrw $4, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $2, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k2} {z}
; AVX512BW-NEXT: kshiftrd $25, %k5, %k2
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kandw %k6, %k2, %k3
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrd $26, %k5, %k7
; AVX512BW-NEXT: kshiftlw $15, %k7, %k7
; AVX512BW-NEXT: kshiftrw $13, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrd $27, %k5, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $8, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $4, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftlw $14, %k0, %k1
; AVX512BW-NEXT: korw %k1, %k3, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $22, %k5, %k0
; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $13, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k6, %k6
; AVX512BW-NEXT: kshiftrd $23, %k5, %k7
; AVX512BW-NEXT: kmovq %k5, %k0
; AVX512BW-NEXT: kshiftlw $15, %k7, %k7
; AVX512BW-NEXT: kshiftrw $12, %k7, %k5
; AVX512BW-NEXT: korw %k5, %k6, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $10, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $9, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrd $24, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrw $3, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k4, %k5, %k5
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k5, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k7, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k2} {z}
; AVX512BW-NEXT: kshiftrd $19, %k0, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k2, %k4
; AVX512BW-NEXT: kshiftlw $15, %k2, %k6
; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k6, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k4, %k4
; AVX512BW-NEXT: kshiftrw $13, %k6, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k4, %k4
; AVX512BW-NEXT: kshiftrw $12, %k6, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k4, %k4
; AVX512BW-NEXT: kshiftrd $20, %k0, %k5
; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k4, %k4
; AVX512BW-NEXT: kshiftrw $10, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $9, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $8, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $7, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrd $21, %k0, %k5
; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kandw %k3, %k4, %k4
; AVX512BW-NEXT: kshiftrw $5, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $4, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $2, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k4, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm4 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k0, %k1
; AVX512BW-NEXT: kandw %k7, %k1, %k3
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k3, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $17, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $18, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k4
; AVX512BW-NEXT: kshiftrw $5, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $13, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrd $12, %k0, %k3
; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k2
; AVX512BW-NEXT: kshiftrw $14, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $14, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $15, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k4
; AVX512BW-NEXT: kshiftrw $4, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm6 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $9, %k0, %k2
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k4
; AVX512BW-NEXT: kshiftlw $15, %k2, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrd $10, %k0, %k5
; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
; AVX512BW-NEXT: kshiftrw $13, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $12, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kandw %k7, %k4, %k4
; AVX512BW-NEXT: kshiftrw $11, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $10, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $9, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrd $11, %k0, %k5
; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $7, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $5, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k4, %k4
; AVX512BW-NEXT: kshiftrw $4, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kandw %k3, %k4, %k4
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $15, %k7, %k5
; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $2, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k4, %k4
; AVX512BW-NEXT: kshiftlw $14, %k7, %k3
; AVX512BW-NEXT: korw %k3, %k4, %k3
; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k3} {z}
; AVX512BW-NEXT: kshiftrd $6, %k0, %k4
; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k4, %k5
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $13, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrd $7, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k5, %k5
; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k5, %k5
; AVX512BW-NEXT: kshiftrd $8, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k5, %k5
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k5, %k5
; AVX512BW-NEXT: kshiftrw $3, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $2, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k5, %k5
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k5, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $14, %k5, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k5, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k5, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $4, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $5, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: korw %k1, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 512(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, 576(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
%data = call <160 x i32> @llvm.masked.load.v160i32.p0(ptr %in.vec, i32 64, <160 x i1> %tgt.mask, <160 x i32> poison)
store <160 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor5_vf64:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm8, %zmm2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm10
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm12
; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm3, %zmm4
; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm13
; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm8, %zmm14
; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm15
; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm16
; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm3, %zmm5
; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm7, %zmm17
; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm8, %zmm18
; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm9, %zmm19
; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm3, %zmm3
; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm11, %zmm6
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm8, %zmm8
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1216(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1152(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1088(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1024(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 960(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 896(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 832(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 768(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 704(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 640(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 576(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 448(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor5_vf64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT: kmovw 6(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4
; AVX512DQ-NEXT: kmovw 4(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5
; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
; AVX512DQ-NEXT: vpermd %zmm4, %zmm8, %zmm2
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm10
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm12
; AVX512DQ-NEXT: vpermd %zmm4, %zmm3, %zmm4
; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm13
; AVX512DQ-NEXT: vpermd %zmm5, %zmm8, %zmm14
; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm15
; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm16
; AVX512DQ-NEXT: vpermd %zmm5, %zmm3, %zmm5
; AVX512DQ-NEXT: vpermd %zmm6, %zmm7, %zmm17
; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm18
; AVX512DQ-NEXT: vpermd %zmm6, %zmm9, %zmm19
; AVX512DQ-NEXT: vpermd %zmm6, %zmm3, %zmm3
; AVX512DQ-NEXT: vpermd %zmm6, %zmm11, %zmm6
; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7
; AVX512DQ-NEXT: vpermd %zmm0, %zmm8, %zmm8
; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9
; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1
; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1
; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1
; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1
; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1
; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1
; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1
; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1
; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1216(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1152(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1088(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1024(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 960(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm13, 896(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm14, 832(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm15, 768(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm16, 704(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 640(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm17, 576(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm19, 448(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor5_vf64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovq (%rdi), %k5
; AVX512BW-NEXT: kshiftrq $1, %k5, %k0
; AVX512BW-NEXT: movw $-3, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kmovw (%rdi), %k2
; AVX512BW-NEXT: kandw %k1, %k2, %k3
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-5, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-9, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-17, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: movw $-33, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k0, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-65, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k0, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-129, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k0, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k0, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k2, %k0
; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k0, %k3
; AVX512BW-NEXT: kshiftrq $2, %k5, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k2
; AVX512BW-NEXT: kshiftrw $5, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $4, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $3, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k7
; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k7, %k7
; AVX512BW-NEXT: kshiftrw $2, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k7, %k7
; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT: kmovd %eax, %k6
; AVX512BW-NEXT: kandw %k6, %k7, %k7
; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k7, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $3, %k5, %k7
; AVX512BW-NEXT: kshiftlw $15, %k7, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k7, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $4, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $5, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrq $6, %k5, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k6
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z}
; AVX512BW-NEXT: kandw %k2, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k7, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k7, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $7, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $8, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $9, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrq $10, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $11, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrq $12, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k6
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k6} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrq $13, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $14, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $15, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $16, %k5, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $17, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrq $18, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $19, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $20, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $21, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $22, %k5, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $23, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $24, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $25, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrq $26, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $27, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $28, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k6
; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k6} {z}
; AVX512BW-NEXT: kandw %k2, %k1, %k0
; AVX512BW-NEXT: kshiftrq $29, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrq $30, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $31, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $32, %k5, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $33, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $34, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $35, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrq $36, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $37, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $38, %k5, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $39, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $40, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $41, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z}
; AVX512BW-NEXT: kandw %k3, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $42, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrq $43, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $44, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k6
; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k6} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrq $45, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $46, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrq $47, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $48, %k5, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $49, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $50, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $51, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $52, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $53, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $54, %k5, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrq $55, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $56, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrq $57, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $58, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $59, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrq $60, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k6
; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k6} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrq $61, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $62, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrq $63, %k5, %k5
; AVX512BW-NEXT: kshiftrw $5, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k5, %k1
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k5, %k2
; AVX512BW-NEXT: korw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k1, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask = load <64 x i1>, ptr %in.maskvec, align 64
%tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
%data = call <320 x i32> @llvm.masked.load.v320i32.p0(ptr %in.vec, i32 64, <320 x i1> %tgt.mask, <320 x i32> poison)
store <320 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor6_vf2:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor6_vf2:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor6_vf2:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512BW-NEXT: movw $4095, %ax # imm = 0xFFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
%tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%data = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %in.vec, i32 64, <12 x i1> %tgt.mask, <12 x i32> poison)
%data.padded = shufflevector <12 x i32> %data, <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
store <12 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-SLOW-LABEL: mask_replication_factor6_vf4:
; AVX512F-SLOW: # %bb.0:
; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1
; AVX512F-SLOW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
; AVX512F-SLOW-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-SLOW-NEXT: movw $255, %ax
; AVX512F-SLOW-NEXT: kmovw %eax, %k1
; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-SLOW-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx)
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
;
; AVX512F-FAST-LABEL: mask_replication_factor6_vf4:
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: kmovw (%rdi), %k1
; AVX512F-FAST-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1
; AVX512F-FAST-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-FAST-NEXT: movw $255, %ax
; AVX512F-FAST-NEXT: kmovw %eax, %k1
; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-FAST-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512F-FAST-NEXT: vmovdqa %ymm0, 64(%rdx)
; AVX512F-FAST-NEXT: vzeroupper
; AVX512F-FAST-NEXT: retq
;
; AVX512DQ-SLOW-LABEL: mask_replication_factor6_vf4:
; AVX512DQ-SLOW: # %bb.0:
; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0
; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-SLOW-NEXT: movw $255, %ax
; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k2
; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx)
; AVX512DQ-SLOW-NEXT: vzeroupper
; AVX512DQ-SLOW-NEXT: retq
;
; AVX512DQ-FAST-LABEL: mask_replication_factor6_vf4:
; AVX512DQ-FAST: # %bb.0:
; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0
; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1
; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-FAST-NEXT: movw $255, %ax
; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k2
; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 64(%rdx)
; AVX512DQ-FAST-NEXT: vzeroupper
; AVX512DQ-FAST-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor6_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: movl $16777215, %eax # imm = 0xFFFFFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%data = call <24 x i32> @llvm.masked.load.v24i32.p0(ptr %in.vec, i32 64, <24 x i1> %tgt.mask, <24 x i32> poison)
%data.padded = shufflevector <24 x i32> %data, <24 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <24 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor6_vf8:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor6_vf8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor6_vf8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2
; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
%data = call <48 x i32> @llvm.masked.load.v48i32.p0(ptr %in.vec, i32 64, <48 x i1> %tgt.mask, <48 x i32> poison)
store <48 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor6_vf16:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor6_vf16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z}
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z}
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor6_vf16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2
; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3
; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4
; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5
; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm0
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm3, 256(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
%data = call <96 x i32> @llvm.masked.load.v96i32.p0(ptr %in.vec, i32 64, <96 x i1> %tgt.mask, <96 x i32> poison)
store <96 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor6_vf32:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm9
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm3
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 448(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 512(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 576(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 640(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 704(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor6_vf32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0
; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1
; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2
; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5
; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7
; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm9
; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm3
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 512(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm7, 576(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm9, 640(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 704(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor6_vf32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd (%rdi), %k5
; AVX512BW-NEXT: movw $-3, %ax
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: kandw %k0, %k1, %k2
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-5, %ax
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k0, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-9, %ax
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k0, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-17, %ax
; AVX512BW-NEXT: kmovd %eax, %k7
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $11, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-33, %ax
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k0, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: movw $-65, %ax
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k0, %k1, %k2
; AVX512BW-NEXT: kshiftrd $1, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-129, %ax
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k0, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k0, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k0, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT: kmovd %eax, %k6
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $5, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k0, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k0, %k1, %k1
; AVX512BW-NEXT: kshiftrd $2, %k5, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k3
; AVX512BW-NEXT: kmovq %k2, %k4
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kshiftrw $3, %k3, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k3, %k2
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k4, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $29, %k5, %k0
; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k4
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrd $30, %k5, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $11, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kandw %k6, %k1, %k4
; AVX512BW-NEXT: kshiftrd $31, %k5, %k7
; AVX512BW-NEXT: kshiftlw $15, %k7, %k1
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k4, %k4
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k4, %k4
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $2, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftlw $14, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
; AVX512BW-NEXT: korw %k1, %k4, %k1
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $26, %k5, %k4
; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k6
; AVX512BW-NEXT: kshiftlw $15, %k4, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftrd $27, %k5, %k7
; AVX512BW-NEXT: kmovq %k5, %k2
; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kshiftlw $15, %k7, %k7
; AVX512BW-NEXT: kshiftrw $13, %k7, %k5
; AVX512BW-NEXT: korw %k5, %k6, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k5, %k5
; AVX512BW-NEXT: kshiftrw $12, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $10, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $9, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrd $28, %k2, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k5, %k5
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k2, %k3
; AVX512BW-NEXT: korw %k3, %k5, %k3
; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $24, %k0, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k3
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $14, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrd $25, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $3, %k4, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $2, %k4, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k0, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k4, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $21, %k1, %k2
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k2, %k3
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kandw %k6, %k3, %k3
; AVX512BW-NEXT: kshiftrd $22, %k1, %k4
; AVX512BW-NEXT: kmovq %k1, %k7
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $11, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $7, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k3, %k4
; AVX512BW-NEXT: kshiftrd $23, %k7, %k5
; AVX512BW-NEXT: kshiftlw $15, %k5, %k3
; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $3, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftlw $14, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
; AVX512BW-NEXT: korw %k3, %k4, %k3
; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k3} {z}
; AVX512BW-NEXT: kmovq %k7, %k4
; AVX512BW-NEXT: kshiftrd $18, %k7, %k6
; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k6, %k5
; AVX512BW-NEXT: kshiftlw $15, %k6, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrd $19, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrd $20, %k4, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k1, %k2
; AVX512BW-NEXT: korw %k2, %k5, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: korw %k0, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT: kmovq %k4, %k0
; AVX512BW-NEXT: kshiftrd $16, %k4, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k2
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrd $17, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $3, %k3, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k3, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $13, %k0, %k3
; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k2
; AVX512BW-NEXT: kshiftlw $15, %k3, %k5
; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k5, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k5, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k5, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrd $14, %k0, %k3
; AVX512BW-NEXT: kmovq %k0, %k7
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k2, %k3
; AVX512BW-NEXT: kshiftrd $15, %k7, %k5
; AVX512BW-NEXT: kshiftlw $15, %k5, %k2
; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrw $3, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrw $2, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftlw $14, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z}
; AVX512BW-NEXT: kmovq %k7, %k2
; AVX512BW-NEXT: kshiftrd $10, %k7, %k0
; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kandw %k1, %k0, %k5
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrd $11, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k4, %k5, %k5
; AVX512BW-NEXT: kshiftrd $12, %k2, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k5, %k5
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k5, %k4
; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: korw %k1, %k4, %k1
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $8, %k2, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k4
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k4, %k4
; AVX512BW-NEXT: kshiftrw $13, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k4, %k4
; AVX512BW-NEXT: kshiftrw $12, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kandw %k0, %k4, %k4
; AVX512BW-NEXT: kshiftrw $11, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k4, %k4
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k4, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
; AVX512BW-NEXT: kshiftrd $9, %k2, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $9, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $3, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k2, %k4
; AVX512BW-NEXT: kmovq %k2, %k5
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $5, %k1, %k2
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kandw %k6, %k2, %k3
; AVX512BW-NEXT: kshiftlw $15, %k2, %k7
; AVX512BW-NEXT: kshiftrw $14, %k7, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k7, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k7, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrd $6, %k1, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $11, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrw $7, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k3, %k4
; AVX512BW-NEXT: kshiftrd $7, %k1, %k5
; AVX512BW-NEXT: kshiftlw $15, %k5, %k3
; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $3, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftlw $14, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
; AVX512BW-NEXT: korw %k3, %k4, %k3
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k3} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $14, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrd $3, %k1, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $13, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrd $4, %k1, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k0, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k0, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k0, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $4, %k0, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $3, %k0, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k3, %k0
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k1, %k2
; AVX512BW-NEXT: korw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k7, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
%data = call <192 x i32> @llvm.masked.load.v192i32.p0(ptr %in.vec, i32 64, <192 x i1> %tgt.mask, <192 x i32> poison)
store <192 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor6_vf64:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm9, %zmm9, %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm10, %zmm1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm11, %zmm2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm12, %zmm3
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm13, %zmm5
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm14, %zmm6
; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm4, %zmm7
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm10, %zmm15
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm11, %zmm16
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm17
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm18
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm19
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm4, %zmm8
; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm10, %zmm20
; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm11, %zmm21
; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm12, %zmm22
; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm23
; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm4, %zmm24
; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm14, %zmm9
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm10, %zmm10
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm12, %zmm12
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm14, %zmm4
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm22 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm18 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm17 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1472(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1408(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1344(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 1280(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1216(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1152(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1088(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 960(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 896(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 832(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 768(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 704(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 640(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 576(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 512(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 448(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor6_vf64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT: kmovw 6(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm7
; AVX512DQ-NEXT: kmovw 4(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8
; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm7, %zmm10, %zmm1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
; AVX512DQ-NEXT: vpermd %zmm7, %zmm11, %zmm2
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512DQ-NEXT: vpermd %zmm7, %zmm12, %zmm3
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
; AVX512DQ-NEXT: vpermd %zmm7, %zmm13, %zmm5
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
; AVX512DQ-NEXT: vpermd %zmm7, %zmm14, %zmm6
; AVX512DQ-NEXT: vpermd %zmm7, %zmm4, %zmm7
; AVX512DQ-NEXT: vpermd %zmm8, %zmm10, %zmm15
; AVX512DQ-NEXT: vpermd %zmm8, %zmm11, %zmm16
; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm17
; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm18
; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm19
; AVX512DQ-NEXT: vpermd %zmm8, %zmm4, %zmm8
; AVX512DQ-NEXT: vpermd %zmm9, %zmm10, %zmm20
; AVX512DQ-NEXT: vpermd %zmm9, %zmm11, %zmm21
; AVX512DQ-NEXT: vpermd %zmm9, %zmm12, %zmm22
; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm23
; AVX512DQ-NEXT: vpermd %zmm9, %zmm4, %zmm24
; AVX512DQ-NEXT: vpermd %zmm9, %zmm14, %zmm9
; AVX512DQ-NEXT: vpermd %zmm0, %zmm10, %zmm10
; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11
; AVX512DQ-NEXT: vpermd %zmm0, %zmm12, %zmm12
; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13
; AVX512DQ-NEXT: vpermd %zmm0, %zmm14, %zmm4
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1
; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1
; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1
; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm22 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1
; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1
; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1
; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1
; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm18 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1
; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm17 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1
; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1
; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1472(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1408(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1344(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1280(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1216(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1152(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1088(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1024(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm17, 960(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm18, 896(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm19, 832(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm20, 704(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm21, 640(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm22, 576(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm23, 512(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm11, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor6_vf64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovq (%rdi), %k5
; AVX512BW-NEXT: movw $-3, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kmovw (%rdi), %k0
; AVX512BW-NEXT: kandw %k1, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k0
; AVX512BW-NEXT: korw %k0, %k3, %k0
; AVX512BW-NEXT: movw $-5, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-9, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-17, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-33, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: movw $-65, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $1, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-129, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k0, %k3
; AVX512BW-NEXT: kshiftrq $2, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k0
; AVX512BW-NEXT: kshiftrw $3, %k0, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k7
; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k7, %k7
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k7, %k6
; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
; AVX512BW-NEXT: korw %k0, %k6, %k6
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovq %k5, %k3
; AVX512BW-NEXT: kshiftrq $3, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $4, %k3, %k1
; AVX512BW-NEXT: kmovq %k3, %k7
; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrq $5, %k7, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z}
; AVX512BW-NEXT: kandw %k4, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $6, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $7, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $8, %k7, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrq $9, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $10, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $11, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrq $12, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $13, %k7, %k1
; AVX512BW-NEXT: kmovq %k7, %k2
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovq %k2, %k7
; AVX512BW-NEXT: kshiftrq $14, %k2, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kshiftrq $15, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $16, %k5, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $17, %k5, %k1
; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kmovq %k7, %k4
; AVX512BW-NEXT: kshiftrq $18, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $19, %k4, %k1
; AVX512BW-NEXT: kmovq %k4, %k7
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $20, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrq $21, %k7, %k1
; AVX512BW-NEXT: kmovq %k7, %k3
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kmovq %k3, %k5
; AVX512BW-NEXT: kshiftrq $22, %k3, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $23, %k5, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $24, %k5, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $25, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $26, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftrq $27, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $28, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $29, %k7, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $30, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kshiftrq $31, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z}
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $32, %k2, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $33, %k2, %k1
; AVX512BW-NEXT: kmovq %k2, %k7
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kmovq %k7, %k5
; AVX512BW-NEXT: kshiftrq $34, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $35, %k5, %k1
; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $36, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrq $37, %k7, %k1
; AVX512BW-NEXT: kmovq %k7, %k3
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovq %k3, %k7
; AVX512BW-NEXT: kshiftrq $38, %k3, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kshiftrq $39, %k7, %k6
; AVX512BW-NEXT: kmovq %k7, %k5
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $40, %k5, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $41, %k5, %k1
; AVX512BW-NEXT: kmovq %k5, %k4
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $42, %k4, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kmovq %k4, %k7
; AVX512BW-NEXT: kshiftrq $43, %k4, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrq $44, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $45, %k7, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z}
; AVX512BW-NEXT: kandw %k5, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $46, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $47, %k5, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z}
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $48, %k5, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrq $49, %k5, %k1
; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kmovq %k7, %k5
; AVX512BW-NEXT: kshiftrq $50, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $51, %k5, %k1
; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrq $52, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrq $53, %k7, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $54, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k1
; AVX512BW-NEXT: kshiftrq $55, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z}
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $56, %k5, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $57, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrq $58, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z}
; AVX512BW-NEXT: kandw %k2, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $59, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $60, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $61, %k5, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $62, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k6
; AVX512BW-NEXT: kshiftrq $63, %k5, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k2
; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $2, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k2, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k1, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm21, 1344(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask = load <64 x i1>, ptr %in.maskvec, align 64
%tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
%data = call <384 x i32> @llvm.masked.load.v384i32.p0(ptr %in.vec, i32 64, <384 x i1> %tgt.mask, <384 x i32> poison)
store <384 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor7_vf2:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-ONLY-NEXT: movw $16383, %ax # imm = 0x3FFF
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512F-ONLY-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512F-ONLY-NEXT: vmovq %xmm1, 48(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor7_vf2:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: movw $16383, %ax # imm = 0x3FFF
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512DQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512DQ-NEXT: vmovq %xmm1, 48(%rdx)
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor7_vf2:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512BW-NEXT: movw $16383, %ax # imm = 0x3FFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, 48(%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
%tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%data = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr %in.vec, i32 64, <14 x i1> %tgt.mask, <14 x i32> poison)
%data.padded = shufflevector <14 x i32> %data, <14 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 undef, i32 undef>
store <14 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor7_vf4:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 96(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT: vmovdqa %ymm0, 64(%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor7_vf4:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 96(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor7_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0]
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: movl $268435455, %eax # imm = 0xFFFFFFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, 96(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vmovdqa %ymm1, 64(%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%data = call <28 x i32> @llvm.masked.load.v28i32.p0(ptr %in.vec, i32 64, <28 x i1> %tgt.mask, <28 x i32> poison)
%data.padded = shufflevector <28 x i32> %data, <28 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef>
store <28 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-SLOW-LABEL: mask_replication_factor7_vf8:
; AVX512F-SLOW: # %bb.0:
; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1
; AVX512F-SLOW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-SLOW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512F-SLOW-NEXT: movw $1, %ax
; AVX512F-SLOW-NEXT: kmovw %eax, %k2
; AVX512F-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2}
; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k3
; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-SLOW-NEXT: vptestmd %zmm0, %zmm0, %k4
; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7]
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3]
; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k1} {z}
; AVX512F-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z}
; AVX512F-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k3} {z}
; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 192(%rdx)
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
;
; AVX512F-FAST-LABEL: mask_replication_factor7_vf8:
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: kmovw (%rdi), %k1
; AVX512F-FAST-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-FAST-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512F-FAST-NEXT: movw $1, %ax
; AVX512F-FAST-NEXT: kmovw %eax, %k2
; AVX512F-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2}
; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k3
; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-FAST-NEXT: vptestmd %zmm0, %zmm0, %k4
; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7]
; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-FAST-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k1} {z}
; AVX512F-FAST-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z}
; AVX512F-FAST-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k3} {z}
; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx)
; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx)
; AVX512F-FAST-NEXT: vmovdqa %ymm1, 192(%rdx)
; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-FAST-NEXT: vzeroupper
; AVX512F-FAST-NEXT: retq
;
; AVX512DQ-SLOW-LABEL: mask_replication_factor7_vf8:
; AVX512DQ-SLOW: # %bb.0:
; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0
; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-SLOW-NEXT: vpmovm2d %k1, %zmm1
; AVX512DQ-SLOW-NEXT: movw $1, %ax
; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k2
; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k3
; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0
; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7]
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3]
; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k4
; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k4} {z}
; AVX512DQ-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z}
; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k2} {z}
; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 192(%rdx)
; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-SLOW-NEXT: vzeroupper
; AVX512DQ-SLOW-NEXT: retq
;
; AVX512DQ-FAST-LABEL: mask_replication_factor7_vf8:
; AVX512DQ-FAST: # %bb.0:
; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0
; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-FAST-NEXT: vpmovm2d %k1, %zmm1
; AVX512DQ-FAST-NEXT: movw $1, %ax
; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k2
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k3
; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7]
; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k4
; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-FAST-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k4} {z}
; AVX512DQ-FAST-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z}
; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k2} {z}
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx)
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx)
; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 192(%rdx)
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-FAST-NEXT: vzeroupper
; AVX512DQ-FAST-NEXT: retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor7_vf8:
; AVX512BW-ONLY: # %bb.0:
; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0
; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,54,55,55,55,55,55,55,55,u,u,u,u,u,u,u,u]
; AVX512BW-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF
; AVX512BW-ONLY-NEXT: kmovq %rax, %k1
; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa %ymm0, 192(%rdx)
; AVX512BW-ONLY-NEXT: vzeroupper
; AVX512BW-ONLY-NEXT: retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor7_vf8:
; AVX512VBMI-ONLY: # %bb.0:
; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0
; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7,7,7,7,7,7,7,u,u,u,u,u,u,u,u]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VBMI-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF
; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1
; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm0, 192(%rdx)
; AVX512VBMI-ONLY-NEXT: vzeroupper
; AVX512VBMI-ONLY-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
%data = call <56 x i32> @llvm.masked.load.v56i32.p0(ptr %in.vec, i32 64, <56 x i1> %tgt.mask, <56 x i32> poison)
%data.padded = shufflevector <56 x i32> %data, <56 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <56 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor7_vf16:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k7} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k6} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k5} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k4} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k3} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor7_vf16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k7} {z}
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k6} {z}
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k5} {z}
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k4} {z}
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k3} {z}
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor7_vf16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2
; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3
; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4
; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5
; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm6
; AVX512BW-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512BW-NEXT: vpermd %zmm0, %zmm7, %zmm0
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm3, 320(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
%data = call <112 x i32> @llvm.masked.load.v112i32.p0(ptr %in.vec, i32 64, <112 x i1> %tgt.mask, <112 x i32> poison)
store <112 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor7_vf32:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm0
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm9
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm11
; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm3
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm12 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 448(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 512(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 576(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 640(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 704(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 768(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 832(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor7_vf32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm0
; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1
; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2
; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5
; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7
; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm9
; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm11
; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm3
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1
; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm12 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm12, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 512(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 576(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm7, 640(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm9, 704(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm11, 768(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 832(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor7_vf32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movw $-3, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw (%rdi), %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kmovq %k2, %k3
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-5, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kmovq %k2, %k4
; AVX512BW-NEXT: kshiftrw $13, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-9, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-17, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-33, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-65, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: movw $-129, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovd (%rdi), %k6
; AVX512BW-NEXT: kshiftrd $1, %k6, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kmovq %k2, %k7
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $7, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT: kmovd %eax, %k5
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $6, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrd $2, %k6, %k2
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kshiftlw $14, %k2, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k2, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: korw %k1, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: kmovq %k6, %k2
; AVX512BW-NEXT: kshiftrd $29, %k6, %k1
; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kandw %k3, %k1, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovq %k4, %k6
; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrd $30, %k2, %k1
; AVX512BW-NEXT: kmovq %k2, %k4
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k5, %k0, %k3
; AVX512BW-NEXT: kshiftrd $31, %k4, %k0
; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
; AVX512BW-NEXT: kshiftrw $6, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k3, %k3
; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k3, %k3
; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k3, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k1, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $27, %k4, %k1
; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k3, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k3, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k3, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k7
; AVX512BW-NEXT: kshiftrd $28, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k7, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $10, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $9, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $8, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $6, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kandw %k5, %k6, %k6
; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k6, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $4, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kmovq %k2, %k4
; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k5, %k6
; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k7, %k0, %k2
; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k2} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $25, %k6, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k5
; AVX512BW-NEXT: kshiftrd $26, %k6, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $7, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k4, %k5, %k5
; AVX512BW-NEXT: kshiftrw $3, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k5, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $2, %k6, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k6, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $23, %k2, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k3
; AVX512BW-NEXT: kshiftrd $22, %k2, %k5
; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovq %k2, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k2
; AVX512BW-NEXT: kshiftrw $14, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrd $24, %k6, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k5
; AVX512BW-NEXT: kshiftrw $7, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $2, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k0, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k2} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $20, %k3, %k5
; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k2
; AVX512BW-NEXT: kshiftlw $15, %k5, %k6
; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k6, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k6, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kandw %k1, %k2, %k5
; AVX512BW-NEXT: kshiftrd $21, %k3, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $10, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k7, %k5, %k5
; AVX512BW-NEXT: kshiftrw $9, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $7, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k5, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k2, %k5
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $15, %k7, %k2
; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $3, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $2, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k4, %k5, %k5
; AVX512BW-NEXT: kshiftlw $14, %k7, %k1
; AVX512BW-NEXT: korw %k1, %k5, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $18, %k2, %k4
; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k5
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $13, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $12, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k6
; AVX512BW-NEXT: kshiftrd $19, %k2, %k5
; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
; AVX512BW-NEXT: kshiftrw $10, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $9, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $8, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftrw $6, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftrw $5, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k6, %k5
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $3, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
; AVX512BW-NEXT: kmovq %k3, %k7
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k0, %k3
; AVX512BW-NEXT: korw %k3, %k5, %k3
; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm6 {%k3} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $16, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k3, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k3
; AVX512BW-NEXT: kshiftrd $17, %k1, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $7, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $4, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $3, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k3, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k1, %k2
; AVX512BW-NEXT: korw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: korw %k1, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $13, %k0, %k2
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kandw %k6, %k2, %k1
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k2, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kandw %k7, %k1, %k3
; AVX512BW-NEXT: kshiftrd $14, %k0, %k1
; AVX512BW-NEXT: kmovq %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k3, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k5
; AVX512BW-NEXT: kshiftrd $15, %k6, %k3
; AVX512BW-NEXT: kmovq %k6, %k0
; AVX512BW-NEXT: kshiftlw $15, %k3, %k1
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrw $2, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k5, %k5
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k5, %k3
; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
; AVX512BW-NEXT: korw %k1, %k3, %k1
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT: kmovq %k0, %k3
; AVX512BW-NEXT: kshiftrd $11, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k5
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k7, %k5, %k5
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k6
; AVX512BW-NEXT: kshiftrd $12, %k3, %k5
; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $10, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k6, %k6
; AVX512BW-NEXT: kshiftrw $9, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k6, %k6
; AVX512BW-NEXT: kshiftrw $8, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $6, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $5, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k6, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $3, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
; AVX512BW-NEXT: kmovq %k3, %k0
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k5, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k0, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k2} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $9, %k6, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k5
; AVX512BW-NEXT: kshiftrd $10, %k6, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $7, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k4, %k5, %k5
; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $3, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k5, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $2, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $7, %k4, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k3
; AVX512BW-NEXT: kshiftrd $6, %k4, %k2
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $14, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k3, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrd $8, %k4, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k5
; AVX512BW-NEXT: kshiftrw $7, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $2, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k0, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k2} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $4, %k6, %k3
; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k3, %k2
; AVX512BW-NEXT: kshiftlw $15, %k3, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k5
; AVX512BW-NEXT: kshiftrd $5, %k6, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $10, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $9, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $7, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k5, %k2
; AVX512BW-NEXT: kandw %k4, %k2, %k5
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k7, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $3, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k5, %k5
; AVX512BW-NEXT: kshiftrw $2, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k5, %k5
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k5, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm12 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $14, %k4, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k4, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k4, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k4, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k2
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $3, %k1, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm8, 384(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm6, 512(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm5, 576(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 640(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm3, 704(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, 832(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
%data = call <224 x i32> @llvm.masked.load.v224i32.p0(ptr %in.vec, i32 64, <224 x i1> %tgt.mask, <224 x i32> poison)
store <224 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor7_vf64:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: movw $1, %ax
; AVX512F-ONLY-NEXT: kmovw %eax, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm9, %zmm9, %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm11, %zmm11, %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm0
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm15, %zmm2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm3
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm17, %zmm4
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm6
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm19, %zmm7
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm5, %zmm8
; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm10
; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm15, %zmm12
; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm16, %zmm14
; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm17, %zmm20
; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm18, %zmm21
; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm19, %zmm22
; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm5, %zmm23
; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm13, %zmm24
; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm15, %zmm25
; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm16, %zmm26
; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm17, %zmm27
; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm18, %zmm28
; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm5, %zmm29
; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm19, %zmm30
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm31
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm15
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm16, %zmm13
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm17, %zmm11
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm18, %zmm9
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm19, %zmm5
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm13 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm16 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm27 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm26, %zmm26, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm26 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm25 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm24 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm23 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm14 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm12 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 1728(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1664(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1600(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1536(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1472(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1408(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1344(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1280(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1216(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1152(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1088(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1024(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 960(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 896(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 832(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 768(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 704(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 640(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 576(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 448(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor7_vf64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: movw $1, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512DQ-NEXT: kmovw 6(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8
; AVX512DQ-NEXT: kmovw 4(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9
; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm11
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
; AVX512DQ-NEXT: vpermd %zmm8, %zmm15, %zmm2
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm3
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
; AVX512DQ-NEXT: vpermd %zmm8, %zmm17, %zmm4
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm6
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
; AVX512DQ-NEXT: vpermd %zmm8, %zmm19, %zmm7
; AVX512DQ-NEXT: vpermd %zmm8, %zmm5, %zmm8
; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm10
; AVX512DQ-NEXT: vpermd %zmm9, %zmm15, %zmm12
; AVX512DQ-NEXT: vpermd %zmm9, %zmm16, %zmm14
; AVX512DQ-NEXT: vpermd %zmm9, %zmm17, %zmm20
; AVX512DQ-NEXT: vpermd %zmm9, %zmm18, %zmm21
; AVX512DQ-NEXT: vpermd %zmm9, %zmm19, %zmm22
; AVX512DQ-NEXT: vpermd %zmm9, %zmm5, %zmm23
; AVX512DQ-NEXT: vpermd %zmm11, %zmm13, %zmm24
; AVX512DQ-NEXT: vpermd %zmm11, %zmm15, %zmm25
; AVX512DQ-NEXT: vpermd %zmm11, %zmm16, %zmm26
; AVX512DQ-NEXT: vpermd %zmm11, %zmm17, %zmm27
; AVX512DQ-NEXT: vpermd %zmm11, %zmm18, %zmm28
; AVX512DQ-NEXT: vpermd %zmm11, %zmm5, %zmm29
; AVX512DQ-NEXT: vpermd %zmm11, %zmm19, %zmm30
; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm31
; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm15
; AVX512DQ-NEXT: vpermd %zmm1, %zmm16, %zmm13
; AVX512DQ-NEXT: vpermd %zmm1, %zmm17, %zmm11
; AVX512DQ-NEXT: vpermd %zmm1, %zmm18, %zmm9
; AVX512DQ-NEXT: vpermd %zmm1, %zmm19, %zmm5
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm13 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1
; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm16 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1
; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1
; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1
; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1
; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm27 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm26, %k1
; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm26 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1
; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm25 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1
; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm24 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1
; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm23 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1
; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1
; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1
; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1
; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm14 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1
; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm12 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1728(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1664(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1600(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1536(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1472(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1408(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1344(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1280(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1216(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1152(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1088(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1024(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm22, 960(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm23, 896(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm24, 832(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm25, 768(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm26, 704(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm27, 640(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm19, 576(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm17, 448(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm16, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm15, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor7_vf64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movw $-3, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kmovw (%rdi), %k0
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-5, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kmovq %k2, %k3
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $13, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-9, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-17, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-33, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-65, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: movw $-129, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq (%rdi), %k4
; AVX512BW-NEXT: kshiftrq $1, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT: kmovd %eax, %k5
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $5, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $2, %k4, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k6
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k7, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k7, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k7, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k7, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq %k4, %k7
; AVX512BW-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT: kshiftrq $3, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $4, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: kandw %k2, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $5, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k6
; AVX512BW-NEXT: kshiftrq $6, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kandw %k5, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $7, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $8, %k4, %k0
; AVX512BW-NEXT: kmovq %k4, %k5
; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $9, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $10, %k5, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k1
; AVX512BW-NEXT: kshiftrq $11, %k5, %k6
; AVX512BW-NEXT: kmovq %k5, %k4
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kmovq %k4, %k7
; AVX512BW-NEXT: kshiftrq $12, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k6
; AVX512BW-NEXT: kshiftrq $13, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kandw %k5, %k6, %k6
; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
; AVX512BW-NEXT: korw %k1, %k6, %k6
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $14, %k5, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k6
; AVX512BW-NEXT: kshiftrq $15, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k0
; AVX512BW-NEXT: kshiftrw $6, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $16, %k5, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kshiftrq $17, %k5, %k0
; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrq $18, %k7, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $19, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $20, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT: kandw %k5, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $21, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k6
; AVX512BW-NEXT: kshiftrq $22, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT: kandw %k5, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $23, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $24, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $25, %k2, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z}
; AVX512BW-NEXT: kandw %k5, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq %k2, %k7
; AVX512BW-NEXT: kshiftrq $26, %k2, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k1
; AVX512BW-NEXT: kshiftrq $27, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $28, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k6
; AVX512BW-NEXT: kshiftrq $29, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kandw %k5, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
; AVX512BW-NEXT: korw %k1, %k6, %k6
; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k6} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $30, %k5, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k6
; AVX512BW-NEXT: kshiftrq $31, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k0
; AVX512BW-NEXT: kshiftrw $6, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $32, %k5, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $33, %k5, %k0
; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kmovq %k7, %k3
; AVX512BW-NEXT: kshiftrq $34, %k7, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $35, %k3, %k0
; AVX512BW-NEXT: kmovq %k3, %k7
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $36, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $37, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k5, %k0, %k6
; AVX512BW-NEXT: kshiftrq $38, %k7, %k0
; AVX512BW-NEXT: kmovq %k7, %k5
; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftrq $39, %k5, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $40, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $41, %k4, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $42, %k4, %k0
; AVX512BW-NEXT: kmovq %k4, %k3
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kshiftrq $43, %k3, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq %k3, %k7
; AVX512BW-NEXT: kshiftrq $44, %k3, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k6
; AVX512BW-NEXT: kshiftrq $45, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
; AVX512BW-NEXT: korw %k1, %k6, %k6
; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k6} {z}
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $46, %k5, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k6
; AVX512BW-NEXT: kshiftrq $47, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k0
; AVX512BW-NEXT: kshiftrw $6, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $48, %k5, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $49, %k5, %k0
; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrq $50, %k7, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $51, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $52, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $53, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k6
; AVX512BW-NEXT: kshiftrq $54, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z}
; AVX512BW-NEXT: kandw %k3, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $55, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $56, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $57, %k4, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm24 {%k7} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kmovq %k4, %k7
; AVX512BW-NEXT: kshiftrq $58, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $59, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $60, %k5, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k6
; AVX512BW-NEXT: kshiftrq $61, %k5, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
; AVX512BW-NEXT: korw %k1, %k6, %k6
; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k6} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $62, %k5, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrq $63, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k5, %k0
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k5, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm24, 1536(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm21, 1344(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask = load <64 x i1>, ptr %in.maskvec, align 64
%tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
%data = call <448 x i32> @llvm.masked.load.v448i32.p0(ptr %in.vec, i32 64, <448 x i1> %tgt.mask, <448 x i32> poison)
store <448 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor8_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor8_vf2:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor8_vf2:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor8_vf2:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
%tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison)
store <16 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor8_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor8_vf4:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor8_vf4:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor8_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison)
store <32 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor8_vf8:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor8_vf8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z}
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-ONLY-LABEL: mask_replication_factor8_vf8:
; AVX512BW-ONLY: # %bb.0:
; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-ONLY-NEXT: vzeroupper
; AVX512BW-ONLY-NEXT: retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor8_vf8:
; AVX512VBMI-ONLY: # %bb.0:
; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0
; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512VBMI-ONLY-NEXT: vzeroupper
; AVX512VBMI-ONLY-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
%data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison)
store <64 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor8_vf16:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k7
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor8_vf16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z}
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor8_vf16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63]
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
; AVX512BW-NEXT: vpmovb2m %zmm0, %k2
; AVX512BW-NEXT: kshiftrd $16, %k2, %k3
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-NEXT: kshiftrd $16, %k2, %k3
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
%data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison)
store <128 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor8_vf32:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm2, %zmm0
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm4
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm6
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm8
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm10
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm12
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm14
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z}
; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm2, %zmm2
; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm3, %zmm3
; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm5, %zmm5
; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm7, %zmm7
; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm9, %zmm9
; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm11, %zmm11
; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm13, %zmm13
; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm15, %zmm15
; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm12 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 960(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 896(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 832(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 768(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 640(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 576(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 512(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 448(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, (%rdx)
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor8_vf32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm1, %zmm2, %zmm0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm4
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm6
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm8
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm10
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm12
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm14
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm1
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16
; AVX512DQ-NEXT: vpermd %zmm16, %zmm2, %zmm2
; AVX512DQ-NEXT: vpermd %zmm16, %zmm3, %zmm3
; AVX512DQ-NEXT: vpermd %zmm16, %zmm5, %zmm5
; AVX512DQ-NEXT: vpermd %zmm16, %zmm7, %zmm7
; AVX512DQ-NEXT: vpermd %zmm16, %zmm9, %zmm9
; AVX512DQ-NEXT: vpermd %zmm16, %zmm11, %zmm11
; AVX512DQ-NEXT: vpermd %zmm16, %zmm13, %zmm13
; AVX512DQ-NEXT: vpermd %zmm16, %zmm15, %zmm15
; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1
; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1
; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm12 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 960(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 896(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 832(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm10, 704(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm12, 640(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm14, 576(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 512(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor8_vf32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd (%rdi), %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm3
; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k3
; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k4
; AVX512BW-NEXT: kshiftrd $16, %k4, %k5
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z}
; AVX512BW-NEXT: kshiftrq $32, %k4, %k4
; AVX512BW-NEXT: kshiftrd $16, %k4, %k5
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
; AVX512BW-NEXT: kshiftrd $16, %k3, %k4
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
; AVX512BW-NEXT: kshiftrq $32, %k3, %k3
; AVX512BW-NEXT: kshiftrd $16, %k3, %k4
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
; AVX512BW-NEXT: kshiftrd $16, %k2, %k3
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-NEXT: kshiftrd $16, %k2, %k3
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm15, 896(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm14, 960(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm13, 768(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm12, 832(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm10, 704(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm8, 576(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
%data = call <256 x i32> @llvm.masked.load.v256i32.p0(ptr %in.vec, i32 64, <256 x i1> %tgt.mask, <256 x i32> poison)
store <256 x i32> %data, ptr %out.vec, align 64
ret void
}
define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor8_vf64:
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: subq $136, %rsp
; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm10, %zmm10, %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm12, %zmm0
; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm14, %zmm0
; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm16, %zmm0
; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm18, %zmm0
; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm20, %zmm4
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm22, %zmm5
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm24, %zmm7
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm26, %zmm9
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm11
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm13
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm15
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm17
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm20, %zmm19
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm22, %zmm21
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm24, %zmm23
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm26, %zmm25
; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm12, %zmm27
; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm14, %zmm28
; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm16, %zmm29
; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm18, %zmm30
; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm20, %zmm31
; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm22, %zmm3
; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm24, %zmm6
; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm26, %zmm2
; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm1
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm0
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm16
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm14
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm20, %zmm12
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm22, %zmm10
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm24, %zmm18
; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm26, %zmm8
; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm26 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm8 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm14 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm16 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm18 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm20 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm22 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm24 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm27 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm25 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm23 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm21 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm15 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm13 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm11 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm9 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm7 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm5 {%k1} {z}
; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm4 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1792(%rsi), %zmm28 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1856(%rsi), %zmm29 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload
; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm31, 1984(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm30, 1920(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm29, 1856(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm28, 1792(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1728(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 1664(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1600(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 1536(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 1472(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 1408(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1344(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 1280(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1152(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 1088(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 1024(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 960(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 896(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 832(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 768(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 704(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 640(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 576(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 512(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 448(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 320(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 256(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 64(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, (%rdx)
; AVX512F-ONLY-NEXT: addq $136, %rsp
; AVX512F-ONLY-NEXT: vzeroupper
; AVX512F-ONLY-NEXT: retq
;
; AVX512DQ-LABEL: mask_replication_factor8_vf64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: subq $136, %rsp
; AVX512DQ-NEXT: kmovw 6(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6
; AVX512DQ-NEXT: kmovw 4(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8
; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm10
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm6, %zmm12, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
; AVX512DQ-NEXT: vpermd %zmm6, %zmm14, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
; AVX512DQ-NEXT: vpermd %zmm6, %zmm16, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512DQ-NEXT: vpermd %zmm6, %zmm20, %zmm4
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
; AVX512DQ-NEXT: vpermd %zmm6, %zmm22, %zmm5
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512DQ-NEXT: vpermd %zmm6, %zmm24, %zmm7
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512DQ-NEXT: vpermd %zmm6, %zmm26, %zmm9
; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm11
; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm13
; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm15
; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm17
; AVX512DQ-NEXT: vpermd %zmm8, %zmm20, %zmm19
; AVX512DQ-NEXT: vpermd %zmm8, %zmm22, %zmm21
; AVX512DQ-NEXT: vpermd %zmm8, %zmm24, %zmm23
; AVX512DQ-NEXT: vpermd %zmm8, %zmm26, %zmm25
; AVX512DQ-NEXT: vpermd %zmm10, %zmm12, %zmm27
; AVX512DQ-NEXT: vpermd %zmm10, %zmm14, %zmm28
; AVX512DQ-NEXT: vpermd %zmm10, %zmm16, %zmm29
; AVX512DQ-NEXT: vpermd %zmm10, %zmm18, %zmm30
; AVX512DQ-NEXT: vpermd %zmm10, %zmm20, %zmm31
; AVX512DQ-NEXT: vpermd %zmm10, %zmm22, %zmm3
; AVX512DQ-NEXT: vpermd %zmm10, %zmm24, %zmm6
; AVX512DQ-NEXT: vpermd %zmm10, %zmm26, %zmm2
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8
; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm1
; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm0
; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm16
; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm14
; AVX512DQ-NEXT: vpermd %zmm8, %zmm20, %zmm12
; AVX512DQ-NEXT: vpermd %zmm8, %zmm22, %zmm10
; AVX512DQ-NEXT: vpermd %zmm8, %zmm24, %zmm18
; AVX512DQ-NEXT: vpermd %zmm8, %zmm26, %zmm8
; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm26 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm8 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1
; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1
; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm14 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1
; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm16 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm18 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm20 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm22 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm24 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1
; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1
; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1
; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1
; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1
; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm27 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1
; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm25 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1
; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm23 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1
; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm21 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1
; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1
; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1
; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm15 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1
; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm13 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1
; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm11 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm9 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm7 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm4 {%k1} {z}
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1
; AVX512DQ-NEXT: vmovdqa32 1792(%rsi), %zmm28 {%k1} {z}
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1
; AVX512DQ-NEXT: vmovdqa32 1856(%rsi), %zmm29 {%k1} {z}
; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload
; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1
; AVX512DQ-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z}
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1
; AVX512DQ-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1984(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1920(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm29, 1856(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm28, 1792(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1728(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1664(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1600(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm9, 1536(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm11, 1472(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm13, 1408(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1344(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1280(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1216(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1152(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1088(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1024(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm27, 960(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 896(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 832(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 768(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 704(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 640(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm24, 576(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm22, 512(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm20, 448(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm18, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm16, 320(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm14, 256(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm26, (%rdx)
; AVX512DQ-NEXT: addq $136, %rsp
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: mask_replication_factor8_vf64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovq (%rdi), %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7]
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm7
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm12
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm16
; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm15
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm10
; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm5
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1
; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k2
; AVX512BW-NEXT: kshiftrd $16, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
; AVX512BW-NEXT: kshiftrd $16, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512BW-NEXT: vpmovb2m %zmm5, %k2
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT: vpmovb2m %zmm10, %k1
; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm10 {%k2} {z}
; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
; AVX512BW-NEXT: kshiftrd $16, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k2} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k2} {z}
; AVX512BW-NEXT: vpmovb2m %zmm15, %k2
; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm18 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k1} {z}
; AVX512BW-NEXT: vpmovb2m %zmm16, %k1
; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k2} {z}
; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k2} {z}
; AVX512BW-NEXT: kshiftrd $16, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm21 {%k2} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k2} {z}
; AVX512BW-NEXT: vpmovb2m %zmm12, %k2
; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z}
; AVX512BW-NEXT: vpmovb2m %zmm7, %k1
; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm7 {%k2} {z}
; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k2} {z}
; AVX512BW-NEXT: kshiftrd $16, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k2} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 1856(%rsi), %zmm28 {%k2} {z}
; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm29 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm31, 1984(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm30, 1920(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm28, 1856(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm29, 1792(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm7, 1536(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm24, 1472(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm23, 1408(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm22, 1344(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm12, 1280(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm18, 960(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm15, 768(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask = load <64 x i1>, ptr %in.maskvec, align 64
%tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
%data = call <512 x i32> @llvm.masked.load.v512i32.p0(ptr %in.vec, i32 64, <512 x i1> %tgt.mask, <512 x i32> poison)
store <512 x i32> %data, ptr %out.vec, align 64
ret void
}
declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
declare <20 x i32> @llvm.masked.load.v20i32.p0(ptr, i32, <20 x i1>, <20 x i32>)
declare <24 x i32> @llvm.masked.load.v24i32.p0(ptr, i32, <24 x i1>, <24 x i32>)
declare <28 x i32> @llvm.masked.load.v28i32.p0(ptr, i32, <28 x i1>, <28 x i32>)
declare <32 x i32> @llvm.masked.load.v32i32.p0(ptr, i32, <32 x i1>, <32 x i32>)
declare <40 x i32> @llvm.masked.load.v40i32.p0(ptr, i32, <40 x i1>, <40 x i32>)
declare <48 x i32> @llvm.masked.load.v48i32.p0(ptr, i32, <48 x i1>, <48 x i32>)
declare <56 x i32> @llvm.masked.load.v56i32.p0(ptr, i32, <56 x i1>, <56 x i32>)
declare <64 x i32> @llvm.masked.load.v64i32.p0(ptr, i32, <64 x i1>, <64 x i32>)
declare <80 x i32> @llvm.masked.load.v80i32.p0(ptr, i32, <80 x i1>, <80 x i32>)
declare <96 x i32> @llvm.masked.load.v96i32.p0(ptr, i32, <96 x i1>, <96 x i32>)
declare <112 x i32> @llvm.masked.load.v112i32.p0(ptr, i32, <112 x i1>, <112 x i32>)
declare <128 x i32> @llvm.masked.load.v128i32.p0(ptr, i32, <128 x i1>, <128 x i32>)
declare <160 x i32> @llvm.masked.load.v160i32.p0(ptr, i32, <160 x i1>, <160 x i32>)
declare <192 x i32> @llvm.masked.load.v192i32.p0(ptr, i32, <192 x i1>, <192 x i32>)
declare <224 x i32> @llvm.masked.load.v224i32.p0(ptr, i32, <224 x i1>, <224 x i32>)
declare <256 x i32> @llvm.masked.load.v256i32.p0(ptr, i32, <256 x i1>, <256 x i32>)
declare <320 x i32> @llvm.masked.load.v320i32.p0(ptr, i32, <320 x i1>, <320 x i32>)
declare <384 x i32> @llvm.masked.load.v384i32.p0(ptr, i32, <384 x i1>, <384 x i32>)
declare <448 x i32> @llvm.masked.load.v448i32.p0(ptr, i32, <448 x i1>, <448 x i32>)
declare <512 x i32> @llvm.masked.load.v512i32.p0(ptr, i32, <512 x i1>, <512 x i32>)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX512: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
; FALLBACK2: {{.*}}
; FALLBACK3: {{.*}}
; FALLBACK4: {{.*}}
; FALLBACK5: {{.*}}
; FALLBACK6: {{.*}}
; FALLBACK7: {{.*}}