; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE,SSE2
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE,SSE42
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512,AVX512-V4
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512-VBMI
define i4 @reverse_cmp_v4i1(<4 x i32> %a0, <4 x i32> %a1) {
; SSE2-LABEL: reverse_cmp_v4i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: movmskps %xmm0, %eax
; SSE2-NEXT: leal (%rax,%rax), %ecx
; SSE2-NEXT: andb $4, %cl
; SSE2-NEXT: leal (,%rax,8), %edx
; SSE2-NEXT: andb $8, %dl
; SSE2-NEXT: orb %cl, %dl
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrb %cl
; SSE2-NEXT: andb $2, %cl
; SSE2-NEXT: orb %dl, %cl
; SSE2-NEXT: shrb $3, %al
; SSE2-NEXT: orb %cl, %al
; SSE2-NEXT: # kill: def $al killed $al killed $rax
; SSE2-NEXT: retq
;
; SSE42-LABEL: reverse_cmp_v4i1:
; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqd %xmm1, %xmm0
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; SSE42-NEXT: movmskps %xmm0, %eax
; SSE42-NEXT: # kill: def $al killed $al killed $eax
; SSE42-NEXT: retq
;
; AVX2-LABEL: reverse_cmp_v4i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX2-NEXT: vmovmskps %xmm0, %eax
; AVX2-NEXT: # kill: def $al killed $al killed $eax
; AVX2-NEXT: retq
;
; AVX512-LABEL: reverse_cmp_v4i1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
%cmp = icmp eq <4 x i32> %a0, %a1
%mask = bitcast <4 x i1> %cmp to i4
%rev = tail call i4 @llvm.bitreverse.i4(i4 %mask)
ret i4 %rev
}
declare i4 @llvm.bitreverse.i4(i4)
define i8 @reverse_cmp_v8i1(<8 x i16> %a0, <8 x i16> %a1) {
; SSE2-LABEL: reverse_cmp_v8i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: rolb $4, %al
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andb $51, %cl
; SSE2-NEXT: shlb $2, %cl
; SSE2-NEXT: shrb $2, %al
; SSE2-NEXT: andb $51, %al
; SSE2-NEXT: orb %cl, %al
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andb $85, %cl
; SSE2-NEXT: addb %cl, %cl
; SSE2-NEXT: shrb %al
; SSE2-NEXT: andb $85, %al
; SSE2-NEXT: orb %cl, %al
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE42-LABEL: reverse_cmp_v8i1:
; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqw %xmm1, %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1]
; SSE42-NEXT: packsswb %xmm0, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: # kill: def $al killed $al killed $eax
; SSE42-NEXT: retq
;
; AVX2-LABEL: reverse_cmp_v8i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,12,10,8,6,4,2,0,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: # kill: def $al killed $al killed $eax
; AVX2-NEXT: retq
;
; AVX512-LABEL: reverse_cmp_v8i1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2d %k0, %ymm0
; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovd2m %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cmp = icmp eq <8 x i16> %a0, %a1
%mask = bitcast <8 x i1> %cmp to i8
%rev = tail call i8 @llvm.bitreverse.i8(i8 %mask)
ret i8 %rev
}
declare i8 @llvm.bitreverse.i8(i8)
define i16 @reverse_cmp_v16i1(<16 x i8> %a0, <16 x i8> %a1) {
; SSE2-LABEL: reverse_cmp_v16i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: rolw $8, %ax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andl $3855, %ecx # imm = 0xF0F
; SSE2-NEXT: shll $4, %ecx
; SSE2-NEXT: shrl $4, %eax
; SSE2-NEXT: andl $3855, %eax # imm = 0xF0F
; SSE2-NEXT: orl %ecx, %eax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andl $13107, %ecx # imm = 0x3333
; SSE2-NEXT: shrl $2, %eax
; SSE2-NEXT: andl $13107, %eax # imm = 0x3333
; SSE2-NEXT: leal (%rax,%rcx,4), %eax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andl $21845, %ecx # imm = 0x5555
; SSE2-NEXT: shrl %eax
; SSE2-NEXT: andl $21845, %eax # imm = 0x5555
; SSE2-NEXT: leal (%rax,%rcx,2), %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE42-LABEL: reverse_cmp_v16i1:
; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqb %xmm1, %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: # kill: def $ax killed $ax killed $eax
; SSE42-NEXT: retq
;
; AVX2-LABEL: reverse_cmp_v16i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-NEXT: retq
;
; AVX512-LABEL: reverse_cmp_v16i1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2w %k0, %ymm0
; AVX512-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX512-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovw2m %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cmp = icmp eq <16 x i8> %a0, %a1
%mask = bitcast <16 x i1> %cmp to i16
%rev = tail call i16 @llvm.bitreverse.i16(i16 %mask)
ret i16 %rev
}
declare i16 @llvm.bitreverse.i16(i16)
define i32 @reverse_cmp_v32i1(<32 x i8> %a0, <32 x i8> %a1) {
; SSE2-LABEL: reverse_cmp_v32i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqb %xmm2, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: pcmpeqb %xmm3, %xmm1
; SSE2-NEXT: pmovmskb %xmm1, %ecx
; SSE2-NEXT: shll $16, %ecx
; SSE2-NEXT: orl %eax, %ecx
; SSE2-NEXT: bswapl %ecx
; SSE2-NEXT: movl %ecx, %eax
; SSE2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; SSE2-NEXT: shll $4, %eax
; SSE2-NEXT: shrl $4, %ecx
; SSE2-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; SSE2-NEXT: orl %eax, %ecx
; SSE2-NEXT: movl %ecx, %eax
; SSE2-NEXT: andl $858993459, %eax # imm = 0x33333333
; SSE2-NEXT: shrl $2, %ecx
; SSE2-NEXT: andl $858993459, %ecx # imm = 0x33333333
; SSE2-NEXT: leal (%rcx,%rax,4), %eax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andl $1431655765, %ecx # imm = 0x55555555
; SSE2-NEXT: shrl %eax
; SSE2-NEXT: andl $1431655765, %eax # imm = 0x55555555
; SSE2-NEXT: leal (%rax,%rcx,2), %eax
; SSE2-NEXT: retq
;
; SSE42-LABEL: reverse_cmp_v32i1:
; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqb %xmm2, %xmm0
; SSE42-NEXT: pcmpeqb %xmm3, %xmm1
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; SSE42-NEXT: pshufb %xmm2, %xmm1
; SSE42-NEXT: pmovmskb %xmm1, %ecx
; SSE42-NEXT: pshufb %xmm2, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: shll $16, %eax
; SSE42-NEXT: orl %ecx, %eax
; SSE42-NEXT: retq
;
; AVX2-LABEL: reverse_cmp_v32i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-V4-LABEL: reverse_cmp_v32i1:
; AVX512-V4: # %bb.0:
; AVX512-V4-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
; AVX512-V4-NEXT: vpmovm2b %k0, %ymm0
; AVX512-V4-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
; AVX512-V4-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512-V4-NEXT: vpmovb2m %ymm0, %k0
; AVX512-V4-NEXT: kmovd %k0, %eax
; AVX512-V4-NEXT: vzeroupper
; AVX512-V4-NEXT: retq
;
; AVX512-VBMI-LABEL: reverse_cmp_v32i1:
; AVX512-VBMI: # %bb.0:
; AVX512-VBMI-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
; AVX512-VBMI-NEXT: vpmovm2b %k0, %ymm0
; AVX512-VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX512-VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512-VBMI-NEXT: vpmovb2m %ymm0, %k0
; AVX512-VBMI-NEXT: kmovd %k0, %eax
; AVX512-VBMI-NEXT: vzeroupper
; AVX512-VBMI-NEXT: retq
%cmp = icmp eq <32 x i8> %a0, %a1
%mask = bitcast <32 x i1> %cmp to i32
%rev = tail call i32 @llvm.bitreverse.i32(i32 %mask)
ret i32 %rev
}
declare i32 @llvm.bitreverse.i32(i32)
define i64 @reverse_cmp_v64i1(<64 x i8> %a0, <64 x i8> %a1) {
; SSE2-LABEL: reverse_cmp_v64i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqb %xmm4, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: pcmpeqb %xmm5, %xmm1
; SSE2-NEXT: pmovmskb %xmm1, %ecx
; SSE2-NEXT: shll $16, %ecx
; SSE2-NEXT: orl %eax, %ecx
; SSE2-NEXT: pcmpeqb %xmm6, %xmm2
; SSE2-NEXT: pmovmskb %xmm2, %eax
; SSE2-NEXT: pcmpeqb %xmm7, %xmm3
; SSE2-NEXT: pmovmskb %xmm3, %edx
; SSE2-NEXT: shll $16, %edx
; SSE2-NEXT: orl %eax, %edx
; SSE2-NEXT: shlq $32, %rdx
; SSE2-NEXT: orq %rcx, %rdx
; SSE2-NEXT: bswapq %rdx
; SSE2-NEXT: movq %rdx, %rax
; SSE2-NEXT: shrq $4, %rax
; SSE2-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; SSE2-NEXT: andq %rcx, %rax
; SSE2-NEXT: andq %rcx, %rdx
; SSE2-NEXT: shlq $4, %rdx
; SSE2-NEXT: orq %rax, %rdx
; SSE2-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
; SSE2-NEXT: movq %rdx, %rcx
; SSE2-NEXT: andq %rax, %rcx
; SSE2-NEXT: shrq $2, %rdx
; SSE2-NEXT: andq %rax, %rdx
; SSE2-NEXT: leaq (%rdx,%rcx,4), %rax
; SSE2-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; SSE2-NEXT: movq %rax, %rdx
; SSE2-NEXT: andq %rcx, %rdx
; SSE2-NEXT: shrq %rax
; SSE2-NEXT: andq %rcx, %rax
; SSE2-NEXT: leaq (%rax,%rdx,2), %rax
; SSE2-NEXT: retq
;
; SSE42-LABEL: reverse_cmp_v64i1:
; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqb %xmm4, %xmm0
; SSE42-NEXT: pcmpeqb %xmm5, %xmm1
; SSE42-NEXT: pcmpeqb %xmm6, %xmm2
; SSE42-NEXT: pcmpeqb %xmm7, %xmm3
; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; SSE42-NEXT: pshufb %xmm4, %xmm3
; SSE42-NEXT: pmovmskb %xmm3, %eax
; SSE42-NEXT: pshufb %xmm4, %xmm2
; SSE42-NEXT: pmovmskb %xmm2, %ecx
; SSE42-NEXT: shll $16, %ecx
; SSE42-NEXT: orl %eax, %ecx
; SSE42-NEXT: pshufb %xmm4, %xmm1
; SSE42-NEXT: pmovmskb %xmm1, %edx
; SSE42-NEXT: pshufb %xmm4, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: shll $16, %eax
; SSE42-NEXT: orl %edx, %eax
; SSE42-NEXT: shlq $32, %rax
; SSE42-NEXT: orq %rcx, %rax
; SSE42-NEXT: retq
;
; AVX2-LABEL: reverse_cmp_v64i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX2-NEXT: vpmovmskb %ymm1, %ecx
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: shlq $32, %rax
; AVX2-NEXT: orq %rcx, %rax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-V4-LABEL: reverse_cmp_v64i1:
; AVX512-V4: # %bb.0:
; AVX512-V4-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512-V4-NEXT: vpmovm2b %k0, %zmm0
; AVX512-V4-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48]
; AVX512-V4-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
; AVX512-V4-NEXT: vpmovb2m %zmm0, %k0
; AVX512-V4-NEXT: kmovq %k0, %rax
; AVX512-V4-NEXT: vzeroupper
; AVX512-V4-NEXT: retq
;
; AVX512-VBMI-LABEL: reverse_cmp_v64i1:
; AVX512-VBMI: # %bb.0:
; AVX512-VBMI-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512-VBMI-NEXT: vpmovm2b %k0, %zmm0
; AVX512-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX512-VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512-VBMI-NEXT: vpmovb2m %zmm0, %k0
; AVX512-VBMI-NEXT: kmovq %k0, %rax
; AVX512-VBMI-NEXT: vzeroupper
; AVX512-VBMI-NEXT: retq
%cmp = icmp eq <64 x i8> %a0, %a1
%mask = bitcast <64 x i1> %cmp to i64
%rev = tail call i64 @llvm.bitreverse.i64(i64 %mask)
ret i64 %rev
}
declare i64 @llvm.bitreverse.i64(i64)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; SSE: {{.*}}