; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5
; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE
; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX1
; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX2
; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX512,AVX512F
; RUN: llc -mattr=+avx512bw -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX512,AVX512BW
define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind {
; SSE-LABEL: v_test_canonicalize__half:
; SSE: # %bb.0: # %entry
; SSE-NEXT: pushq %rbx
; SSE-NEXT: subq $16, %rsp
; SSE-NEXT: movq %rdi, %rbx
; SSE-NEXT: pinsrw $0, (%rdi), %xmm0
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE-NEXT: callq __truncsfhf2@PLT
; SSE-NEXT: pextrw $0, %xmm0, %eax
; SSE-NEXT: movw %ax, (%rbx)
; SSE-NEXT: addq $16, %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: retq
;
; AVX-LABEL: v_test_canonicalize__half:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rbx
; AVX-NEXT: subq $16, %rsp
; AVX-NEXT: movq %rdi, %rbx
; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; AVX-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; AVX-NEXT: callq __truncsfhf2@PLT
; AVX-NEXT: vpextrw $0, %xmm0, (%rbx)
; AVX-NEXT: addq $16, %rsp
; AVX-NEXT: popq %rbx
; AVX-NEXT: retq
;
; AVX512-LABEL: v_test_canonicalize__half:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: movzwl (%rdi), %eax
; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
; AVX512-NEXT: vmovd %ecx, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vmovd %eax, %xmm1
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: movw %ax, (%rdi)
; AVX512-NEXT: retq
entry:
%val = load half, half addrspace(1)* %out
%canonicalized = call half @llvm.canonicalize.f16(half %val)
store half %canonicalized, half addrspace(1)* %out
ret void
}
define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
; SSE-LABEL: complex_canonicalize_fmul_half:
; SSE: # %bb.0: # %entry
; SSE-NEXT: pushq %rax
; SSE-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE-NEXT: movss (%rsp), %xmm1 # 4-byte Reload
; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: subss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: callq __truncsfhf2@PLT
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
; SSE-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE-NEXT: callq __truncsfhf2@PLT
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: subss (%rsp), %xmm0 # 4-byte Folded Reload
; SSE-NEXT: callq __truncsfhf2@PLT
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: mulss (%rsp), %xmm0 # 4-byte Folded Reload
; SSE-NEXT: callq __truncsfhf2@PLT
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE-NEXT: callq __truncsfhf2@PLT
; SSE-NEXT: popq %rax
; SSE-NEXT: retq
;
; AVX-LABEL: complex_canonicalize_fmul_half:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload
; AVX-NEXT: # xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
; AVX-NEXT: callq __truncsfhf2@PLT
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
; AVX-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; AVX-NEXT: callq __truncsfhf2@PLT
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
; AVX-NEXT: callq __truncsfhf2@PLT
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
; AVX-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
; AVX-NEXT: callq __truncsfhf2@PLT
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; AVX-NEXT: callq __truncsfhf2@PLT
; AVX-NEXT: popq %rax
; AVX-NEXT: retq
;
; AVX512-LABEL: complex_canonicalize_fmul_half:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpextrw $0, %xmm1, %eax
; AVX512-NEXT: vpextrw $0, %xmm0, %ecx
; AVX512-NEXT: vmovd %ecx, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vmovd %eax, %xmm1
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512-NEXT: vsubss %xmm0, %xmm2, %xmm0
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
; AVX512-NEXT: vmovd %eax, %xmm2
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVX512-NEXT: retq
entry:
%mul1 = fsub half %a, %b
%add = fadd half %mul1, %b
%mul2 = fsub half %add, %mul1
%canonicalized = call half @llvm.canonicalize.f16(half %mul2)
%result = fsub half %canonicalized, %b
ret half %result
}
define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind {
; SSE-LABEL: v_test_canonicalize_v2half:
; SSE: # %bb.0: # %entry
; SSE-NEXT: pushq %rbx
; SSE-NEXT: subq $48, %rsp
; SSE-NEXT: movq %rdi, %rbx
; SSE-NEXT: pinsrw $0, 2(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pinsrw $0, (%rdi), %xmm0
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: mulss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: callq __truncsfhf2@PLT
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE-NEXT: callq __truncsfhf2@PLT
; SSE-NEXT: pextrw $0, %xmm0, %eax
; SSE-NEXT: movw %ax, 2(%rbx)
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: pextrw $0, %xmm0, %eax
; SSE-NEXT: movw %ax, (%rbx)
; SSE-NEXT: addq $48, %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: retq
;
; AVX-LABEL: v_test_canonicalize_v2half:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rbx
; AVX-NEXT: subq $48, %rsp
; AVX-NEXT: movq %rdi, %rbx
; AVX-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; AVX-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; AVX-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; AVX-NEXT: callq __truncsfhf2@PLT
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; AVX-NEXT: callq __truncsfhf2@PLT
; AVX-NEXT: vpextrw $0, %xmm0, 2(%rbx)
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vpextrw $0, %xmm0, (%rbx)
; AVX-NEXT: addq $48, %rsp
; AVX-NEXT: popq %rbx
; AVX-NEXT: retq
;
; AVX512-LABEL: v_test_canonicalize_v2half:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
; AVX512-NEXT: vmovd %eax, %xmm1
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm2
; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
; AVX512-NEXT: vmovd %xmm2, %eax
; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX512-NEXT: vmovd %xmm0, (%rdi)
; AVX512-NEXT: retq
entry:
%val = load <2 x half>, <2 x half> addrspace(1)* %out
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX1: {{.*}}
; AVX2: {{.*}}
; AVX512BW: {{.*}}
; AVX512F: {{.*}}