slp-v2f16.ll | Explore in Territory

; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s

; FIXME: Should not vectorize on gfx8

; GCN-LABEL: @fadd_combine_v2f16
; GCN: fadd <2 x half>
define void @fadd_combine_v2f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = fadd half %tmp3, 1.000000e+00
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = fadd half %tmp7, 1.000000e+00
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; FIXME: Should not vectorize on gfx8
; GCN-LABEL: @fsub_combine_v2f16
; GCN: fsub <2 x half>
define void @fsub_combine_v2f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = fsub half %tmp3, 1.000000e+00
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = fsub half %tmp7, 1.000000e+00
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; FIXME: Should not vectorize on gfx8
; GCN-LABEL: @fmul_combine_v2f16
; GCN: fmul <2 x half>
define void @fmul_combine_v2f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = fmul half %tmp3, 1.000000e+00
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = fmul half %tmp7, 1.000000e+00
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; GCN-LABEL: @fdiv_combine_v2f16
; GCN: fdiv <2 x half>
define void @fdiv_combine_v2f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = fdiv half %tmp3, 1.000000e+00
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = fdiv half %tmp7, 1.000000e+00
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; GCN-LABEL: @frem_combine_v2f16
; GCN: frem <2 x half>
define void @frem_combine_v2f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = frem half %tmp3, 1.000000e+00
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = frem half %tmp7, 1.000000e+00
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; FIXME: Should not vectorize on gfx8
; GCN-LABEL: @fma_combine_v2f16
; GCN: call <2 x half> @llvm.fma.v2f16
define amdgpu_kernel void @fma_combine_v2f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = tail call half @llvm.fma.f16(half %tmp3, half 1.000000e+00, half 1.000000e+00)
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = tail call half @llvm.fma.f16(half %tmp7, half 1.000000e+00, half 1.000000e+00)
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; FIXME: Should not vectorize on gfx8
; GCN-LABEL: @fmuladd_combine_v2f16
; GCN: call <2 x half> @llvm.fmuladd.v2f16
define amdgpu_kernel void @fmuladd_combine_v2f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = tail call half @llvm.fmuladd.f16(half %tmp3, half 1.000000e+00, half 1.000000e+00)
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = tail call half @llvm.fmuladd.f16(half %tmp7, half 1.000000e+00, half 1.000000e+00)
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; GCN-LABEL: @minnum_combine_v2f16
; GFX8: call half @llvm.minnum.f16(
; GFX8: call half @llvm.minnum.f16(

; GFX9: call <2 x half> @llvm.minnum.v2f16
define void @minnum_combine_v2f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = call half @llvm.minnum.f16(half %tmp3, half 1.000000e+00)
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = call half @llvm.minnum.f16(half %tmp7, half 1.000000e+00)
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; GCN-LABEL: @maxnum_combine_v2f16
; GFX8: call half @llvm.maxnum.f16(
; GFX8: call half @llvm.maxnum.f16(

; GFX9: call <2 x half> @llvm.maxnum.v2f16
define void @maxnum_combine_v2f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = call half @llvm.maxnum.f16(half %tmp3, half 1.000000e+00)
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = call half @llvm.maxnum.f16(half %tmp7, half 1.000000e+00)
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; FIXME: Should vectorize
; GCN-LABEL: @minimum_combine_v2f16
; GCN: call half @llvm.minimum.f16(
; GCN: call half @llvm.minimum.f16(
define void @minimum_combine_v2f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = call half @llvm.minimum.f16(half %tmp3, half 1.000000e+00)
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = call half @llvm.minimum.f16(half %tmp7, half 1.000000e+00)
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; GCN-LABEL: @maximum_combine_v2f16
; GCN: call half @llvm.maximum.f16(
; GCN: call half @llvm.maximum.f16(
define void @maximum_combine_v2f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = call half @llvm.maximum.f16(half %tmp3, half 1.000000e+00)
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = call half @llvm.maximum.f16(half %tmp7, half 1.000000e+00)
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; GCN-LABEL: @canonicalize_combine_v2f16
; GCN: call <2 x half> @llvm.canonicalize.v2f16(
define void @canonicalize_combine_v2f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = call half @llvm.canonicalize.f16(half %tmp3)
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = call half @llvm.canonicalize.f16(half %tmp7)
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; GCN-LABEL: @fabs_combine_v2f16
; GCN: call <2 x half> @llvm.fabs.v2f16(
define void @fabs_combine_v2f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = call half @llvm.fabs.f16(half %tmp3)
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = call half @llvm.fabs.f16(half %tmp7)
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; GCN-LABEL: @fneg_combine_v2f16
; GCN: fneg <2 x half>
define void @fneg_combine_v2f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = fneg half %tmp3
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = fneg half %tmp7
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; GCN-LABEL: @copysign_combine_v2f16
; GCN: call <2 x half> @llvm.copysign.v2f16(
define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = call half @llvm.copysign.f16(half %tmp3, half %sign)
  store half %tmp4, ptr addrspace(1) %tmp2, align 2
  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = call half @llvm.copysign.f16(half %tmp7, half %sign)
  store half %tmp8, ptr addrspace(1) %tmp6, align 2
  ret void
}

; FIXME: Should always vectorize
; GCN-LABEL: @copysign_combine_v4f16
; GCN: call <2 x half> @llvm.copysign.v2f16(

; GFX8: call half @llvm.copysign.f16(
; GFX8: call half @llvm.copysign.f16(

; GFX9: call <2 x half> @llvm.copysign.v2f16(
define void @copysign_combine_v4f16(ptr addrspace(1) %arg, half %sign) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64

  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = call half @llvm.copysign.f16(half %tmp3, half %sign)
  store half %tmp4, ptr addrspace(1) %tmp2, align 2

  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = call half @llvm.copysign.f16(half %tmp7, half %sign)
  store half %tmp8, ptr addrspace(1) %tmp6, align 2

  %tmp9 = add nuw nsw i64 %tmp1, 2
  %tmp10 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp9
  %tmp11 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp12 = call half @llvm.copysign.f16(half %tmp11, half %sign)
  store half %tmp12, ptr addrspace(1) %tmp10, align 2

  %tmp13 = add nuw nsw i64 %tmp1, 3
  %tmp14 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp13
  %tmp15 = load half, ptr addrspace(1) %tmp14, align 2
  %tmp16 = call half @llvm.copysign.f16(half %tmp15, half %sign)
  store half %tmp16, ptr addrspace(1) %tmp14, align 2
  ret void
}

; GCN-LABEL: @canonicalize_combine_v4f16
; GCN: call <2 x half> @llvm.canonicalize.v2f16(
; GCN: call <2 x half> @llvm.canonicalize.v2f16(
define void @canonicalize_combine_v4f16(ptr addrspace(1) %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64

  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
  %tmp4 = call half @llvm.canonicalize.f16(half %tmp3)
  store half %tmp4, ptr addrspace(1) %tmp2, align 2

  %tmp5 = add nuw nsw i64 %tmp1, 1
  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp8 = call half @llvm.canonicalize.f16(half %tmp7)
  store half %tmp8, ptr addrspace(1) %tmp6, align 2

  %tmp9 = add nuw nsw i64 %tmp1, 2
  %tmp10 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp9
  %tmp11 = load half, ptr addrspace(1) %tmp6, align 2
  %tmp12 = call half @llvm.canonicalize.f16(half %tmp11)
  store half %tmp12, ptr addrspace(1) %tmp10, align 2

  %tmp13 = add nuw nsw i64 %tmp1, 3
  %tmp14 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp13
  %tmp15 = load half, ptr addrspace(1) %tmp14, align 2
  %tmp16 = call half @llvm.canonicalize.f16(half %tmp15)
  store half %tmp16, ptr addrspace(1) %tmp14, align 2
  ret void
}
llvm/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll