; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; FIXME: Should not vectorize on gfx8
; GCN-LABEL: @fadd_combine_v2f16
; GCN: fadd <2 x half>
define void @fadd_combine_v2f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = fadd half %tmp3, 1.000000e+00
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = fadd half %tmp7, 1.000000e+00
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; FIXME: Should not vectorize on gfx8
; GCN-LABEL: @fsub_combine_v2f16
; GCN: fsub <2 x half>
define void @fsub_combine_v2f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = fsub half %tmp3, 1.000000e+00
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = fsub half %tmp7, 1.000000e+00
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; FIXME: Should not vectorize on gfx8
; GCN-LABEL: @fmul_combine_v2f16
; GCN: fmul <2 x half>
define void @fmul_combine_v2f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = fmul half %tmp3, 1.000000e+00
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = fmul half %tmp7, 1.000000e+00
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; GCN-LABEL: @fdiv_combine_v2f16
; GCN: fdiv <2 x half>
define void @fdiv_combine_v2f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = fdiv half %tmp3, 1.000000e+00
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = fdiv half %tmp7, 1.000000e+00
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; GCN-LABEL: @frem_combine_v2f16
; GCN: frem <2 x half>
define void @frem_combine_v2f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = frem half %tmp3, 1.000000e+00
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = frem half %tmp7, 1.000000e+00
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; FIXME: Should not vectorize on gfx8
; GCN-LABEL: @fma_combine_v2f16
; GCN: call <2 x half> @llvm.fma.v2f16
define amdgpu_kernel void @fma_combine_v2f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = tail call half @llvm.fma.f16(half %tmp3, half 1.000000e+00, half 1.000000e+00)
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = tail call half @llvm.fma.f16(half %tmp7, half 1.000000e+00, half 1.000000e+00)
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; FIXME: Should not vectorize on gfx8
; GCN-LABEL: @fmuladd_combine_v2f16
; GCN: call <2 x half> @llvm.fmuladd.v2f16
define amdgpu_kernel void @fmuladd_combine_v2f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = tail call half @llvm.fmuladd.f16(half %tmp3, half 1.000000e+00, half 1.000000e+00)
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = tail call half @llvm.fmuladd.f16(half %tmp7, half 1.000000e+00, half 1.000000e+00)
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; GCN-LABEL: @minnum_combine_v2f16
; GFX8: call half @llvm.minnum.f16(
; GFX8: call half @llvm.minnum.f16(
; GFX9: call <2 x half> @llvm.minnum.v2f16
define void @minnum_combine_v2f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = call half @llvm.minnum.f16(half %tmp3, half 1.000000e+00)
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = call half @llvm.minnum.f16(half %tmp7, half 1.000000e+00)
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; GCN-LABEL: @maxnum_combine_v2f16
; GFX8: call half @llvm.maxnum.f16(
; GFX8: call half @llvm.maxnum.f16(
; GFX9: call <2 x half> @llvm.maxnum.v2f16
define void @maxnum_combine_v2f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = call half @llvm.maxnum.f16(half %tmp3, half 1.000000e+00)
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = call half @llvm.maxnum.f16(half %tmp7, half 1.000000e+00)
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; FIXME: Should vectorize
; GCN-LABEL: @minimum_combine_v2f16
; GCN: call half @llvm.minimum.f16(
; GCN: call half @llvm.minimum.f16(
define void @minimum_combine_v2f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = call half @llvm.minimum.f16(half %tmp3, half 1.000000e+00)
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = call half @llvm.minimum.f16(half %tmp7, half 1.000000e+00)
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; GCN-LABEL: @maximum_combine_v2f16
; GCN: call half @llvm.maximum.f16(
; GCN: call half @llvm.maximum.f16(
define void @maximum_combine_v2f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = call half @llvm.maximum.f16(half %tmp3, half 1.000000e+00)
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = call half @llvm.maximum.f16(half %tmp7, half 1.000000e+00)
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; GCN-LABEL: @canonicalize_combine_v2f16
; GCN: call <2 x half> @llvm.canonicalize.v2f16(
define void @canonicalize_combine_v2f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = call half @llvm.canonicalize.f16(half %tmp3)
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = call half @llvm.canonicalize.f16(half %tmp7)
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; GCN-LABEL: @fabs_combine_v2f16
; GCN: call <2 x half> @llvm.fabs.v2f16(
define void @fabs_combine_v2f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = call half @llvm.fabs.f16(half %tmp3)
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = call half @llvm.fabs.f16(half %tmp7)
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; GCN-LABEL: @fneg_combine_v2f16
; GCN: fneg <2 x half>
define void @fneg_combine_v2f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = fneg half %tmp3
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = fneg half %tmp7
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; GCN-LABEL: @copysign_combine_v2f16
; GCN: call <2 x half> @llvm.copysign.v2f16(
define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = call half @llvm.copysign.f16(half %tmp3, half %sign)
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = call half @llvm.copysign.f16(half %tmp7, half %sign)
store half %tmp8, ptr addrspace(1) %tmp6, align 2
ret void
}
; FIXME: Should always vectorize
; GCN-LABEL: @copysign_combine_v4f16
; GCN: call <2 x half> @llvm.copysign.v2f16(
; GFX8: call half @llvm.copysign.f16(
; GFX8: call half @llvm.copysign.f16(
; GFX9: call <2 x half> @llvm.copysign.v2f16(
define void @copysign_combine_v4f16(ptr addrspace(1) %arg, half %sign) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = call half @llvm.copysign.f16(half %tmp3, half %sign)
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = call half @llvm.copysign.f16(half %tmp7, half %sign)
store half %tmp8, ptr addrspace(1) %tmp6, align 2
%tmp9 = add nuw nsw i64 %tmp1, 2
%tmp10 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp9
%tmp11 = load half, ptr addrspace(1) %tmp6, align 2
%tmp12 = call half @llvm.copysign.f16(half %tmp11, half %sign)
store half %tmp12, ptr addrspace(1) %tmp10, align 2
%tmp13 = add nuw nsw i64 %tmp1, 3
%tmp14 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp13
%tmp15 = load half, ptr addrspace(1) %tmp14, align 2
%tmp16 = call half @llvm.copysign.f16(half %tmp15, half %sign)
store half %tmp16, ptr addrspace(1) %tmp14, align 2
ret void
}
; GCN-LABEL: @canonicalize_combine_v4f16
; GCN: call <2 x half> @llvm.canonicalize.v2f16(
; GCN: call <2 x half> @llvm.canonicalize.v2f16(
define void @canonicalize_combine_v4f16(ptr addrspace(1) %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
%tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
%tmp3 = load half, ptr addrspace(1) %tmp2, align 2
%tmp4 = call half @llvm.canonicalize.f16(half %tmp3)
store half %tmp4, ptr addrspace(1) %tmp2, align 2
%tmp5 = add nuw nsw i64 %tmp1, 1
%tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
%tmp7 = load half, ptr addrspace(1) %tmp6, align 2
%tmp8 = call half @llvm.canonicalize.f16(half %tmp7)
store half %tmp8, ptr addrspace(1) %tmp6, align 2
%tmp9 = add nuw nsw i64 %tmp1, 2
%tmp10 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp9
%tmp11 = load half, ptr addrspace(1) %tmp6, align 2
%tmp12 = call half @llvm.canonicalize.f16(half %tmp11)
store half %tmp12, ptr addrspace(1) %tmp10, align 2
%tmp13 = add nuw nsw i64 %tmp1, 3
%tmp14 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp13
%tmp15 = load half, ptr addrspace(1) %tmp14, align 2
%tmp16 = call half @llvm.canonicalize.f16(half %tmp15)
store half %tmp16, ptr addrspace(1) %tmp14, align 2
ret void
}