; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -amdgpu-codegenprepare-break-large-phis=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; XUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; Note: breaking up large PHIs is disabled to prevent some testcases from becoming
; branchless.
; FIXME: This leaves behind a now unnecessary and with exec
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @test_vccnz_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%v = load float, ptr addrspace(1) %in
%cc = fcmp oeq float %v, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u = fadd float %v, %v
br label %endif
endif:
%r = phi float [ %v, %entry ], [ %u, %if ]
store float %r, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
; GCN: buffer_store_dword [[MUL]]
define amdgpu_kernel void @test_vccnz_ifcvt_diamond(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%v = load float, ptr addrspace(1) %in
%cc = fcmp oeq float %v, 1.000000e+00
br i1 %cc, label %if, label %else
if:
%u0 = fadd float %v, %v
br label %endif
else:
%u1 = fmul float %v, %v
br label %endif
endif:
%r = phi float [ %u0, %if ], [ %u1, %else ]
store float %r, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber:
; GCN: ; clobber vcc
; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
; GCN: s_mov_b64 vcc, [[CMP]]
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(ptr addrspace(1) %out, ptr addrspace(1) %in, float %k) #0 {
entry:
%v = load i32, ptr addrspace(1) %in
%cc = fcmp oeq float %k, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
call void asm "; clobber $0", "~{vcc}"() #0
%u = add i32 %v, %v
br label %endif
endif:
%r = phi i32 [ %v, %entry ], [ %u, %if ]
store i32 %r, ptr addrspace(1) %out
ret void
}
; Longest chain of cheap instructions to convert
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap:
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_cndmask_b32_e32
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%v = load float, ptr addrspace(1) %in
%cc = fcmp oeq float %v, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u.0 = fmul float %v, %v
%u.1 = fmul float %v, %u.0
%u.2 = fmul float %v, %u.1
%u.3 = fmul float %v, %u.2
%u.4 = fmul float %v, %u.3
%u.5 = fmul float %v, %u.4
%u.6 = fmul float %v, %u.5
%u.7 = fmul float %v, %u.6
%u.8 = fmul float %v, %u.7
br label %endif
endif:
%r = phi float [ %v, %entry ], [ %u.8, %if ]
store float %r, ptr addrspace(1) %out
ret void
}
; Short chain of cheap instructions to not convert
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive:
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: [[ENDIF]]:
; GCN: buffer_store_dword
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%v = load float, ptr addrspace(1) %in
%cc = fcmp oeq float %v, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u.0 = fmul float %v, %v
%u.1 = fmul float %v, %u.0
%u.2 = fmul float %v, %u.1
%u.3 = fmul float %v, %u.2
%u.4 = fmul float %v, %u.3
%u.5 = fmul float %v, %u.4
%u.6 = fmul float %v, %u.5
%u.7 = fmul float %v, %u.6
%u.8 = fmul float %v, %u.7
%u.9 = fmul float %v, %u.8
br label %endif
endif:
%r = phi float [ %v, %entry ], [ %u.9, %if ]
store float %r, ptr addrspace(1) %out
ret void
}
; Should still branch over fdiv expansion
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive:
; GCN: v_cmp_neq_f32_e32
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
; GCN: v_div_scale_f32
; GCN: [[ENDIF]]:
; GCN: buffer_store_dword
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%v = load float, ptr addrspace(1) %in
%cc = fcmp oeq float %v, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u = fdiv float %v, %v
br label %endif
endif:
%r = phi float [ %v, %entry ], [ %u, %if ]
store float %r, ptr addrspace(1) %out
ret void
}
; vcc branch with SGPR inputs
; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle:
; GCN: v_cmp_neq_f32_e64
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
; GCN: s_add_i32
; GCN: [[ENDIF]]:
; GCN: buffer_store_dword
define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(4) %in, float %cnd) #0 {
entry:
%v = load i32, ptr addrspace(4) %in
%cc = fcmp oeq float %cnd, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u = add i32 %v, %v
br label %endif
endif:
%r = phi i32 [ %v, %entry ], [ %u, %if ]
store i32 %r, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
; GCN: v_cndmask_b32
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%v = load float, ptr addrspace(4) %in
%cc = fcmp oeq float %v, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u = fadd float %v, %v
br label %endif
endif:
%r = phi float [ %v, %entry ], [ %u, %if ]
store float %r, ptr addrspace(1) %out
ret void
}
; Due to broken cost heuristic, this is not if converted like
; test_vccnz_ifcvt_triangle_constant_load even though it should be.
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
; GCN: v_cndmask_b32
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(ptr addrspace(1) %out, float %v) #0 {
entry:
%cc = fcmp oeq float %v, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u = fadd float %v, %v
br label %endif
endif:
%r = phi float [ %v, %entry ], [ %u, %if ]
store float %r, ptr addrspace(1) %out
ret void
}
; Scalar branch and scalar inputs
; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle:
; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[VAL]], [[ADD]]
define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(ptr addrspace(4) %in, i32 %cond) #0 {
entry:
%v = load i32, ptr addrspace(4) %in
%cc = icmp eq i32 %cond, 1
br i1 %cc, label %if, label %endif
if:
%u = add i32 %v, %v
br label %endif
endif:
%r = phi i32 [ %v, %entry ], [ %u, %if ]
call void asm sideeffect "; reg use $0", "s"(i32 %r) #0
ret void
}
; FIXME: Should be able to use VALU compare and select
; Scalar branch but VGPR select operands
; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle:
; GCN: s_cmp_lg_u32
; GCN: s_cbranch_scc1 [[ENDIF:.LBB[0-9]+_[0-9]+]]
; GCN: v_add_f32_e32
; GCN: [[ENDIF]]:
; GCN: buffer_store_dword
define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %cond) #0 {
entry:
%v = load float, ptr addrspace(1) %in
%cc = icmp eq i32 %cond, 1
br i1 %cc, label %if, label %endif
if:
%u = fadd float %v, %v
br label %endif
endif:
%r = phi float [ %v, %entry ], [ %u, %if ]
store float %r, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64:
; GCN: s_add_u32
; GCN: s_addc_u32
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(ptr addrspace(4) %in, i32 %cond) #0 {
entry:
%v = load i64, ptr addrspace(4) %in
%cc = icmp eq i32 %cond, 1
br i1 %cc, label %if, label %endif
if:
%u = add i64 %v, %v
br label %endif
endif:
%r = phi i64 [ %v, %entry ], [ %u, %if ]
call void asm sideeffect "; reg use $0", "s"(i64 %r) #0
ret void
}
; TODO: Can do s_cselect_b64; s_cselect_b32
; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96:
; GCN: s_add_i32
; GCN: s_add_i32
; GCN: s_add_i32
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
; GCN-NEXT: s_cselect_b32 s
; GCN-NEXT: s_cselect_b32 s
; GCN-NEXT: s_cselect_b32 s
define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(ptr addrspace(4) %in, i32 %cond) #0 {
entry:
%v = load <3 x i32>, ptr addrspace(4) %in
%cc = icmp eq i32 %cond, 1
br i1 %cc, label %if, label %endif
if:
%u = add <3 x i32> %v, %v
br label %endif
endif:
%r = phi <3 x i32> [ %v, %entry ], [ %u, %if ]
%r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0
ret void
}
; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128:
; GCN: s_add_i32
; GCN: s_add_i32
; GCN: s_add_i32
; GCN: s_add_i32
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(ptr addrspace(4) %in, i32 %cond) #0 {
entry:
%v = load <4 x i32>, ptr addrspace(4) %in
%cc = icmp eq i32 %cond, 1
br i1 %cc, label %if, label %endif
if:
%u = add <4 x i32> %v, %v
br label %endif
endif:
%r = phi <4 x i32> [ %v, %entry ], [ %u, %if ]
call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0
ret void
}
; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, ptr addrspace(1) %out) {
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %else, label %if
if:
br label %done
else:
br label %done
done:
%value = phi i32 [0, %if], [1, %else]
store i32 %value, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}ifcvt_undef_scc:
; GCN: {{^}}; %bb.0:
; GCN-NEXT: s_load_dwordx2
; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, ptr addrspace(1) %out) {
entry:
br i1 undef, label %else, label %if
if:
br label %done
else:
br label %done
done:
%value = phi i32 [0, %if], [1, %else]
store i32 %value, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256:
; GCN: v_cmp_neq_f32
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
; GCN: v_add_i32
; GCN: v_add_i32
; GCN: [[ENDIF]]:
; GCN: buffer_store_dword
define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(ptr addrspace(1) %out, ptr addrspace(1) %in, float %cnd) #0 {
entry:
%v = load <8 x i32>, ptr addrspace(1) %in
%cc = fcmp oeq float %cnd, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u = add <8 x i32> %v, %v
br label %endif
endif:
%r = phi <8 x i32> [ %v, %entry ], [ %u, %if ]
store <8 x i32> %r, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512:
; GCN: v_cmp_neq_f32
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
; GCN: v_add_i32
; GCN: v_add_i32
; GCN: [[ENDIF]]:
; GCN: buffer_store_dword
define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(ptr addrspace(1) %out, ptr addrspace(1) %in, float %cnd) #0 {
entry:
%v = load <16 x i32>, ptr addrspace(1) %in
%cc = fcmp oeq float %cnd, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u = add <16 x i32> %v, %v
br label %endif
endif:
%r = phi <16 x i32> [ %v, %entry ], [ %u, %if ]
store <16 x i32> %r, ptr addrspace(1) %out
ret void
}
attributes #0 = { nounwind }