llvm/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll

; RUN: llc -mtriple=amdgcn--amdpal -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX8 -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX9 -enable-var-scope %s
; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mattr=-xnack -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL,GFX9 -enable-var-scope %s

declare amdgpu_gfx float @extern_func(float) #0
declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0

@funcptr = external hidden unnamed_addr addrspace(4) constant ptr, align 4

define amdgpu_gfx float @no_stack(float %arg0) #0 {
  %add = fadd float %arg0, 1.0
  ret float %add
}

define amdgpu_gfx float @simple_stack(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %add = fadd float %arg0, %val
  ret float %add
}

define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %add = fadd float %arg0, %val
  %stack2 = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack2
  %val2 = load volatile float, ptr addrspace(5) %stack2
  %add2 = fadd float %add, %val2
  ret float %add2
}

define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
bb0:
  %cmp = fcmp ogt float %arg0, 0.0
  br i1 %cmp, label %bb1, label %bb2

bb1:
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %add = fadd float %arg0, %val
  br label %bb2

bb2:
  %res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
  ret float %res
}

define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
bb0:
  br label %bb1

bb1:
  %ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %add = fadd float %arg0, %val
  %cmp = icmp sgt i32 %ctr, 0
  %newctr = sub i32 %ctr, 1
  br i1 %cmp, label %bb1, label %bb2

bb2:
  ret float %add
}

define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
  %res = call amdgpu_gfx float @simple_stack(float %arg0)
  ret float %res
}

define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %res = call amdgpu_gfx float @simple_stack(float %arg0)
  %add = fadd float %res, %val
  ret float %add
}

define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
  %res = call amdgpu_gfx float @extern_func(float %arg0)
  ret float %res
}

define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %res = call amdgpu_gfx float @extern_func(float %arg0)
  %add = fadd float %res, %val
  ret float %add
}

define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
  %res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
  ret float %res
}

define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
  %fptr = load ptr, ptr addrspace(4) @funcptr
  call amdgpu_gfx void %fptr()
  ret float %arg0
}

define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %fptr = load ptr, ptr addrspace(4) @funcptr
  call amdgpu_gfx void %fptr()
  %add = fadd float %arg0, %val
  ret float %add
}

define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
  %add = fadd float %res, %val
  ret float %add
}

@lds = internal addrspace(3) global [64 x float] undef

define amdgpu_gfx float @simple_lds(float %arg0) #0 {
  %val = load float, ptr addrspace(3) @lds
  ret float %val
}

define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 {
  %val = load float, ptr addrspace(3) @lds
  %res = call amdgpu_gfx float @simple_lds_recurse(float %val)
  ret float %res
}

attributes #0 = { nounwind }

; GCN: amdpal.pipelines:
; GCN-NEXT:  - .registers:
; SDAG-NEXT:     '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
; GISEL-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01cb{{$}}
; GCN-NEXT:      '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
; GCN-NEXT:    .shader_functions:
; GCN-NEXT:      dynamic_stack:
; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
; GCN-NEXT:        .lds_size:       0{{$}}
; GCN-NEXT:        .sgpr_count:     0x28{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
; SDAG-NEXT:        .vgpr_count:     0x2{{$}}
; GISEL-NEXT:        .vgpr_count:     0x3{{$}}
; GCN-NEXT:      dynamic_stack_loop:
; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
; GCN-NEXT:        .lds_size:       0{{$}}
; SDAG-NEXT:        .sgpr_count:     0x25{{$}}
; GISEL-NEXT:        .sgpr_count:     0x27{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
; SDAG-NEXT:        .vgpr_count:     0x3{{$}}
; GISEL-NEXT:        .vgpr_count:     0x5{{$}}
; GCN-NEXT:      multiple_stack:
; GCN-NEXT:        .backend_stack_size: 0x24{{$}}
; GCN-NEXT:        .lds_size:       0{{$}}
; GCN-NEXT:        .sgpr_count:     0x21{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x24{{$}}
; GCN-NEXT:        .vgpr_count:     0x3{{$}}
; GCN-NEXT:      no_stack:
; GCN-NEXT:        .backend_stack_size: 0{{$}}
; GCN-NEXT:        .lds_size:       0{{$}}
; GCN-NEXT:        .sgpr_count:     0x20{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
; GCN-NEXT:        .vgpr_count:     0x1{{$}}
; GCN-NEXT:      no_stack_call:
; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
; GCN-NEXT:        .lds_size:       0{{$}}
; GCN-NEXT:        .sgpr_count:     0x25{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
; GCN-NEXT:        .vgpr_count:     0x3{{$}}
; GCN-NEXT:      no_stack_extern_call:
; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
; GCN-NEXT:        .lds_size:       0{{$}}
; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
; GCN-NEXT:        .vgpr_count:    0x2b{{$}}
; GCN-NEXT:      no_stack_extern_call_many_args:
; GCN-NEXT:        .backend_stack_size: 0x90{{$}}
; GCN-NEXT:        .lds_size:       0{{$}}
; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x90{{$}}
; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
; GCN-NEXT:      no_stack_indirect_call:
; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
; GCN-NEXT:        .lds_size:       0{{$}}
; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
; GCN-NEXT:      simple_lds:
; GCN-NEXT:        .backend_stack_size: 0{{$}}
; GCN-NEXT:        .lds_size:       0x100{{$}}
; GCN-NEXT:        .sgpr_count:     0x20{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
; GCN-NEXT:        .vgpr_count:     0x1{{$}}
; GCN-NEXT:      simple_lds_recurse:
; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
; GCN-NEXT:        .lds_size:       0x100{{$}}
; GCN-NEXT:        .sgpr_count:     0x28{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
; GCN-NEXT:        .vgpr_count:     0x29{{$}}
; GCN-NEXT:      simple_stack:
; GCN-NEXT:        .backend_stack_size: 0x14{{$}}
; GCN-NEXT:        .lds_size:       0{{$}}
; GCN-NEXT:        .sgpr_count:     0x21{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x14{{$}}
; GCN-NEXT:        .vgpr_count:     0x2{{$}}
; GCN-NEXT:      simple_stack_call:
; GCN-NEXT:        .backend_stack_size: 0x20{{$}}
; GCN-NEXT:        .lds_size:       0{{$}}
; GCN-NEXT:        .sgpr_count:     0x25{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
; GCN-NEXT:        .vgpr_count:     0x4{{$}}
; GCN-NEXT:      simple_stack_extern_call:
; GCN-NEXT:        .backend_stack_size: 0x20{{$}}
; GCN-NEXT:        .lds_size:       0{{$}}
; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
; GCN-NEXT:      simple_stack_indirect_call:
; GCN-NEXT:        .backend_stack_size: 0x20{{$}}
; GCN-NEXT:        .lds_size:       0{{$}}
; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
; GCN-NEXT:      simple_stack_recurse:
; GCN-NEXT:        .backend_stack_size: 0x20{{$}}
; GCN-NEXT:        .lds_size:       0{{$}}
; GCN-NEXT:        .sgpr_count:     0x28{{$}}
; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
; GCN-NEXT:        .vgpr_count:     0x2a{{$}}
; GCN-NEXT: ...