llvm/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll

; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=VI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s

; Make sure the stack is never realigned for entry functions.

define amdgpu_kernel void @max_alignment_128() #0 {
; VI-LABEL: max_alignment_128:
; VI:       ; %bb.0:
; VI-NEXT:    s_add_u32 s0, s0, s17
; VI-NEXT:    s_addc_u32 s1, s1, 0
; VI-NEXT:    v_mov_b32_e32 v0, 3
; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v0, 9
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_endpgm
; VI-NEXT:    .section .rodata,"a"
; VI-NEXT:    .p2align 6
; VI-NEXT:    .amdhsa_kernel max_alignment_128
; VI-NEXT:     .amdhsa_group_segment_fixed_size 0
; VI-NEXT:     .amdhsa_private_segment_fixed_size 256
; VI-NEXT:     .amdhsa_kernarg_size 56
; VI-NEXT:     .amdhsa_user_sgpr_count 14
; VI-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
; VI-NEXT:     .amdhsa_user_sgpr_dispatch_ptr 1
; VI-NEXT:     .amdhsa_user_sgpr_queue_ptr 1
; VI-NEXT:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
; VI-NEXT:     .amdhsa_user_sgpr_dispatch_id 1
; VI-NEXT:     .amdhsa_user_sgpr_flat_scratch_init 1
; VI-NEXT:     .amdhsa_user_sgpr_private_segment_size 0
; VI-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_x 1
; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_y 1
; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
; VI-NEXT:     .amdhsa_next_free_vgpr 1
; VI-NEXT:     .amdhsa_next_free_sgpr 18
; VI-NEXT:     .amdhsa_reserve_vcc 0
; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
; VI-NEXT:     .amdhsa_float_round_mode_32 0
; VI-NEXT:     .amdhsa_float_round_mode_16_64 0
; VI-NEXT:     .amdhsa_float_denorm_mode_32 3
; VI-NEXT:     .amdhsa_float_denorm_mode_16_64 3
; VI-NEXT:     .amdhsa_dx10_clamp 1
; VI-NEXT:     .amdhsa_ieee_mode 1
; VI-NEXT:     .amdhsa_exception_fp_ieee_invalid_op 0
; VI-NEXT:     .amdhsa_exception_fp_denorm_src 0
; VI-NEXT:     .amdhsa_exception_fp_ieee_div_zero 0
; VI-NEXT:     .amdhsa_exception_fp_ieee_overflow 0
; VI-NEXT:     .amdhsa_exception_fp_ieee_underflow 0
; VI-NEXT:     .amdhsa_exception_fp_ieee_inexact 0
; VI-NEXT:     .amdhsa_exception_int_div_zero 0
; VI-NEXT:    .end_amdhsa_kernel
; VI-NEXT:    .text
;
; GFX9-LABEL: max_alignment_128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_add_u32 s0, s0, s17
; GFX9-NEXT:    s_addc_u32 s1, s1, 0
; GFX9-NEXT:    v_mov_b32_e32 v0, 3
; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v0, 9
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_endpgm
; GFX9-NEXT:    .section .rodata,"a"
; GFX9-NEXT:    .p2align 6
; GFX9-NEXT:    .amdhsa_kernel max_alignment_128
; GFX9-NEXT:     .amdhsa_group_segment_fixed_size 0
; GFX9-NEXT:     .amdhsa_private_segment_fixed_size 256
; GFX9-NEXT:     .amdhsa_kernarg_size 56
; GFX9-NEXT:     .amdhsa_user_sgpr_count 14
; GFX9-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
; GFX9-NEXT:     .amdhsa_user_sgpr_dispatch_ptr 1
; GFX9-NEXT:     .amdhsa_user_sgpr_queue_ptr 1
; GFX9-NEXT:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
; GFX9-NEXT:     .amdhsa_user_sgpr_dispatch_id 1
; GFX9-NEXT:     .amdhsa_user_sgpr_flat_scratch_init 1
; GFX9-NEXT:     .amdhsa_user_sgpr_private_segment_size 0
; GFX9-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_x 1
; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_y 1
; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
; GFX9-NEXT:     .amdhsa_reserve_vcc 0
; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
; GFX9-NEXT:     .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT:     .amdhsa_float_round_mode_32 0
; GFX9-NEXT:     .amdhsa_float_round_mode_16_64 0
; GFX9-NEXT:     .amdhsa_float_denorm_mode_32 3
; GFX9-NEXT:     .amdhsa_float_denorm_mode_16_64 3
; GFX9-NEXT:     .amdhsa_dx10_clamp 1
; GFX9-NEXT:     .amdhsa_ieee_mode 1
; GFX9-NEXT:     .amdhsa_fp16_overflow 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_invalid_op 0
; GFX9-NEXT:     .amdhsa_exception_fp_denorm_src 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_div_zero 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_overflow 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_underflow 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_inexact 0
; GFX9-NEXT:     .amdhsa_exception_int_div_zero 0
; GFX9-NEXT:    .end_amdhsa_kernel
; GFX9-NEXT:    .text
  %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
  store volatile i8 3, ptr addrspace(5) %clutter
  %alloca.align = alloca i32, align 128, addrspace(5)
  store volatile i32 9, ptr addrspace(5) %alloca.align, align 128
  ret void
}

define amdgpu_kernel void @stackrealign_attr() #1 {
; VI-LABEL: stackrealign_attr:
; VI:       ; %bb.0:
; VI-NEXT:    s_add_u32 s0, s0, s17
; VI-NEXT:    s_addc_u32 s1, s1, 0
; VI-NEXT:    v_mov_b32_e32 v0, 3
; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v0, 9
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_endpgm
; VI-NEXT:    .section .rodata,"a"
; VI-NEXT:    .p2align 6
; VI-NEXT:    .amdhsa_kernel stackrealign_attr
; VI-NEXT:     .amdhsa_group_segment_fixed_size 0
; VI-NEXT:     .amdhsa_private_segment_fixed_size 12
; VI-NEXT:     .amdhsa_kernarg_size 56
; VI-NEXT:     .amdhsa_user_sgpr_count 14
; VI-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
; VI-NEXT:     .amdhsa_user_sgpr_dispatch_ptr 1
; VI-NEXT:     .amdhsa_user_sgpr_queue_ptr 1
; VI-NEXT:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
; VI-NEXT:     .amdhsa_user_sgpr_dispatch_id 1
; VI-NEXT:     .amdhsa_user_sgpr_flat_scratch_init 1
; VI-NEXT:     .amdhsa_user_sgpr_private_segment_size 0
; VI-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_x 1
; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_y 1
; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
; VI-NEXT:     .amdhsa_next_free_vgpr 1
; VI-NEXT:     .amdhsa_next_free_sgpr 18
; VI-NEXT:     .amdhsa_reserve_vcc 0
; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
; VI-NEXT:     .amdhsa_float_round_mode_32 0
; VI-NEXT:     .amdhsa_float_round_mode_16_64 0
; VI-NEXT:     .amdhsa_float_denorm_mode_32 3
; VI-NEXT:     .amdhsa_float_denorm_mode_16_64 3
; VI-NEXT:     .amdhsa_dx10_clamp 1
; VI-NEXT:     .amdhsa_ieee_mode 1
; VI-NEXT:     .amdhsa_exception_fp_ieee_invalid_op 0
; VI-NEXT:     .amdhsa_exception_fp_denorm_src 0
; VI-NEXT:     .amdhsa_exception_fp_ieee_div_zero 0
; VI-NEXT:     .amdhsa_exception_fp_ieee_overflow 0
; VI-NEXT:     .amdhsa_exception_fp_ieee_underflow 0
; VI-NEXT:     .amdhsa_exception_fp_ieee_inexact 0
; VI-NEXT:     .amdhsa_exception_int_div_zero 0
; VI-NEXT:    .end_amdhsa_kernel
; VI-NEXT:    .text
;
; GFX9-LABEL: stackrealign_attr:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_add_u32 s0, s0, s17
; GFX9-NEXT:    s_addc_u32 s1, s1, 0
; GFX9-NEXT:    v_mov_b32_e32 v0, 3
; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v0, 9
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_endpgm
; GFX9-NEXT:    .section .rodata,"a"
; GFX9-NEXT:    .p2align 6
; GFX9-NEXT:    .amdhsa_kernel stackrealign_attr
; GFX9-NEXT:     .amdhsa_group_segment_fixed_size 0
; GFX9-NEXT:     .amdhsa_private_segment_fixed_size 12
; GFX9-NEXT:     .amdhsa_kernarg_size 56
; GFX9-NEXT:     .amdhsa_user_sgpr_count 14
; GFX9-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
; GFX9-NEXT:     .amdhsa_user_sgpr_dispatch_ptr 1
; GFX9-NEXT:     .amdhsa_user_sgpr_queue_ptr 1
; GFX9-NEXT:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
; GFX9-NEXT:     .amdhsa_user_sgpr_dispatch_id 1
; GFX9-NEXT:     .amdhsa_user_sgpr_flat_scratch_init 1
; GFX9-NEXT:     .amdhsa_user_sgpr_private_segment_size 0
; GFX9-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_x 1
; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_y 1
; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
; GFX9-NEXT:     .amdhsa_reserve_vcc 0
; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
; GFX9-NEXT:     .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT:     .amdhsa_float_round_mode_32 0
; GFX9-NEXT:     .amdhsa_float_round_mode_16_64 0
; GFX9-NEXT:     .amdhsa_float_denorm_mode_32 3
; GFX9-NEXT:     .amdhsa_float_denorm_mode_16_64 3
; GFX9-NEXT:     .amdhsa_dx10_clamp 1
; GFX9-NEXT:     .amdhsa_ieee_mode 1
; GFX9-NEXT:     .amdhsa_fp16_overflow 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_invalid_op 0
; GFX9-NEXT:     .amdhsa_exception_fp_denorm_src 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_div_zero 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_overflow 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_underflow 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_inexact 0
; GFX9-NEXT:     .amdhsa_exception_int_div_zero 0
; GFX9-NEXT:    .end_amdhsa_kernel
; GFX9-NEXT:    .text
  %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
  store volatile i8 3, ptr addrspace(5) %clutter
  %alloca.align = alloca i32, align 4, addrspace(5)
  store volatile i32 9, ptr addrspace(5) %alloca.align, align 4
  ret void
}

define amdgpu_kernel void @alignstack_attr() #2 {
; VI-LABEL: alignstack_attr:
; VI:       ; %bb.0:
; VI-NEXT:    s_add_u32 s0, s0, s17
; VI-NEXT:    s_addc_u32 s1, s1, 0
; VI-NEXT:    v_mov_b32_e32 v0, 3
; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v0, 9
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_endpgm
; VI-NEXT:    .section .rodata,"a"
; VI-NEXT:    .p2align 6
; VI-NEXT:    .amdhsa_kernel alignstack_attr
; VI-NEXT:     .amdhsa_group_segment_fixed_size 0
; VI-NEXT:     .amdhsa_private_segment_fixed_size 128
; VI-NEXT:     .amdhsa_kernarg_size 56
; VI-NEXT:     .amdhsa_user_sgpr_count 14
; VI-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
; VI-NEXT:     .amdhsa_user_sgpr_dispatch_ptr 1
; VI-NEXT:     .amdhsa_user_sgpr_queue_ptr 1
; VI-NEXT:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
; VI-NEXT:     .amdhsa_user_sgpr_dispatch_id 1
; VI-NEXT:     .amdhsa_user_sgpr_flat_scratch_init 1
; VI-NEXT:     .amdhsa_user_sgpr_private_segment_size 0
; VI-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_x 1
; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_y 1
; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
; VI-NEXT:     .amdhsa_next_free_vgpr 1
; VI-NEXT:     .amdhsa_next_free_sgpr 18
; VI-NEXT:     .amdhsa_reserve_vcc 0
; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
; VI-NEXT:     .amdhsa_float_round_mode_32 0
; VI-NEXT:     .amdhsa_float_round_mode_16_64 0
; VI-NEXT:     .amdhsa_float_denorm_mode_32 3
; VI-NEXT:     .amdhsa_float_denorm_mode_16_64 3
; VI-NEXT:     .amdhsa_dx10_clamp 1
; VI-NEXT:     .amdhsa_ieee_mode 1
; VI-NEXT:     .amdhsa_exception_fp_ieee_invalid_op 0
; VI-NEXT:     .amdhsa_exception_fp_denorm_src 0
; VI-NEXT:     .amdhsa_exception_fp_ieee_div_zero 0
; VI-NEXT:     .amdhsa_exception_fp_ieee_overflow 0
; VI-NEXT:     .amdhsa_exception_fp_ieee_underflow 0
; VI-NEXT:     .amdhsa_exception_fp_ieee_inexact 0
; VI-NEXT:     .amdhsa_exception_int_div_zero 0
; VI-NEXT:    .end_amdhsa_kernel
; VI-NEXT:    .text
;
; GFX9-LABEL: alignstack_attr:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_add_u32 s0, s0, s17
; GFX9-NEXT:    s_addc_u32 s1, s1, 0
; GFX9-NEXT:    v_mov_b32_e32 v0, 3
; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v0, 9
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_endpgm
; GFX9-NEXT:    .section .rodata,"a"
; GFX9-NEXT:    .p2align 6
; GFX9-NEXT:    .amdhsa_kernel alignstack_attr
; GFX9-NEXT:     .amdhsa_group_segment_fixed_size 0
; GFX9-NEXT:     .amdhsa_private_segment_fixed_size 128
; GFX9-NEXT:     .amdhsa_kernarg_size 56
; GFX9-NEXT:     .amdhsa_user_sgpr_count 14
; GFX9-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
; GFX9-NEXT:     .amdhsa_user_sgpr_dispatch_ptr 1
; GFX9-NEXT:     .amdhsa_user_sgpr_queue_ptr 1
; GFX9-NEXT:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
; GFX9-NEXT:     .amdhsa_user_sgpr_dispatch_id 1
; GFX9-NEXT:     .amdhsa_user_sgpr_flat_scratch_init 1
; GFX9-NEXT:     .amdhsa_user_sgpr_private_segment_size 0
; GFX9-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_x 1
; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_y 1
; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
; GFX9-NEXT:     .amdhsa_reserve_vcc 0
; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
; GFX9-NEXT:     .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT:     .amdhsa_float_round_mode_32 0
; GFX9-NEXT:     .amdhsa_float_round_mode_16_64 0
; GFX9-NEXT:     .amdhsa_float_denorm_mode_32 3
; GFX9-NEXT:     .amdhsa_float_denorm_mode_16_64 3
; GFX9-NEXT:     .amdhsa_dx10_clamp 1
; GFX9-NEXT:     .amdhsa_ieee_mode 1
; GFX9-NEXT:     .amdhsa_fp16_overflow 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_invalid_op 0
; GFX9-NEXT:     .amdhsa_exception_fp_denorm_src 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_div_zero 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_overflow 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_underflow 0
; GFX9-NEXT:     .amdhsa_exception_fp_ieee_inexact 0
; GFX9-NEXT:     .amdhsa_exception_int_div_zero 0
; GFX9-NEXT:    .end_amdhsa_kernel
; GFX9-NEXT:    .text
  %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
  store volatile i8 3, ptr addrspace(5) %clutter
  %alloca.align = alloca i32, align 4, addrspace(5)
  store volatile i32 9, ptr addrspace(5) %alloca.align, align 4
  ret void
}

attributes #0 = { nounwind }
attributes #1 = { nounwind "stackrealign" }
attributes #2 = { nounwind alignstack=128 }

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}