llvm/llvm/test/CodeGen/AMDGPU/limit-soft-clause-reg-pressure.mir

# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+xnack -run-pass=si-form-memory-clauses -verify-machineinstrs -o - %s | FileCheck %s

# This previously would produce a bundle that could not be satisfied
# due to using nearly the entire register budget and not considering
# the alignment requirement of large SGPR tuples.

---
name: soft_clause_bundle_out_of_registers
tracksRegLiveness: true
machineFunctionInfo:
  isEntryFunction: true
  waveLimiter:     true
  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
  stackPtrOffsetReg: '$sgpr32'
  occupancy:       6
body:             |
  ; CHECK-LABEL: name: soft_clause_bundle_out_of_registers
  ; CHECK: bb.0:
  ; CHECK:   successors: %bb.1(0x80000000)
  ; CHECK:   [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 0, 0 :: (load (s512), align 4, addrspace 4)
  ; CHECK-NEXT:   [[S_LOAD_DWORDX16_IMM1:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 4096, 0 :: (load (s512), align 4, addrspace 4)
  ; CHECK-NEXT:   [[S_LOAD_DWORDX16_IMM2:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 8192, 0 :: (load (s512), align 4, addrspace 4)
  ; CHECK-NEXT:   [[S_LOAD_DWORDX16_IMM3:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 12288, 0 :: (load (s512), align 4, addrspace 4)
  ; CHECK-NEXT:   [[S_LOAD_DWORDX16_IMM4:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 64, 0 :: (load (s512), align 4, addrspace 4)
  ; CHECK-NEXT:   [[S_LOAD_DWORDX16_IMM5:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 4160, 0 :: (load (s512), align 4, addrspace 4)
  ; CHECK-NEXT:   [[S_LOAD_DWORDX16_IMM6:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 8256, 0 :: (load (s512), align 4, addrspace 4)
  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef $sgpr4_sgpr5, 0, csr_amdgpu, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr70, implicit-def $sgpr80, implicit-def $sgpr90, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
  bb.0:
    liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6

    %0:sgpr_64 = COPY $sgpr4_sgpr5
    %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64), addrspace 4)
    %2:vreg_64 = IMPLICIT_DEF

  bb.1:
    undef %3.sub0:sreg_64 = S_ADD_U32 %1.sub0, 0, implicit-def $scc
    %3.sub1:sreg_64 = S_ADDC_U32 %1.sub1, 0, implicit-def dead $scc, implicit killed $scc
    %4:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 0, 0 :: (load (s512), align 4, addrspace 4)
    %5:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 4096, 0 :: (load (s512), align 4, addrspace 4)
    %6:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 8192, 0 :: (load (s512), align 4, addrspace 4)
    %7:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 12288, 0 :: (load (s512), align 4, addrspace 4)
    %8:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 64, 0 :: (load (s512), align 4, addrspace 4)
    %9:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 4160, 0 :: (load (s512), align 4, addrspace 4)
    %10:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 8256, 0 :: (load (s512), align 4, addrspace 4)
    dead $sgpr30_sgpr31 = SI_CALL undef $sgpr4_sgpr5, 0, csr_amdgpu, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr70, implicit-def $sgpr80, implicit-def $sgpr90, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
    dead %11:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub1, 0, 0, implicit $mode, implicit $exec
    dead %12:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub2, 0, 0, implicit $mode, implicit $exec
    dead %13:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub3, 0, 0, implicit $mode, implicit $exec
    dead %14:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub4, 0, 0, implicit $mode, implicit $exec
    dead %15:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub5, 0, 0, implicit $mode, implicit $exec
    dead %16:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub6, 0, 0, implicit $mode, implicit $exec
    dead %17:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub7, 0, 0, implicit $mode, implicit $exec
    dead %18:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub8, 0, 0, implicit $mode, implicit $exec
    dead %19:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub9, 0, 0, implicit $mode, implicit $exec
    dead %20:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub10, 0, 0, implicit $mode, implicit $exec
    dead %21:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub11, 0, 0, implicit $mode, implicit $exec
    dead %22:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub12, 0, 0, implicit $mode, implicit $exec
    dead %23:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub13, 0, 0, implicit $mode, implicit $exec
    dead %24:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub14, 0, 0, implicit $mode, implicit $exec
    dead %25:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub15, 0, 0, implicit $mode, implicit $exec
    dead %26:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub0, 0, 0, implicit $mode, implicit $exec
    dead %27:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub1, 0, 0, implicit $mode, implicit $exec
    dead %28:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub2, 0, 0, implicit $mode, implicit $exec
    dead %29:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub3, 0, 0, implicit $mode, implicit $exec
    dead %30:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub4, 0, 0, implicit $mode, implicit $exec
    dead %31:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub5, 0, 0, implicit $mode, implicit $exec
    dead %32:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub6, 0, 0, implicit $mode, implicit $exec
    dead %33:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub7, 0, 0, implicit $mode, implicit $exec
    dead %34:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub8, 0, 0, implicit $mode, implicit $exec
    dead %35:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub9, 0, 0, implicit $mode, implicit $exec
    dead %36:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub10, 0, 0, implicit $mode, implicit $exec
    dead %37:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub11, 0, 0, implicit $mode, implicit $exec
    dead %38:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub12, 0, 0, implicit $mode, implicit $exec
    dead %39:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub13, 0, 0, implicit $mode, implicit $exec
    dead %40:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub14, 0, 0, implicit $mode, implicit $exec
    dead %41:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub15, 0, 0, implicit $mode, implicit $exec
    dead %42:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub0, 0, 0, implicit $mode, implicit $exec
    dead %43:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub1, 0, 0, implicit $mode, implicit $exec
    dead %44:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub2, 0, 0, implicit $mode, implicit $exec
    dead %45:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub3, 0, 0, implicit $mode, implicit $exec
    dead %46:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub4, 0, 0, implicit $mode, implicit $exec
    dead %47:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub5, 0, 0, implicit $mode, implicit $exec
    dead %48:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub8, 0, 0, implicit $mode, implicit $exec
    dead %49:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub9, 0, 0, implicit $mode, implicit $exec
    dead %50:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub10, 0, 0, implicit $mode, implicit $exec
    dead %51:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub11, 0, 0, implicit $mode, implicit $exec
    dead %52:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub12, 0, 0, implicit $mode, implicit $exec
    dead %53:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub14, 0, 0, implicit $mode, implicit $exec
    dead %54:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub15, 0, 0, implicit $mode, implicit $exec
    dead %55:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %9.sub10, 0, 0, implicit $mode, implicit $exec
    dead %56:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %6.sub0, 0, 0, implicit $mode, implicit $exec
    dead %57:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %7.sub10, 0, 0, implicit $mode, implicit $exec
    dead %58:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %10.sub0, 0, 0, implicit $mode, implicit $exec
    S_CMP_LG_U32 0, 0, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2

  bb.2:
    S_ENDPGM 0

...

# Case with simple clause which exceeds the pressure limit, though
# didn't hit the register allocator error.

---
name: simple_huge_reg_tuple_clause
tracksRegLiveness: true
machineFunctionInfo:
  isEntryFunction: true
  waveLimiter:     false
  memoryBound: false
  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
  stackPtrOffsetReg: '$sgpr32'
  occupancy:       10

body:             |
  bb.0:
    liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6
    ; CHECK-LABEL: name: simple_huge_reg_tuple_clause
    ; CHECK: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 0, 0 :: (load (s512), align 4, addrspace 4)
    ; CHECK-NEXT: [[S_LOAD_DWORDX16_IMM1:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 64, 0 :: (load (s512), align 4, addrspace 4)
    ; CHECK-NEXT: [[S_LOAD_DWORDX16_IMM2:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 4096, 0 :: (load (s512), align 4, addrspace 4)
    ; CHECK-NEXT: [[S_LOAD_DWORDX16_IMM3:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 4160, 0 :: (load (s512), align 4, addrspace 4)
    ; CHECK-NEXT: S_NOP 0, implicit [[S_LOAD_DWORDX16_IMM]]
    %0:sreg_64 = COPY $sgpr4_sgpr5
    %1:sreg_64 = S_MOV_B64 0
    %2:sreg_64 = S_MOV_B64 1
    %3:sreg_64 = S_MOV_B64 2
    %4:sreg_64 = S_MOV_B64 3
    %5:sreg_64 = S_MOV_B64 4
    %6:sreg_64 = S_MOV_B64 5
    %7:sreg_64 = S_MOV_B64 6
    %8:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 0, 0 :: (load (s512), align 4, addrspace 4)
    %9:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 64, 0 :: (load (s512), align 4, addrspace 4)
    %10:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 4096, 0 :: (load (s512), align 4, addrspace 4)
    %11:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 4160, 0 :: (load (s512), align 4, addrspace 4)
    S_NOP 0, implicit %8
    S_NOP 0, implicit %9
    S_NOP 0, implicit %10
    S_NOP 0, implicit %11
    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5
    S_NOP 0, implicit %6
    S_NOP 0, implicit %7
    S_ENDPGM 0

...