llvm/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir

# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -run-pass=prologepilog,machine-cp -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s

# The COPY that moves the return value to VGPR0 should not be removed during machine-cp. The spill restore of the same register that follows,
# meant to only reload its inactive lanes. By marking the reg itself as the tied-op in the spill reload prevents the undesired optimization.

---
name:            wwm_scratch_reg_spill_reload_of_outgoing_reg
tracksRegLiveness: true
machineFunctionInfo:
  wwmReservedRegs: ['$vgpr0']
  isEntryFunction: false
  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
  stackPtrOffsetReg: '$sgpr32'
  frameOffsetReg: '$sgpr33'
body:             |
  bb.0:
    liveins: $sgpr20, $vgpr1
    ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_of_outgoing_reg
    ; GCN: liveins: $sgpr20, $vgpr0, $vgpr1
    ; GCN-NEXT: {{  $}}
    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
    ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF
    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
    ; GCN-NEXT: $vgpr0 = COPY killed renamable $vgpr1, implicit $exec
    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
    ; GCN-NEXT: SI_RETURN implicit $vgpr0
    $vgpr0 = IMPLICIT_DEF
    $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
    $vgpr0 = COPY killed renamable $vgpr1, implicit $exec
    SI_RETURN implicit $vgpr0
...

# The reload of vgpr0 require the tied-op as it is a subreg in the outgoing tuple register vgpr0_vgpr1.
# The vgpr2 doesn't need the tied-op in the reload as it isn't holding any return value.
---
name:            wwm_scratch_reg_spill_reload_of_outgoing_tuple_subreg
tracksRegLiveness: true
machineFunctionInfo:
  wwmReservedRegs: ['$vgpr0', '$vgpr2']
  isEntryFunction: false
  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
  stackPtrOffsetReg: '$sgpr32'
  frameOffsetReg: '$sgpr33'
body:             |
  bb.0:
    liveins: $sgpr20, $sgpr21, $vgpr1
    ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_of_outgoing_tuple_subreg
    ; GCN: liveins: $sgpr20, $sgpr21, $vgpr0, $vgpr1, $vgpr2
    ; GCN-NEXT: {{  $}}
    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
    ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF
    ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF
    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
    ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr21, 0, $vgpr2
    ; GCN-NEXT: $vgpr0 = COPY $vgpr1, implicit $exec
    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
    ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
    ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
    $vgpr0 = IMPLICIT_DEF
    $vgpr2 = IMPLICIT_DEF
    $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
    $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr21, 0, $vgpr2
    $vgpr0 = COPY $vgpr1, implicit $exec
    SI_RETURN implicit $vgpr0_vgpr1
...

# Tied op not required in the spill reload of vgpr2.

---
name:            wwm_scratch_reg_spill_reload_different_outgoing_reg
tracksRegLiveness: true
machineFunctionInfo:
  wwmReservedRegs: ['$vgpr2']
  isEntryFunction: false
  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
  stackPtrOffsetReg: '$sgpr32'
  frameOffsetReg: '$sgpr33'
body:             |
  bb.0:
    liveins: $sgpr20, $vgpr1
    ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_different_outgoing_reg
    ; GCN: liveins: $sgpr20, $vgpr1, $vgpr2
    ; GCN-NEXT: {{  $}}
    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
    ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF
    ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr2
    ; GCN-NEXT: $vgpr0 = COPY $vgpr1, implicit $exec
    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
    ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
    ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
    $vgpr2 = IMPLICIT_DEF
    $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr2
    $vgpr0 = COPY $vgpr1, implicit $exec
    SI_RETURN implicit $vgpr0_vgpr1
...

# Tied op not required in the spill reload of vgpr40 which is in the CSR range.
---
name:            wwm_csr_spill_reload
tracksRegLiveness: true
machineFunctionInfo:
  wwmReservedRegs: ['$vgpr40']
  isEntryFunction: false
  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
  stackPtrOffsetReg: '$sgpr32'
  frameOffsetReg: '$sgpr33'
body:             |
  bb.0:
    liveins: $sgpr20, $vgpr1
    ; GCN-LABEL: name: wwm_csr_spill_reload
    ; GCN: liveins: $sgpr20, $vgpr1, $vgpr40
    ; GCN-NEXT: {{  $}}
    ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
    ; GCN-NEXT: $vgpr40 = IMPLICIT_DEF
    ; GCN-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr40
    ; GCN-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 0, implicit $exec
    ; GCN-NEXT: $vgpr0 = COPY killed $vgpr1, implicit $exec
    ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
    ; GCN-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
    ; GCN-NEXT: SI_RETURN implicit $vgpr0
    $vgpr40 = IMPLICIT_DEF
    $vgpr40 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr40
    $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 0, implicit $exec
    $vgpr0 = COPY killed $vgpr1, implicit $exec
    SI_RETURN implicit $vgpr0
...