llvm/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir

# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s

---

# The loop contains a store and a use of a value loaded outside of the loop.
# We expect the waitcnt for the use to be hoisted on GFX9, but not on GFX10+
# because we have the vscnt counter.

# GFX9-LABEL: waitcnt_vm_loop
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_loop
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_loop
# GFX12-LABEL: bb.0:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name:            waitcnt_vm_loop
body:             |
  bb.0:
    successors: %bb.1

    $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2

  bb.2:
    S_ENDPGM 0

...
---

# Same as before, but the loop preheader has no terminator.

# GFX9-LABEL: waitcnt_vm_loop_noterm
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_loop_noterm
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_loop_noterm
# GFX12-LABEL: bb.0:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name:            waitcnt_vm_loop_noterm
body:             |
  bb.0:
    successors: %bb.1

    $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec

  bb.1:
    successors: %bb.1, %bb.2

    BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2

  bb.2:
    S_ENDPGM 0

...
---

# Same as before but there is a preexisting waitcnt in the preheader.

# GFX9-LABEL: waitcnt_vm_loop_noterm_wait
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:
name:            waitcnt_vm_loop_noterm_wait
body:             |
  bb.0:
    successors: %bb.1

    $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_WAITCNT 3952

  bb.1:
    successors: %bb.1, %bb.2

    BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2

  bb.2:
    S_ENDPGM 0

...
---

# The loop contains a store, a load, and uses values loaded both inside and
# outside the loop.
# We do not expect the waitcnt to be hoisted out of the loop.

# GFX9-LABEL: waitcnt_vm_loop_load
# GFX9-LABEL: bb.0:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_loop_load
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_loop_load
# GFX12-LABEL: bb.0:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name:            waitcnt_vm_loop_load
body:             |
  bb.0:
    successors: %bb.1

    $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    $vgpr7 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr7, implicit $exec
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2

  bb.2:
    S_ENDPGM 0

...
---

# The loop contains a use of a value loaded outside of the loop, and no store
# nor load.
# We do not expect the waitcnt to be hoisted out of the loop.

# GFX9-LABEL: waitcnt_vm_loop_no_store
# GFX9-LABEL: bb.0:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_loop_no_store
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_loop_no_store
# GFX12-LABEL: bb.0:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name:            waitcnt_vm_loop_no_store
body:             |
  bb.0:
    successors: %bb.1

    $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2

  bb.2:
    S_ENDPGM 0

...
---

# The loop contains a store, no load, and doesn't use any value loaded inside
# or outside of the loop. There is only one use of the loaded value in the
# exit block.
# We don't expect any s_waitcnt vmcnt in the loop body or preheader, but expect
# one in the exit block.


# GFX9-LABEL: waitcnt_vm_loop_no_use
# GFX9-LABEL: bb.0:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_loop_no_use
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_loop_no_use
# GFX12-LABEL: bb.0:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name:            waitcnt_vm_loop_no_use
body:             |
  bb.0:
    successors: %bb.1

    $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    $vgpr1 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2

  bb.2:
    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    S_ENDPGM 0

...
---

# The loop loads a value that is not used in the loop, and uses a value loaded
# outside of the loop.
# We expect the waitcnt to be hoisted of the loop to wait a single time before
# the loop is executed and avoid waiting for the load to complete on each
# iteration.

# GFX9-LABEL: waitcnt_vm_loop2
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_loop2
# GFX10-LABEL: bb.0:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_loop2
# GFX12-LABEL: bb.0:
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name:            waitcnt_vm_loop2
body:             |
  bb.0:
    successors: %bb.1

    $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
    $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2

  bb.2:
    S_ENDPGM 0

...
---

# Same as before with an additional store in the loop. We still expect the
# waitcnt instructions to be hoisted.

# GFX9-LABEL: waitcnt_vm_loop2_store
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_loop2_store
# GFX10-LABEL: bb.0:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_loop2_store
# GFX12-LABEL: bb.0:
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name:            waitcnt_vm_loop2_store
body:             |
  bb.0:
    successors: %bb.1

    $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
    $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2

  bb.2:
    S_ENDPGM 0

...
---

# Same as loop2 but the value loaded inside the loop is also used in the loop.
# We do not expect the waitcnt to be hoisted out of the loop.

# GFX9-LABEL: waitcnt_vm_loop2_use_in_loop
# GFX9-LABEL: bb.0:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_loop2_use_in_loop
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_loop2_use_in_loop
# GFX12-LABEL: bb.0:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name:            waitcnt_vm_loop2_use_in_loop
body:             |
  bb.0:
    successors: %bb.1

    $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
    $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    $vgpr4 = V_ADD_U32_e32 $vgpr5, $vgpr1, implicit $exec
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2

  bb.2:
    S_ENDPGM 0

...
---

# The loop contains a use of a value loaded outside of the loop, but we already
# waited for that load to complete. The loop also loads a value that is not used
# in the loop. We do not expect any waitcnt in the loop.

# GFX9-LABEL: waitcnt_vm_loop2_nowait
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.3:

# GFX10-LABEL: waitcnt_vm_loop2_nowait
# GFX10-LABEL: bb.0:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.2:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.3:

# GFX12-LABEL: waitcnt_vm_loop2_nowait
# GFX12-LABEL: bb.0:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.3:
name:            waitcnt_vm_loop2_nowait
body:             |
  bb.0:
    successors: %bb.1

    $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.2

    $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec
    $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec
    $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec

    S_BRANCH %bb.2

  bb.2:
    successors: %bb.2, %bb.3

    $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
    $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.2, implicit killed $scc
    S_BRANCH %bb.3

  bb.3:
    S_ENDPGM 0

...
---

# Similar test case but for register intervals.

# GFX9-LABEL: waitcnt_vm_loop2_reginterval
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_loop2_reginterval
# GFX10-LABEL: bb.0:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_loop2_reginterval
# GFX12-LABEL: bb.0:
# GFX12: GLOBAL_LOAD_DWORDX4
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name:            waitcnt_vm_loop2_reginterval
body:             |
  bb.0:
    successors: %bb.1

    $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec

    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    $vgpr10 = COPY $vgpr0

    $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2

  bb.2:
    S_ENDPGM 0

...
---

# Similar test case but for register intervals.

# GFX9-LABEL: waitcnt_vm_loop2_reginterval2
# GFX9-LABEL: bb.0:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_loop2_reginterval2
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_loop2_reginterval2
# GFX12-LABEL: bb.0:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name:            waitcnt_vm_loop2_reginterval2
body:             |
  bb.0:
    successors: %bb.1

    $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec

    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    $vgpr10 = COPY $vgpr0

    $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
    $vgpr11 = COPY $vgpr7
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2

  bb.2:
    S_ENDPGM 0

...
---

# The loop loads a value that is not used in the loop, but uses a value loaded
# outside of it. We expect the s_waitcnt instruction to be hoisted.
# A s_waitcnt vmcnt(0) is generated to flush in the preheader, but for this
# specific test case, it would be better to use vmcnt(1) instead. This is
# currently not implemented.

# GFX9-LABEL: waitcnt_vm_zero
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 3952
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_zero
# GFX10-LABEL: bb.0:
# GFX10: S_WAITCNT 16240
# GFX10-LABEL: bb.1:
# GFX10-NOT: S_WAITCNT 16240
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_zero
# GFX12-LABEL: bb.0:
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:

name:            waitcnt_vm_zero
body:             |
  bb.0:
    successors: %bb.1

    $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr3, implicit $exec
    $vgpr2 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr3, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2

  bb.2:
    S_ENDPGM 0

...
---

# This test case checks that we flush the vmcnt counter only if necessary
# (i.e. if a waitcnt is needed for the vgpr use we find in the loop)

# GFX10-LABEL: waitcnt_vm_necessary
# GFX10-LABEL: bb.0:
# GFX10: S_WAITCNT 16240
# GFX10: $vgpr4
# GFX10-NOT: S_WAITCNT
# GFX10-LABEL: bb.1:
# GFX10-NOT: S_WAITCNT

# GFX12-LABEL: waitcnt_vm_necessary
# GFX12-LABEL: bb.0:
# GFX12: S_WAIT_LOADCNT 0
# GFX12: $vgpr4
# GFX12-NOT: S_WAITCNT
# GFX12-LABEL: bb.1:
# GFX12-NOT: S_WAITCNT

# GFX9-LABEL: waitcnt_vm_necessary
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 3952
# GFX9: $vgpr4
# GFX9-NOT: S_WAITCNT
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT

name:            waitcnt_vm_necessary
body:             |
  bb.0:
    successors: %bb.1(0x80000000)

    $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 killed $vgpr0_vgpr1, 0, 0, implicit $exec
    $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec

  bb.1:
    successors: %bb.1(0x40000000)

    $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_ENDPGM 0

...
---

# The loop contains a global store, and uses a (global) loaded value outside of the loop.

# GFX9-LABEL: waitcnt_vm_loop_global_mem
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_loop_global_mem
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_loop_global_mem
# GFX12-LABEL: bb.0:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:

name:            waitcnt_vm_loop_global_mem
body:             |
  bb.0:
    successors: %bb.1
    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    GLOBAL_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc

  bb.2:
    successors: %bb.3
    S_BRANCH %bb.3

  bb.3:
    S_ENDPGM 0

...
---

# Same as above case, but use scratch memory instructions instead

# GFX9-LABEL: waitcnt_vm_loop_scratch_mem
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_loop_scratch_mem
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_loop_scratch_mem
# GFX12-LABEL: bb.0:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:

name:            waitcnt_vm_loop_scratch_mem
body:             |
  bb.0:
    successors: %bb.1
    $vgpr0 = SCRATCH_LOAD_DWORD $vgpr1, 0, 0, implicit $exec, implicit $flat_scr
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    SCRATCH_STORE_DWORD $vgpr4, $vgpr6, 0, 0, implicit $exec, implicit $flat_scr
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc

  bb.2:
    successors: %bb.3
    S_BRANCH %bb.3

  bb.3:
    S_ENDPGM 0

...
---

# Same as above case, but use flat memory instructions instead

# GFX9-LABEL: waitcnt_vm_loop_flat_mem
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_loop_flat_mem
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 11
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 11
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_loop_flat_mem
# GFX12-LABEL: bb.0:
# GFX12: FLAT_LOAD_DWORD
# GFX12-NOT: S_WAIT_LOADCNT_DSCNT 0
# GFX12-LABEL: bb.1:
# GFX12: S_WAIT_LOADCNT_DSCNT 0
# GFX12-LABEL: bb.2:
name:            waitcnt_vm_loop_flat_mem
body:             |
  bb.0:
    successors: %bb.1
    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    FLAT_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec, implicit $flat_scr
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc

  bb.2:
    successors: %bb.3
    S_BRANCH %bb.3

  bb.3:
    S_ENDPGM 0

...
---

# The loop contains a store, a load, and uses values loaded both inside and
# outside the loop.
# We do not expect the waitcnt to be hoisted out of the loop.

# GFX9-LABEL: waitcnt_vm_loop_flat_load
# GFX9-LABEL: bb.0:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.2:

# GFX10-LABEL: waitcnt_vm_loop_flat_load
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:

# GFX12-LABEL: waitcnt_vm_loop_flat_load
# GFX12-LABEL: bb.0:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name:            waitcnt_vm_loop_flat_load
body:             |
  bb.0:
    successors: %bb.1

    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    GLOBAL_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec
    $vgpr7 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec
    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr7, implicit $exec
    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2

  bb.2:
    S_ENDPGM 0

...