llvm/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -o - %s | FileCheck -check-prefix=CHECK %s

define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" {
; CHECK-LABEL: max_6_vgprs:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    global_load_b32 v2, v[0:1], off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; CHECK-NEXT:    v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; CHECK-NEXT:    global_load_b32 v5, v[0:1], off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v2, off ; 4-byte Folded Spill
; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:4 ; 4-byte Folded Spill
; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:8 ; 4-byte Folded Spill
; CHECK-NEXT:    global_load_b32 v0, v[0:1], off offset:160 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:12 ; 4-byte Folded Spill
; CHECK-NEXT:    ;;#ASMSTART
; CHECK-NEXT:    ;;#ASMEND
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v5, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; CHECK-NEXT:    s_endpgm
  %tid = load volatile i32, ptr addrspace(1) undef
  %p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %tid
  %p2 = getelementptr inbounds i32, ptr addrspace(1) %p1, i32 4
  %p3 = getelementptr inbounds i32, ptr addrspace(1) %p2, i32 8
  %p4 = getelementptr inbounds i32, ptr addrspace(1) %p3, i32 12
  %p5 = getelementptr inbounds i32, ptr addrspace(1) %p4, i32 16
  %v1 = load volatile i32, ptr addrspace(1) %p1
  %v2 = load volatile i32, ptr addrspace(1) %p2
  %v3 = load volatile i32, ptr addrspace(1) %p3
  %v4 = load volatile i32, ptr addrspace(1) %p4
  %v5 = load volatile i32, ptr addrspace(1) %p5
  call void asm sideeffect "", "~{v[0:4]}" ()
  store volatile i32 %v1, ptr addrspace(1) undef
  store volatile i32 %v2, ptr addrspace(1) undef
  store volatile i32 %v3, ptr addrspace(1) undef
  store volatile i32 %v4, ptr addrspace(1) undef
  store volatile i32 %v5, ptr addrspace(1) undef
  ret void
}

define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgpu-num-vgpr"="11" {
; CHECK-LABEL: max_11_vgprs_branch:
; CHECK:       ; %bb.0: ; %.entry
; CHECK-NEXT:    global_load_b32 v3, v[0:1], off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    s_mov_b32 s0, exec_lo
; CHECK-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; CHECK-NEXT:    v_lshlrev_b64_e32 v[3:4], 2, v[3:4]
; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:336 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill
; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:448 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:12 ; 4-byte Folded Spill
; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:576 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v3, off ; 4-byte Folded Spill
; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:720 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill
; CHECK-NEXT:    v_cmpx_eq_u32_e32 0, v2
; CHECK-NEXT:    s_xor_b32 s0, exec_lo, s0
; CHECK-NEXT:    s_cbranch_execz .LBB1_2
; CHECK-NEXT:  ; %bb.1: ; %.false
; CHECK-NEXT:    global_load_b32 v10, v[0:1], off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:16 ; 4-byte Folded Spill
; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:160 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
; CHECK-NEXT:    global_load_b32 v0, v[0:1], off offset:240 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
; CHECK-NEXT:    ;;#ASMSTART
; CHECK-NEXT:    ;;#ASMEND
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:24 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:28 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:32 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    ; implicit-def: $vgpr0
; CHECK-NEXT:    ; kill: killed $vgpr0
; CHECK-NEXT:    ; implicit-def: $vgpr0
; CHECK-NEXT:    ; kill: killed $vgpr0
; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT:  .LBB1_2: ; %Flow
; CHECK-NEXT:    s_and_not1_saveexec_b32 s0, s0
; CHECK-NEXT:    s_cbranch_execz .LBB1_4
; CHECK-NEXT:  ; %bb.3: ; %.true
; CHECK-NEXT:    global_load_b32 v10, v[0:1], off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:16 ; 4-byte Folded Spill
; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:160 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
; CHECK-NEXT:    global_load_b32 v0, v[0:1], off offset:240 scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
; CHECK-NEXT:    ;;#ASMSTART
; CHECK-NEXT:    ;;#ASMEND
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:24 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:28 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:32 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:  .LBB1_4: ; %.exit
; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s0
; CHECK-NEXT:    scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT:    s_wait_loadcnt 0x0
; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT:    s_wait_storecnt 0x0
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; CHECK-NEXT:    s_endpgm
.entry:
  %tid = load volatile i32, ptr addrspace(1) undef
  %p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %tid
  %p2 = getelementptr inbounds i32, ptr addrspace(1) %p1, i32 4
  %p3 = getelementptr inbounds i32, ptr addrspace(1) %p2, i32 8
  %p4 = getelementptr inbounds i32, ptr addrspace(1) %p3, i32 12
  %p5 = getelementptr inbounds i32, ptr addrspace(1) %p4, i32 16
  %p6 = getelementptr inbounds i32, ptr addrspace(1) %p5, i32 20
  %p7 = getelementptr inbounds i32, ptr addrspace(1) %p6, i32 24
  %p8 = getelementptr inbounds i32, ptr addrspace(1) %p7, i32 28
  %p9 = getelementptr inbounds i32, ptr addrspace(1) %p8, i32 32
  %p10 = getelementptr inbounds i32, ptr addrspace(1) %p9, i32 36
  %v7 = load volatile i32, ptr addrspace(1) %p7
  %v8 = load volatile i32, ptr addrspace(1) %p8
  %v9 = load volatile i32, ptr addrspace(1) %p9
  %v10 = load volatile i32, ptr addrspace(1) %p10
  %cmp = icmp ne i32 %tmp, 0
  br i1 %cmp, label %.true, label %.false

.true:
  %v1_t = load volatile i32, ptr addrspace(1) %p1
  %v2_t = load volatile i32, ptr addrspace(1) %p2
  %v3_t = load volatile i32, ptr addrspace(1) %p3
  %v4_t = load volatile i32, ptr addrspace(1) %p4
  %v5_t = load volatile i32, ptr addrspace(1) %p5
  %v6_t = load volatile i32, ptr addrspace(1) %p6
  call void asm sideeffect "", "~{v[0:9]}" ()
  store volatile i32 %v1_t, ptr addrspace(1) undef
  store volatile i32 %v2_t, ptr addrspace(1) undef
  store volatile i32 %v3_t, ptr addrspace(1) undef
  store volatile i32 %v4_t, ptr addrspace(1) undef
  store volatile i32 %v5_t, ptr addrspace(1) undef
  store volatile i32 %v6_t, ptr addrspace(1) undef
  store volatile i32 %v7, ptr addrspace(1) undef
  store volatile i32 %v8, ptr addrspace(1) undef

  br label %.exit

.false:
  %v1_f = load volatile i32, ptr addrspace(1) %p1
  %v2_f = load volatile i32, ptr addrspace(1) %p2
  %v3_f = load volatile i32, ptr addrspace(1) %p3
  %v4_f = load volatile i32, ptr addrspace(1) %p4
  %v5_f = load volatile i32, ptr addrspace(1) %p5
  %v6_f = load volatile i32, ptr addrspace(1) %p6
  call void asm sideeffect "", "~{v[0:9]}" ()
  store volatile i32 %v1_f, ptr addrspace(1) undef
  store volatile i32 %v2_f, ptr addrspace(1) undef
  store volatile i32 %v3_f, ptr addrspace(1) undef
  store volatile i32 %v4_f, ptr addrspace(1) undef
  store volatile i32 %v5_f, ptr addrspace(1) undef
  store volatile i32 %v6_f, ptr addrspace(1) undef
  store volatile i32 %v7, ptr addrspace(1) undef
  store volatile i32 %v8, ptr addrspace(1) undef

  br label %.exit

.exit:
  store volatile i32 %v9, ptr addrspace(1) undef
  store volatile i32 %v10, ptr addrspace(1) undef
  ret void
}