llvm/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s

; Disabled endcf collapse at -O0.
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -O0 -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN-O0 %s

; Note: Breaking large PHIs is disabled to branches from being eliminated (in scc_liveness)

define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: simple_nested_if:
; GCN:       ; %bb.0: ; %bb
; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT:    s_cbranch_execz .LBB0_3
; GCN-NEXT:  ; %bb.1: ; %bb.outer.then
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT:    v_mov_b32_e32 v2, 0
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, 0
; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v0
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    buffer_store_dword v2, v[1:2], s[0:3], 0 addr64
; GCN-NEXT:    s_and_b64 exec, exec, vcc
; GCN-NEXT:    s_cbranch_execz .LBB0_3
; GCN-NEXT:  ; %bb.2: ; %bb.inner.then
; GCN-NEXT:    s_waitcnt expcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v2, s1
; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v1
; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GCN-NEXT:    s_mov_b32 s0, s2
; GCN-NEXT:    s_mov_b32 s1, s2
; GCN-NEXT:    v_mov_b32_e32 v2, 1
; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:4
; GCN-NEXT:  .LBB0_3: ; %bb.outer.end
; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
; GCN-NEXT:    v_mov_b32_e32 v0, 3
; GCN-NEXT:    v_mov_b32_e32 v1, 0
; GCN-NEXT:    s_mov_b32 m0, -1
; GCN-NEXT:    ds_write_b32 v1, v0
; GCN-NEXT:    s_endpgm
;
; GCN-O0-LABEL: simple_nested_if:
; GCN-O0:       ; %bb.0: ; %bb
; GCN-O0-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT:    s_mov_b32 s14, -1
; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN-O0-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 0
; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 1
; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
; GCN-O0-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b32 s0, 1
; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 2
; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 3
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT:    s_cbranch_execz .LBB0_4
; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 0
; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 1
; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
; GCN-O0-NEXT:    s_mov_b32 s0, 0
; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT:    s_mov_b32 s1, s2
; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v0
; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
; GCN-O0-NEXT:    s_mov_b32 s0, 2
; GCN-O0-NEXT:    v_lshl_b64 v[2:3], v[1:2], s0
; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[4:7], 0 addr64
; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 4
; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 5
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT:    s_cbranch_execz .LBB0_3
; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
; GCN-O0-NEXT:    s_waitcnt expcnt(1)
; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
; GCN-O0-NEXT:    s_mov_b32 s2, 2
; GCN-O0-NEXT:    v_lshl_b64 v[1:2], v[1:2], s2
; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
; GCN-O0-NEXT:    s_mov_b32 s4, 0
; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT:    s_mov_b32 s5, s2
; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-O0-NEXT:  .LBB0_3: ; %Flow
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 4
; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 5
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT:  .LBB0_4: ; %bb.outer.end
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 2
; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 3
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT:    v_mov_b32_e32 v1, 3
; GCN-O0-NEXT:    v_mov_b32_e32 v0, 0
; GCN-O0-NEXT:    s_mov_b32 m0, -1
; GCN-O0-NEXT:    ds_write_b32 v0, v1
; GCN-O0-NEXT:    s_endpgm
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = icmp ugt i32 %tmp, 1
  br i1 %tmp1, label %bb.outer.then, label %bb.outer.end

bb.outer.then:                                    ; preds = %bb
  %tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
  store i32 0, ptr addrspace(1) %tmp4, align 4
  %tmp5 = icmp eq i32 %tmp, 2
  br i1 %tmp5, label %bb.outer.end, label %bb.inner.then

bb.inner.then:                                    ; preds = %bb.outer.then
  %tmp7 = add i32 %tmp, 1
  %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
  store i32 1, ptr addrspace(1) %tmp9, align 4
  br label %bb.outer.end

bb.outer.end:                                     ; preds = %bb.outer.then, %bb.inner.then, %bb
  store i32 3, ptr addrspace(3) null
  ret void
}

define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: uncollapsable_nested_if:
; GCN:       ; %bb.0: ; %bb
; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT:    s_cbranch_execz .LBB1_4
; GCN-NEXT:  ; %bb.1: ; %bb.outer.then
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
; GCN-NEXT:    v_mov_b32_e32 v4, 0
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, 0
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v2, s1
; GCN-NEXT:    v_add_i32_e32 v1, vcc, s0, v3
; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v0
; GCN-NEXT:    buffer_store_dword v4, v[3:4], s[0:3], 0 addr64
; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT:    s_cbranch_execz .LBB1_3
; GCN-NEXT:  ; %bb.2: ; %bb.inner.then
; GCN-NEXT:    s_mov_b32 s0, s2
; GCN-NEXT:    s_mov_b32 s1, s2
; GCN-NEXT:    v_mov_b32_e32 v0, 1
; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:4
; GCN-NEXT:  .LBB1_3: ; %bb.inner.end
; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
; GCN-NEXT:    s_mov_b32 s0, s2
; GCN-NEXT:    s_mov_b32 s1, s2
; GCN-NEXT:    s_waitcnt expcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v0, 2
; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
; GCN-NEXT:  .LBB1_4: ; %Flow
; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
; GCN-NEXT:    s_waitcnt expcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v0, 3
; GCN-NEXT:    v_mov_b32_e32 v1, 0
; GCN-NEXT:    s_mov_b32 m0, -1
; GCN-NEXT:    ds_write_b32 v1, v0
; GCN-NEXT:    s_endpgm
;
; GCN-O0-LABEL: uncollapsable_nested_if:
; GCN-O0:       ; %bb.0: ; %bb
; GCN-O0-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT:    s_mov_b32 s14, -1
; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN-O0-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 0
; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 1
; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
; GCN-O0-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b32 s0, 1
; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 2
; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 3
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT:    s_cbranch_execz .LBB1_3
; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 0
; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 1
; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
; GCN-O0-NEXT:    s_mov_b32 s0, 0
; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT:    s_mov_b32 s1, s2
; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v0
; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
; GCN-O0-NEXT:    s_mov_b32 s0, 2
; GCN-O0-NEXT:    v_lshl_b64 v[2:3], v[1:2], s0
; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[4:7], 0 addr64
; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 4
; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 5
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT:    s_cbranch_execz .LBB1_4
; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
; GCN-O0-NEXT:    s_waitcnt expcnt(1)
; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
; GCN-O0-NEXT:    s_mov_b32 s2, 2
; GCN-O0-NEXT:    v_lshl_b64 v[1:2], v[1:2], s2
; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
; GCN-O0-NEXT:    s_mov_b32 s4, 0
; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT:    s_mov_b32 s5, s2
; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-O0-NEXT:    s_branch .LBB1_4
; GCN-O0-NEXT:  .LBB1_3: ; %Flow
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 2
; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 3
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT:    s_branch .LBB1_5
; GCN-O0-NEXT:  .LBB1_4: ; %bb.inner.end
; GCN-O0-NEXT:    s_waitcnt expcnt(1)
; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s2, v4, 4
; GCN-O0-NEXT:    v_readlane_b32 s3, v4, 5
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[2:3]
; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
; GCN-O0-NEXT:    v_lshl_b64 v[1:2], v[1:2], v0
; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
; GCN-O0-NEXT:    s_mov_b32 s4, 0
; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT:    s_mov_b32 s5, s2
; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-O0-NEXT:    s_branch .LBB1_3
; GCN-O0-NEXT:  .LBB1_5: ; %bb.outer.end
; GCN-O0-NEXT:    v_mov_b32_e32 v1, 3
; GCN-O0-NEXT:    v_mov_b32_e32 v0, 0
; GCN-O0-NEXT:    s_mov_b32 m0, -1
; GCN-O0-NEXT:    ds_write_b32 v0, v1
; GCN-O0-NEXT:    s_endpgm
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = icmp ugt i32 %tmp, 1
  br i1 %tmp1, label %bb.outer.then, label %bb.outer.end

bb.outer.then:                                    ; preds = %bb
  %tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
  store i32 0, ptr addrspace(1) %tmp4, align 4
  %tmp5 = icmp eq i32 %tmp, 2
  br i1 %tmp5, label %bb.inner.end, label %bb.inner.then

bb.inner.then:                                    ; preds = %bb.outer.then
  %tmp7 = add i32 %tmp, 1
  %tmp8 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
  store i32 1, ptr addrspace(1) %tmp8, align 4
  br label %bb.inner.end

bb.inner.end:                                     ; preds = %bb.inner.then, %bb.outer.then
  %tmp9 = add i32 %tmp, 2
  %tmp10 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp9
  store i32 2, ptr addrspace(1) %tmp10, align 4
  br label %bb.outer.end

bb.outer.end:                                     ; preds = %bb.inner.then, %bb
  store i32 3, ptr addrspace(3) null
  ret void
}

define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: nested_if_if_else:
; GCN:       ; %bb.0: ; %bb
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT:    v_mov_b32_e32 v2, 0
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, 0
; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    buffer_store_dword v2, v[1:2], s[0:3], 0 addr64
; GCN-NEXT:    s_and_saveexec_b64 s[2:3], vcc
; GCN-NEXT:    s_cbranch_execz .LBB2_5
; GCN-NEXT:  ; %bb.1: ; %bb.outer.then
; GCN-NEXT:    v_mov_b32_e32 v4, s1
; GCN-NEXT:    v_add_i32_e32 v3, vcc, s0, v1
; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v0
; GCN-NEXT:    s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
; GCN-NEXT:    s_cbranch_execz .LBB2_3
; GCN-NEXT:  ; %bb.2: ; %bb.else
; GCN-NEXT:    s_mov_b32 s6, 0
; GCN-NEXT:    s_mov_b32 s7, 0xf000
; GCN-NEXT:    s_mov_b32 s4, s6
; GCN-NEXT:    s_mov_b32 s5, s6
; GCN-NEXT:    v_mov_b32_e32 v0, 2
; GCN-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:8
; GCN-NEXT:    ; implicit-def: $vgpr3_vgpr4
; GCN-NEXT:  .LBB2_3: ; %Flow
; GCN-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
; GCN-NEXT:    s_cbranch_execz .LBB2_5
; GCN-NEXT:  ; %bb.4: ; %bb.then
; GCN-NEXT:    s_mov_b32 s6, 0
; GCN-NEXT:    s_mov_b32 s7, 0xf000
; GCN-NEXT:    s_mov_b32 s4, s6
; GCN-NEXT:    s_mov_b32 s5, s6
; GCN-NEXT:    s_waitcnt expcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v0, 1
; GCN-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:4
; GCN-NEXT:  .LBB2_5: ; %bb.outer.end
; GCN-NEXT:    s_or_b64 exec, exec, s[2:3]
; GCN-NEXT:    s_waitcnt expcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v0, 3
; GCN-NEXT:    s_mov_b32 m0, -1
; GCN-NEXT:    ds_write_b32 v2, v0
; GCN-NEXT:    s_endpgm
;
; GCN-O0-LABEL: nested_if_if_else:
; GCN-O0:       ; %bb.0: ; %bb
; GCN-O0-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT:    s_mov_b32 s14, -1
; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
; GCN-O0-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
; GCN-O0-NEXT:    v_writelane_b32 v4, s2, 0
; GCN-O0-NEXT:    v_writelane_b32 v4, s3, 1
; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
; GCN-O0-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
; GCN-O0-NEXT:    s_mov_b32 s4, 0
; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT:    s_mov_b32 s5, s2
; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT:    s_mov_b32 s4, 2
; GCN-O0-NEXT:    v_lshlrev_b32_e64 v2, s4, v0
; GCN-O0-NEXT:    s_mov_b32 s4, 0
; GCN-O0-NEXT:    ; implicit-def: $sgpr4
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
; GCN-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT:    v_mov_b32_e32 v3, v1
; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[0:3], 0 addr64
; GCN-O0-NEXT:    s_mov_b32 s0, 1
; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 2
; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 3
; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT:    s_cbranch_execz .LBB2_6
; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT:    s_mov_b32 s0, 2
; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[0:1], v0, s0
; GCN-O0-NEXT:    s_mov_b64 s[2:3], exec
; GCN-O0-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
; GCN-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_writelane_b32 v4, s2, 4
; GCN-O0-NEXT:    v_writelane_b32 v4, s3, 5
; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT:    s_cbranch_execz .LBB2_2
; GCN-O0-NEXT:    s_branch .LBB2_4
; GCN-O0-NEXT:  .LBB2_2: ; %Flow
; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 4
; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 5
; GCN-O0-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
; GCN-O0-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 6
; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 7
; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT:    s_xor_b64 exec, exec, s[0:1]
; GCN-O0-NEXT:    s_cbranch_execz .LBB2_5
; GCN-O0-NEXT:  ; %bb.3: ; %bb.then
; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
; GCN-O0-NEXT:    s_mov_b32 s2, 2
; GCN-O0-NEXT:    v_lshl_b64 v[1:2], v[1:2], s2
; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
; GCN-O0-NEXT:    s_mov_b32 s4, 0
; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT:    s_mov_b32 s5, s2
; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-O0-NEXT:    s_branch .LBB2_5
; GCN-O0-NEXT:  .LBB2_4: ; %bb.else
; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
; GCN-O0-NEXT:    v_lshl_b64 v[1:2], v[1:2], v0
; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
; GCN-O0-NEXT:    s_mov_b32 s4, 0
; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT:    s_mov_b32 s5, s2
; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-O0-NEXT:    s_branch .LBB2_2
; GCN-O0-NEXT:  .LBB2_5: ; %Flow1
; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 6
; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 7
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT:  .LBB2_6: ; %bb.outer.end
; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 2
; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 3
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT:    v_mov_b32_e32 v1, 3
; GCN-O0-NEXT:    v_mov_b32_e32 v0, 0
; GCN-O0-NEXT:    s_mov_b32 m0, -1
; GCN-O0-NEXT:    ds_write_b32 v0, v1
; GCN-O0-NEXT:    s_endpgm
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
  store i32 0, ptr addrspace(1) %tmp1, align 4
  %tmp2 = icmp ugt i32 %tmp, 1
  br i1 %tmp2, label %bb.outer.then, label %bb.outer.end

bb.outer.then:                                       ; preds = %bb
  %tmp5 = icmp eq i32 %tmp, 2
  br i1 %tmp5, label %bb.then, label %bb.else

bb.then:                                             ; preds = %bb.outer.then
  %tmp3 = add i32 %tmp, 1
  %tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp3
  store i32 1, ptr addrspace(1) %tmp4, align 4
  br label %bb.outer.end

bb.else:                                             ; preds = %bb.outer.then
  %tmp7 = add i32 %tmp, 2
  %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
  store i32 2, ptr addrspace(1) %tmp9, align 4
  br label %bb.outer.end

bb.outer.end:                                        ; preds = %bb, %bb.then, %bb.else
  store i32 3, ptr addrspace(3) null
  ret void
}

define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: nested_if_else_if:
; GCN:       ; %bb.0: ; %bb
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
; GCN-NEXT:    v_mov_b32_e32 v4, 0
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, 0
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v2, s1
; GCN-NEXT:    v_add_i32_e32 v1, vcc, s0, v3
; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v0
; GCN-NEXT:    buffer_store_dword v4, v[3:4], s[0:3], 0 addr64
; GCN-NEXT:    s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[0:1]
; GCN-NEXT:    s_cbranch_execz .LBB3_4
; GCN-NEXT:  ; %bb.1: ; %bb.outer.else
; GCN-NEXT:    s_mov_b32 s0, s2
; GCN-NEXT:    s_mov_b32 s1, s2
; GCN-NEXT:    v_mov_b32_e32 v3, 3
; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT:    buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:12
; GCN-NEXT:    s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT:    s_cbranch_execz .LBB3_3
; GCN-NEXT:  ; %bb.2: ; %bb.inner.then2
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    v_mov_b32_e32 v0, 4
; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[8:11], 0 addr64 offset:16
; GCN-NEXT:  .LBB3_3: ; %Flow
; GCN-NEXT:    s_or_b64 exec, exec, s[0:1]
; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
; GCN-NEXT:    ; implicit-def: $vgpr0
; GCN-NEXT:  .LBB3_4: ; %Flow2
; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT:    s_cbranch_execz .LBB3_8
; GCN-NEXT:  ; %bb.5: ; %bb.outer.then
; GCN-NEXT:    s_mov_b32 s0, s2
; GCN-NEXT:    s_mov_b32 s1, s2
; GCN-NEXT:    s_waitcnt expcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v3, 1
; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
; GCN-NEXT:    buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:4
; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT:    s_cbranch_execz .LBB3_7
; GCN-NEXT:  ; %bb.6: ; %bb.inner.then
; GCN-NEXT:    v_mov_b32_e32 v0, 2
; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
; GCN-NEXT:  .LBB3_7: ; %Flow1
; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
; GCN-NEXT:  .LBB3_8: ; %bb.outer.end
; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
; GCN-NEXT:    s_waitcnt expcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v0, 3
; GCN-NEXT:    v_mov_b32_e32 v1, 0
; GCN-NEXT:    s_mov_b32 m0, -1
; GCN-NEXT:    ds_write_b32 v1, v0
; GCN-NEXT:    s_endpgm
;
; GCN-O0-LABEL: nested_if_else_if:
; GCN-O0:       ; %bb.0: ; %bb
; GCN-O0-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT:    s_mov_b32 s14, -1
; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
; GCN-O0-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
; GCN-O0-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b32 s0, 2
; GCN-O0-NEXT:    v_lshlrev_b32_e64 v2, s0, v0
; GCN-O0-NEXT:    s_mov_b32 s1, 0
; GCN-O0-NEXT:    ; implicit-def: $sgpr1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
; GCN-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT:    v_mov_b32_e32 v3, v1
; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT:    s_mov_b32 s2, s4
; GCN-O0-NEXT:    v_mov_b32_e32 v1, v2
; GCN-O0-NEXT:    s_mov_b32 s1, s5
; GCN-O0-NEXT:    v_mov_b32_e32 v5, v3
; GCN-O0-NEXT:    v_add_i32_e64 v4, s[2:3], s2, v1
; GCN-O0-NEXT:    v_mov_b32_e32 v1, s1
; GCN-O0-NEXT:    v_addc_u32_e64 v1, s[2:3], v1, v5, s[2:3]
; GCN-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT:    v_mov_b32_e32 v5, v1
; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b32 s1, 0xf000
; GCN-O0-NEXT:    s_mov_b32 s2, 0
; GCN-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT:    s_mov_b32 s3, s1
; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[4:7], 0 addr64
; GCN-O0-NEXT:    v_cmp_lt_u32_e64 s[0:1], v0, s0
; GCN-O0-NEXT:    s_mov_b64 s[2:3], exec
; GCN-O0-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
; GCN-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
; GCN-O0-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
; GCN-O0-NEXT:    v_writelane_b32 v6, s2, 0
; GCN-O0-NEXT:    v_writelane_b32 v6, s3, 1
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT:    s_cbranch_execz .LBB3_1
; GCN-O0-NEXT:    s_branch .LBB3_4
; GCN-O0-NEXT:  .LBB3_1: ; %Flow2
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v6, 0
; GCN-O0-NEXT:    v_readlane_b32 s1, v6, 1
; GCN-O0-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
; GCN-O0-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
; GCN-O0-NEXT:    v_writelane_b32 v6, s0, 2
; GCN-O0-NEXT:    v_writelane_b32 v6, s1, 3
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_xor_b64 exec, exec, s[0:1]
; GCN-O0-NEXT:    s_cbranch_execz .LBB3_8
; GCN-O0-NEXT:  ; %bb.2: ; %bb.outer.then
; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
; GCN-O0-NEXT:    s_mov_b32 s2, 0
; GCN-O0-NEXT:    s_mov_b32 s4, s2
; GCN-O0-NEXT:    s_mov_b32 s5, s0
; GCN-O0-NEXT:    s_mov_b32 s0, s2
; GCN-O0-NEXT:    s_mov_b32 s1, s2
; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT:    v_mov_b32_e32 v1, 1
; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 offset:4
; GCN-O0-NEXT:    s_mov_b32 s0, 2
; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
; GCN-O0-NEXT:    v_writelane_b32 v6, s0, 4
; GCN-O0-NEXT:    v_writelane_b32 v6, s1, 5
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT:    s_cbranch_execz .LBB3_7
; GCN-O0-NEXT:  ; %bb.3: ; %bb.inner.then
; GCN-O0-NEXT:    s_waitcnt expcnt(1)
; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
; GCN-O0-NEXT:    s_mov_b32 s2, 0
; GCN-O0-NEXT:    s_mov_b32 s4, s2
; GCN-O0-NEXT:    s_mov_b32 s5, s0
; GCN-O0-NEXT:    s_mov_b32 s0, s2
; GCN-O0-NEXT:    s_mov_b32 s1, s2
; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
; GCN-O0-NEXT:    s_branch .LBB3_7
; GCN-O0-NEXT:  .LBB3_4: ; %bb.outer.else
; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_mov_b32 s1, 0xf000
; GCN-O0-NEXT:    s_mov_b32 s0, 0
; GCN-O0-NEXT:    s_mov_b32 s2, s0
; GCN-O0-NEXT:    s_mov_b32 s3, s1
; GCN-O0-NEXT:    s_mov_b32 s4, s0
; GCN-O0-NEXT:    s_mov_b32 s5, s0
; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
; GCN-O0-NEXT:    v_mov_b32_e32 v1, 3
; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[4:7], 0 addr64 offset:12
; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
; GCN-O0-NEXT:    v_writelane_b32 v6, s0, 6
; GCN-O0-NEXT:    v_writelane_b32 v6, s1, 7
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT:    s_cbranch_execz .LBB3_6
; GCN-O0-NEXT:  ; %bb.5: ; %bb.inner.then2
; GCN-O0-NEXT:    s_waitcnt expcnt(1)
; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
; GCN-O0-NEXT:    s_mov_b32 s2, 0
; GCN-O0-NEXT:    s_mov_b32 s4, s2
; GCN-O0-NEXT:    s_mov_b32 s5, s0
; GCN-O0-NEXT:    s_mov_b32 s0, s2
; GCN-O0-NEXT:    s_mov_b32 s1, s2
; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT:    v_mov_b32_e32 v0, 4
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:16
; GCN-O0-NEXT:  .LBB3_6: ; %Flow
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v6, 6
; GCN-O0-NEXT:    v_readlane_b32 s1, v6, 7
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT:    s_branch .LBB3_1
; GCN-O0-NEXT:  .LBB3_7: ; %Flow1
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v6, 4
; GCN-O0-NEXT:    v_readlane_b32 s1, v6, 5
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT:  .LBB3_8: ; %bb.outer.end
; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v6, 2
; GCN-O0-NEXT:    v_readlane_b32 s1, v6, 3
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT:    v_mov_b32_e32 v1, 3
; GCN-O0-NEXT:    v_mov_b32_e32 v0, 0
; GCN-O0-NEXT:    s_mov_b32 m0, -1
; GCN-O0-NEXT:    ds_write_b32 v0, v1
; GCN-O0-NEXT:    s_endpgm
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
  store i32 0, ptr addrspace(1) %tmp1, align 4
  %cc1 = icmp ugt i32 %tmp, 1
  br i1 %cc1, label %bb.outer.then, label %bb.outer.else

bb.outer.then:
  %tmp2 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 1
  store i32 1, ptr addrspace(1) %tmp2, align 4
  %cc2 = icmp eq i32 %tmp, 2
  br i1 %cc2, label %bb.inner.then, label %bb.outer.end

bb.inner.then:
  %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 2
  store i32 2, ptr addrspace(1) %tmp3, align 4
  br label %bb.outer.end

bb.outer.else:
  %tmp4 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 3
  store i32 3, ptr addrspace(1) %tmp4, align 4
  %cc3 = icmp eq i32 %tmp, 0   ; avoid being optimized away through the domination
  br i1 %cc3, label %bb.inner.then2, label %bb.outer.end

bb.inner.then2:
  %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 4
  store i32 4, ptr addrspace(1) %tmp5, align 4
  br label %bb.outer.end

bb.outer.end:
  store i32 3, ptr addrspace(3) null
  ret void
}

define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: s_endpgm_unsafe_barrier:
; GCN:       ; %bb.0: ; %bb
; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT:    s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT:    s_cbranch_execz .LBB4_2
; GCN-NEXT:  ; %bb.1: ; %bb.then
; GCN-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN-NEXT:    s_mov_b32 s7, 0xf000
; GCN-NEXT:    s_mov_b32 s6, 0
; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT:    v_mov_b32_e32 v1, 0
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    buffer_store_dword v1, v[0:1], s[4:7], 0 addr64
; GCN-NEXT:  .LBB4_2: ; %bb.end
; GCN-NEXT:    s_or_b64 exec, exec, s[0:1]
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT:    s_barrier
; GCN-NEXT:    s_endpgm
;
; GCN-O0-LABEL: s_endpgm_unsafe_barrier:
; GCN-O0:       ; %bb.0: ; %bb
; GCN-O0-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT:    s_mov_b32 s14, -1
; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT:    v_writelane_b32 v3, s0, 0
; GCN-O0-NEXT:    v_writelane_b32 v3, s1, 1
; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
; GCN-O0-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b32 s0, 1
; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
; GCN-O0-NEXT:    v_writelane_b32 v3, s0, 2
; GCN-O0-NEXT:    v_writelane_b32 v3, s1, 3
; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT:    buffer_store_dword v3, off, s[12:15], 0 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT:    s_cbranch_execz .LBB4_2
; GCN-O0-NEXT:  ; %bb.1: ; %bb.then
; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v3, 0
; GCN-O0-NEXT:    v_readlane_b32 s1, v3, 1
; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
; GCN-O0-NEXT:    s_mov_b32 s4, 0
; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT:    s_mov_b32 s5, s2
; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT:    v_ashrrev_i32_e64 v2, 31, v0
; GCN-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GCN-O0-NEXT:    v_mov_b32_e32 v1, v2
; GCN-O0-NEXT:    s_mov_b32 s4, 2
; GCN-O0-NEXT:    v_lshl_b64 v[1:2], v[0:1], s4
; GCN-O0-NEXT:    v_mov_b32_e32 v0, 0
; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-O0-NEXT:  .LBB4_2: ; %bb.end
; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s0, v3, 2
; GCN-O0-NEXT:    v_readlane_b32 s1, v3, 3
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT:    s_barrier
; GCN-O0-NEXT:    s_endpgm
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = icmp ugt i32 %tmp, 1
  br i1 %tmp1, label %bb.then, label %bb.end

bb.then:                                          ; preds = %bb
  %tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
  store i32 0, ptr addrspace(1) %tmp4, align 4
  br label %bb.end

bb.end:                                           ; preds = %bb.then, %bb
  call void @llvm.amdgcn.s.barrier()
  ret void
}

define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-LABEL: scc_liveness:
; GCN:       ; %bb.0: ; %bb
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    s_movk_i32 s4, 0x207
; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
; GCN-NEXT:    s_mov_b32 s8, 0
; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
; GCN-NEXT:    s_mov_b64 s[12:13], 0
; GCN-NEXT:    s_mov_b64 s[6:7], 0
; GCN-NEXT:    s_branch .LBB5_3
; GCN-NEXT:  .LBB5_1: ; %Flow
; GCN-NEXT:    ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT:    s_or_b64 exec, exec, s[10:11]
; GCN-NEXT:  .LBB5_2: ; %bb10
; GCN-NEXT:    ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT:    s_or_b64 exec, exec, s[14:15]
; GCN-NEXT:    s_and_b64 s[6:7], exec, s[4:5]
; GCN-NEXT:    s_or_b64 s[12:13], s[6:7], s[12:13]
; GCN-NEXT:    s_mov_b64 s[6:7], 0
; GCN-NEXT:    s_andn2_b64 exec, exec, s[12:13]
; GCN-NEXT:    s_cbranch_execz .LBB5_7
; GCN-NEXT:  .LBB5_3: ; %bb1
; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
; GCN-NEXT:    s_and_b64 s[10:11], exec, vcc
; GCN-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
; GCN-NEXT:    s_andn2_b64 exec, exec, s[6:7]
; GCN-NEXT:    s_cbranch_execnz .LBB5_3
; GCN-NEXT:  ; %bb.4: ; %bb2
; GCN-NEXT:    ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
; GCN-NEXT:    s_mov_b32 s9, s8
; GCN-NEXT:    s_mov_b32 s10, s8
; GCN-NEXT:    s_mov_b32 s11, s8
; GCN-NEXT:    v_mov_b32_e32 v0, s8
; GCN-NEXT:    v_mov_b32_e32 v1, s9
; GCN-NEXT:    v_mov_b32_e32 v2, s10
; GCN-NEXT:    v_mov_b32_e32 v3, s11
; GCN-NEXT:    s_and_saveexec_b64 s[14:15], s[4:5]
; GCN-NEXT:    s_cbranch_execz .LBB5_2
; GCN-NEXT:  ; %bb.5: ; %bb4
; GCN-NEXT:    ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    v_cmp_gt_f32_e64 s[6:7], 0, v0
; GCN-NEXT:    v_mov_b32_e32 v0, s8
; GCN-NEXT:    v_mov_b32_e32 v1, s9
; GCN-NEXT:    v_mov_b32_e32 v2, s10
; GCN-NEXT:    v_mov_b32_e32 v3, s11
; GCN-NEXT:    s_and_saveexec_b64 s[10:11], s[6:7]
; GCN-NEXT:    s_cbranch_execz .LBB5_1
; GCN-NEXT:  ; %bb.6: ; %bb8
; GCN-NEXT:    ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT:    s_mov_b32 s9, s8
; GCN-NEXT:    v_mov_b32_e32 v0, s8
; GCN-NEXT:    v_mov_b32_e32 v1, s9
; GCN-NEXT:    v_mov_b32_e32 v2, s10
; GCN-NEXT:    v_mov_b32_e32 v3, s11
; GCN-NEXT:    s_branch .LBB5_1
; GCN-NEXT:  .LBB5_7: ; %bb12
; GCN-NEXT:    s_or_b64 exec, exec, s[12:13]
; GCN-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT:    s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: scc_liveness:
; GCN-O0:       ; %bb.0: ; %bb
; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 s[4:5], 0
; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
; GCN-O0-NEXT:    s_waitcnt expcnt(1)
; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 0
; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 1
; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 2
; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 3
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:  .LBB5_1: ; %bb1
; GCN-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT:    s_waitcnt expcnt(1)
; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s8, v6, 2
; GCN-O0-NEXT:    v_readlane_b32 s9, v6, 3
; GCN-O0-NEXT:    v_readlane_b32 s6, v6, 0
; GCN-O0-NEXT:    v_readlane_b32 s7, v6, 1
; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 4
; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 5
; GCN-O0-NEXT:    s_mov_b32 s4, 0x207
; GCN-O0-NEXT:    v_cmp_lt_i32_e64 s[4:5], v0, s4
; GCN-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 6
; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 7
; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 0
; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 1
; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 2
; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 3
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
; GCN-O0-NEXT:    s_cbranch_execnz .LBB5_1
; GCN-O0-NEXT:  ; %bb.2: ; %bb2
; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 6
; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 7
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT:    s_mov_b32 s6, 0
; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[4:5], v0, s6
; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 8
; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 9
; GCN-O0-NEXT:    s_mov_b32 s4, 0
; GCN-O0-NEXT:    s_mov_b32 s8, s4
; GCN-O0-NEXT:    s_mov_b32 s9, s4
; GCN-O0-NEXT:    s_mov_b32 s10, s4
; GCN-O0-NEXT:    s_mov_b32 s11, s4
; GCN-O0-NEXT:    v_mov_b32_e32 v0, s8
; GCN-O0-NEXT:    v_mov_b32_e32 v1, s9
; GCN-O0-NEXT:    v_mov_b32_e32 v2, s10
; GCN-O0-NEXT:    v_mov_b32_e32 v3, s11
; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 s[4:5], exec
; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 10
; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 11
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT:    s_cbranch_execz .LBB5_5
; GCN-O0-NEXT:  ; %bb.3: ; %bb4
; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:    ; implicit-def: $sgpr4
; GCN-O0-NEXT:    v_mov_b32_e32 v0, s4
; GCN-O0-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
; GCN-O0-NEXT:    s_mov_b32 s4, 0
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_cmp_lt_f32_e64 s[6:7], v0, s4
; GCN-O0-NEXT:    s_mov_b32 s8, s4
; GCN-O0-NEXT:    s_mov_b32 s9, s4
; GCN-O0-NEXT:    s_mov_b32 s10, s4
; GCN-O0-NEXT:    s_mov_b32 s11, s4
; GCN-O0-NEXT:    v_mov_b32_e32 v0, s8
; GCN-O0-NEXT:    v_mov_b32_e32 v1, s9
; GCN-O0-NEXT:    v_mov_b32_e32 v2, s10
; GCN-O0-NEXT:    v_mov_b32_e32 v3, s11
; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 s[4:5], exec
; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 12
; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 13
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT:    s_cbranch_execz .LBB5_6
; GCN-O0-NEXT:  ; %bb.4: ; %bb8
; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT:    s_mov_b32 s10, 0
; GCN-O0-NEXT:    ; implicit-def: $sgpr4
; GCN-O0-NEXT:    ; implicit-def: $sgpr5
; GCN-O0-NEXT:    ; implicit-def: $sgpr9
; GCN-O0-NEXT:    ; implicit-def: $sgpr5
; GCN-O0-NEXT:    ; implicit-def: $sgpr8
; GCN-O0-NEXT:    ; implicit-def: $sgpr5
; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT:    s_mov_b32 s5, s10
; GCN-O0-NEXT:    s_mov_b32 s6, s9
; GCN-O0-NEXT:    s_mov_b32 s7, s8
; GCN-O0-NEXT:    s_waitcnt expcnt(1)
; GCN-O0-NEXT:    v_mov_b32_e32 v0, s4
; GCN-O0-NEXT:    v_mov_b32_e32 v1, s5
; GCN-O0-NEXT:    v_mov_b32_e32 v2, s6
; GCN-O0-NEXT:    v_mov_b32_e32 v3, s7
; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_branch .LBB5_6
; GCN-O0-NEXT:  .LBB5_5: ; %Flow2
; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT:    s_waitcnt expcnt(3)
; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_waitcnt expcnt(2)
; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_waitcnt expcnt(1)
; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 10
; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 11
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_branch .LBB5_7
; GCN-O0-NEXT:  .LBB5_6: ; %Flow
; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT:    s_waitcnt expcnt(3)
; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_waitcnt expcnt(2)
; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_waitcnt expcnt(1)
; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 12
; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 13
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_branch .LBB5_5
; GCN-O0-NEXT:  .LBB5_7: ; %bb10
; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s6, v6, 8
; GCN-O0-NEXT:    v_readlane_b32 s7, v6, 9
; GCN-O0-NEXT:    s_mov_b64 s[4:5], -1
; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 14
; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 15
; GCN-O0-NEXT:    s_mov_b64 s[4:5], exec
; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 16
; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 17
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT:    s_cbranch_execz .LBB5_9
; GCN-O0-NEXT:  ; %bb.8: ; %Flow1
; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:    s_mov_b64 s[4:5], 0
; GCN-O0-NEXT:    s_xor_b64 s[4:5], exec, -1
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 14
; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 15
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:  .LBB5_9: ; %Flow3
; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT:    s_waitcnt expcnt(4)
; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_waitcnt expcnt(3)
; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_waitcnt expcnt(2)
; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_waitcnt expcnt(1)
; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s8, v6, 16
; GCN-O0-NEXT:    v_readlane_b32 s9, v6, 17
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[8:9]
; GCN-O0-NEXT:    v_readlane_b32 s6, v6, 4
; GCN-O0-NEXT:    v_readlane_b32 s7, v6, 5
; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 14
; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 15
; GCN-O0-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
; GCN-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT:    s_mov_b64 s[6:7], 0
; GCN-O0-NEXT:    s_mov_b64 s[8:9], s[4:5]
; GCN-O0-NEXT:    v_writelane_b32 v6, s8, 0
; GCN-O0-NEXT:    v_writelane_b32 v6, s9, 1
; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 2
; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 3
; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 18
; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 19
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GCN-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
; GCN-O0-NEXT:    s_cbranch_execnz .LBB5_1
; GCN-O0-NEXT:  ; %bb.10: ; %bb12
; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT:    s_waitcnt expcnt(4)
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 18
; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 19
; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT:  ; %bb.11: ; %bb12
; GCN-O0-NEXT:    s_waitcnt expcnt(3)
; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_waitcnt expcnt(2)
; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_waitcnt expcnt(1)
; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_waitcnt expcnt(0)
; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    v_mov_b32_e32 v4, v3
; GCN-O0-NEXT:    ; implicit-def: $sgpr4
; GCN-O0-NEXT:    v_mov_b32_e32 v5, s4
; GCN-O0-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT:    v_mov_b32_e32 v4, v2
; GCN-O0-NEXT:    ; implicit-def: $sgpr4
; GCN-O0-NEXT:    v_mov_b32_e32 v5, s4
; GCN-O0-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT:    v_mov_b32_e32 v4, v1
; GCN-O0-NEXT:    ; implicit-def: $sgpr4
; GCN-O0-NEXT:    v_mov_b32_e32 v5, s4
; GCN-O0-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT:    ; implicit-def: $sgpr4
; GCN-O0-NEXT:    v_mov_b32_e32 v1, s4
; GCN-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
; GCN-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT:    s_setpc_b64 s[30:31]
bb:
  br label %bb1

bb1:                                              ; preds = %Flow1, %bb1, %bb
  %tmp = icmp slt i32 %arg, 519
  br i1 %tmp, label %bb2, label %bb1

bb2:                                              ; preds = %bb1
  %tmp3 = icmp eq i32 %arg, 0
  br i1 %tmp3, label %bb4, label %bb10

bb4:                                              ; preds = %bb2
  %tmp6 = load float, ptr addrspace(5) undef
  %tmp7 = fcmp olt float %tmp6, 0.0
  br i1 %tmp7, label %bb8, label %Flow

bb8:                                              ; preds = %bb4
  %tmp9 = insertelement <4 x float> undef, float 0.0, i32 1
  br label %Flow

Flow:                                             ; preds = %bb8, %bb4
  %tmp8 = phi <4 x float> [ %tmp9, %bb8 ], [ zeroinitializer, %bb4 ]
  br label %bb10

bb10:                                             ; preds = %Flow, %bb2
  %tmp11 = phi <4 x float> [ zeroinitializer, %bb2 ], [ %tmp8, %Flow ]
  br i1 %tmp3, label %bb12, label %Flow1

Flow1:                                            ; preds = %bb10
  br label %bb1

bb12:                                             ; preds = %bb10
  store volatile <4 x float> %tmp11, ptr addrspace(5) undef, align 16
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #0
declare void @llvm.amdgcn.s.barrier() #1

attributes #0 = { nounwind readnone speculatable }
attributes #1 = { nounwind convergent }
attributes #2 = { nounwind }

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}