llvm/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GENERIC %s
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 < %s | FileCheck -check-prefix=NOOPT %s
; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI-MOVREL %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI,VI-MOVREL %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-vgpr-index-mode < %s | FileCheck -check-prefixes=VI,VI-IDXMODE %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-IDXMODE %s

; Tests for indirect addressing on SI, which is implemented using dynamic
; indexing of vectors.
define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) {
; GENERIC-LABEL: extract_w_offset:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT:    s_load_dword s4, s[2:3], 0xb
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x40400000
; GENERIC-NEXT:    v_mov_b32_e32 v1, 0x40a00000
; GENERIC-NEXT:    v_mov_b32_e32 v2, 0x40c00000
; GENERIC-NEXT:    v_mov_b32_e32 v3, 0x40e00000
; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x41000000
; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x41100000
; GENERIC-NEXT:    v_mov_b32_e32 v6, 0x41200000
; GENERIC-NEXT:    v_mov_b32_e32 v7, 0x41300000
; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x41400000
; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x41500000
; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41600000
; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41700000
; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41800000
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_add_i32 s6, s4, 1
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 1
; GENERIC-NEXT:    s_cselect_b64 s[4:5], -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e64 v13, 1.0, 2.0, s[4:5]
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 2
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 3
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 4.0, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 4
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 5
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 6
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 7
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 8
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 9
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 10
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 11
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 12
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 13
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 14
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 15
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: extract_w_offset:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xb
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_mov_b32 s5, 1
; NOOPT-NEXT:    s_add_i32 s4, s4, s5
; NOOPT-NEXT:    s_mov_b32 s5, 0x41800000
; NOOPT-NEXT:    s_mov_b32 s6, 0x41700000
; NOOPT-NEXT:    s_mov_b32 s7, 0x41600000
; NOOPT-NEXT:    s_mov_b32 s8, 0x41500000
; NOOPT-NEXT:    s_mov_b32 s9, 0x41400000
; NOOPT-NEXT:    s_mov_b32 s10, 0x41300000
; NOOPT-NEXT:    s_mov_b32 s11, 0x41200000
; NOOPT-NEXT:    s_mov_b32 s12, 0x41100000
; NOOPT-NEXT:    s_mov_b32 s13, 0x41000000
; NOOPT-NEXT:    s_mov_b32 s14, 0x40e00000
; NOOPT-NEXT:    s_mov_b32 s15, 0x40c00000
; NOOPT-NEXT:    s_mov_b32 s16, 0x40a00000
; NOOPT-NEXT:    s_mov_b32 s17, 4.0
; NOOPT-NEXT:    s_mov_b32 s18, 0x40400000
; NOOPT-NEXT:    s_mov_b32 s19, 2.0
; NOOPT-NEXT:    s_mov_b32 s20, 1.0
; NOOPT-NEXT:    v_mov_b32_e32 v0, s20
; NOOPT-NEXT:    v_mov_b32_e32 v30, s19
; NOOPT-NEXT:    v_mov_b32_e32 v29, s18
; NOOPT-NEXT:    v_mov_b32_e32 v28, s17
; NOOPT-NEXT:    v_mov_b32_e32 v27, s16
; NOOPT-NEXT:    v_mov_b32_e32 v26, s15
; NOOPT-NEXT:    v_mov_b32_e32 v25, s14
; NOOPT-NEXT:    v_mov_b32_e32 v24, s13
; NOOPT-NEXT:    v_mov_b32_e32 v23, s12
; NOOPT-NEXT:    v_mov_b32_e32 v22, s11
; NOOPT-NEXT:    v_mov_b32_e32 v21, s10
; NOOPT-NEXT:    v_mov_b32_e32 v20, s9
; NOOPT-NEXT:    v_mov_b32_e32 v19, s8
; NOOPT-NEXT:    v_mov_b32_e32 v18, s7
; NOOPT-NEXT:    v_mov_b32_e32 v17, s6
; NOOPT-NEXT:    v_mov_b32_e32 v16, s5
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
; NOOPT-NEXT:    s_mov_b32 m0, s4
; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: extract_w_offset:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0xb
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_add_i32 s4, s4, 1
; SI-MOVREL-NEXT:    s_mov_b32 m0, s4
; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
; SI-MOVREL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: extract_w_offset:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0x2c
; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    s_add_i32 s4, s4, 1
; VI-MOVREL-NEXT:    s_mov_b32 m0, s4
; VI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
; VI-MOVREL-NEXT:    v_movrels_b32_e32 v2, v0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s1
; VI-MOVREL-NEXT:    flat_store_dword v[0:1], v2
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: extract_w_offset:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_add_i32 s4, s4, 1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s4, gpr_idx(SRC0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, v0
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s1
; VI-IDXMODE-NEXT:    flat_store_dword v[0:1], v2
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: extract_w_offset:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    s_add_i32 s4, s4, 1
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s4, gpr_idx(SRC0)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, v0
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dword v16, v0, s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %idx = add i32 %in, 1
  %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %idx
  store float %elt, ptr addrspace(1) %out
  ret void
}

; XXX: Could do v_or_b32 directly
define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %out, i32 %in, <16 x i32> %or.val) {
; GENERIC-LABEL: extract_w_offset_salu_use_vector:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dword s20, s[2:3], 0xb
; GENERIC-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x19
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_add_i32 s20, s20, 1
; GENERIC-NEXT:    s_or_b32 s2, s19, 16
; GENERIC-NEXT:    s_or_b32 s18, s18, 15
; GENERIC-NEXT:    s_or_b32 s17, s17, 14
; GENERIC-NEXT:    s_or_b32 s16, s16, 13
; GENERIC-NEXT:    s_or_b32 s15, s15, 12
; GENERIC-NEXT:    s_or_b32 s14, s14, 11
; GENERIC-NEXT:    s_or_b32 s13, s13, 10
; GENERIC-NEXT:    s_or_b32 s12, s12, 9
; GENERIC-NEXT:    s_or_b32 s11, s11, 8
; GENERIC-NEXT:    s_or_b32 s10, s10, 7
; GENERIC-NEXT:    s_or_b32 s9, s9, 6
; GENERIC-NEXT:    s_or_b32 s8, s8, 5
; GENERIC-NEXT:    s_or_b32 s7, s7, 4
; GENERIC-NEXT:    s_or_b32 s6, s6, 3
; GENERIC-NEXT:    s_or_b32 s4, s4, 1
; GENERIC-NEXT:    s_or_b32 s5, s5, 2
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 1
; GENERIC-NEXT:    s_cselect_b32 s4, s5, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 2
; GENERIC-NEXT:    s_cselect_b32 s4, s6, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 3
; GENERIC-NEXT:    s_cselect_b32 s4, s7, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 4
; GENERIC-NEXT:    s_cselect_b32 s4, s8, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 5
; GENERIC-NEXT:    s_cselect_b32 s4, s9, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 6
; GENERIC-NEXT:    s_cselect_b32 s4, s10, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 7
; GENERIC-NEXT:    s_cselect_b32 s4, s11, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 8
; GENERIC-NEXT:    s_cselect_b32 s4, s12, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 9
; GENERIC-NEXT:    s_cselect_b32 s4, s13, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 10
; GENERIC-NEXT:    s_cselect_b32 s4, s14, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 11
; GENERIC-NEXT:    s_cselect_b32 s4, s15, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 12
; GENERIC-NEXT:    s_cselect_b32 s4, s16, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 13
; GENERIC-NEXT:    s_cselect_b32 s4, s17, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 14
; GENERIC-NEXT:    s_cselect_b32 s4, s18, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 15
; GENERIC-NEXT:    s_cselect_b32 s4, s2, s4
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    v_mov_b32_e32 v0, s4
; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: extract_w_offset_salu_use_vector:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xb
; NOOPT-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x19
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_mov_b32 s21, 1
; NOOPT-NEXT:    s_add_i32 s4, s4, s21
; NOOPT-NEXT:    s_mov_b32 s5, s51
; NOOPT-NEXT:    s_mov_b32 s6, 16
; NOOPT-NEXT:    s_or_b32 s5, s5, s6
; NOOPT-NEXT:    s_mov_b32 s6, s50
; NOOPT-NEXT:    s_mov_b32 s7, 15
; NOOPT-NEXT:    s_or_b32 s6, s6, s7
; NOOPT-NEXT:    s_mov_b32 s7, s49
; NOOPT-NEXT:    s_mov_b32 s8, 14
; NOOPT-NEXT:    s_or_b32 s7, s7, s8
; NOOPT-NEXT:    s_mov_b32 s8, s48
; NOOPT-NEXT:    s_mov_b32 s9, 13
; NOOPT-NEXT:    s_or_b32 s8, s8, s9
; NOOPT-NEXT:    s_mov_b32 s9, s47
; NOOPT-NEXT:    s_mov_b32 s10, 12
; NOOPT-NEXT:    s_or_b32 s9, s9, s10
; NOOPT-NEXT:    s_mov_b32 s10, s46
; NOOPT-NEXT:    s_mov_b32 s11, 11
; NOOPT-NEXT:    s_or_b32 s10, s10, s11
; NOOPT-NEXT:    s_mov_b32 s11, s45
; NOOPT-NEXT:    s_mov_b32 s12, 10
; NOOPT-NEXT:    s_or_b32 s11, s11, s12
; NOOPT-NEXT:    s_mov_b32 s12, s44
; NOOPT-NEXT:    s_mov_b32 s13, 9
; NOOPT-NEXT:    s_or_b32 s12, s12, s13
; NOOPT-NEXT:    s_mov_b32 s13, s43
; NOOPT-NEXT:    s_mov_b32 s14, 8
; NOOPT-NEXT:    s_or_b32 s13, s13, s14
; NOOPT-NEXT:    s_mov_b32 s14, s42
; NOOPT-NEXT:    s_mov_b32 s15, 7
; NOOPT-NEXT:    s_or_b32 s14, s14, s15
; NOOPT-NEXT:    s_mov_b32 s15, s41
; NOOPT-NEXT:    s_mov_b32 s16, 6
; NOOPT-NEXT:    s_or_b32 s15, s15, s16
; NOOPT-NEXT:    s_mov_b32 s16, s40
; NOOPT-NEXT:    s_mov_b32 s17, 5
; NOOPT-NEXT:    s_or_b32 s16, s16, s17
; NOOPT-NEXT:    s_mov_b32 s17, s39
; NOOPT-NEXT:    s_mov_b32 s18, 4
; NOOPT-NEXT:    s_or_b32 s17, s17, s18
; NOOPT-NEXT:    s_mov_b32 s18, s38
; NOOPT-NEXT:    s_mov_b32 s19, 3
; NOOPT-NEXT:    s_or_b32 s18, s18, s19
; NOOPT-NEXT:    s_mov_b32 s19, s37
; NOOPT-NEXT:    s_mov_b32 s20, 2
; NOOPT-NEXT:    s_or_b32 s19, s19, s20
; NOOPT-NEXT:    s_mov_b32 s20, s36
; NOOPT-NEXT:    s_or_b32 s20, s20, s21
; NOOPT-NEXT:    v_mov_b32_e32 v0, s20
; NOOPT-NEXT:    v_mov_b32_e32 v30, s19
; NOOPT-NEXT:    v_mov_b32_e32 v29, s18
; NOOPT-NEXT:    v_mov_b32_e32 v28, s17
; NOOPT-NEXT:    v_mov_b32_e32 v27, s16
; NOOPT-NEXT:    v_mov_b32_e32 v26, s15
; NOOPT-NEXT:    v_mov_b32_e32 v25, s14
; NOOPT-NEXT:    v_mov_b32_e32 v24, s13
; NOOPT-NEXT:    v_mov_b32_e32 v23, s12
; NOOPT-NEXT:    v_mov_b32_e32 v22, s11
; NOOPT-NEXT:    v_mov_b32_e32 v21, s10
; NOOPT-NEXT:    v_mov_b32_e32 v20, s9
; NOOPT-NEXT:    v_mov_b32_e32 v19, s8
; NOOPT-NEXT:    v_mov_b32_e32 v18, s7
; NOOPT-NEXT:    v_mov_b32_e32 v17, s6
; NOOPT-NEXT:    v_mov_b32_e32 v16, s5
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
; NOOPT-NEXT:    s_mov_b32 m0, s4
; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: extract_w_offset_salu_use_vector:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dword s20, s[2:3], 0xb
; SI-MOVREL-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x19
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_add_i32 s20, s20, 1
; SI-MOVREL-NEXT:    s_or_b32 s4, s4, 1
; SI-MOVREL-NEXT:    s_or_b32 s19, s19, 16
; SI-MOVREL-NEXT:    s_or_b32 s18, s18, 15
; SI-MOVREL-NEXT:    s_or_b32 s17, s17, 14
; SI-MOVREL-NEXT:    s_or_b32 s16, s16, 13
; SI-MOVREL-NEXT:    s_or_b32 s15, s15, 12
; SI-MOVREL-NEXT:    s_or_b32 s14, s14, 11
; SI-MOVREL-NEXT:    s_or_b32 s13, s13, 10
; SI-MOVREL-NEXT:    s_or_b32 s12, s12, 9
; SI-MOVREL-NEXT:    s_or_b32 s11, s11, 8
; SI-MOVREL-NEXT:    s_or_b32 s10, s10, 7
; SI-MOVREL-NEXT:    s_or_b32 s9, s9, 6
; SI-MOVREL-NEXT:    s_or_b32 s8, s8, 5
; SI-MOVREL-NEXT:    s_or_b32 s7, s7, 4
; SI-MOVREL-NEXT:    s_or_b32 s6, s6, 3
; SI-MOVREL-NEXT:    s_or_b32 s5, s5, 2
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, s4
; SI-MOVREL-NEXT:    s_mov_b32 m0, s20
; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, s5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, s6
; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, s7
; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, s8
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, s9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, s10
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, s11
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, s12
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, s13
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, s14
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, s15
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, s16
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, s17
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, s18
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, s19
; SI-MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
; SI-MOVREL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: extract_w_offset_salu_use_vector:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dword s20, s[2:3], 0x2c
; VI-MOVREL-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    s_add_i32 s20, s20, 1
; VI-MOVREL-NEXT:    s_or_b32 s6, s6, 3
; VI-MOVREL-NEXT:    s_or_b32 s5, s5, 2
; VI-MOVREL-NEXT:    s_or_b32 s4, s4, 1
; VI-MOVREL-NEXT:    s_or_b32 s2, s19, 16
; VI-MOVREL-NEXT:    s_or_b32 s3, s18, 15
; VI-MOVREL-NEXT:    s_or_b32 s17, s17, 14
; VI-MOVREL-NEXT:    s_or_b32 s16, s16, 13
; VI-MOVREL-NEXT:    s_or_b32 s15, s15, 12
; VI-MOVREL-NEXT:    s_or_b32 s14, s14, 11
; VI-MOVREL-NEXT:    s_or_b32 s13, s13, 10
; VI-MOVREL-NEXT:    s_or_b32 s12, s12, 9
; VI-MOVREL-NEXT:    s_or_b32 s11, s11, 8
; VI-MOVREL-NEXT:    s_or_b32 s10, s10, 7
; VI-MOVREL-NEXT:    s_or_b32 s9, s9, 6
; VI-MOVREL-NEXT:    s_or_b32 s8, s8, 5
; VI-MOVREL-NEXT:    s_or_b32 s7, s7, 4
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s4
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, s6
; VI-MOVREL-NEXT:    s_mov_b32 m0, s20
; VI-MOVREL-NEXT:    v_mov_b32_e32 v3, s7
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s8
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s9
; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, s10
; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, s11
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, s12
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, s13
; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, s14
; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, s15
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, s16
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, s17
; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, s2
; VI-MOVREL-NEXT:    v_movrels_b32_e32 v2, v0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s1
; VI-MOVREL-NEXT:    flat_store_dword v[0:1], v2
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: extract_w_offset_salu_use_vector:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dword s20, s[2:3], 0x2c
; VI-IDXMODE-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_add_i32 s20, s20, 1
; VI-IDXMODE-NEXT:    s_or_b32 s6, s6, 3
; VI-IDXMODE-NEXT:    s_or_b32 s5, s5, 2
; VI-IDXMODE-NEXT:    s_or_b32 s4, s4, 1
; VI-IDXMODE-NEXT:    s_or_b32 s2, s19, 16
; VI-IDXMODE-NEXT:    s_or_b32 s3, s18, 15
; VI-IDXMODE-NEXT:    s_or_b32 s17, s17, 14
; VI-IDXMODE-NEXT:    s_or_b32 s16, s16, 13
; VI-IDXMODE-NEXT:    s_or_b32 s15, s15, 12
; VI-IDXMODE-NEXT:    s_or_b32 s14, s14, 11
; VI-IDXMODE-NEXT:    s_or_b32 s13, s13, 10
; VI-IDXMODE-NEXT:    s_or_b32 s12, s12, 9
; VI-IDXMODE-NEXT:    s_or_b32 s11, s11, 8
; VI-IDXMODE-NEXT:    s_or_b32 s10, s10, 7
; VI-IDXMODE-NEXT:    s_or_b32 s9, s9, 6
; VI-IDXMODE-NEXT:    s_or_b32 s8, s8, 5
; VI-IDXMODE-NEXT:    s_or_b32 s7, s7, 4
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s4
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, s6
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v3, s7
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s8
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s9
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, s10
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, s11
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, s12
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, s13
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, s14
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, s15
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, s16
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, s17
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, s2
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s20, gpr_idx(SRC0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, v0
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s1
; VI-IDXMODE-NEXT:    flat_store_dword v[0:1], v2
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: extract_w_offset_salu_use_vector:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dword s20, s[2:3], 0x2c
; GFX9-IDXMODE-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    s_add_i32 s20, s20, 1
; GFX9-IDXMODE-NEXT:    s_or_b32 s4, s4, 1
; GFX9-IDXMODE-NEXT:    s_or_b32 s2, s19, 16
; GFX9-IDXMODE-NEXT:    s_or_b32 s3, s18, 15
; GFX9-IDXMODE-NEXT:    s_or_b32 s17, s17, 14
; GFX9-IDXMODE-NEXT:    s_or_b32 s16, s16, 13
; GFX9-IDXMODE-NEXT:    s_or_b32 s15, s15, 12
; GFX9-IDXMODE-NEXT:    s_or_b32 s14, s14, 11
; GFX9-IDXMODE-NEXT:    s_or_b32 s13, s13, 10
; GFX9-IDXMODE-NEXT:    s_or_b32 s12, s12, 9
; GFX9-IDXMODE-NEXT:    s_or_b32 s11, s11, 8
; GFX9-IDXMODE-NEXT:    s_or_b32 s10, s10, 7
; GFX9-IDXMODE-NEXT:    s_or_b32 s9, s9, 6
; GFX9-IDXMODE-NEXT:    s_or_b32 s8, s8, 5
; GFX9-IDXMODE-NEXT:    s_or_b32 s7, s7, 4
; GFX9-IDXMODE-NEXT:    s_or_b32 s6, s6, 3
; GFX9-IDXMODE-NEXT:    s_or_b32 s5, s5, 2
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, s4
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, s5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, s6
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, s7
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, s8
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, s9
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, s10
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, s11
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, s12
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, s13
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, s14
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, s15
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, s16
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, s17
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, s3
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, s2
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s20, gpr_idx(SRC0)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, v0
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dword v16, v0, s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %idx = add i32 %in, 1
  %vec = or <16 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
  %elt = extractelement <16 x i32> %vec, i32 %idx
  store i32 %elt, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) {
; GENERIC-LABEL: extract_wo_offset:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT:    s_load_dword s6, s[2:3], 0xb
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x40400000
; GENERIC-NEXT:    v_mov_b32_e32 v1, 0x40a00000
; GENERIC-NEXT:    v_mov_b32_e32 v2, 0x40c00000
; GENERIC-NEXT:    v_mov_b32_e32 v3, 0x40e00000
; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x41000000
; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x41100000
; GENERIC-NEXT:    v_mov_b32_e32 v6, 0x41200000
; GENERIC-NEXT:    v_mov_b32_e32 v7, 0x41300000
; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x41400000
; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x41500000
; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41600000
; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41700000
; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41800000
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 1
; GENERIC-NEXT:    s_cselect_b64 s[4:5], -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e64 v13, 1.0, 2.0, s[4:5]
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 2
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 3
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 4.0, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 4
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 5
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 6
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 7
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 8
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 9
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 10
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 11
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 12
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 13
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 14
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s6, 15
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: extract_wo_offset:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xb
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_mov_b32 s5, 0x41800000
; NOOPT-NEXT:    s_mov_b32 s6, 0x41700000
; NOOPT-NEXT:    s_mov_b32 s7, 0x41600000
; NOOPT-NEXT:    s_mov_b32 s8, 0x41500000
; NOOPT-NEXT:    s_mov_b32 s9, 0x41400000
; NOOPT-NEXT:    s_mov_b32 s10, 0x41300000
; NOOPT-NEXT:    s_mov_b32 s11, 0x41200000
; NOOPT-NEXT:    s_mov_b32 s12, 0x41100000
; NOOPT-NEXT:    s_mov_b32 s13, 0x41000000
; NOOPT-NEXT:    s_mov_b32 s14, 0x40e00000
; NOOPT-NEXT:    s_mov_b32 s15, 0x40c00000
; NOOPT-NEXT:    s_mov_b32 s16, 0x40a00000
; NOOPT-NEXT:    s_mov_b32 s17, 4.0
; NOOPT-NEXT:    s_mov_b32 s18, 0x40400000
; NOOPT-NEXT:    s_mov_b32 s19, 2.0
; NOOPT-NEXT:    s_mov_b32 s20, 1.0
; NOOPT-NEXT:    v_mov_b32_e32 v0, s20
; NOOPT-NEXT:    v_mov_b32_e32 v30, s19
; NOOPT-NEXT:    v_mov_b32_e32 v29, s18
; NOOPT-NEXT:    v_mov_b32_e32 v28, s17
; NOOPT-NEXT:    v_mov_b32_e32 v27, s16
; NOOPT-NEXT:    v_mov_b32_e32 v26, s15
; NOOPT-NEXT:    v_mov_b32_e32 v25, s14
; NOOPT-NEXT:    v_mov_b32_e32 v24, s13
; NOOPT-NEXT:    v_mov_b32_e32 v23, s12
; NOOPT-NEXT:    v_mov_b32_e32 v22, s11
; NOOPT-NEXT:    v_mov_b32_e32 v21, s10
; NOOPT-NEXT:    v_mov_b32_e32 v20, s9
; NOOPT-NEXT:    v_mov_b32_e32 v19, s8
; NOOPT-NEXT:    v_mov_b32_e32 v18, s7
; NOOPT-NEXT:    v_mov_b32_e32 v17, s6
; NOOPT-NEXT:    v_mov_b32_e32 v16, s5
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
; NOOPT-NEXT:    s_mov_b32 m0, s4
; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: extract_wo_offset:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0xb
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_mov_b32 m0, s4
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
; SI-MOVREL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: extract_wo_offset:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-MOVREL-NEXT:    s_load_dword s2, s[2:3], 0x2c
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    s_mov_b32 m0, s2
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
; VI-MOVREL-NEXT:    v_movrels_b32_e32 v2, v0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s1
; VI-MOVREL-NEXT:    flat_store_dword v[0:1], v2
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: extract_wo_offset:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-IDXMODE-NEXT:    s_load_dword s2, s[2:3], 0x2c
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, v0
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s1
; VI-IDXMODE-NEXT:    flat_store_dword v[0:1], v2
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: extract_wo_offset:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s4, gpr_idx(SRC0)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, v0
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dword v16, v0, s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %in
  store float %elt, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %offset) {
; GENERIC-LABEL: extract_neg_offset_sgpr:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dword s4, s[2:3], 0xb
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_add_i32 s2, s4, 0xfffffe00
; GENERIC-NEXT:    s_cmp_eq_u32 s2, 1
; GENERIC-NEXT:    s_cselect_b64 s[4:5], -1, 0
; GENERIC-NEXT:    s_cmp_lg_u32 s2, 2
; GENERIC-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GENERIC-NEXT:    v_readfirstlane_b32 s4, v0
; GENERIC-NEXT:    s_cselect_b32 s4, s4, 2
; GENERIC-NEXT:    s_cmp_lg_u32 s2, 3
; GENERIC-NEXT:    s_cselect_b32 s4, s4, 3
; GENERIC-NEXT:    s_cmp_lg_u32 s2, 4
; GENERIC-NEXT:    s_cselect_b32 s4, s4, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s2, 5
; GENERIC-NEXT:    s_cselect_b32 s4, s4, 6
; GENERIC-NEXT:    s_cmp_lg_u32 s2, 6
; GENERIC-NEXT:    s_cselect_b32 s4, s4, 7
; GENERIC-NEXT:    s_cmp_lg_u32 s2, 7
; GENERIC-NEXT:    s_cselect_b32 s4, s4, 8
; GENERIC-NEXT:    s_cmp_lg_u32 s2, 8
; GENERIC-NEXT:    s_cselect_b32 s4, s4, 9
; GENERIC-NEXT:    s_cmp_lg_u32 s2, 9
; GENERIC-NEXT:    s_cselect_b32 s4, s4, 10
; GENERIC-NEXT:    s_cmp_lg_u32 s2, 10
; GENERIC-NEXT:    s_cselect_b32 s4, s4, 11
; GENERIC-NEXT:    s_cmp_lg_u32 s2, 11
; GENERIC-NEXT:    s_cselect_b32 s4, s4, 12
; GENERIC-NEXT:    s_cmp_lg_u32 s2, 12
; GENERIC-NEXT:    s_cselect_b32 s4, s4, 13
; GENERIC-NEXT:    s_cmp_lg_u32 s2, 13
; GENERIC-NEXT:    s_cselect_b32 s4, s4, 14
; GENERIC-NEXT:    s_cmp_lg_u32 s2, 14
; GENERIC-NEXT:    s_cselect_b32 s4, s4, 15
; GENERIC-NEXT:    s_cmp_lg_u32 s2, 15
; GENERIC-NEXT:    s_cselect_b32 s4, s4, 16
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    v_mov_b32_e32 v0, s4
; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: extract_neg_offset_sgpr:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xb
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_mov_b32 s5, 16
; NOOPT-NEXT:    s_mov_b32 s6, 15
; NOOPT-NEXT:    s_mov_b32 s7, 14
; NOOPT-NEXT:    s_mov_b32 s8, 13
; NOOPT-NEXT:    s_mov_b32 s9, 12
; NOOPT-NEXT:    s_mov_b32 s10, 11
; NOOPT-NEXT:    s_mov_b32 s11, 10
; NOOPT-NEXT:    s_mov_b32 s12, 9
; NOOPT-NEXT:    s_mov_b32 s13, 8
; NOOPT-NEXT:    s_mov_b32 s14, 7
; NOOPT-NEXT:    s_mov_b32 s15, 6
; NOOPT-NEXT:    s_mov_b32 s16, 5
; NOOPT-NEXT:    s_mov_b32 s17, 3
; NOOPT-NEXT:    s_mov_b32 s18, 2
; NOOPT-NEXT:    s_mov_b32 s19, 1
; NOOPT-NEXT:    s_mov_b32 s20, 0
; NOOPT-NEXT:    v_mov_b32_e32 v0, s20
; NOOPT-NEXT:    v_mov_b32_e32 v30, s19
; NOOPT-NEXT:    v_mov_b32_e32 v29, s18
; NOOPT-NEXT:    v_mov_b32_e32 v28, s17
; NOOPT-NEXT:    v_mov_b32_e32 v27, s16
; NOOPT-NEXT:    v_mov_b32_e32 v26, s15
; NOOPT-NEXT:    v_mov_b32_e32 v25, s14
; NOOPT-NEXT:    v_mov_b32_e32 v24, s13
; NOOPT-NEXT:    v_mov_b32_e32 v23, s12
; NOOPT-NEXT:    v_mov_b32_e32 v22, s11
; NOOPT-NEXT:    v_mov_b32_e32 v21, s10
; NOOPT-NEXT:    v_mov_b32_e32 v20, s9
; NOOPT-NEXT:    v_mov_b32_e32 v19, s8
; NOOPT-NEXT:    v_mov_b32_e32 v18, s7
; NOOPT-NEXT:    v_mov_b32_e32 v17, s6
; NOOPT-NEXT:    v_mov_b32_e32 v16, s5
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
; NOOPT-NEXT:    s_add_i32 m0, s4, 0xfffffe00
; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: extract_neg_offset_sgpr:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0xb
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, 0
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, 1
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_add_i32 m0, s4, 0xfffffe00
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, 2
; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, 3
; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, 5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, 6
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, 7
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, 8
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, 9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, 10
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, 11
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, 12
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, 13
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, 14
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, 15
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, 16
; SI-MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
; SI-MOVREL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: extract_neg_offset_sgpr:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-MOVREL-NEXT:    s_load_dword s2, s[2:3], 0x2c
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, 1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, 2
; VI-MOVREL-NEXT:    v_mov_b32_e32 v3, 3
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    s_add_i32 m0, s2, 0xfffffe00
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, 5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, 6
; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, 7
; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, 8
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, 9
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, 10
; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, 11
; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, 12
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, 13
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, 14
; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, 15
; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, 16
; VI-MOVREL-NEXT:    v_movrels_b32_e32 v2, v0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s1
; VI-MOVREL-NEXT:    flat_store_dword v[0:1], v2
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: extract_neg_offset_sgpr:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-IDXMODE-NEXT:    s_load_dword s2, s[2:3], 0x2c
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, 1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, 2
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v3, 3
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_addk_i32 s2, 0xfe00
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, 5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, 6
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, 7
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, 8
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, 9
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, 10
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, 11
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, 12
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, 13
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, 14
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, 15
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, 16
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, v0
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s1
; VI-IDXMODE-NEXT:    flat_store_dword v[0:1], v2
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: extract_neg_offset_sgpr:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, 1
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, 2
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, 3
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    s_addk_i32 s4, 0xfe00
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, 5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, 6
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, 7
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, 8
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, 9
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, 10
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, 11
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, 12
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, 13
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, 14
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, 15
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, 16
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s4, gpr_idx(SRC0)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, v0
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dword v0, v1, s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %index = add i32 %offset, -512
  %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
  store i32 %value, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) {
; GENERIC-LABEL: extract_neg_offset_sgpr_loaded:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x19
; GENERIC-NEXT:    s_load_dword s20, s[2:3], 0x39
; GENERIC-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x29
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_addk_i32 s20, 0xfe00
; GENERIC-NEXT:    s_or_b32 s2, s19, s51
; GENERIC-NEXT:    s_or_b32 s18, s18, s50
; GENERIC-NEXT:    s_or_b32 s17, s17, s49
; GENERIC-NEXT:    s_or_b32 s16, s16, s48
; GENERIC-NEXT:    s_or_b32 s15, s15, s47
; GENERIC-NEXT:    s_or_b32 s14, s14, s46
; GENERIC-NEXT:    s_or_b32 s13, s13, s45
; GENERIC-NEXT:    s_or_b32 s12, s12, s44
; GENERIC-NEXT:    s_or_b32 s11, s11, s43
; GENERIC-NEXT:    s_or_b32 s10, s10, s42
; GENERIC-NEXT:    s_or_b32 s9, s9, s41
; GENERIC-NEXT:    s_or_b32 s8, s8, s40
; GENERIC-NEXT:    s_or_b32 s7, s7, s39
; GENERIC-NEXT:    s_or_b32 s6, s6, s38
; GENERIC-NEXT:    s_or_b32 s4, s4, s36
; GENERIC-NEXT:    s_or_b32 s5, s5, s37
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 1
; GENERIC-NEXT:    s_cselect_b32 s4, s5, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 2
; GENERIC-NEXT:    s_cselect_b32 s4, s6, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 3
; GENERIC-NEXT:    s_cselect_b32 s4, s7, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 4
; GENERIC-NEXT:    s_cselect_b32 s4, s8, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 5
; GENERIC-NEXT:    s_cselect_b32 s4, s9, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 6
; GENERIC-NEXT:    s_cselect_b32 s4, s10, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 7
; GENERIC-NEXT:    s_cselect_b32 s4, s11, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 8
; GENERIC-NEXT:    s_cselect_b32 s4, s12, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 9
; GENERIC-NEXT:    s_cselect_b32 s4, s13, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 10
; GENERIC-NEXT:    s_cselect_b32 s4, s14, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 11
; GENERIC-NEXT:    s_cselect_b32 s4, s15, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 12
; GENERIC-NEXT:    s_cselect_b32 s4, s16, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 13
; GENERIC-NEXT:    s_cselect_b32 s4, s17, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 14
; GENERIC-NEXT:    s_cselect_b32 s4, s18, s4
; GENERIC-NEXT:    s_cmp_eq_u32 s20, 15
; GENERIC-NEXT:    s_cselect_b32 s4, s2, s4
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    v_mov_b32_e32 v0, s4
; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: extract_neg_offset_sgpr_loaded:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x19
; NOOPT-NEXT:    s_load_dwordx16 s[52:67], s[2:3], 0x29
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x39
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_mov_b32 s6, s67
; NOOPT-NEXT:    s_mov_b32 s5, s51
; NOOPT-NEXT:    s_or_b32 s5, s5, s6
; NOOPT-NEXT:    s_mov_b32 s7, s66
; NOOPT-NEXT:    s_mov_b32 s6, s50
; NOOPT-NEXT:    s_or_b32 s6, s6, s7
; NOOPT-NEXT:    s_mov_b32 s8, s65
; NOOPT-NEXT:    s_mov_b32 s7, s49
; NOOPT-NEXT:    s_or_b32 s7, s7, s8
; NOOPT-NEXT:    s_mov_b32 s9, s64
; NOOPT-NEXT:    s_mov_b32 s8, s48
; NOOPT-NEXT:    s_or_b32 s8, s8, s9
; NOOPT-NEXT:    s_mov_b32 s10, s63
; NOOPT-NEXT:    s_mov_b32 s9, s47
; NOOPT-NEXT:    s_or_b32 s9, s9, s10
; NOOPT-NEXT:    s_mov_b32 s11, s62
; NOOPT-NEXT:    s_mov_b32 s10, s46
; NOOPT-NEXT:    s_or_b32 s10, s10, s11
; NOOPT-NEXT:    s_mov_b32 s12, s61
; NOOPT-NEXT:    s_mov_b32 s11, s45
; NOOPT-NEXT:    s_or_b32 s11, s11, s12
; NOOPT-NEXT:    s_mov_b32 s13, s60
; NOOPT-NEXT:    s_mov_b32 s12, s44
; NOOPT-NEXT:    s_or_b32 s12, s12, s13
; NOOPT-NEXT:    s_mov_b32 s14, s59
; NOOPT-NEXT:    s_mov_b32 s13, s43
; NOOPT-NEXT:    s_or_b32 s13, s13, s14
; NOOPT-NEXT:    s_mov_b32 s15, s58
; NOOPT-NEXT:    s_mov_b32 s14, s42
; NOOPT-NEXT:    s_or_b32 s14, s14, s15
; NOOPT-NEXT:    s_mov_b32 s16, s57
; NOOPT-NEXT:    s_mov_b32 s15, s41
; NOOPT-NEXT:    s_or_b32 s15, s15, s16
; NOOPT-NEXT:    s_mov_b32 s17, s56
; NOOPT-NEXT:    s_mov_b32 s16, s40
; NOOPT-NEXT:    s_or_b32 s16, s16, s17
; NOOPT-NEXT:    s_mov_b32 s18, s55
; NOOPT-NEXT:    s_mov_b32 s17, s39
; NOOPT-NEXT:    s_or_b32 s17, s17, s18
; NOOPT-NEXT:    s_mov_b32 s19, s54
; NOOPT-NEXT:    s_mov_b32 s18, s38
; NOOPT-NEXT:    s_or_b32 s18, s18, s19
; NOOPT-NEXT:    s_mov_b32 s20, s53
; NOOPT-NEXT:    s_mov_b32 s19, s37
; NOOPT-NEXT:    s_or_b32 s19, s19, s20
; NOOPT-NEXT:    s_mov_b32 s21, s52
; NOOPT-NEXT:    s_mov_b32 s20, s36
; NOOPT-NEXT:    s_or_b32 s20, s20, s21
; NOOPT-NEXT:    v_mov_b32_e32 v0, s20
; NOOPT-NEXT:    v_mov_b32_e32 v30, s19
; NOOPT-NEXT:    v_mov_b32_e32 v29, s18
; NOOPT-NEXT:    v_mov_b32_e32 v28, s17
; NOOPT-NEXT:    v_mov_b32_e32 v27, s16
; NOOPT-NEXT:    v_mov_b32_e32 v26, s15
; NOOPT-NEXT:    v_mov_b32_e32 v25, s14
; NOOPT-NEXT:    v_mov_b32_e32 v24, s13
; NOOPT-NEXT:    v_mov_b32_e32 v23, s12
; NOOPT-NEXT:    v_mov_b32_e32 v22, s11
; NOOPT-NEXT:    v_mov_b32_e32 v21, s10
; NOOPT-NEXT:    v_mov_b32_e32 v20, s9
; NOOPT-NEXT:    v_mov_b32_e32 v19, s8
; NOOPT-NEXT:    v_mov_b32_e32 v18, s7
; NOOPT-NEXT:    v_mov_b32_e32 v17, s6
; NOOPT-NEXT:    v_mov_b32_e32 v16, s5
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
; NOOPT-NEXT:    s_add_i32 m0, s4, 0xfffffe00
; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: extract_neg_offset_sgpr_loaded:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x19
; SI-MOVREL-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x29
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-MOVREL-NEXT:    s_load_dword s20, s[2:3], 0x39
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_or_b32 s4, s4, s36
; SI-MOVREL-NEXT:    s_or_b32 s19, s19, s51
; SI-MOVREL-NEXT:    s_or_b32 s18, s18, s50
; SI-MOVREL-NEXT:    s_or_b32 s17, s17, s49
; SI-MOVREL-NEXT:    s_or_b32 s16, s16, s48
; SI-MOVREL-NEXT:    s_or_b32 s15, s15, s47
; SI-MOVREL-NEXT:    s_or_b32 s14, s14, s46
; SI-MOVREL-NEXT:    s_or_b32 s13, s13, s45
; SI-MOVREL-NEXT:    s_or_b32 s12, s12, s44
; SI-MOVREL-NEXT:    s_or_b32 s11, s11, s43
; SI-MOVREL-NEXT:    s_or_b32 s10, s10, s42
; SI-MOVREL-NEXT:    s_or_b32 s9, s9, s41
; SI-MOVREL-NEXT:    s_or_b32 s8, s8, s40
; SI-MOVREL-NEXT:    s_or_b32 s7, s7, s39
; SI-MOVREL-NEXT:    s_or_b32 s6, s6, s38
; SI-MOVREL-NEXT:    s_or_b32 s5, s5, s37
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, s4
; SI-MOVREL-NEXT:    s_add_i32 m0, s20, 0xfffffe00
; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, s5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, s6
; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, s7
; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, s8
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, s9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, s10
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, s11
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, s12
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, s13
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, s14
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, s15
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, s16
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, s17
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, s18
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, s19
; SI-MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
; SI-MOVREL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: extract_neg_offset_sgpr_loaded:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
; VI-MOVREL-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0xa4
; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-MOVREL-NEXT:    s_load_dword s2, s[2:3], 0xe4
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    s_or_b32 s6, s6, s38
; VI-MOVREL-NEXT:    s_or_b32 s5, s5, s37
; VI-MOVREL-NEXT:    s_or_b32 s4, s4, s36
; VI-MOVREL-NEXT:    s_or_b32 s3, s19, s51
; VI-MOVREL-NEXT:    s_or_b32 s18, s18, s50
; VI-MOVREL-NEXT:    s_or_b32 s17, s17, s49
; VI-MOVREL-NEXT:    s_or_b32 s16, s16, s48
; VI-MOVREL-NEXT:    s_or_b32 s15, s15, s47
; VI-MOVREL-NEXT:    s_or_b32 s14, s14, s46
; VI-MOVREL-NEXT:    s_or_b32 s13, s13, s45
; VI-MOVREL-NEXT:    s_or_b32 s12, s12, s44
; VI-MOVREL-NEXT:    s_or_b32 s11, s11, s43
; VI-MOVREL-NEXT:    s_or_b32 s10, s10, s42
; VI-MOVREL-NEXT:    s_or_b32 s9, s9, s41
; VI-MOVREL-NEXT:    s_or_b32 s8, s8, s40
; VI-MOVREL-NEXT:    s_or_b32 s7, s7, s39
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s4
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, s6
; VI-MOVREL-NEXT:    s_add_i32 m0, s2, 0xfffffe00
; VI-MOVREL-NEXT:    v_mov_b32_e32 v3, s7
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s8
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s9
; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, s10
; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, s11
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, s12
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, s13
; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, s14
; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, s15
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, s16
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, s17
; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, s18
; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, s3
; VI-MOVREL-NEXT:    v_movrels_b32_e32 v2, v0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s1
; VI-MOVREL-NEXT:    flat_store_dword v[0:1], v2
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: extract_neg_offset_sgpr_loaded:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
; VI-IDXMODE-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0xa4
; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-IDXMODE-NEXT:    s_load_dword s2, s[2:3], 0xe4
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_or_b32 s6, s6, s38
; VI-IDXMODE-NEXT:    s_or_b32 s5, s5, s37
; VI-IDXMODE-NEXT:    s_or_b32 s4, s4, s36
; VI-IDXMODE-NEXT:    s_or_b32 s3, s19, s51
; VI-IDXMODE-NEXT:    s_or_b32 s18, s18, s50
; VI-IDXMODE-NEXT:    s_or_b32 s17, s17, s49
; VI-IDXMODE-NEXT:    s_or_b32 s16, s16, s48
; VI-IDXMODE-NEXT:    s_or_b32 s15, s15, s47
; VI-IDXMODE-NEXT:    s_or_b32 s14, s14, s46
; VI-IDXMODE-NEXT:    s_or_b32 s13, s13, s45
; VI-IDXMODE-NEXT:    s_or_b32 s12, s12, s44
; VI-IDXMODE-NEXT:    s_or_b32 s11, s11, s43
; VI-IDXMODE-NEXT:    s_or_b32 s10, s10, s42
; VI-IDXMODE-NEXT:    s_or_b32 s9, s9, s41
; VI-IDXMODE-NEXT:    s_or_b32 s8, s8, s40
; VI-IDXMODE-NEXT:    s_or_b32 s7, s7, s39
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s4
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, s6
; VI-IDXMODE-NEXT:    s_addk_i32 s2, 0xfe00
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v3, s7
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s8
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s9
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, s10
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, s11
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, s12
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, s13
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, s14
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, s15
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, s16
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, s17
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, s18
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, s3
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, v0
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s1
; VI-IDXMODE-NEXT:    flat_store_dword v[0:1], v2
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: extract_neg_offset_sgpr_loaded:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
; GFX9-IDXMODE-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0xa4
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    s_load_dword s20, s[2:3], 0xe4
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    s_or_b32 s4, s4, s36
; GFX9-IDXMODE-NEXT:    s_or_b32 s2, s19, s51
; GFX9-IDXMODE-NEXT:    s_or_b32 s3, s18, s50
; GFX9-IDXMODE-NEXT:    s_or_b32 s17, s17, s49
; GFX9-IDXMODE-NEXT:    s_or_b32 s16, s16, s48
; GFX9-IDXMODE-NEXT:    s_or_b32 s15, s15, s47
; GFX9-IDXMODE-NEXT:    s_or_b32 s14, s14, s46
; GFX9-IDXMODE-NEXT:    s_or_b32 s13, s13, s45
; GFX9-IDXMODE-NEXT:    s_or_b32 s12, s12, s44
; GFX9-IDXMODE-NEXT:    s_or_b32 s11, s11, s43
; GFX9-IDXMODE-NEXT:    s_or_b32 s10, s10, s42
; GFX9-IDXMODE-NEXT:    s_or_b32 s9, s9, s41
; GFX9-IDXMODE-NEXT:    s_or_b32 s8, s8, s40
; GFX9-IDXMODE-NEXT:    s_or_b32 s7, s7, s39
; GFX9-IDXMODE-NEXT:    s_or_b32 s6, s6, s38
; GFX9-IDXMODE-NEXT:    s_or_b32 s5, s5, s37
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, s4
; GFX9-IDXMODE-NEXT:    s_addk_i32 s20, 0xfe00
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, s5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, s6
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, s7
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, s8
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, s9
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, s10
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, s11
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, s12
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, s13
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, s14
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, s15
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, s16
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, s17
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, s3
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, s2
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s20, gpr_idx(SRC0)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, v0
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dword v16, v0, s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %index = add i32 %offset, -512
  %or = or <16 x i32> %vec0, %vec1
  %value = extractelement <16 x i32> %or, i32 %index
  store i32 %value, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
; GENERIC-LABEL: extract_neg_offset_vgpr:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffe00, v0
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
; GENERIC-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 2, v1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 3, v1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 5, v0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 6, v1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 6, v0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 7, v1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 7, v0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 8, v1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 9, v1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 10, v1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 11, v1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 12, v1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 13, v1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 13, v0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 14, v1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 15, v1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 16, v1, vcc
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: extract_neg_offset_vgpr:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; NOOPT-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; NOOPT-NEXT:    s_mov_b32 s22, -1
; NOOPT-NEXT:    s_mov_b32 s23, 0xe8f000
; NOOPT-NEXT:    s_add_u32 s20, s20, s9
; NOOPT-NEXT:    s_addc_u32 s21, s21, 0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s6, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s4, 0xf000
; NOOPT-NEXT:    s_mov_b32 s5, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s6
; NOOPT-NEXT:    s_mov_b32 s2, s5
; NOOPT-NEXT:    s_mov_b32 s3, s4
; NOOPT-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
; NOOPT-NEXT:    v_writelane_b32 v31, s0, 0
; NOOPT-NEXT:    v_writelane_b32 v31, s1, 1
; NOOPT-NEXT:    v_writelane_b32 v31, s2, 2
; NOOPT-NEXT:    v_writelane_b32 v31, s3, 3
; NOOPT-NEXT:    s_mov_b32 s0, 16
; NOOPT-NEXT:    s_mov_b32 s1, 15
; NOOPT-NEXT:    s_mov_b32 s2, 14
; NOOPT-NEXT:    s_mov_b32 s3, 13
; NOOPT-NEXT:    s_mov_b32 s4, 12
; NOOPT-NEXT:    s_mov_b32 s5, 11
; NOOPT-NEXT:    s_mov_b32 s6, 10
; NOOPT-NEXT:    s_mov_b32 s7, 9
; NOOPT-NEXT:    s_mov_b32 s8, 8
; NOOPT-NEXT:    s_mov_b32 s9, 7
; NOOPT-NEXT:    s_mov_b32 s10, 6
; NOOPT-NEXT:    s_mov_b32 s11, 5
; NOOPT-NEXT:    s_mov_b32 s12, 3
; NOOPT-NEXT:    s_mov_b32 s13, 2
; NOOPT-NEXT:    s_mov_b32 s14, 1
; NOOPT-NEXT:    s_mov_b32 s15, 0
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, s15
; NOOPT-NEXT:    v_mov_b32_e32 v30, s14
; NOOPT-NEXT:    v_mov_b32_e32 v29, s13
; NOOPT-NEXT:    v_mov_b32_e32 v28, s12
; NOOPT-NEXT:    v_mov_b32_e32 v27, s11
; NOOPT-NEXT:    v_mov_b32_e32 v26, s10
; NOOPT-NEXT:    v_mov_b32_e32 v25, s9
; NOOPT-NEXT:    v_mov_b32_e32 v24, s8
; NOOPT-NEXT:    v_mov_b32_e32 v23, s7
; NOOPT-NEXT:    v_mov_b32_e32 v22, s6
; NOOPT-NEXT:    v_mov_b32_e32 v21, s5
; NOOPT-NEXT:    v_mov_b32_e32 v20, s4
; NOOPT-NEXT:    v_mov_b32_e32 v19, s3
; NOOPT-NEXT:    v_mov_b32_e32 v18, s2
; NOOPT-NEXT:    v_mov_b32_e32 v17, s1
; NOOPT-NEXT:    v_mov_b32_e32 v16, s0
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
; NOOPT-NEXT:    v_writelane_b32 v31, s0, 4
; NOOPT-NEXT:    v_writelane_b32 v31, s1, 5
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    ; implicit-def: $vgpr0
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v31, 6
; NOOPT-NEXT:    v_readlane_b32 s1, v31, 7
; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readfirstlane_b32 s2, v16
; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v16
; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
; NOOPT-NEXT:    s_add_i32 m0, s2, 0xfffffe00
; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
; NOOPT-NEXT:    v_writelane_b32 v31, s2, 6
; NOOPT-NEXT:    v_writelane_b32 v31, s3, 7
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
; NOOPT-NEXT:    s_cbranch_execnz .LBB5_1
; NOOPT-NEXT:  ; %bb.2:
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v31, 4
; NOOPT-NEXT:    v_readlane_b32 s1, v31, 5
; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
; NOOPT-NEXT:  ; %bb.3:
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v31, 0
; NOOPT-NEXT:    v_readlane_b32 s1, v31, 1
; NOOPT-NEXT:    v_readlane_b32 s2, v31, 2
; NOOPT-NEXT:    v_readlane_b32 s3, v31, 3
; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:76 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: extract_neg_offset_vgpr:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffe00, v0
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v1, 2, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v1, 3, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 5, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v1, 6, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 6, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v1, 7, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 7, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v1, 8, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v1, 9, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v1, 10, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v1, 11, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v0
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v1, 12, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v1, 13, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 13, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v1, 14, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v1, 15, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v0
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v0, 16, v1, vcc
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-LABEL: extract_neg_offset_vgpr:
; VI:       ; %bb.0: ; %entry
; VI-NEXT:    v_add_u32_e32 v0, vcc, 0xfffffe00, v0
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v0
; VI-NEXT:    v_cndmask_b32_e32 v1, 2, v1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v0
; VI-NEXT:    v_cndmask_b32_e32 v1, 3, v1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v0
; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 5, v0
; VI-NEXT:    v_cndmask_b32_e32 v1, 6, v1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 6, v0
; VI-NEXT:    v_cndmask_b32_e32 v1, 7, v1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 7, v0
; VI-NEXT:    v_cndmask_b32_e32 v1, 8, v1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v0
; VI-NEXT:    v_cndmask_b32_e32 v1, 9, v1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v0
; VI-NEXT:    v_cndmask_b32_e32 v1, 10, v1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v0
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT:    v_cndmask_b32_e32 v1, 11, v1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v0
; VI-NEXT:    v_cndmask_b32_e32 v1, 12, v1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v0
; VI-NEXT:    v_cndmask_b32_e32 v1, 13, v1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 13, v0
; VI-NEXT:    v_cndmask_b32_e32 v1, 14, v1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v0
; VI-NEXT:    v_cndmask_b32_e32 v1, 15, v1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v0
; VI-NEXT:    v_cndmask_b32_e32 v2, 16, v1, vcc
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v0, s0
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: extract_neg_offset_vgpr:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    v_add_u32_e32 v0, 0xfffffe00, v0
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v2, 2, v2, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v2, 3, v2, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 5, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v2, 6, v2, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 6, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v2, 7, v2, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 7, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v2, 8, v2, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v2, 9, v2, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v2, 10, v2, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v2, 11, v2, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v0
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v2, 12, v2, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v2, 13, v2, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 13, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v2, 14, v2, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v2, 15, v2, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v0, 16, v2, vcc
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    global_store_dword v1, v0, s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %index = add i32 %id, -512
  %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
  store i32 %value, ptr addrspace(1) %out
  ret void
}

; undefined behavior, but shouldn't crash compiler
define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GENERIC-LABEL: extract_undef_offset_sgpr:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
; GENERIC-NEXT:    s_mov_b32 s7, 0xf000
; GENERIC-NEXT:    s_mov_b32 s6, -1
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_mov_b32 s4, s2
; GENERIC-NEXT:    s_mov_b32 s5, s3
; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: extract_undef_offset_sgpr:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s6, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s4, 0xf000
; NOOPT-NEXT:    s_mov_b32 s5, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s6
; NOOPT-NEXT:    s_mov_b32 s2, s5
; NOOPT-NEXT:    s_mov_b32 s3, s4
; NOOPT-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: extract_undef_offset_sgpr:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-MOVREL-NEXT:    s_mov_b32 s7, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s6, -1
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_mov_b32 s4, s2
; SI-MOVREL-NEXT:    s_mov_b32 s5, s3
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-LABEL: extract_undef_offset_sgpr:
; VI:       ; %bb.0: ; %entry
; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v0, s2
; VI-NEXT:    v_mov_b32_e32 v1, s3
; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: extract_undef_offset_sgpr:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7] glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %ld = load volatile <4 x i32>, ptr addrspace(1) %in
  %value = extractelement <4 x i32> %ld, i32 undef
  store i32 %value, ptr addrspace(1) %out
  ret void
}

; undefined behavior, but shouldn't crash compiler
define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GENERIC-LABEL: insert_undef_offset_sgpr_vector_src:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: insert_undef_offset_sgpr_vector_src:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: insert_undef_offset_sgpr_vector_src:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-LABEL: insert_undef_offset_sgpr_vector_src:
; VI:       ; %bb.0: ; %entry
; VI-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: insert_undef_offset_sgpr_vector_src:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %ld = load <4 x i32>, ptr addrspace(1) %in
  %value = insertelement <4 x i32> %ld, i32 5, i32 undef
  store <4 x i32> %value, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) {
; GENERIC-LABEL: insert_w_offset:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT:    s_load_dword s4, s[2:3], 0xb
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41880000
; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x40400000
; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x41000000
; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40e00000
; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x40c00000
; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x40a00000
; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41400000
; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41300000
; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41200000
; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41100000
; GENERIC-NEXT:    v_mov_b32_e32 v15, 0x41800000
; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x41700000
; GENERIC-NEXT:    v_mov_b32_e32 v17, 0x41600000
; GENERIC-NEXT:    v_mov_b32_e32 v18, 0x41500000
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_add_i32 s4, s4, 1
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 3
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 4.0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 2
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v2, v0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 1
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 2.0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 0
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 1.0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 7
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v4, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 6
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v5, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 5
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 4
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 11
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v11, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 10
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 9
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v13, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 8
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v14, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 15
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v9, v15, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 14
; GENERIC-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v16, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 13
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v17, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 12
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v18, v10, vcc
; GENERIC-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: insert_w_offset:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xb
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_mov_b32 s5, 1
; NOOPT-NEXT:    s_add_i32 s4, s4, s5
; NOOPT-NEXT:    s_mov_b32 s5, 0x41800000
; NOOPT-NEXT:    s_mov_b32 s6, 0x41700000
; NOOPT-NEXT:    s_mov_b32 s7, 0x41600000
; NOOPT-NEXT:    s_mov_b32 s8, 0x41500000
; NOOPT-NEXT:    s_mov_b32 s9, 0x41400000
; NOOPT-NEXT:    s_mov_b32 s10, 0x41300000
; NOOPT-NEXT:    s_mov_b32 s11, 0x41200000
; NOOPT-NEXT:    s_mov_b32 s12, 0x41100000
; NOOPT-NEXT:    s_mov_b32 s13, 0x41000000
; NOOPT-NEXT:    s_mov_b32 s14, 0x40e00000
; NOOPT-NEXT:    s_mov_b32 s15, 0x40c00000
; NOOPT-NEXT:    s_mov_b32 s16, 0x40a00000
; NOOPT-NEXT:    s_mov_b32 s17, 4.0
; NOOPT-NEXT:    s_mov_b32 s18, 0x40400000
; NOOPT-NEXT:    s_mov_b32 s19, 2.0
; NOOPT-NEXT:    s_mov_b32 s20, 1.0
; NOOPT-NEXT:    v_mov_b32_e32 v7, s20
; NOOPT-NEXT:    v_mov_b32_e32 v30, s19
; NOOPT-NEXT:    v_mov_b32_e32 v29, s18
; NOOPT-NEXT:    v_mov_b32_e32 v28, s17
; NOOPT-NEXT:    v_mov_b32_e32 v27, s16
; NOOPT-NEXT:    v_mov_b32_e32 v26, s15
; NOOPT-NEXT:    v_mov_b32_e32 v25, s14
; NOOPT-NEXT:    v_mov_b32_e32 v24, s13
; NOOPT-NEXT:    v_mov_b32_e32 v23, s12
; NOOPT-NEXT:    v_mov_b32_e32 v6, s11
; NOOPT-NEXT:    v_mov_b32_e32 v5, s10
; NOOPT-NEXT:    v_mov_b32_e32 v4, s9
; NOOPT-NEXT:    v_mov_b32_e32 v3, s8
; NOOPT-NEXT:    v_mov_b32_e32 v2, s7
; NOOPT-NEXT:    v_mov_b32_e32 v1, s6
; NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; NOOPT-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v8, v30
; NOOPT-NEXT:    v_mov_b32_e32 v9, v29
; NOOPT-NEXT:    v_mov_b32_e32 v10, v28
; NOOPT-NEXT:    v_mov_b32_e32 v11, v27
; NOOPT-NEXT:    v_mov_b32_e32 v12, v26
; NOOPT-NEXT:    v_mov_b32_e32 v13, v25
; NOOPT-NEXT:    v_mov_b32_e32 v14, v24
; NOOPT-NEXT:    v_mov_b32_e32 v15, v23
; NOOPT-NEXT:    v_mov_b32_e32 v16, v6
; NOOPT-NEXT:    v_mov_b32_e32 v17, v5
; NOOPT-NEXT:    v_mov_b32_e32 v18, v4
; NOOPT-NEXT:    v_mov_b32_e32 v19, v3
; NOOPT-NEXT:    v_mov_b32_e32 v20, v2
; NOOPT-NEXT:    v_mov_b32_e32 v21, v1
; NOOPT-NEXT:    v_mov_b32_e32 v22, v0
; NOOPT-NEXT:    v_mov_b32_e32 v0, 0x41880000
; NOOPT-NEXT:    s_mov_b32 m0, s4
; NOOPT-NEXT:    v_movreld_b32_e32 v7, v0
; NOOPT-NEXT:    v_mov_b32_e32 v4, v22
; NOOPT-NEXT:    v_mov_b32_e32 v5, v21
; NOOPT-NEXT:    v_mov_b32_e32 v6, v20
; NOOPT-NEXT:    v_mov_b32_e32 v0, v19
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; NOOPT-NEXT:    v_mov_b32_e32 v4, v14
; NOOPT-NEXT:    v_mov_b32_e32 v5, v13
; NOOPT-NEXT:    v_mov_b32_e32 v6, v12
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v11
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; NOOPT-NEXT:    v_mov_b32_e32 v4, v10
; NOOPT-NEXT:    v_mov_b32_e32 v5, v9
; NOOPT-NEXT:    v_mov_b32_e32 v6, v8
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v7
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: insert_w_offset:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0xb
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_add_i32 s4, s4, 1
; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v16, 0x41880000
; SI-MOVREL-NEXT:    s_mov_b32 m0, s4
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    v_movreld_b32_e32 v0, v16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: insert_w_offset:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0x2c
; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    s_add_i32 s4, s4, 1
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, 0x41880000
; VI-MOVREL-NEXT:    s_mov_b32 m0, s4
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    v_movreld_b32_e32 v0, v16
; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 32
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 16
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, s2
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: insert_w_offset:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_add_i32 s4, s4, 1
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 48
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0x41880000
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s4, gpr_idx(DST)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, v16
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 32
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 16
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, s2
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: insert_w_offset:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    s_add_i32 s4, s4, 1
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, 0x41880000
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s4, gpr_idx(DST)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, v17
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %add = add i32 %in, 1
  %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
  store <16 x float> %ins, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %out, i16 %in) {
; GENERIC-LABEL: insert_unsigned_base_plus_offset:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT:    s_load_dword s4, s[2:3], 0xb
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41880000
; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x40400000
; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x41000000
; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40e00000
; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x40c00000
; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x40a00000
; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41400000
; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41300000
; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41200000
; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41100000
; GENERIC-NEXT:    v_mov_b32_e32 v15, 0x41800000
; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x41700000
; GENERIC-NEXT:    v_mov_b32_e32 v17, 0x41600000
; GENERIC-NEXT:    v_mov_b32_e32 v18, 0x41500000
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_and_b32 s4, s4, 0xffff
; GENERIC-NEXT:    s_add_i32 s4, s4, 1
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 3
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 4.0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 2
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v2, v0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 1
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 2.0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 0
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 1.0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 7
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v4, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 6
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v5, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 5
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 4
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 11
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v11, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 10
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 9
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v13, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 8
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v14, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 15
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v9, v15, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 14
; GENERIC-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v16, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 13
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v17, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 12
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v18, v10, vcc
; GENERIC-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: insert_unsigned_base_plus_offset:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xb
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_mov_b32 s5, 0xffff
; NOOPT-NEXT:    s_and_b32 s4, s4, s5
; NOOPT-NEXT:    s_mov_b32 s5, 0x41800000
; NOOPT-NEXT:    s_mov_b32 s6, 0x41700000
; NOOPT-NEXT:    s_mov_b32 s7, 0x41600000
; NOOPT-NEXT:    s_mov_b32 s8, 0x41500000
; NOOPT-NEXT:    s_mov_b32 s9, 0x41400000
; NOOPT-NEXT:    s_mov_b32 s10, 0x41300000
; NOOPT-NEXT:    s_mov_b32 s11, 0x41200000
; NOOPT-NEXT:    s_mov_b32 s12, 0x41100000
; NOOPT-NEXT:    s_mov_b32 s13, 0x41000000
; NOOPT-NEXT:    s_mov_b32 s14, 0x40e00000
; NOOPT-NEXT:    s_mov_b32 s15, 0x40c00000
; NOOPT-NEXT:    s_mov_b32 s16, 0x40a00000
; NOOPT-NEXT:    s_mov_b32 s17, 4.0
; NOOPT-NEXT:    s_mov_b32 s18, 0x40400000
; NOOPT-NEXT:    s_mov_b32 s19, 2.0
; NOOPT-NEXT:    s_mov_b32 s20, 1.0
; NOOPT-NEXT:    v_mov_b32_e32 v7, s20
; NOOPT-NEXT:    v_mov_b32_e32 v30, s19
; NOOPT-NEXT:    v_mov_b32_e32 v29, s18
; NOOPT-NEXT:    v_mov_b32_e32 v28, s17
; NOOPT-NEXT:    v_mov_b32_e32 v27, s16
; NOOPT-NEXT:    v_mov_b32_e32 v26, s15
; NOOPT-NEXT:    v_mov_b32_e32 v25, s14
; NOOPT-NEXT:    v_mov_b32_e32 v24, s13
; NOOPT-NEXT:    v_mov_b32_e32 v23, s12
; NOOPT-NEXT:    v_mov_b32_e32 v6, s11
; NOOPT-NEXT:    v_mov_b32_e32 v5, s10
; NOOPT-NEXT:    v_mov_b32_e32 v4, s9
; NOOPT-NEXT:    v_mov_b32_e32 v3, s8
; NOOPT-NEXT:    v_mov_b32_e32 v2, s7
; NOOPT-NEXT:    v_mov_b32_e32 v1, s6
; NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; NOOPT-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v8, v30
; NOOPT-NEXT:    v_mov_b32_e32 v9, v29
; NOOPT-NEXT:    v_mov_b32_e32 v10, v28
; NOOPT-NEXT:    v_mov_b32_e32 v11, v27
; NOOPT-NEXT:    v_mov_b32_e32 v12, v26
; NOOPT-NEXT:    v_mov_b32_e32 v13, v25
; NOOPT-NEXT:    v_mov_b32_e32 v14, v24
; NOOPT-NEXT:    v_mov_b32_e32 v15, v23
; NOOPT-NEXT:    v_mov_b32_e32 v16, v6
; NOOPT-NEXT:    v_mov_b32_e32 v17, v5
; NOOPT-NEXT:    v_mov_b32_e32 v18, v4
; NOOPT-NEXT:    v_mov_b32_e32 v19, v3
; NOOPT-NEXT:    v_mov_b32_e32 v20, v2
; NOOPT-NEXT:    v_mov_b32_e32 v21, v1
; NOOPT-NEXT:    v_mov_b32_e32 v22, v0
; NOOPT-NEXT:    v_mov_b32_e32 v0, 0x41880000
; NOOPT-NEXT:    s_mov_b32 m0, s4
; NOOPT-NEXT:    v_movreld_b32_e32 v8, v0
; NOOPT-NEXT:    v_mov_b32_e32 v4, v22
; NOOPT-NEXT:    v_mov_b32_e32 v5, v21
; NOOPT-NEXT:    v_mov_b32_e32 v6, v20
; NOOPT-NEXT:    v_mov_b32_e32 v0, v19
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; NOOPT-NEXT:    v_mov_b32_e32 v4, v14
; NOOPT-NEXT:    v_mov_b32_e32 v5, v13
; NOOPT-NEXT:    v_mov_b32_e32 v6, v12
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v11
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; NOOPT-NEXT:    v_mov_b32_e32 v4, v10
; NOOPT-NEXT:    v_mov_b32_e32 v5, v9
; NOOPT-NEXT:    v_mov_b32_e32 v6, v8
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v7
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: insert_unsigned_base_plus_offset:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0xb
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_and_b32 s4, s4, 0xffff
; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v16, 0x41880000
; SI-MOVREL-NEXT:    s_mov_b32 m0, s4
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    v_movreld_b32_e32 v1, v16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: insert_unsigned_base_plus_offset:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0x2c
; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    s_and_b32 s2, s4, 0xffff
; VI-MOVREL-NEXT:    s_mov_b32 m0, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, 0x41880000
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    v_movreld_b32_e32 v1, v16
; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 32
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 16
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, s2
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: insert_unsigned_base_plus_offset:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_and_b32 s2, s4, 0xffff
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0x41880000
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, v16
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 48
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 32
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 16
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, s2
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: insert_unsigned_base_plus_offset:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    s_and_b32 s2, s4, 0xffff
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, 0x41880000
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, v17
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %base = zext i16 %in to i32
  %add = add i32 %base, 1
  %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
  store <16 x float> %ins, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, i16 %in) {
; GENERIC-LABEL: insert_signed_base_plus_offset:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT:    s_load_dword s4, s[2:3], 0xb
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41880000
; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x40400000
; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x41000000
; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40e00000
; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x40c00000
; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x40a00000
; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41400000
; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41300000
; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41200000
; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41100000
; GENERIC-NEXT:    v_mov_b32_e32 v15, 0x41800000
; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x41700000
; GENERIC-NEXT:    v_mov_b32_e32 v17, 0x41600000
; GENERIC-NEXT:    v_mov_b32_e32 v18, 0x41500000
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_sext_i32_i16 s4, s4
; GENERIC-NEXT:    s_add_i32 s4, s4, 1
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 3
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 4.0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 2
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v2, v0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 1
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 2.0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 0
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 1.0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 7
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v4, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 6
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v5, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 5
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 4
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 11
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v11, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 10
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 9
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v13, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 8
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v14, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 15
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v9, v15, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 14
; GENERIC-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v16, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 13
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v17, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 12
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v18, v10, vcc
; GENERIC-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: insert_signed_base_plus_offset:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xb
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_sext_i32_i16 s4, s4
; NOOPT-NEXT:    s_mov_b32 s5, 1
; NOOPT-NEXT:    s_add_i32 s4, s4, s5
; NOOPT-NEXT:    s_mov_b32 s5, 0x41800000
; NOOPT-NEXT:    s_mov_b32 s6, 0x41700000
; NOOPT-NEXT:    s_mov_b32 s7, 0x41600000
; NOOPT-NEXT:    s_mov_b32 s8, 0x41500000
; NOOPT-NEXT:    s_mov_b32 s9, 0x41400000
; NOOPT-NEXT:    s_mov_b32 s10, 0x41300000
; NOOPT-NEXT:    s_mov_b32 s11, 0x41200000
; NOOPT-NEXT:    s_mov_b32 s12, 0x41100000
; NOOPT-NEXT:    s_mov_b32 s13, 0x41000000
; NOOPT-NEXT:    s_mov_b32 s14, 0x40e00000
; NOOPT-NEXT:    s_mov_b32 s15, 0x40c00000
; NOOPT-NEXT:    s_mov_b32 s16, 0x40a00000
; NOOPT-NEXT:    s_mov_b32 s17, 4.0
; NOOPT-NEXT:    s_mov_b32 s18, 0x40400000
; NOOPT-NEXT:    s_mov_b32 s19, 2.0
; NOOPT-NEXT:    s_mov_b32 s20, 1.0
; NOOPT-NEXT:    v_mov_b32_e32 v7, s20
; NOOPT-NEXT:    v_mov_b32_e32 v30, s19
; NOOPT-NEXT:    v_mov_b32_e32 v29, s18
; NOOPT-NEXT:    v_mov_b32_e32 v28, s17
; NOOPT-NEXT:    v_mov_b32_e32 v27, s16
; NOOPT-NEXT:    v_mov_b32_e32 v26, s15
; NOOPT-NEXT:    v_mov_b32_e32 v25, s14
; NOOPT-NEXT:    v_mov_b32_e32 v24, s13
; NOOPT-NEXT:    v_mov_b32_e32 v23, s12
; NOOPT-NEXT:    v_mov_b32_e32 v6, s11
; NOOPT-NEXT:    v_mov_b32_e32 v5, s10
; NOOPT-NEXT:    v_mov_b32_e32 v4, s9
; NOOPT-NEXT:    v_mov_b32_e32 v3, s8
; NOOPT-NEXT:    v_mov_b32_e32 v2, s7
; NOOPT-NEXT:    v_mov_b32_e32 v1, s6
; NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; NOOPT-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v8, v30
; NOOPT-NEXT:    v_mov_b32_e32 v9, v29
; NOOPT-NEXT:    v_mov_b32_e32 v10, v28
; NOOPT-NEXT:    v_mov_b32_e32 v11, v27
; NOOPT-NEXT:    v_mov_b32_e32 v12, v26
; NOOPT-NEXT:    v_mov_b32_e32 v13, v25
; NOOPT-NEXT:    v_mov_b32_e32 v14, v24
; NOOPT-NEXT:    v_mov_b32_e32 v15, v23
; NOOPT-NEXT:    v_mov_b32_e32 v16, v6
; NOOPT-NEXT:    v_mov_b32_e32 v17, v5
; NOOPT-NEXT:    v_mov_b32_e32 v18, v4
; NOOPT-NEXT:    v_mov_b32_e32 v19, v3
; NOOPT-NEXT:    v_mov_b32_e32 v20, v2
; NOOPT-NEXT:    v_mov_b32_e32 v21, v1
; NOOPT-NEXT:    v_mov_b32_e32 v22, v0
; NOOPT-NEXT:    v_mov_b32_e32 v0, 0x41880000
; NOOPT-NEXT:    s_mov_b32 m0, s4
; NOOPT-NEXT:    v_movreld_b32_e32 v7, v0
; NOOPT-NEXT:    v_mov_b32_e32 v4, v22
; NOOPT-NEXT:    v_mov_b32_e32 v5, v21
; NOOPT-NEXT:    v_mov_b32_e32 v6, v20
; NOOPT-NEXT:    v_mov_b32_e32 v0, v19
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; NOOPT-NEXT:    v_mov_b32_e32 v4, v14
; NOOPT-NEXT:    v_mov_b32_e32 v5, v13
; NOOPT-NEXT:    v_mov_b32_e32 v6, v12
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v11
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; NOOPT-NEXT:    v_mov_b32_e32 v4, v10
; NOOPT-NEXT:    v_mov_b32_e32 v5, v9
; NOOPT-NEXT:    v_mov_b32_e32 v6, v8
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v7
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: insert_signed_base_plus_offset:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0xb
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_sext_i32_i16 s4, s4
; SI-MOVREL-NEXT:    s_add_i32 s4, s4, 1
; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v16, 0x41880000
; SI-MOVREL-NEXT:    s_mov_b32 m0, s4
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    v_movreld_b32_e32 v0, v16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: insert_signed_base_plus_offset:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0x2c
; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    s_sext_i32_i16 s2, s4
; VI-MOVREL-NEXT:    s_add_i32 s2, s2, 1
; VI-MOVREL-NEXT:    s_mov_b32 m0, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, 0x41880000
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    v_movreld_b32_e32 v0, v16
; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 32
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 16
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, s2
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: insert_signed_base_plus_offset:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_sext_i32_i16 s2, s4
; VI-IDXMODE-NEXT:    s_add_i32 s2, s2, 1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0x41880000
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, v16
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 48
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 32
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 16
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, s2
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: insert_signed_base_plus_offset:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    s_sext_i32_i16 s2, s4
; GFX9-IDXMODE-NEXT:    s_add_i32 s2, s2, 1
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, 0x41880000
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, v17
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %base = sext i16 %in to i32
  %add = add i32 %base, 1
  %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
  store <16 x float> %ins, ptr addrspace(1) %out
  ret void
}

; Make sure that TwoAddressInstructions keeps src0 as subregister sub0
; of the tied implicit use and def of the super register.
define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) {
; GENERIC-LABEL: insert_wo_offset:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT:    s_load_dword s4, s[2:3], 0xb
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41880000
; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x40400000
; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x41000000
; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40e00000
; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x40c00000
; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x40a00000
; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41400000
; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41300000
; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41200000
; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41100000
; GENERIC-NEXT:    v_mov_b32_e32 v15, 0x41800000
; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x41700000
; GENERIC-NEXT:    v_mov_b32_e32 v17, 0x41600000
; GENERIC-NEXT:    v_mov_b32_e32 v18, 0x41500000
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 3
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 4.0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 2
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v2, v0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 1
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 2.0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 0
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 1.0, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 7
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v4, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 6
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v5, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 5
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 4
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 11
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v11, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 10
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 9
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v13, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 8
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v14, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 15
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v9, v15, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 14
; GENERIC-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v16, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 13
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v17, v10, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 12
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v18, v10, vcc
; GENERIC-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: insert_wo_offset:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xb
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_mov_b32 s5, 0x41800000
; NOOPT-NEXT:    s_mov_b32 s6, 0x41700000
; NOOPT-NEXT:    s_mov_b32 s7, 0x41600000
; NOOPT-NEXT:    s_mov_b32 s8, 0x41500000
; NOOPT-NEXT:    s_mov_b32 s9, 0x41400000
; NOOPT-NEXT:    s_mov_b32 s10, 0x41300000
; NOOPT-NEXT:    s_mov_b32 s11, 0x41200000
; NOOPT-NEXT:    s_mov_b32 s12, 0x41100000
; NOOPT-NEXT:    s_mov_b32 s13, 0x41000000
; NOOPT-NEXT:    s_mov_b32 s14, 0x40e00000
; NOOPT-NEXT:    s_mov_b32 s15, 0x40c00000
; NOOPT-NEXT:    s_mov_b32 s16, 0x40a00000
; NOOPT-NEXT:    s_mov_b32 s17, 4.0
; NOOPT-NEXT:    s_mov_b32 s18, 0x40400000
; NOOPT-NEXT:    s_mov_b32 s19, 2.0
; NOOPT-NEXT:    s_mov_b32 s20, 1.0
; NOOPT-NEXT:    v_mov_b32_e32 v7, s20
; NOOPT-NEXT:    v_mov_b32_e32 v30, s19
; NOOPT-NEXT:    v_mov_b32_e32 v29, s18
; NOOPT-NEXT:    v_mov_b32_e32 v28, s17
; NOOPT-NEXT:    v_mov_b32_e32 v27, s16
; NOOPT-NEXT:    v_mov_b32_e32 v26, s15
; NOOPT-NEXT:    v_mov_b32_e32 v25, s14
; NOOPT-NEXT:    v_mov_b32_e32 v24, s13
; NOOPT-NEXT:    v_mov_b32_e32 v23, s12
; NOOPT-NEXT:    v_mov_b32_e32 v6, s11
; NOOPT-NEXT:    v_mov_b32_e32 v5, s10
; NOOPT-NEXT:    v_mov_b32_e32 v4, s9
; NOOPT-NEXT:    v_mov_b32_e32 v3, s8
; NOOPT-NEXT:    v_mov_b32_e32 v2, s7
; NOOPT-NEXT:    v_mov_b32_e32 v1, s6
; NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; NOOPT-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v8, v30
; NOOPT-NEXT:    v_mov_b32_e32 v9, v29
; NOOPT-NEXT:    v_mov_b32_e32 v10, v28
; NOOPT-NEXT:    v_mov_b32_e32 v11, v27
; NOOPT-NEXT:    v_mov_b32_e32 v12, v26
; NOOPT-NEXT:    v_mov_b32_e32 v13, v25
; NOOPT-NEXT:    v_mov_b32_e32 v14, v24
; NOOPT-NEXT:    v_mov_b32_e32 v15, v23
; NOOPT-NEXT:    v_mov_b32_e32 v16, v6
; NOOPT-NEXT:    v_mov_b32_e32 v17, v5
; NOOPT-NEXT:    v_mov_b32_e32 v18, v4
; NOOPT-NEXT:    v_mov_b32_e32 v19, v3
; NOOPT-NEXT:    v_mov_b32_e32 v20, v2
; NOOPT-NEXT:    v_mov_b32_e32 v21, v1
; NOOPT-NEXT:    v_mov_b32_e32 v22, v0
; NOOPT-NEXT:    v_mov_b32_e32 v0, 0x41880000
; NOOPT-NEXT:    s_mov_b32 m0, s4
; NOOPT-NEXT:    v_movreld_b32_e32 v7, v0
; NOOPT-NEXT:    v_mov_b32_e32 v4, v22
; NOOPT-NEXT:    v_mov_b32_e32 v5, v21
; NOOPT-NEXT:    v_mov_b32_e32 v6, v20
; NOOPT-NEXT:    v_mov_b32_e32 v0, v19
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; NOOPT-NEXT:    v_mov_b32_e32 v4, v14
; NOOPT-NEXT:    v_mov_b32_e32 v5, v13
; NOOPT-NEXT:    v_mov_b32_e32 v6, v12
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v11
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; NOOPT-NEXT:    v_mov_b32_e32 v4, v10
; NOOPT-NEXT:    v_mov_b32_e32 v5, v9
; NOOPT-NEXT:    v_mov_b32_e32 v6, v8
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v7
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: insert_wo_offset:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0xb
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v16, 0x41880000
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_mov_b32 m0, s4
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    v_movreld_b32_e32 v0, v16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: insert_wo_offset:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-MOVREL-NEXT:    s_load_dword s2, s[2:3], 0x2c
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    s_mov_b32 m0, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, 0x41880000
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    v_movreld_b32_e32 v0, v16
; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 32
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 16
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, s2
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: insert_wo_offset:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-IDXMODE-NEXT:    s_load_dword s2, s[2:3], 0x2c
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0x41880000
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, v16
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 48
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 32
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 16
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, s2
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: insert_wo_offset:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, 0x41880000
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s4, gpr_idx(DST)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, v17
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
  store <16 x float> %ins, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %offset) {
; GENERIC-LABEL: insert_neg_offset_sgpr:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dword s4, s[2:3], 0xd
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_add_i32 s6, s4, 0xfffffe00
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 0
; GENERIC-NEXT:    s_cselect_b64 s[4:5], -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 3
; GENERIC-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GENERIC-NEXT:    s_cselect_b32 s4, 16, 3
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 2
; GENERIC-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
; GENERIC-NEXT:    s_cselect_b32 s5, 16, 2
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 1
; GENERIC-NEXT:    v_mov_b32_e32 v3, s4
; GENERIC-NEXT:    s_cselect_b32 s4, 16, 1
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 7
; GENERIC-NEXT:    v_mov_b32_e32 v2, s5
; GENERIC-NEXT:    s_cselect_b32 s5, 16, 7
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 6
; GENERIC-NEXT:    v_mov_b32_e32 v1, s4
; GENERIC-NEXT:    s_cselect_b32 s4, 16, 6
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 5
; GENERIC-NEXT:    v_mov_b32_e32 v7, s5
; GENERIC-NEXT:    s_cselect_b32 s5, 16, 5
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 4
; GENERIC-NEXT:    v_mov_b32_e32 v6, s4
; GENERIC-NEXT:    s_cselect_b32 s4, 16, 4
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 11
; GENERIC-NEXT:    v_mov_b32_e32 v5, s5
; GENERIC-NEXT:    s_cselect_b32 s5, 16, 11
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 10
; GENERIC-NEXT:    v_mov_b32_e32 v4, s4
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GENERIC-NEXT:    s_cselect_b32 s4, 16, 10
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 9
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_mov_b32_e32 v7, s5
; GENERIC-NEXT:    s_cselect_b32 s5, 16, 9
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 8
; GENERIC-NEXT:    v_mov_b32_e32 v6, s4
; GENERIC-NEXT:    s_cselect_b32 s4, 16, 8
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 15
; GENERIC-NEXT:    v_mov_b32_e32 v5, s5
; GENERIC-NEXT:    s_cselect_b32 s5, 16, 15
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 14
; GENERIC-NEXT:    v_mov_b32_e32 v4, s4
; GENERIC-NEXT:    s_cselect_b32 s4, 16, 14
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 13
; GENERIC-NEXT:    s_cselect_b32 s7, 16, 13
; GENERIC-NEXT:    s_cmp_eq_u32 s6, 12
; GENERIC-NEXT:    s_cselect_b32 s6, 16, 12
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_mov_b32_e32 v7, s5
; GENERIC-NEXT:    v_mov_b32_e32 v6, s4
; GENERIC-NEXT:    v_mov_b32_e32 v5, s7
; GENERIC-NEXT:    v_mov_b32_e32 v4, s6
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: insert_neg_offset_sgpr:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xd
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_mov_b32 s5, 15
; NOOPT-NEXT:    s_mov_b32 s6, 14
; NOOPT-NEXT:    s_mov_b32 s7, 13
; NOOPT-NEXT:    s_mov_b32 s8, 12
; NOOPT-NEXT:    s_mov_b32 s9, 11
; NOOPT-NEXT:    s_mov_b32 s10, 10
; NOOPT-NEXT:    s_mov_b32 s11, 9
; NOOPT-NEXT:    s_mov_b32 s12, 8
; NOOPT-NEXT:    s_mov_b32 s13, 7
; NOOPT-NEXT:    s_mov_b32 s14, 6
; NOOPT-NEXT:    s_mov_b32 s15, 5
; NOOPT-NEXT:    s_mov_b32 s16, 4
; NOOPT-NEXT:    s_mov_b32 s17, 3
; NOOPT-NEXT:    s_mov_b32 s18, 2
; NOOPT-NEXT:    s_mov_b32 s19, 1
; NOOPT-NEXT:    s_mov_b32 s20, 0
; NOOPT-NEXT:    v_mov_b32_e32 v15, s20
; NOOPT-NEXT:    v_mov_b32_e32 v14, s19
; NOOPT-NEXT:    v_mov_b32_e32 v13, s18
; NOOPT-NEXT:    v_mov_b32_e32 v12, s17
; NOOPT-NEXT:    v_mov_b32_e32 v11, s16
; NOOPT-NEXT:    v_mov_b32_e32 v10, s15
; NOOPT-NEXT:    v_mov_b32_e32 v9, s14
; NOOPT-NEXT:    v_mov_b32_e32 v8, s13
; NOOPT-NEXT:    v_mov_b32_e32 v7, s12
; NOOPT-NEXT:    v_mov_b32_e32 v6, s11
; NOOPT-NEXT:    v_mov_b32_e32 v5, s10
; NOOPT-NEXT:    v_mov_b32_e32 v4, s9
; NOOPT-NEXT:    v_mov_b32_e32 v3, s8
; NOOPT-NEXT:    v_mov_b32_e32 v2, s7
; NOOPT-NEXT:    v_mov_b32_e32 v1, s6
; NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; NOOPT-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v16, v14
; NOOPT-NEXT:    v_mov_b32_e32 v17, v13
; NOOPT-NEXT:    v_mov_b32_e32 v18, v12
; NOOPT-NEXT:    v_mov_b32_e32 v19, v11
; NOOPT-NEXT:    v_mov_b32_e32 v20, v10
; NOOPT-NEXT:    v_mov_b32_e32 v21, v9
; NOOPT-NEXT:    v_mov_b32_e32 v22, v8
; NOOPT-NEXT:    v_mov_b32_e32 v23, v7
; NOOPT-NEXT:    v_mov_b32_e32 v24, v6
; NOOPT-NEXT:    v_mov_b32_e32 v25, v5
; NOOPT-NEXT:    v_mov_b32_e32 v26, v4
; NOOPT-NEXT:    v_mov_b32_e32 v27, v3
; NOOPT-NEXT:    v_mov_b32_e32 v28, v2
; NOOPT-NEXT:    v_mov_b32_e32 v29, v1
; NOOPT-NEXT:    v_mov_b32_e32 v30, v0
; NOOPT-NEXT:    v_mov_b32_e32 v0, 16
; NOOPT-NEXT:    s_add_i32 m0, s4, 0xfffffe00
; NOOPT-NEXT:    v_movreld_b32_e32 v15, v0
; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
; NOOPT-NEXT:    v_mov_b32_e32 v1, v22
; NOOPT-NEXT:    v_mov_b32_e32 v2, v21
; NOOPT-NEXT:    v_mov_b32_e32 v3, v20
; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
; NOOPT-NEXT:    v_mov_b32_e32 v12, v26
; NOOPT-NEXT:    v_mov_b32_e32 v13, v25
; NOOPT-NEXT:    v_mov_b32_e32 v14, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    v_mov_b32_e32 v9, v30
; NOOPT-NEXT:    v_mov_b32_e32 v10, v29
; NOOPT-NEXT:    v_mov_b32_e32 v11, v28
; NOOPT-NEXT:    v_mov_b32_e32 v15, v27
; NOOPT-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v16, v11
; NOOPT-NEXT:    v_mov_b32_e32 v17, v10
; NOOPT-NEXT:    v_mov_b32_e32 v18, v9
; NOOPT-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
; NOOPT-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v9, v14
; NOOPT-NEXT:    v_mov_b32_e32 v10, v13
; NOOPT-NEXT:    v_mov_b32_e32 v11, v12
; NOOPT-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; NOOPT-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v8, v3
; NOOPT-NEXT:    v_mov_b32_e32 v9, v2
; NOOPT-NEXT:    v_mov_b32_e32 v10, v1
; NOOPT-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: insert_neg_offset_sgpr:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
; SI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0xd
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, 0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, 1
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, 2
; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, 3
; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, 4
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, 5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, 6
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, 7
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, 8
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, 9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, 10
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, 11
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, 12
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, 13
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, 14
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, 15
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_add_i32 m0, s4, 0xfffffe00
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    v_movreld_b32_e32 v0, 16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: insert_neg_offset_sgpr:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2c
; VI-MOVREL-NEXT:    s_load_dword s2, s[2:3], 0x34
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, 1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, 2
; VI-MOVREL-NEXT:    v_mov_b32_e32 v3, 3
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    s_add_i32 m0, s2, 0xfffffe00
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, 4
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, 5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, 6
; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, 7
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, 8
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, 9
; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, 10
; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, 11
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, 12
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, 13
; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, 14
; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, 15
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 32
; VI-MOVREL-NEXT:    v_movreld_b32_e32 v0, 16
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 16
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, s2
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: insert_neg_offset_sgpr:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2c
; VI-IDXMODE-NEXT:    s_load_dword s2, s[2:3], 0x34
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, 1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, 2
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v3, 3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, 4
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, 5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, 6
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, 7
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, 8
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, 9
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, 10
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, 11
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, 12
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, 13
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, 14
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, 15
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_addk_i32 s2, 0xfe00
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 16
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 48
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 32
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 16
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, s2
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: insert_neg_offset_sgpr:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GFX9-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x34
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, 1
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, 2
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, 3
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, 4
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, 5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, 6
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, 7
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, 8
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, 9
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, 10
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, 11
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, 12
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, 13
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, 14
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, 15
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, v15
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    s_addk_i32 s4, 0xfe00
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, v14
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, v13
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, v12
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, v11
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, v10
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, v9
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, v8
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, v7
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, v6
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, v4
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, v3
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, v2
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, v1
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, v0
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s4, gpr_idx(DST)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, 16
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v0, v[13:16], s[0:1] offset:48
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v0, v[9:12], s[0:1] offset:32
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v0, v[5:8], s[0:1] offset:16
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %index = add i32 %offset, -512
  %value = insertelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, i32 16, i32 %index
  store <16 x i32> %value, ptr addrspace(1) %out
  ret void
}

; The vector indexed into is originally loaded into an SGPR rather
; than built with a reg_sequence
define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ptr addrspace(1) %out, <16 x i32> %vec, i32 %offset) {
; GENERIC-LABEL: insert_neg_offset_sgpr_loadreg:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0xb
; GENERIC-NEXT:    s_load_dword s20, s[2:3], 0x29
; GENERIC-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x19
; GENERIC-NEXT:    s_mov_b32 s19, 0xf000
; GENERIC-NEXT:    s_mov_b32 s18, -1
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_addk_i32 s20, 0xfe00
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 3
; GENERIC-NEXT:    s_cselect_b32 s3, s3, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 2
; GENERIC-NEXT:    s_cselect_b32 s2, s2, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 1
; GENERIC-NEXT:    v_mov_b32_e32 v3, s3
; GENERIC-NEXT:    s_cselect_b32 s1, s1, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 0
; GENERIC-NEXT:    v_mov_b32_e32 v2, s2
; GENERIC-NEXT:    s_cselect_b32 s0, s0, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 7
; GENERIC-NEXT:    v_mov_b32_e32 v1, s1
; GENERIC-NEXT:    s_cselect_b32 s1, s7, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 6
; GENERIC-NEXT:    v_mov_b32_e32 v0, s0
; GENERIC-NEXT:    s_cselect_b32 s0, s6, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 5
; GENERIC-NEXT:    v_mov_b32_e32 v7, s1
; GENERIC-NEXT:    s_cselect_b32 s1, s5, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 4
; GENERIC-NEXT:    v_mov_b32_e32 v6, s0
; GENERIC-NEXT:    s_cselect_b32 s0, s4, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 11
; GENERIC-NEXT:    v_mov_b32_e32 v5, s1
; GENERIC-NEXT:    s_cselect_b32 s1, s11, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 10
; GENERIC-NEXT:    v_mov_b32_e32 v4, s0
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
; GENERIC-NEXT:    s_cselect_b32 s0, s10, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 9
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_mov_b32_e32 v7, s1
; GENERIC-NEXT:    s_cselect_b32 s1, s9, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 8
; GENERIC-NEXT:    v_mov_b32_e32 v6, s0
; GENERIC-NEXT:    s_cselect_b32 s0, s8, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 15
; GENERIC-NEXT:    v_mov_b32_e32 v5, s1
; GENERIC-NEXT:    s_cselect_b32 s1, s15, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 14
; GENERIC-NEXT:    v_mov_b32_e32 v4, s0
; GENERIC-NEXT:    s_cselect_b32 s0, s14, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 13
; GENERIC-NEXT:    s_cselect_b32 s2, s13, 5
; GENERIC-NEXT:    s_cmp_lg_u32 s20, 12
; GENERIC-NEXT:    s_cselect_b32 s3, s12, 5
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:32
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_mov_b32_e32 v7, s1
; GENERIC-NEXT:    v_mov_b32_e32 v6, s0
; GENERIC-NEXT:    v_mov_b32_e32 v5, s2
; GENERIC-NEXT:    v_mov_b32_e32 v4, s3
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:48
; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: insert_neg_offset_sgpr_loadreg:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
; NOOPT-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x19
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x29
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    v_mov_b32_e32 v0, 5
; NOOPT-NEXT:    v_mov_b32_e32 v30, s23
; NOOPT-NEXT:    v_mov_b32_e32 v29, s22
; NOOPT-NEXT:    v_mov_b32_e32 v28, s21
; NOOPT-NEXT:    v_mov_b32_e32 v27, s20
; NOOPT-NEXT:    v_mov_b32_e32 v26, s19
; NOOPT-NEXT:    v_mov_b32_e32 v25, s18
; NOOPT-NEXT:    v_mov_b32_e32 v24, s17
; NOOPT-NEXT:    v_mov_b32_e32 v23, s16
; NOOPT-NEXT:    v_mov_b32_e32 v22, s15
; NOOPT-NEXT:    v_mov_b32_e32 v21, s14
; NOOPT-NEXT:    v_mov_b32_e32 v20, s13
; NOOPT-NEXT:    v_mov_b32_e32 v19, s12
; NOOPT-NEXT:    v_mov_b32_e32 v18, s11
; NOOPT-NEXT:    v_mov_b32_e32 v17, s10
; NOOPT-NEXT:    v_mov_b32_e32 v16, s9
; NOOPT-NEXT:    v_mov_b32_e32 v15, s8
; NOOPT-NEXT:    s_add_i32 m0, s4, 0xfffffe00
; NOOPT-NEXT:    v_movreld_b32_e32 v15, v0
; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
; NOOPT-NEXT:    v_mov_b32_e32 v1, v22
; NOOPT-NEXT:    v_mov_b32_e32 v2, v21
; NOOPT-NEXT:    v_mov_b32_e32 v3, v20
; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
; NOOPT-NEXT:    v_mov_b32_e32 v12, v26
; NOOPT-NEXT:    v_mov_b32_e32 v13, v25
; NOOPT-NEXT:    v_mov_b32_e32 v14, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    v_mov_b32_e32 v9, v30
; NOOPT-NEXT:    v_mov_b32_e32 v10, v29
; NOOPT-NEXT:    v_mov_b32_e32 v11, v28
; NOOPT-NEXT:    v_mov_b32_e32 v15, v27
; NOOPT-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v16, v11
; NOOPT-NEXT:    v_mov_b32_e32 v17, v10
; NOOPT-NEXT:    v_mov_b32_e32 v18, v9
; NOOPT-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
; NOOPT-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v9, v14
; NOOPT-NEXT:    v_mov_b32_e32 v10, v13
; NOOPT-NEXT:    v_mov_b32_e32 v11, v12
; NOOPT-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; NOOPT-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v8, v3
; NOOPT-NEXT:    v_mov_b32_e32 v9, v2
; NOOPT-NEXT:    v_mov_b32_e32 v10, v1
; NOOPT-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: insert_neg_offset_sgpr_loadreg:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x19
; SI-MOVREL-NEXT:    s_load_dword s0, s[2:3], 0x29
; SI-MOVREL-NEXT:    s_load_dwordx2 s[20:21], s[2:3], 0xb
; SI-MOVREL-NEXT:    s_mov_b32 s23, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s22, -1
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, s4
; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, s5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, s6
; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, s7
; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, s8
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, s9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, s10
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, s11
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, s12
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, s13
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, s14
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, s15
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, s16
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, s17
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, s18
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, s19
; SI-MOVREL-NEXT:    s_add_i32 m0, s0, 0xfffffe00
; SI-MOVREL-NEXT:    v_movreld_b32_e32 v0, 5
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:48
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:32
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: insert_neg_offset_sgpr_loadreg:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
; VI-MOVREL-NEXT:    s_load_dword s20, s[2:3], 0xa4
; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2c
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s4
; VI-MOVREL-NEXT:    s_add_i32 m0, s20, 0xfffffe00
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, s6
; VI-MOVREL-NEXT:    v_mov_b32_e32 v3, s7
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s8
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s9
; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, s10
; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, s11
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, s12
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, s13
; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, s14
; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, s15
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, s16
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, s17
; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, s18
; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, s19
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 32
; VI-MOVREL-NEXT:    v_movreld_b32_e32 v0, 5
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 16
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, s2
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: insert_neg_offset_sgpr_loadreg:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
; VI-IDXMODE-NEXT:    s_load_dword s20, s[2:3], 0xa4
; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2c
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s4
; VI-IDXMODE-NEXT:    s_addk_i32 s20, 0xfe00
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 48
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, s6
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v3, s7
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s8
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s9
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, s10
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, s11
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, s12
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, s13
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, s14
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, s15
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, s16
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, s17
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, s18
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, s19
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 32
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s20, gpr_idx(DST)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 5
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 16
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, s2
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: insert_neg_offset_sgpr_loadreg:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
; GFX9-IDXMODE-NEXT:    s_load_dword s20, s[2:3], 0xa4
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, s4
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, s5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, s6
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, s7
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, s8
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, s9
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, s10
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, s11
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, s12
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, s13
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, s14
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, s15
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, s16
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, s17
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, s18
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, s19
; GFX9-IDXMODE-NEXT:    s_addk_i32 s20, 0xfe00
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s20, gpr_idx(DST)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 5
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %index = add i32 %offset, -512
  %value = insertelement <16 x i32> %vec, i32 5, i32 %index
  store <16 x i32> %value, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GENERIC-LABEL: insert_neg_offset_vgpr:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    v_add_i32_e32 v12, vcc, 0xfffffe00, v0
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 4, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 3, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v1, 2, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v0, 1, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v7, 8, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v6, 7, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v5, 6, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v4, 5, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v11, 12, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v10, 11, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v9, 10, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v8, 9, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v15, 16, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v14, 15, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v13, 14, 33, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v12
; GENERIC-NEXT:    v_cndmask_b32_e64 v12, 13, 33, vcc
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; GENERIC-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: insert_neg_offset_vgpr:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; NOOPT-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; NOOPT-NEXT:    s_mov_b32 s22, -1
; NOOPT-NEXT:    s_mov_b32 s23, 0xe8f000
; NOOPT-NEXT:    s_add_u32 s20, s20, s9
; NOOPT-NEXT:    s_addc_u32 s21, s21, 0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s6, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s4, 0xf000
; NOOPT-NEXT:    s_mov_b32 s5, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s6
; NOOPT-NEXT:    s_mov_b32 s2, s5
; NOOPT-NEXT:    s_mov_b32 s3, s4
; NOOPT-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
; NOOPT-NEXT:    v_writelane_b32 v31, s0, 0
; NOOPT-NEXT:    v_writelane_b32 v31, s1, 1
; NOOPT-NEXT:    v_writelane_b32 v31, s2, 2
; NOOPT-NEXT:    v_writelane_b32 v31, s3, 3
; NOOPT-NEXT:    s_mov_b32 s0, 16
; NOOPT-NEXT:    s_mov_b32 s1, 15
; NOOPT-NEXT:    s_mov_b32 s2, 14
; NOOPT-NEXT:    s_mov_b32 s3, 13
; NOOPT-NEXT:    s_mov_b32 s4, 12
; NOOPT-NEXT:    s_mov_b32 s5, 11
; NOOPT-NEXT:    s_mov_b32 s6, 10
; NOOPT-NEXT:    s_mov_b32 s7, 9
; NOOPT-NEXT:    s_mov_b32 s8, 8
; NOOPT-NEXT:    s_mov_b32 s9, 7
; NOOPT-NEXT:    s_mov_b32 s10, 6
; NOOPT-NEXT:    s_mov_b32 s11, 5
; NOOPT-NEXT:    s_mov_b32 s12, 4
; NOOPT-NEXT:    s_mov_b32 s13, 3
; NOOPT-NEXT:    s_mov_b32 s14, 2
; NOOPT-NEXT:    s_mov_b32 s15, 1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, s15
; NOOPT-NEXT:    v_mov_b32_e32 v30, s14
; NOOPT-NEXT:    v_mov_b32_e32 v29, s13
; NOOPT-NEXT:    v_mov_b32_e32 v28, s12
; NOOPT-NEXT:    v_mov_b32_e32 v27, s11
; NOOPT-NEXT:    v_mov_b32_e32 v26, s10
; NOOPT-NEXT:    v_mov_b32_e32 v25, s9
; NOOPT-NEXT:    v_mov_b32_e32 v24, s8
; NOOPT-NEXT:    v_mov_b32_e32 v23, s7
; NOOPT-NEXT:    v_mov_b32_e32 v22, s6
; NOOPT-NEXT:    v_mov_b32_e32 v21, s5
; NOOPT-NEXT:    v_mov_b32_e32 v20, s4
; NOOPT-NEXT:    v_mov_b32_e32 v19, s3
; NOOPT-NEXT:    v_mov_b32_e32 v18, s2
; NOOPT-NEXT:    v_mov_b32_e32 v17, s1
; NOOPT-NEXT:    v_mov_b32_e32 v16, s0
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:80 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:84 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:88 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:92 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:96 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:100 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:104 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:108 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:112 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:116 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:120 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:124 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill
; NOOPT-NEXT:    v_mov_b32_e32 v16, 33
; NOOPT-NEXT:    buffer_store_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
; NOOPT-NEXT:    v_writelane_b32 v31, s0, 4
; NOOPT-NEXT:    v_writelane_b32 v31, s1, 5
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v31, 6
; NOOPT-NEXT:    v_readlane_b32 s1, v31, 7
; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(6)
; NOOPT-NEXT:    buffer_load_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(5)
; NOOPT-NEXT:    buffer_load_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(4)
; NOOPT-NEXT:    buffer_load_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(3)
; NOOPT-NEXT:    buffer_load_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(2)
; NOOPT-NEXT:    buffer_load_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(1)
; NOOPT-NEXT:    buffer_load_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
; NOOPT-NEXT:    s_add_i32 m0, s2, 0xfffffe00
; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
; NOOPT-NEXT:    v_writelane_b32 v31, s2, 6
; NOOPT-NEXT:    v_writelane_b32 v31, s3, 7
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
; NOOPT-NEXT:    s_cbranch_execnz .LBB14_1
; NOOPT-NEXT:  ; %bb.2:
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v31, 4
; NOOPT-NEXT:    v_readlane_b32 s1, v31, 5
; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
; NOOPT-NEXT:  ; %bb.3:
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v31, 0
; NOOPT-NEXT:    v_readlane_b32 s1, v31, 1
; NOOPT-NEXT:    v_readlane_b32 s2, v31, 2
; NOOPT-NEXT:    v_readlane_b32 s3, v31, 3
; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v18, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v19, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v20, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v21, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v22, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v23, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v24, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v25, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v26, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v27, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v30, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(12)
; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
; NOOPT-NEXT:    s_waitcnt vmcnt(8)
; NOOPT-NEXT:    v_mov_b32_e32 v1, v22
; NOOPT-NEXT:    v_mov_b32_e32 v2, v21
; NOOPT-NEXT:    v_mov_b32_e32 v3, v20
; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
; NOOPT-NEXT:    s_waitcnt vmcnt(4)
; NOOPT-NEXT:    v_mov_b32_e32 v12, v26
; NOOPT-NEXT:    v_mov_b32_e32 v13, v25
; NOOPT-NEXT:    v_mov_b32_e32 v14, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v9, v30
; NOOPT-NEXT:    v_mov_b32_e32 v10, v29
; NOOPT-NEXT:    v_mov_b32_e32 v11, v28
; NOOPT-NEXT:    v_mov_b32_e32 v15, v27
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v16, v11
; NOOPT-NEXT:    v_mov_b32_e32 v17, v10
; NOOPT-NEXT:    v_mov_b32_e32 v18, v9
; NOOPT-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v9, v14
; NOOPT-NEXT:    v_mov_b32_e32 v10, v13
; NOOPT-NEXT:    v_mov_b32_e32 v11, v12
; NOOPT-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v8, v3
; NOOPT-NEXT:    v_mov_b32_e32 v9, v2
; NOOPT-NEXT:    v_mov_b32_e32 v10, v1
; NOOPT-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: insert_neg_offset_vgpr:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    v_add_i32_e32 v12, vcc, 0xfffffe00, v0
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v3, 4, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 3, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v1, 2, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v0, 1, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v7, 8, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v6, 7, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v5, 6, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v4, 5, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v11, 12, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v10, 11, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v9, 10, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v12
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v8, 9, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v15, 16, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v14, 15, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v13, 14, 33, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v12
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v12, 13, 33, vcc
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-LABEL: insert_neg_offset_vgpr:
; VI:       ; %bb.0: ; %entry
; VI-NEXT:    v_add_u32_e32 v12, vcc, 0xfffffe00, v0
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
; VI-NEXT:    v_cndmask_b32_e64 v3, 4, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v12
; VI-NEXT:    v_cndmask_b32_e64 v2, 3, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
; VI-NEXT:    v_cndmask_b32_e64 v1, 2, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
; VI-NEXT:    v_cndmask_b32_e64 v0, 1, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v12
; VI-NEXT:    v_cndmask_b32_e64 v7, 8, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v12
; VI-NEXT:    v_cndmask_b32_e64 v6, 7, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v12
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2c
; VI-NEXT:    v_cndmask_b32_e64 v5, 6, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v12
; VI-NEXT:    v_cndmask_b32_e64 v4, 5, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v12
; VI-NEXT:    v_cndmask_b32_e64 v11, 12, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v12
; VI-NEXT:    v_cndmask_b32_e64 v10, 11, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v12
; VI-NEXT:    v_cndmask_b32_e64 v9, 10, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v12
; VI-NEXT:    v_cndmask_b32_e64 v8, 9, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v12
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    s_add_u32 s2, s0, 48
; VI-NEXT:    v_cndmask_b32_e64 v15, 16, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v12
; VI-NEXT:    s_addc_u32 s3, s1, 0
; VI-NEXT:    v_cndmask_b32_e64 v14, 15, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v12
; VI-NEXT:    v_mov_b32_e32 v17, s3
; VI-NEXT:    v_cndmask_b32_e64 v13, 14, 33, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v12
; VI-NEXT:    v_mov_b32_e32 v16, s2
; VI-NEXT:    s_add_u32 s2, s0, 32
; VI-NEXT:    v_cndmask_b32_e64 v12, 13, 33, vcc
; VI-NEXT:    s_addc_u32 s3, s1, 0
; VI-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-NEXT:    s_nop 0
; VI-NEXT:    v_mov_b32_e32 v13, s3
; VI-NEXT:    v_mov_b32_e32 v12, s2
; VI-NEXT:    s_add_u32 s2, s0, 16
; VI-NEXT:    s_addc_u32 s3, s1, 0
; VI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-NEXT:    s_nop 0
; VI-NEXT:    v_mov_b32_e32 v9, s3
; VI-NEXT:    v_mov_b32_e32 v8, s2
; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-NEXT:    s_nop 0
; VI-NEXT:    v_mov_b32_e32 v5, s1
; VI-NEXT:    v_mov_b32_e32 v4, s0
; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: insert_neg_offset_vgpr:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    v_add_u32_e32 v12, 0xfffffe00, v0
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 4, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v2, 3, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v1, 2, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v0, 1, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v7, 8, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v6, 7, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v5, 6, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v4, 5, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v11, 12, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v10, 11, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v9, 10, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v12
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v8, 9, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v15, 16, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v14, 15, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v13, 14, 33, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v12
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v12, 13, 33, vcc
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %index = add i32 %id, -512
  %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 33, i32 %index
  store <16 x i32> %value, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GENERIC-LABEL: insert_neg_inline_offset_vgpr:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    v_add_i32_e32 v12, vcc, -16, v0
; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x1f4
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 4, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v2, 3, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 2, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 1, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, 8, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, 7, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, 6, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 5, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v11, 12, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v10, 11, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v9, 10, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v8, 9, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v15, 16, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v14, 15, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v13, 14, v16, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v12
; GENERIC-NEXT:    v_cndmask_b32_e32 v12, 13, v16, vcc
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; GENERIC-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: insert_neg_inline_offset_vgpr:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; NOOPT-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; NOOPT-NEXT:    s_mov_b32 s22, -1
; NOOPT-NEXT:    s_mov_b32 s23, 0xe8f000
; NOOPT-NEXT:    s_add_u32 s20, s20, s9
; NOOPT-NEXT:    s_addc_u32 s21, s21, 0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s6, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s4, 0xf000
; NOOPT-NEXT:    s_mov_b32 s5, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s6
; NOOPT-NEXT:    s_mov_b32 s2, s5
; NOOPT-NEXT:    s_mov_b32 s3, s4
; NOOPT-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
; NOOPT-NEXT:    v_writelane_b32 v31, s0, 0
; NOOPT-NEXT:    v_writelane_b32 v31, s1, 1
; NOOPT-NEXT:    v_writelane_b32 v31, s2, 2
; NOOPT-NEXT:    v_writelane_b32 v31, s3, 3
; NOOPT-NEXT:    s_mov_b32 s0, 16
; NOOPT-NEXT:    s_mov_b32 s1, 15
; NOOPT-NEXT:    s_mov_b32 s2, 14
; NOOPT-NEXT:    s_mov_b32 s3, 13
; NOOPT-NEXT:    s_mov_b32 s4, 12
; NOOPT-NEXT:    s_mov_b32 s5, 11
; NOOPT-NEXT:    s_mov_b32 s6, 10
; NOOPT-NEXT:    s_mov_b32 s7, 9
; NOOPT-NEXT:    s_mov_b32 s8, 8
; NOOPT-NEXT:    s_mov_b32 s9, 7
; NOOPT-NEXT:    s_mov_b32 s10, 6
; NOOPT-NEXT:    s_mov_b32 s11, 5
; NOOPT-NEXT:    s_mov_b32 s12, 4
; NOOPT-NEXT:    s_mov_b32 s13, 3
; NOOPT-NEXT:    s_mov_b32 s14, 2
; NOOPT-NEXT:    s_mov_b32 s15, 1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, s15
; NOOPT-NEXT:    v_mov_b32_e32 v30, s14
; NOOPT-NEXT:    v_mov_b32_e32 v29, s13
; NOOPT-NEXT:    v_mov_b32_e32 v28, s12
; NOOPT-NEXT:    v_mov_b32_e32 v27, s11
; NOOPT-NEXT:    v_mov_b32_e32 v26, s10
; NOOPT-NEXT:    v_mov_b32_e32 v25, s9
; NOOPT-NEXT:    v_mov_b32_e32 v24, s8
; NOOPT-NEXT:    v_mov_b32_e32 v23, s7
; NOOPT-NEXT:    v_mov_b32_e32 v22, s6
; NOOPT-NEXT:    v_mov_b32_e32 v21, s5
; NOOPT-NEXT:    v_mov_b32_e32 v20, s4
; NOOPT-NEXT:    v_mov_b32_e32 v19, s3
; NOOPT-NEXT:    v_mov_b32_e32 v18, s2
; NOOPT-NEXT:    v_mov_b32_e32 v17, s1
; NOOPT-NEXT:    v_mov_b32_e32 v16, s0
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:80 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:84 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:88 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:92 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:96 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:100 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:104 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:108 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:112 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:116 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:120 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:124 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill
; NOOPT-NEXT:    v_mov_b32_e32 v16, 0x1f4
; NOOPT-NEXT:    buffer_store_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
; NOOPT-NEXT:    v_writelane_b32 v31, s0, 4
; NOOPT-NEXT:    v_writelane_b32 v31, s1, 5
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v31, 6
; NOOPT-NEXT:    v_readlane_b32 s1, v31, 7
; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(6)
; NOOPT-NEXT:    buffer_load_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(5)
; NOOPT-NEXT:    buffer_load_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(4)
; NOOPT-NEXT:    buffer_load_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(3)
; NOOPT-NEXT:    buffer_load_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(2)
; NOOPT-NEXT:    buffer_load_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(1)
; NOOPT-NEXT:    buffer_load_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
; NOOPT-NEXT:    s_add_i32 m0, s2, -16
; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
; NOOPT-NEXT:    v_writelane_b32 v31, s2, 6
; NOOPT-NEXT:    v_writelane_b32 v31, s3, 7
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
; NOOPT-NEXT:    s_cbranch_execnz .LBB15_1
; NOOPT-NEXT:  ; %bb.2:
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v31, 4
; NOOPT-NEXT:    v_readlane_b32 s1, v31, 5
; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
; NOOPT-NEXT:  ; %bb.3:
; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v31, 0
; NOOPT-NEXT:    v_readlane_b32 s1, v31, 1
; NOOPT-NEXT:    v_readlane_b32 s2, v31, 2
; NOOPT-NEXT:    v_readlane_b32 s3, v31, 3
; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v18, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v19, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v20, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v21, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v22, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v23, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v24, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v25, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v26, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v27, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v30, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(12)
; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
; NOOPT-NEXT:    s_waitcnt vmcnt(8)
; NOOPT-NEXT:    v_mov_b32_e32 v1, v22
; NOOPT-NEXT:    v_mov_b32_e32 v2, v21
; NOOPT-NEXT:    v_mov_b32_e32 v3, v20
; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
; NOOPT-NEXT:    s_waitcnt vmcnt(4)
; NOOPT-NEXT:    v_mov_b32_e32 v12, v26
; NOOPT-NEXT:    v_mov_b32_e32 v13, v25
; NOOPT-NEXT:    v_mov_b32_e32 v14, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v9, v30
; NOOPT-NEXT:    v_mov_b32_e32 v10, v29
; NOOPT-NEXT:    v_mov_b32_e32 v11, v28
; NOOPT-NEXT:    v_mov_b32_e32 v15, v27
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v16, v11
; NOOPT-NEXT:    v_mov_b32_e32 v17, v10
; NOOPT-NEXT:    v_mov_b32_e32 v18, v9
; NOOPT-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v9, v14
; NOOPT-NEXT:    v_mov_b32_e32 v10, v13
; NOOPT-NEXT:    v_mov_b32_e32 v11, v12
; NOOPT-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v8, v3
; NOOPT-NEXT:    v_mov_b32_e32 v9, v2
; NOOPT-NEXT:    v_mov_b32_e32 v10, v1
; NOOPT-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: insert_neg_inline_offset_vgpr:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    v_add_i32_e32 v12, vcc, -16, v0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v16, 0x1f4
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v3, 4, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v2, 3, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v1, 2, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v0, 1, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v7, 8, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v6, 7, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v5, 6, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v4, 5, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v11, 12, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v10, 11, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v9, 10, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v12
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v8, 9, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v15, 16, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v14, 15, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v12
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v13, 14, v16, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v12
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v12, 13, v16, vcc
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-LABEL: insert_neg_inline_offset_vgpr:
; VI:       ; %bb.0: ; %entry
; VI-NEXT:    v_add_u32_e32 v12, vcc, -16, v0
; VI-NEXT:    v_mov_b32_e32 v16, 0x1f4
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
; VI-NEXT:    v_cndmask_b32_e32 v3, 4, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v12
; VI-NEXT:    v_cndmask_b32_e32 v2, 3, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
; VI-NEXT:    v_cndmask_b32_e32 v1, 2, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
; VI-NEXT:    v_cndmask_b32_e32 v0, 1, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v12
; VI-NEXT:    v_cndmask_b32_e32 v7, 8, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v12
; VI-NEXT:    v_cndmask_b32_e32 v6, 7, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v12
; VI-NEXT:    v_cndmask_b32_e32 v5, 6, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v12
; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v12
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2c
; VI-NEXT:    v_cndmask_b32_e32 v11, 12, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v12
; VI-NEXT:    v_cndmask_b32_e32 v10, 11, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v12
; VI-NEXT:    v_cndmask_b32_e32 v9, 10, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v12
; VI-NEXT:    v_cndmask_b32_e32 v8, 9, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v12
; VI-NEXT:    v_cndmask_b32_e32 v15, 16, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v12
; VI-NEXT:    v_cndmask_b32_e32 v14, 15, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v12
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    s_add_u32 s2, s0, 48
; VI-NEXT:    v_cndmask_b32_e32 v13, 14, v16, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v12
; VI-NEXT:    s_addc_u32 s3, s1, 0
; VI-NEXT:    v_cndmask_b32_e32 v12, 13, v16, vcc
; VI-NEXT:    v_mov_b32_e32 v17, s3
; VI-NEXT:    v_mov_b32_e32 v16, s2
; VI-NEXT:    s_add_u32 s2, s0, 32
; VI-NEXT:    s_addc_u32 s3, s1, 0
; VI-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-NEXT:    s_nop 0
; VI-NEXT:    v_mov_b32_e32 v13, s3
; VI-NEXT:    v_mov_b32_e32 v12, s2
; VI-NEXT:    s_add_u32 s2, s0, 16
; VI-NEXT:    s_addc_u32 s3, s1, 0
; VI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-NEXT:    s_nop 0
; VI-NEXT:    v_mov_b32_e32 v9, s3
; VI-NEXT:    v_mov_b32_e32 v8, s2
; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-NEXT:    s_nop 0
; VI-NEXT:    v_mov_b32_e32 v5, s1
; VI-NEXT:    v_mov_b32_e32 v4, s0
; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: insert_neg_inline_offset_vgpr:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    v_add_u32_e32 v12, -16, v0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, 0x1f4
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v3, 4, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v2, 3, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v1, 2, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v0, 1, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v7, 8, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v6, 7, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v5, 6, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v4, 5, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v11, 12, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v10, 11, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v9, 10, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v12
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v8, 9, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v15, 16, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v14, 15, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v12
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v13, 14, v17, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v12
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v12, 13, v17, vcc
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %index = add i32 %id, -16
  %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 500, i32 %index
  store <16 x i32> %value, ptr addrspace(1) %out
  ret void
}

; When the block is split to insert the loop, make sure any other
; places that need to be expanded in the same block are also handled.
define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) {
; GENERIC-LABEL: extract_vgpr_offset_multiple_in_block:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
; GENERIC-NEXT:    s_mov_b32 s11, 0xf000
; GENERIC-NEXT:    s_mov_b32 s6, 0
; GENERIC-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; GENERIC-NEXT:    v_mov_b32_e32 v2, 0
; GENERIC-NEXT:    s_mov_b32 s7, s11
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x9
; GENERIC-NEXT:    s_mov_b32 s10, -1
; GENERIC-NEXT:    ;;#ASMSTART
; GENERIC-NEXT:    s_mov_b32 s4, 17
; GENERIC-NEXT:    ;;#ASMEND
; GENERIC-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 7, 9, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v1
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 11, v3, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
; GENERIC-NEXT:    v_cndmask_b32_e64 v4, 7, 9, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v1
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 13, v3, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 11, v4, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v1
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 5, v3, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 13, v4, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 5, v1
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 6, v3, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 6, v1
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 7, v3, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 5, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 6, v4, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 7, v1
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 8, v3, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 6, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 7, v4, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v1
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 9, v3, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 7, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 8, v4, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v1
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 10, v3, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 9, v4, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v1
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 11, v3, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 10, v4, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v1
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 12, v3, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 11, v4, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v1
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 13, v3, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 12, v4, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 13, v1
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 14, v3, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 13, v4, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v1
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 15, v3, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 13, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 14, v4, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v1
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 16, v3, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 15, v4, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v2, 16, v3, vcc
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    buffer_store_dword v1, off, s[8:11], 0
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    buffer_store_dword v2, off, s[8:11], 0
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
; GENERIC-NEXT:    s_and_saveexec_b64 s[0:1], vcc
; GENERIC-NEXT:    s_cbranch_execz .LBB16_2
; GENERIC-NEXT:  ; %bb.1: ; %bb1
; GENERIC-NEXT:    v_mov_b32_e32 v0, s4
; GENERIC-NEXT:    buffer_store_dword v0, off, s[8:11], 0
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:  .LBB16_2: ; %bb2
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: extract_vgpr_offset_multiple_in_block:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; NOOPT-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; NOOPT-NEXT:    s_mov_b32 s38, -1
; NOOPT-NEXT:    s_mov_b32 s39, 0xe8f000
; NOOPT-NEXT:    s_add_u32 s36, s36, s9
; NOOPT-NEXT:    s_addc_u32 s37, s37, 0
; NOOPT-NEXT:    s_mov_b64 s[0:1], s[2:3]
; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:76 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s8, s3
; NOOPT-NEXT:    s_mov_b32 s4, s2
; NOOPT-NEXT:    s_mov_b32 s2, 0xf000
; NOOPT-NEXT:    s_mov_b32 s3, -1
; NOOPT-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; NOOPT-NEXT:    s_mov_b32 s5, s8
; NOOPT-NEXT:    s_mov_b32 s6, s3
; NOOPT-NEXT:    s_mov_b32 s7, s2
; NOOPT-NEXT:    ; implicit-def: $vgpr18 : SGPR spill to VGPR lane
; NOOPT-NEXT:    v_writelane_b32 v18, s4, 0
; NOOPT-NEXT:    v_writelane_b32 v18, s5, 1
; NOOPT-NEXT:    v_writelane_b32 v18, s6, 2
; NOOPT-NEXT:    v_writelane_b32 v18, s7, 3
; NOOPT-NEXT:    s_mov_b32 s4, 0
; NOOPT-NEXT:    v_writelane_b32 v18, s4, 4
; NOOPT-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; NOOPT-NEXT:    s_mov_b32 s5, s2
; NOOPT-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; NOOPT-NEXT:    s_mov_b32 s4, 2
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
; NOOPT-NEXT:    s_mov_b32 s4, 0
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    v_mov_b32_e32 v2, 0
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v2
; NOOPT-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:72 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b32 s0, 1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_add_i32_e64 v0, s[0:1], v0, s0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:68 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b32 s16, 16
; NOOPT-NEXT:    s_mov_b32 s17, 15
; NOOPT-NEXT:    s_mov_b32 s18, 14
; NOOPT-NEXT:    s_mov_b32 s20, 12
; NOOPT-NEXT:    s_mov_b32 s22, 10
; NOOPT-NEXT:    s_mov_b32 s24, 8
; NOOPT-NEXT:    s_mov_b32 s26, 6
; NOOPT-NEXT:    s_mov_b32 s27, 5
; NOOPT-NEXT:    s_mov_b32 s19, 13
; NOOPT-NEXT:    s_mov_b32 s21, 11
; NOOPT-NEXT:    s_mov_b32 s23, 9
; NOOPT-NEXT:    s_mov_b32 s25, 7
; NOOPT-NEXT:    s_mov_b32 s0, s25
; NOOPT-NEXT:    s_mov_b32 s1, s23
; NOOPT-NEXT:    s_mov_b32 s2, s21
; NOOPT-NEXT:    s_mov_b32 s3, s19
; NOOPT-NEXT:    s_mov_b32 s4, s27
; NOOPT-NEXT:    s_mov_b32 s5, s26
; NOOPT-NEXT:    s_mov_b32 s6, s25
; NOOPT-NEXT:    s_mov_b32 s7, s24
; NOOPT-NEXT:    s_mov_b32 s8, s23
; NOOPT-NEXT:    s_mov_b32 s9, s22
; NOOPT-NEXT:    s_mov_b32 s10, s21
; NOOPT-NEXT:    s_mov_b32 s11, s20
; NOOPT-NEXT:    s_mov_b32 s12, s19
; NOOPT-NEXT:    s_mov_b32 s13, s18
; NOOPT-NEXT:    s_mov_b32 s14, s17
; NOOPT-NEXT:    s_mov_b32 s15, s16
; NOOPT-NEXT:    v_writelane_b32 v18, s0, 5
; NOOPT-NEXT:    v_writelane_b32 v18, s1, 6
; NOOPT-NEXT:    v_writelane_b32 v18, s2, 7
; NOOPT-NEXT:    v_writelane_b32 v18, s3, 8
; NOOPT-NEXT:    v_writelane_b32 v18, s4, 9
; NOOPT-NEXT:    v_writelane_b32 v18, s5, 10
; NOOPT-NEXT:    v_writelane_b32 v18, s6, 11
; NOOPT-NEXT:    v_writelane_b32 v18, s7, 12
; NOOPT-NEXT:    v_writelane_b32 v18, s8, 13
; NOOPT-NEXT:    v_writelane_b32 v18, s9, 14
; NOOPT-NEXT:    v_writelane_b32 v18, s10, 15
; NOOPT-NEXT:    v_writelane_b32 v18, s11, 16
; NOOPT-NEXT:    v_writelane_b32 v18, s12, 17
; NOOPT-NEXT:    v_writelane_b32 v18, s13, 18
; NOOPT-NEXT:    v_writelane_b32 v18, s14, 19
; NOOPT-NEXT:    v_writelane_b32 v18, s15, 20
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, s0
; NOOPT-NEXT:    v_mov_b32_e32 v1, s1
; NOOPT-NEXT:    v_mov_b32_e32 v2, s2
; NOOPT-NEXT:    v_mov_b32_e32 v3, s3
; NOOPT-NEXT:    v_mov_b32_e32 v4, s4
; NOOPT-NEXT:    v_mov_b32_e32 v5, s5
; NOOPT-NEXT:    v_mov_b32_e32 v6, s6
; NOOPT-NEXT:    v_mov_b32_e32 v7, s7
; NOOPT-NEXT:    v_mov_b32_e32 v8, s8
; NOOPT-NEXT:    v_mov_b32_e32 v9, s9
; NOOPT-NEXT:    v_mov_b32_e32 v10, s10
; NOOPT-NEXT:    v_mov_b32_e32 v11, s11
; NOOPT-NEXT:    v_mov_b32_e32 v12, s12
; NOOPT-NEXT:    v_mov_b32_e32 v13, s13
; NOOPT-NEXT:    v_mov_b32_e32 v14, s14
; NOOPT-NEXT:    v_mov_b32_e32 v15, s15
; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[36:39], 0 offset:8 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[36:39], 0 offset:12 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[36:39], 0 offset:16 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[36:39], 0 offset:20 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[36:39], 0 offset:24 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[36:39], 0 offset:28 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[36:39], 0 offset:32 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[36:39], 0 offset:36 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[36:39], 0 offset:40 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[36:39], 0 offset:44 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[36:39], 0 offset:48 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[36:39], 0 offset:52 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[36:39], 0 offset:56 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[36:39], 0 offset:60 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[36:39], 0 offset:64 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
; NOOPT-NEXT:    v_writelane_b32 v18, s0, 21
; NOOPT-NEXT:    v_writelane_b32 v18, s1, 22
; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
; NOOPT-NEXT:    buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
; NOOPT-NEXT:    ; implicit-def: $vgpr0
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v18, 23
; NOOPT-NEXT:    v_readlane_b32 s1, v18, 24
; NOOPT-NEXT:    buffer_load_dword v17, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v2, off, s[36:39], 0 offset:12 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v3, off, s[36:39], 0 offset:16 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v4, off, s[36:39], 0 offset:20 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v5, off, s[36:39], 0 offset:24 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v6, off, s[36:39], 0 offset:28 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v7, off, s[36:39], 0 offset:32 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v8, off, s[36:39], 0 offset:36 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v9, off, s[36:39], 0 offset:40 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v10, off, s[36:39], 0 offset:44 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v11, off, s[36:39], 0 offset:48 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v12, off, s[36:39], 0 offset:52 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v13, off, s[36:39], 0 offset:56 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v14, off, s[36:39], 0 offset:60 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v15, off, s[36:39], 0 offset:64 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v16, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readfirstlane_b32 s2, v16
; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v16
; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
; NOOPT-NEXT:    s_mov_b32 m0, s2
; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:84 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:80 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
; NOOPT-NEXT:    v_writelane_b32 v18, s2, 23
; NOOPT-NEXT:    v_writelane_b32 v18, s3, 24
; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
; NOOPT-NEXT:    buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
; NOOPT-NEXT:    s_cbranch_execnz .LBB16_1
; NOOPT-NEXT:  ; %bb.2:
; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v18, 21
; NOOPT-NEXT:    v_readlane_b32 s1, v18, 22
; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
; NOOPT-NEXT:  ; %bb.3:
; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
; NOOPT-NEXT:    ;;#ASMSTART
; NOOPT-NEXT:    s_mov_b32 s4, 17
; NOOPT-NEXT:    ;;#ASMEND
; NOOPT-NEXT:    s_mov_b32 s16, s4
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v18, 5
; NOOPT-NEXT:    v_readlane_b32 s1, v18, 6
; NOOPT-NEXT:    v_readlane_b32 s2, v18, 7
; NOOPT-NEXT:    v_readlane_b32 s3, v18, 8
; NOOPT-NEXT:    v_readlane_b32 s4, v18, 9
; NOOPT-NEXT:    v_readlane_b32 s5, v18, 10
; NOOPT-NEXT:    v_readlane_b32 s6, v18, 11
; NOOPT-NEXT:    v_readlane_b32 s7, v18, 12
; NOOPT-NEXT:    v_readlane_b32 s8, v18, 13
; NOOPT-NEXT:    v_readlane_b32 s9, v18, 14
; NOOPT-NEXT:    v_readlane_b32 s10, v18, 15
; NOOPT-NEXT:    v_readlane_b32 s11, v18, 16
; NOOPT-NEXT:    v_readlane_b32 s12, v18, 17
; NOOPT-NEXT:    v_readlane_b32 s13, v18, 18
; NOOPT-NEXT:    v_readlane_b32 s14, v18, 19
; NOOPT-NEXT:    v_readlane_b32 s15, v18, 20
; NOOPT-NEXT:    v_writelane_b32 v18, s16, 25
; NOOPT-NEXT:    v_mov_b32_e32 v0, s0
; NOOPT-NEXT:    v_mov_b32_e32 v1, s1
; NOOPT-NEXT:    v_mov_b32_e32 v2, s2
; NOOPT-NEXT:    v_mov_b32_e32 v3, s3
; NOOPT-NEXT:    v_mov_b32_e32 v4, s4
; NOOPT-NEXT:    v_mov_b32_e32 v5, s5
; NOOPT-NEXT:    v_mov_b32_e32 v6, s6
; NOOPT-NEXT:    v_mov_b32_e32 v7, s7
; NOOPT-NEXT:    v_mov_b32_e32 v8, s8
; NOOPT-NEXT:    v_mov_b32_e32 v9, s9
; NOOPT-NEXT:    v_mov_b32_e32 v10, s10
; NOOPT-NEXT:    v_mov_b32_e32 v11, s11
; NOOPT-NEXT:    v_mov_b32_e32 v12, s12
; NOOPT-NEXT:    v_mov_b32_e32 v13, s13
; NOOPT-NEXT:    v_mov_b32_e32 v14, s14
; NOOPT-NEXT:    v_mov_b32_e32 v15, s15
; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:88 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[36:39], 0 offset:92 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[36:39], 0 offset:96 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[36:39], 0 offset:100 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[36:39], 0 offset:104 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[36:39], 0 offset:108 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[36:39], 0 offset:112 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[36:39], 0 offset:116 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[36:39], 0 offset:120 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[36:39], 0 offset:124 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[36:39], 0 offset:128 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[36:39], 0 offset:132 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[36:39], 0 offset:136 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[36:39], 0 offset:140 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[36:39], 0 offset:144 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[36:39], 0 offset:148 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
; NOOPT-NEXT:    v_writelane_b32 v18, s0, 26
; NOOPT-NEXT:    v_writelane_b32 v18, s1, 27
; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
; NOOPT-NEXT:    buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
; NOOPT-NEXT:    ; implicit-def: $vgpr0
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:  .LBB16_4: ; =>This Inner Loop Header: Depth=1
; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v18, 28
; NOOPT-NEXT:    v_readlane_b32 s1, v18, 29
; NOOPT-NEXT:    buffer_load_dword v17, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:88 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:92 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v2, off, s[36:39], 0 offset:96 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v3, off, s[36:39], 0 offset:100 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v4, off, s[36:39], 0 offset:104 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v5, off, s[36:39], 0 offset:108 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v6, off, s[36:39], 0 offset:112 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v7, off, s[36:39], 0 offset:116 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v8, off, s[36:39], 0 offset:120 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v9, off, s[36:39], 0 offset:124 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v10, off, s[36:39], 0 offset:128 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v11, off, s[36:39], 0 offset:132 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v12, off, s[36:39], 0 offset:136 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v13, off, s[36:39], 0 offset:140 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v14, off, s[36:39], 0 offset:144 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v15, off, s[36:39], 0 offset:148 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v16, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readfirstlane_b32 s2, v16
; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v16
; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
; NOOPT-NEXT:    s_mov_b32 m0, s2
; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:156 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:152 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
; NOOPT-NEXT:    v_writelane_b32 v18, s2, 28
; NOOPT-NEXT:    v_writelane_b32 v18, s3, 29
; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
; NOOPT-NEXT:    buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
; NOOPT-NEXT:    s_cbranch_execnz .LBB16_4
; NOOPT-NEXT:  ; %bb.5:
; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v18, 26
; NOOPT-NEXT:    v_readlane_b32 s1, v18, 27
; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
; NOOPT-NEXT:  ; %bb.6:
; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v18, 4
; NOOPT-NEXT:    v_readlane_b32 s4, v18, 0
; NOOPT-NEXT:    v_readlane_b32 s5, v18, 1
; NOOPT-NEXT:    v_readlane_b32 s6, v18, 2
; NOOPT-NEXT:    v_readlane_b32 s7, v18, 3
; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v2, off, s[36:39], 0 offset:84 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_store_dword v2, off, s[4:7], 0
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_store_dword v1, off, s[4:7], 0
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
; NOOPT-NEXT:    v_writelane_b32 v18, s0, 30
; NOOPT-NEXT:    v_writelane_b32 v18, s1, 31
; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
; NOOPT-NEXT:    buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
; NOOPT-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
; NOOPT-NEXT:    s_cbranch_execz .LBB16_8
; NOOPT-NEXT:  ; %bb.7: ; %bb1
; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s4, v18, 25
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    v_mov_b32_e32 v0, s4
; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:  .LBB16_8: ; %bb2
; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v18, 30
; NOOPT-NEXT:    v_readlane_b32 s1, v18, 31
; NOOPT-NEXT:    s_or_b64 exec, exec, s[0:1]
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: extract_vgpr_offset_multiple_in_block:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
; SI-MOVREL-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x9
; SI-MOVREL-NEXT:    s_mov_b32 s11, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s6, 0
; SI-MOVREL-NEXT:    s_mov_b32 s7, s11
; SI-MOVREL-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
; SI-MOVREL-NEXT:    s_mov_b32 s10, -1
; SI-MOVREL-NEXT:    ;;#ASMSTART
; SI-MOVREL-NEXT:    s_mov_b32 s4, 17
; SI-MOVREL-NEXT:    ;;#ASMEND
; SI-MOVREL-NEXT:    v_add_i32_e64 v0, s[0:1], 1, v1
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 7, 9, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 2, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 11, v2, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v3, 7, 9, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 13, v2, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 2, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v3, 11, v3, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 4, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 5, v2, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v3, 13, v3, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 5, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 6, v2, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 4, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v3, 5, v3, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 6, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 7, v2, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 5, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v3, 6, v3, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 7, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 8, v2, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 6, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v3, 7, v3, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 8, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 9, v2, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 7, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v3, 8, v3, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 9, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 10, v2, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 8, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v3, 9, v3, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 10, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 11, v2, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 9, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v3, 10, v3, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 11, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 12, v2, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 10, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v3, 11, v3, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 12, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 13, v2, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 11, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v3, 12, v3, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 13, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 14, v2, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 12, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v3, 13, v3, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 14, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 15, v2, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 13, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v3, 14, v3, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 15, v1
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v1, 16, v2, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 14, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v2, 15, v3, s[0:1]
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e64 s[0:1], 15, v0
; SI-MOVREL-NEXT:    v_cndmask_b32_e64 v0, 16, v2, s[0:1]
; SI-MOVREL-NEXT:    buffer_store_dword v1, off, s[8:11], 0
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    buffer_store_dword v0, off, s[8:11], 0
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
; SI-MOVREL-NEXT:    s_cbranch_execz .LBB16_2
; SI-MOVREL-NEXT:  ; %bb.1: ; %bb1
; SI-MOVREL-NEXT:    s_waitcnt expcnt(0)
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, s4
; SI-MOVREL-NEXT:    buffer_store_dword v0, off, s[8:11], 0
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:  .LBB16_2: ; %bb2
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-LABEL: extract_vgpr_offset_multiple_in_block:
; VI:       ; %bb.0: ; %entry
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v2, s1
; VI-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT:    flat_load_dword v2, v[1:2] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT:    ;;#ASMSTART
; VI-NEXT:    s_mov_b32 s4, 17
; VI-NEXT:    ;;#ASMEND
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v0, s0
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    v_add_u32_e64 v3, s[0:1], 1, v2
; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v2
; VI-NEXT:    v_cndmask_b32_e64 v4, 7, 9, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 2, v2
; VI-NEXT:    v_cndmask_b32_e64 v4, 11, v4, s[0:1]
; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v3
; VI-NEXT:    v_cndmask_b32_e64 v5, 7, 9, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v2
; VI-NEXT:    v_cndmask_b32_e64 v4, 13, v4, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 2, v3
; VI-NEXT:    v_cndmask_b32_e64 v5, 11, v5, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 4, v2
; VI-NEXT:    v_cndmask_b32_e64 v4, 5, v4, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v3
; VI-NEXT:    v_cndmask_b32_e64 v5, 13, v5, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 5, v2
; VI-NEXT:    v_cndmask_b32_e64 v4, 6, v4, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 4, v3
; VI-NEXT:    v_cndmask_b32_e64 v5, 5, v5, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 6, v2
; VI-NEXT:    v_cndmask_b32_e64 v4, 7, v4, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 5, v3
; VI-NEXT:    v_cndmask_b32_e64 v5, 6, v5, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 7, v2
; VI-NEXT:    v_cndmask_b32_e64 v4, 8, v4, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 6, v3
; VI-NEXT:    v_cndmask_b32_e64 v5, 7, v5, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 8, v2
; VI-NEXT:    v_cndmask_b32_e64 v4, 9, v4, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 7, v3
; VI-NEXT:    v_cndmask_b32_e64 v5, 8, v5, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 9, v2
; VI-NEXT:    v_cndmask_b32_e64 v4, 10, v4, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 8, v3
; VI-NEXT:    v_cndmask_b32_e64 v5, 9, v5, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 10, v2
; VI-NEXT:    v_cndmask_b32_e64 v4, 11, v4, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 9, v3
; VI-NEXT:    v_cndmask_b32_e64 v5, 10, v5, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 11, v2
; VI-NEXT:    v_cndmask_b32_e64 v4, 12, v4, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 10, v3
; VI-NEXT:    v_cndmask_b32_e64 v5, 11, v5, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 12, v2
; VI-NEXT:    v_cndmask_b32_e64 v4, 13, v4, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 11, v3
; VI-NEXT:    v_cndmask_b32_e64 v5, 12, v5, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 13, v2
; VI-NEXT:    v_cndmask_b32_e64 v4, 14, v4, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 12, v3
; VI-NEXT:    v_cndmask_b32_e64 v5, 13, v5, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 14, v2
; VI-NEXT:    v_cndmask_b32_e64 v4, 15, v4, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 13, v3
; VI-NEXT:    v_cndmask_b32_e64 v5, 14, v5, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 15, v2
; VI-NEXT:    v_cndmask_b32_e64 v2, 16, v4, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 14, v3
; VI-NEXT:    v_cndmask_b32_e64 v4, 15, v5, s[0:1]
; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 15, v3
; VI-NEXT:    v_cndmask_b32_e64 v3, 16, v4, s[0:1]
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    flat_store_dword v[0:1], v3
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
; VI-NEXT:    s_cbranch_execz .LBB16_2
; VI-NEXT:  ; %bb.1: ; %bb1
; VI-NEXT:    v_mov_b32_e32 v0, s4
; VI-NEXT:    flat_store_dword v[0:1], v0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:  .LBB16_2: ; %bb2
; VI-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: extract_vgpr_offset_multiple_in_block:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dword v2, v1, s[0:1] glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-IDXMODE-NEXT:    ;;#ASMSTART
; GFX9-IDXMODE-NEXT:    s_mov_b32 s4, 17
; GFX9-IDXMODE-NEXT:    ;;#ASMEND
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v2
; GFX9-IDXMODE-NEXT:    v_add_u32_e32 v0, 1, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 7, 9, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 2, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 11, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v4, 7, 9, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 13, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 2, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v4, 11, v4, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 4, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 5, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v4, 13, v4, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 5, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 6, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 4, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v4, 5, v4, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 6, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 7, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 5, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v4, 6, v4, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 7, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 8, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 6, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v4, 7, v4, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 8, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 9, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 7, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v4, 8, v4, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 9, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 10, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 8, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v4, 9, v4, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 10, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 11, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 9, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v4, 10, v4, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 11, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 12, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 10, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v4, 11, v4, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 12, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 13, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 11, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v4, 12, v4, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 13, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 14, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 12, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v4, 13, v4, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 14, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 15, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 13, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v4, 14, v4, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 15, v2
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v2, 16, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 14, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v3, 15, v4, s[0:1]
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e64 s[0:1], 15, v0
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e64 v0, 16, v3, s[0:1]
; GFX9-IDXMODE-NEXT:    global_store_dword v1, v2, s[6:7]
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    global_store_dword v1, v0, s[6:7]
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
; GFX9-IDXMODE-NEXT:    s_cbranch_execz .LBB16_2
; GFX9-IDXMODE-NEXT:  ; %bb.1: ; %bb1
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, s4
; GFX9-IDXMODE-NEXT:    global_store_dword v[0:1], v0, off
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:  .LBB16_2: ; %bb2
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %id.ext = zext i32 %id to i64
  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext
  %idx0 = load volatile i32, ptr addrspace(1) %gep
  %idx1 = add i32 %idx0, 1
  %val0 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx0
  %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
  %val1 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx1
  store volatile i32 %val0, ptr addrspace(1) %out0
  store volatile i32 %val1, ptr addrspace(1) %out0
  %cmp = icmp eq i32 %id, 0
  br i1 %cmp, label %bb1, label %bb2

bb1:
  store volatile i32 %live.out.reg, ptr addrspace(1) undef
  br label %bb2

bb2:
  ret void
}

define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) {
; GENERIC-LABEL: insert_vgpr_offset_multiple_in_block:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx2 s[24:25], s[2:3], 0xd
; GENERIC-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x19
; GENERIC-NEXT:    s_mov_b32 s23, 0xf000
; GENERIC-NEXT:    s_mov_b32 s26, 0
; GENERIC-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; GENERIC-NEXT:    v_mov_b32_e32 v2, 0
; GENERIC-NEXT:    s_mov_b32 s27, s23
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    buffer_load_dword v2, v[1:2], s[24:27], 0 addr64 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    s_load_dwordx2 s[20:21], s[2:3], 0x9
; GENERIC-NEXT:    s_mov_b32 s22, -1
; GENERIC-NEXT:    ;;#ASMSTART
; GENERIC-NEXT:    v_mov_b32 v1, 62
; GENERIC-NEXT:    ;;#ASMEND
; GENERIC-NEXT:    v_mov_b32_e32 v3, s16
; GENERIC-NEXT:    v_mov_b32_e32 v4, s17
; GENERIC-NEXT:    v_mov_b32_e32 v5, s18
; GENERIC-NEXT:    v_mov_b32_e32 v6, s19
; GENERIC-NEXT:    v_mov_b32_e32 v7, s12
; GENERIC-NEXT:    v_mov_b32_e32 v8, s13
; GENERIC-NEXT:    v_mov_b32_e32 v9, s14
; GENERIC-NEXT:    v_mov_b32_e32 v10, s15
; GENERIC-NEXT:    v_mov_b32_e32 v11, s8
; GENERIC-NEXT:    v_mov_b32_e32 v12, s9
; GENERIC-NEXT:    v_mov_b32_e32 v13, s10
; GENERIC-NEXT:    v_mov_b32_e32 v14, s11
; GENERIC-NEXT:    v_mov_b32_e32 v15, s4
; GENERIC-NEXT:    v_mov_b32_e32 v16, s5
; GENERIC-NEXT:    v_mov_b32_e32 v17, s6
; GENERIC-NEXT:    v_mov_b32_e32 v18, s7
; GENERIC-NEXT:    v_add_i32_e32 v19, vcc, 1, v2
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v20, v3, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v21, v4, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v22, v5, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v23, v6, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v24, v7, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v25, v8, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v26, v9, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v10, v10, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v11, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v12, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v13, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v9, v14, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v11, v15, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, v16, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, v17, v1, vcc
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v2
; GENERIC-NEXT:    v_cndmask_b32_e32 v2, v18, v1, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, 63, v2, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 63, v4, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 63, v3, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v2, 63, v11, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 7, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v9, 63, v9, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 6, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v8, 63, v8, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 5, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, 63, v7, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, 63, v6, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v13, 63, v10, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v12, 63, v26, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v11, 63, v25, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v10, 63, v24, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v17, 63, v23, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v16, 63, v22, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 13, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v15, 63, v21, vcc
; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v19
; GENERIC-NEXT:    v_cndmask_b32_e32 v14, 63, v20, vcc
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    buffer_store_dwordx4 v[6:9], off, s[20:23], 0 offset:16
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    buffer_store_dwordx4 v[2:5], off, s[20:23], 0
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
; GENERIC-NEXT:    s_and_saveexec_b64 s[0:1], vcc
; GENERIC-NEXT:    s_cbranch_execz .LBB17_2
; GENERIC-NEXT:  ; %bb.1: ; %bb1
; GENERIC-NEXT:    buffer_store_dword v1, off, s[20:23], 0
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:  .LBB17_2: ; %bb2
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: insert_vgpr_offset_multiple_in_block:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_mov_b32 s28, SCRATCH_RSRC_DWORD0
; NOOPT-NEXT:    s_mov_b32 s29, SCRATCH_RSRC_DWORD1
; NOOPT-NEXT:    s_mov_b32 s30, -1
; NOOPT-NEXT:    s_mov_b32 s31, 0xe8f000
; NOOPT-NEXT:    s_add_u32 s28, s28, s9
; NOOPT-NEXT:    s_addc_u32 s29, s29, 0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_load_dwordx2 s[18:19], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0xd
; NOOPT-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x19
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s24, s19
; NOOPT-NEXT:    s_mov_b32 s20, s18
; NOOPT-NEXT:    s_mov_b32 s18, 0xf000
; NOOPT-NEXT:    s_mov_b32 s19, -1
; NOOPT-NEXT:    ; kill: def $sgpr20 killed $sgpr20 def $sgpr20_sgpr21_sgpr22_sgpr23
; NOOPT-NEXT:    s_mov_b32 s21, s24
; NOOPT-NEXT:    s_mov_b32 s22, s19
; NOOPT-NEXT:    s_mov_b32 s23, s18
; NOOPT-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
; NOOPT-NEXT:    v_writelane_b32 v32, s20, 0
; NOOPT-NEXT:    v_writelane_b32 v32, s21, 1
; NOOPT-NEXT:    v_writelane_b32 v32, s22, 2
; NOOPT-NEXT:    v_writelane_b32 v32, s23, 3
; NOOPT-NEXT:    s_mov_b32 s20, 0
; NOOPT-NEXT:    v_writelane_b32 v32, s20, 4
; NOOPT-NEXT:    ; kill: def $sgpr20 killed $sgpr20 def $sgpr20_sgpr21
; NOOPT-NEXT:    s_mov_b32 s21, s18
; NOOPT-NEXT:    ; kill: def $sgpr16_sgpr17 killed $sgpr16_sgpr17 def $sgpr16_sgpr17_sgpr18_sgpr19
; NOOPT-NEXT:    s_mov_b64 s[18:19], s[20:21]
; NOOPT-NEXT:    s_mov_b32 s20, 2
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_lshlrev_b32_e64 v0, s20, v0
; NOOPT-NEXT:    s_mov_b32 s20, 0
; NOOPT-NEXT:    ; implicit-def: $sgpr20
; NOOPT-NEXT:    v_mov_b32_e32 v2, 0
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v2
; NOOPT-NEXT:    buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:80 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b32 s16, 1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_add_i32_e64 v0, s[16:17], v0, s16
; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:76 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    ;;#ASMSTART
; NOOPT-NEXT:    v_mov_b32 v0, 62
; NOOPT-NEXT:    ;;#ASMEND
; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:72 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:68 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, s0
; NOOPT-NEXT:    v_mov_b32_e32 v1, s1
; NOOPT-NEXT:    v_mov_b32_e32 v2, s2
; NOOPT-NEXT:    v_mov_b32_e32 v3, s3
; NOOPT-NEXT:    v_mov_b32_e32 v4, s4
; NOOPT-NEXT:    v_mov_b32_e32 v5, s5
; NOOPT-NEXT:    v_mov_b32_e32 v6, s6
; NOOPT-NEXT:    v_mov_b32_e32 v7, s7
; NOOPT-NEXT:    v_mov_b32_e32 v8, s8
; NOOPT-NEXT:    v_mov_b32_e32 v9, s9
; NOOPT-NEXT:    v_mov_b32_e32 v10, s10
; NOOPT-NEXT:    v_mov_b32_e32 v11, s11
; NOOPT-NEXT:    v_mov_b32_e32 v12, s12
; NOOPT-NEXT:    v_mov_b32_e32 v13, s13
; NOOPT-NEXT:    v_mov_b32_e32 v14, s14
; NOOPT-NEXT:    v_mov_b32_e32 v15, s15
; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
; NOOPT-NEXT:    v_writelane_b32 v32, s0, 5
; NOOPT-NEXT:    v_writelane_b32 v32, s1, 6
; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
; NOOPT-NEXT:    buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v32, 7
; NOOPT-NEXT:    v_readlane_b32 s1, v32, 8
; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v3, off, s[28:31], 0 offset:16 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v4, off, s[28:31], 0 offset:20 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v5, off, s[28:31], 0 offset:24 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v6, off, s[28:31], 0 offset:28 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v7, off, s[28:31], 0 offset:32 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v8, off, s[28:31], 0 offset:36 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(6)
; NOOPT-NEXT:    buffer_load_dword v9, off, s[28:31], 0 offset:40 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(5)
; NOOPT-NEXT:    buffer_load_dword v10, off, s[28:31], 0 offset:44 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(4)
; NOOPT-NEXT:    buffer_load_dword v11, off, s[28:31], 0 offset:48 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(3)
; NOOPT-NEXT:    buffer_load_dword v12, off, s[28:31], 0 offset:52 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(2)
; NOOPT-NEXT:    buffer_load_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(1)
; NOOPT-NEXT:    buffer_load_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
; NOOPT-NEXT:    s_mov_b32 m0, s2
; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:88 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:92 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:96 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:100 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:104 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:108 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:112 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:116 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:120 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:124 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:128 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:132 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:136 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:140 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:144 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:148 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
; NOOPT-NEXT:    v_writelane_b32 v32, s2, 7
; NOOPT-NEXT:    v_writelane_b32 v32, s3, 8
; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
; NOOPT-NEXT:    buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
; NOOPT-NEXT:    s_cbranch_execnz .LBB17_1
; NOOPT-NEXT:  ; %bb.2:
; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v32, 5
; NOOPT-NEXT:    v_readlane_b32 s1, v32, 6
; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
; NOOPT-NEXT:  ; %bb.3:
; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:88 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v1, off, s[28:31], 0 offset:92 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v2, off, s[28:31], 0 offset:96 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v3, off, s[28:31], 0 offset:100 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v4, off, s[28:31], 0 offset:104 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v5, off, s[28:31], 0 offset:108 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v6, off, s[28:31], 0 offset:112 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v7, off, s[28:31], 0 offset:116 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v8, off, s[28:31], 0 offset:120 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v9, off, s[28:31], 0 offset:124 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v10, off, s[28:31], 0 offset:128 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v11, off, s[28:31], 0 offset:132 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v12, off, s[28:31], 0 offset:136 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v13, off, s[28:31], 0 offset:140 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v14, off, s[28:31], 0 offset:144 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v15, off, s[28:31], 0 offset:148 ; 4-byte Folded Reload
; NOOPT-NEXT:    v_mov_b32_e32 v16, 63
; NOOPT-NEXT:    buffer_store_dword v16, off, s[28:31], 0 offset:216 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
; NOOPT-NEXT:    s_waitcnt vmcnt(14)
; NOOPT-NEXT:    v_writelane_b32 v32, s0, 9
; NOOPT-NEXT:    v_writelane_b32 v32, s1, 10
; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
; NOOPT-NEXT:    buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_waitcnt vmcnt(14)
; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:164 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:168 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:172 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:176 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_waitcnt vmcnt(14)
; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:180 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:184 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:188 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:192 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_waitcnt vmcnt(14)
; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:196 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:200 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:204 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:208 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_waitcnt vmcnt(14)
; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:  .LBB17_4: ; =>This Inner Loop Header: Depth=1
; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v32, 11
; NOOPT-NEXT:    v_readlane_b32 s1, v32, 12
; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v3, off, s[28:31], 0 offset:164 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v4, off, s[28:31], 0 offset:168 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v5, off, s[28:31], 0 offset:172 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v6, off, s[28:31], 0 offset:176 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v7, off, s[28:31], 0 offset:180 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v8, off, s[28:31], 0 offset:184 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(6)
; NOOPT-NEXT:    buffer_load_dword v9, off, s[28:31], 0 offset:188 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(5)
; NOOPT-NEXT:    buffer_load_dword v10, off, s[28:31], 0 offset:192 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(4)
; NOOPT-NEXT:    buffer_load_dword v11, off, s[28:31], 0 offset:196 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(3)
; NOOPT-NEXT:    buffer_load_dword v12, off, s[28:31], 0 offset:200 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(2)
; NOOPT-NEXT:    buffer_load_dword v13, off, s[28:31], 0 offset:204 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(1)
; NOOPT-NEXT:    buffer_load_dword v14, off, s[28:31], 0 offset:208 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:216 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
; NOOPT-NEXT:    s_mov_b32 m0, s2
; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:220 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:224 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:228 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:232 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:236 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:240 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:244 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:248 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:252 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:256 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:260 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:264 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:268 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:272 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:276 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:280 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:164 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:168 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:172 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:176 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:180 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:184 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:188 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:192 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:196 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:200 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:204 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:208 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
; NOOPT-NEXT:    v_writelane_b32 v32, s2, 11
; NOOPT-NEXT:    v_writelane_b32 v32, s3, 12
; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
; NOOPT-NEXT:    buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
; NOOPT-NEXT:    s_cbranch_execnz .LBB17_4
; NOOPT-NEXT:  ; %bb.5:
; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v32, 9
; NOOPT-NEXT:    v_readlane_b32 s1, v32, 10
; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
; NOOPT-NEXT:  ; %bb.6:
; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v32, 4
; NOOPT-NEXT:    v_readlane_b32 s4, v32, 0
; NOOPT-NEXT:    v_readlane_b32 s5, v32, 1
; NOOPT-NEXT:    v_readlane_b32 s6, v32, 2
; NOOPT-NEXT:    v_readlane_b32 s7, v32, 3
; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v18, off, s[28:31], 0 offset:228 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v19, off, s[28:31], 0 offset:232 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v20, off, s[28:31], 0 offset:236 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v21, off, s[28:31], 0 offset:240 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v22, off, s[28:31], 0 offset:244 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v23, off, s[28:31], 0 offset:248 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v24, off, s[28:31], 0 offset:252 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v25, off, s[28:31], 0 offset:256 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v26, off, s[28:31], 0 offset:260 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v27, off, s[28:31], 0 offset:264 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v28, off, s[28:31], 0 offset:268 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v29, off, s[28:31], 0 offset:272 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v30, off, s[28:31], 0 offset:276 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v31, off, s[28:31], 0 offset:280 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(12)
; NOOPT-NEXT:    v_mov_b32_e32 v5, v19
; NOOPT-NEXT:    v_mov_b32_e32 v6, v18
; NOOPT-NEXT:    v_mov_b32_e32 v7, v17
; NOOPT-NEXT:    v_mov_b32_e32 v1, v16
; NOOPT-NEXT:    s_waitcnt vmcnt(8)
; NOOPT-NEXT:    v_mov_b32_e32 v2, v23
; NOOPT-NEXT:    v_mov_b32_e32 v3, v22
; NOOPT-NEXT:    v_mov_b32_e32 v4, v21
; NOOPT-NEXT:    v_mov_b32_e32 v8, v20
; NOOPT-NEXT:    s_waitcnt vmcnt(4)
; NOOPT-NEXT:    v_mov_b32_e32 v13, v27
; NOOPT-NEXT:    v_mov_b32_e32 v14, v26
; NOOPT-NEXT:    v_mov_b32_e32 v15, v25
; NOOPT-NEXT:    v_mov_b32_e32 v9, v24
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v10, v31
; NOOPT-NEXT:    v_mov_b32_e32 v11, v30
; NOOPT-NEXT:    v_mov_b32_e32 v12, v29
; NOOPT-NEXT:    v_mov_b32_e32 v16, v28
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v17, v12
; NOOPT-NEXT:    v_mov_b32_e32 v18, v11
; NOOPT-NEXT:    v_mov_b32_e32 v19, v10
; NOOPT-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:48
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v10, v15
; NOOPT-NEXT:    v_mov_b32_e32 v11, v14
; NOOPT-NEXT:    v_mov_b32_e32 v12, v13
; NOOPT-NEXT:    buffer_store_dwordx4 v[9:12], off, s[4:7], 0 offset:32
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v9, v4
; NOOPT-NEXT:    v_mov_b32_e32 v10, v3
; NOOPT-NEXT:    v_mov_b32_e32 v11, v2
; NOOPT-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v2, v7
; NOOPT-NEXT:    v_mov_b32_e32 v3, v6
; NOOPT-NEXT:    v_mov_b32_e32 v4, v5
; NOOPT-NEXT:    buffer_store_dwordx4 v[1:4], off, s[4:7], 0
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
; NOOPT-NEXT:    v_writelane_b32 v32, s0, 13
; NOOPT-NEXT:    v_writelane_b32 v32, s1, 14
; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
; NOOPT-NEXT:    buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
; NOOPT-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
; NOOPT-NEXT:    s_cbranch_execz .LBB17_8
; NOOPT-NEXT:  ; %bb.7: ; %bb1
; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:68 ; 4-byte Folded Reload
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s6, s1
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s4, 0xf000
; NOOPT-NEXT:    s_mov_b32 s5, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s6
; NOOPT-NEXT:    s_mov_b32 s2, s5
; NOOPT-NEXT:    s_mov_b32 s3, s4
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:  .LBB17_8: ; %bb2
; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v32, 13
; NOOPT-NEXT:    v_readlane_b32 s1, v32, 14
; NOOPT-NEXT:    s_or_b64 exec, exec, s[0:1]
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: insert_vgpr_offset_multiple_in_block:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
; SI-MOVREL-NEXT:    s_mov_b32 s23, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s6, 0
; SI-MOVREL-NEXT:    s_mov_b32 s7, s23
; SI-MOVREL-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    buffer_load_dword v2, v[1:2], s[4:7], 0 addr64 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x19
; SI-MOVREL-NEXT:    s_load_dwordx2 s[20:21], s[2:3], 0x9
; SI-MOVREL-NEXT:    ;;#ASMSTART
; SI-MOVREL-NEXT:    v_mov_b32 v1, 62
; SI-MOVREL-NEXT:    ;;#ASMEND
; SI-MOVREL-NEXT:    s_mov_b32 s22, -1
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, s16
; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, s17
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, s18
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, s19
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, s12
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, s13
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, s14
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, s15
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, s8
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, s9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, s10
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, s11
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, s4
; SI-MOVREL-NEXT:    v_mov_b32_e32 v16, s5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v17, s6
; SI-MOVREL-NEXT:    v_mov_b32_e32 v18, s7
; SI-MOVREL-NEXT:    v_add_i32_e32 v19, vcc, 1, v2
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v20, v3, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v21, v4, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v22, v5, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v23, v6, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v24, v7, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v25, v8, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v26, v9, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v10, v10, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v6, v11, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v7, v12, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v8, v13, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v9, v14, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v11, v15, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v3, v16, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v4, v17, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v2
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v2, v18, v1, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v5, 63, v2, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v4, 63, v4, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v3, 63, v3, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v2, 63, v11, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 7, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v9, 63, v9, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 6, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v8, 63, v8, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 5, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v7, 63, v7, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v6, 63, v6, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v13, 63, v10, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v12, 63, v26, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v11, 63, v25, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v10, 63, v24, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v17, 63, v23, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v16, 63, v22, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 13, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v15, 63, v21, vcc
; SI-MOVREL-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v19
; SI-MOVREL-NEXT:    v_cndmask_b32_e32 v14, 63, v20, vcc
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[6:9], off, s[20:23], 0 offset:16
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[2:5], off, s[20:23], 0
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
; SI-MOVREL-NEXT:    s_cbranch_execz .LBB17_2
; SI-MOVREL-NEXT:  ; %bb.1: ; %bb1
; SI-MOVREL-NEXT:    buffer_store_dword v1, off, s[20:23], 0
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:  .LBB17_2: ; %bb2
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-LABEL: insert_vgpr_offset_multiple_in_block:
; VI:       ; %bb.0: ; %entry
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v2, s1
; VI-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT:    flat_load_dword v2, v[1:2] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT:    ;;#ASMSTART
; VI-NEXT:    v_mov_b32 v1, 62
; VI-NEXT:    ;;#ASMEND
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v3, s16
; VI-NEXT:    v_mov_b32_e32 v4, s17
; VI-NEXT:    v_mov_b32_e32 v5, s18
; VI-NEXT:    v_mov_b32_e32 v6, s19
; VI-NEXT:    v_mov_b32_e32 v7, s12
; VI-NEXT:    v_mov_b32_e32 v8, s13
; VI-NEXT:    v_mov_b32_e32 v9, s14
; VI-NEXT:    v_mov_b32_e32 v10, s15
; VI-NEXT:    v_mov_b32_e32 v11, s8
; VI-NEXT:    v_mov_b32_e32 v12, s9
; VI-NEXT:    v_mov_b32_e32 v13, s10
; VI-NEXT:    v_mov_b32_e32 v14, s11
; VI-NEXT:    v_mov_b32_e32 v15, s4
; VI-NEXT:    v_mov_b32_e32 v16, s5
; VI-NEXT:    v_mov_b32_e32 v17, s6
; VI-NEXT:    v_mov_b32_e32 v18, s7
; VI-NEXT:    s_add_u32 s2, s0, 48
; VI-NEXT:    s_addc_u32 s3, s1, 0
; VI-NEXT:    v_add_u32_e32 v19, vcc, 1, v2
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v2
; VI-NEXT:    v_cndmask_b32_e32 v20, v3, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v2
; VI-NEXT:    v_cndmask_b32_e32 v21, v4, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v2
; VI-NEXT:    v_cndmask_b32_e32 v22, v5, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v2
; VI-NEXT:    v_cndmask_b32_e32 v23, v6, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v2
; VI-NEXT:    v_cndmask_b32_e32 v24, v7, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v2
; VI-NEXT:    v_cndmask_b32_e32 v25, v8, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v2
; VI-NEXT:    v_cndmask_b32_e32 v26, v9, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v2
; VI-NEXT:    v_cndmask_b32_e32 v10, v10, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v2
; VI-NEXT:    v_cndmask_b32_e32 v6, v11, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v2
; VI-NEXT:    v_cndmask_b32_e32 v7, v12, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v2
; VI-NEXT:    v_cndmask_b32_e32 v8, v13, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v2
; VI-NEXT:    v_cndmask_b32_e32 v9, v14, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
; VI-NEXT:    v_cndmask_b32_e32 v11, v15, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
; VI-NEXT:    v_cndmask_b32_e32 v3, v16, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v2
; VI-NEXT:    v_cndmask_b32_e32 v4, v17, v1, vcc
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v2
; VI-NEXT:    v_cndmask_b32_e32 v2, v18, v1, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v19
; VI-NEXT:    v_cndmask_b32_e32 v5, 63, v2, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v19
; VI-NEXT:    v_cndmask_b32_e32 v4, 63, v4, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v19
; VI-NEXT:    v_cndmask_b32_e32 v3, 63, v3, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
; VI-NEXT:    v_cndmask_b32_e32 v2, 63, v11, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 7, v19
; VI-NEXT:    v_cndmask_b32_e32 v9, 63, v9, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 6, v19
; VI-NEXT:    v_cndmask_b32_e32 v8, 63, v8, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 5, v19
; VI-NEXT:    v_cndmask_b32_e32 v7, 63, v7, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v19
; VI-NEXT:    v_cndmask_b32_e32 v6, 63, v6, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v19
; VI-NEXT:    v_cndmask_b32_e32 v13, 63, v10, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v19
; VI-NEXT:    v_cndmask_b32_e32 v12, 63, v26, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v19
; VI-NEXT:    v_cndmask_b32_e32 v11, 63, v25, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v19
; VI-NEXT:    v_cndmask_b32_e32 v10, 63, v24, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v19
; VI-NEXT:    v_cndmask_b32_e32 v17, 63, v23, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v19
; VI-NEXT:    v_cndmask_b32_e32 v16, 63, v22, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 13, v19
; VI-NEXT:    v_cndmask_b32_e32 v15, 63, v21, vcc
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v19
; VI-NEXT:    v_mov_b32_e32 v19, s3
; VI-NEXT:    v_mov_b32_e32 v18, s2
; VI-NEXT:    s_add_u32 s2, s0, 32
; VI-NEXT:    v_cndmask_b32_e32 v14, 63, v20, vcc
; VI-NEXT:    s_addc_u32 s3, s1, 0
; VI-NEXT:    flat_store_dwordx4 v[18:19], v[14:17]
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT:    v_mov_b32_e32 v15, s3
; VI-NEXT:    v_mov_b32_e32 v14, s2
; VI-NEXT:    s_add_u32 s2, s0, 16
; VI-NEXT:    s_addc_u32 s3, s1, 0
; VI-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v11, s3
; VI-NEXT:    v_mov_b32_e32 v10, s2
; VI-NEXT:    flat_store_dwordx4 v[10:11], v[6:9]
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v7, s1
; VI-NEXT:    v_mov_b32_e32 v6, s0
; VI-NEXT:    flat_store_dwordx4 v[6:7], v[2:5]
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
; VI-NEXT:    s_cbranch_execz .LBB17_2
; VI-NEXT:  ; %bb.1: ; %bb1
; VI-NEXT:    flat_store_dword v[0:1], v1
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:  .LBB17_2: ; %bb2
; VI-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: insert_vgpr_offset_multiple_in_block:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX9-IDXMODE-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dword v3, v1, s[0:1] glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    ;;#ASMSTART
; GFX9-IDXMODE-NEXT:    v_mov_b32 v1, 62
; GFX9-IDXMODE-NEXT:    ;;#ASMEND
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, s16
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, s17
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, s18
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, s19
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, s12
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, s13
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, s14
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, s15
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, s8
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, s9
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, s10
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, s11
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, s4
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, s5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v18, s6
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v19, s7
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v21, v4, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v22, v5, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v23, v6, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v24, v7, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v25, v8, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v26, v9, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v27, v10, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v11, v11, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v7, v12, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v8, v13, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v9, v14, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v10, v15, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v12, v16, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v4, v17, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v3
; GFX9-IDXMODE-NEXT:    v_add_u32_e32 v20, 1, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v5, v18, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v3
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v3, v19, v1, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v6, 63, v3, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v5, 63, v5, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v4, 63, v4, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v3, 63, v12, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 7, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v10, 63, v10, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 6, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v9, 63, v9, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 5, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v8, 63, v8, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v7, 63, v7, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v14, 63, v11, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v13, 63, v27, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v12, 63, v26, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v11, 63, v25, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v18, 63, v24, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v17, 63, v23, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 13, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v16, 63, v22, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v20
; GFX9-IDXMODE-NEXT:    v_cndmask_b32_e32 v15, 63, v21, vcc
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v2, v[15:18], s[0:1] offset:48
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v2, v[11:14], s[0:1] offset:32
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v2, v[7:10], s[0:1] offset:16
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v2, v[3:6], s[0:1]
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
; GFX9-IDXMODE-NEXT:    s_cbranch_execz .LBB17_2
; GFX9-IDXMODE-NEXT:  ; %bb.1: ; %bb1
; GFX9-IDXMODE-NEXT:    global_store_dword v[0:1], v1, off
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:  .LBB17_2: ; %bb2
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %id.ext = zext i32 %id to i64
  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext
  %idx0 = load volatile i32, ptr addrspace(1) %gep
  %idx1 = add i32 %idx0, 1
  %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
  %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
  %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
  store volatile <16 x i32> %vec2, ptr addrspace(1) %out0
  %cmp = icmp eq i32 %id, 0
  br i1 %cmp, label %bb1, label %bb2

bb1:
  store volatile i32 %live.out.val, ptr addrspace(1) undef
  br label %bb2

bb2:
  ret void
}

; Avoid inserting extra v_mov from copies within the vgpr indexing sequence. The
; gpr_idx mode switching sequence is expanded late for this reason.
define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) {
; GENERIC-LABEL: insert_w_offset_multiple_in_block:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT:    s_load_dword s4, s[2:3], 0xb
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x41500000
; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x41880000
; GENERIC-NEXT:    v_mov_b32_e32 v1, 0x41600000
; GENERIC-NEXT:    v_mov_b32_e32 v2, 0x41700000
; GENERIC-NEXT:    v_mov_b32_e32 v3, 0x41800000
; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x41100000
; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x41200000
; GENERIC-NEXT:    v_mov_b32_e32 v6, 0x41300000
; GENERIC-NEXT:    v_mov_b32_e32 v7, 0x41400000
; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x40a00000
; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x40c00000
; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x40e00000
; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41000000
; GENERIC-NEXT:    v_mov_b32_e32 v15, 0x40400000
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_add_i32 s5, s4, 1
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 12
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 13
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 14
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 15
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 8
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 9
; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 10
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 11
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 4
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 5
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v10, v10, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 6
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v11, v11, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 7
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v12, v12, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 0
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v13, 1.0, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 1
; GENERIC-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v14, 2.0, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 2
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v15, v15, v8, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s5, 3
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v16, 4.0, v8, vcc
; GENERIC-NEXT:    s_add_i32 s4, s4, 2
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 3
; GENERIC-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_cndmask_b32_e32 v16, v8, v16, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 2
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v15, v8, v15, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 1
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v14, v8, v14, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 0
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v13, v8, v13, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 7
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v12, v8, v12, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 6
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v11, v8, v11, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 5
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v10, v8, v10, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 4
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 11
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 10
; GENERIC-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:80
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 9
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 8
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 15
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 14
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 13
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 12
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
; GENERIC-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: insert_w_offset_multiple_in_block:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xb
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_mov_b32 s5, 1
; NOOPT-NEXT:    s_add_i32 s5, s4, s5
; NOOPT-NEXT:    s_mov_b32 s6, 0x41800000
; NOOPT-NEXT:    s_mov_b32 s7, 0x41700000
; NOOPT-NEXT:    s_mov_b32 s8, 0x41600000
; NOOPT-NEXT:    s_mov_b32 s9, 0x41500000
; NOOPT-NEXT:    s_mov_b32 s10, 0x41400000
; NOOPT-NEXT:    s_mov_b32 s11, 0x41300000
; NOOPT-NEXT:    s_mov_b32 s12, 0x41200000
; NOOPT-NEXT:    s_mov_b32 s13, 0x41100000
; NOOPT-NEXT:    s_mov_b32 s14, 0x41000000
; NOOPT-NEXT:    s_mov_b32 s15, 0x40e00000
; NOOPT-NEXT:    s_mov_b32 s16, 0x40c00000
; NOOPT-NEXT:    s_mov_b32 s17, 0x40a00000
; NOOPT-NEXT:    s_mov_b32 s18, 4.0
; NOOPT-NEXT:    s_mov_b32 s19, 0x40400000
; NOOPT-NEXT:    s_mov_b32 s20, 2.0
; NOOPT-NEXT:    s_mov_b32 s21, 1.0
; NOOPT-NEXT:    v_mov_b32_e32 v23, s21
; NOOPT-NEXT:    v_mov_b32_e32 v14, s20
; NOOPT-NEXT:    v_mov_b32_e32 v13, s19
; NOOPT-NEXT:    v_mov_b32_e32 v12, s18
; NOOPT-NEXT:    v_mov_b32_e32 v11, s17
; NOOPT-NEXT:    v_mov_b32_e32 v10, s16
; NOOPT-NEXT:    v_mov_b32_e32 v9, s15
; NOOPT-NEXT:    v_mov_b32_e32 v8, s14
; NOOPT-NEXT:    v_mov_b32_e32 v7, s13
; NOOPT-NEXT:    v_mov_b32_e32 v6, s12
; NOOPT-NEXT:    v_mov_b32_e32 v5, s11
; NOOPT-NEXT:    v_mov_b32_e32 v4, s10
; NOOPT-NEXT:    v_mov_b32_e32 v3, s9
; NOOPT-NEXT:    v_mov_b32_e32 v2, s8
; NOOPT-NEXT:    v_mov_b32_e32 v1, s7
; NOOPT-NEXT:    v_mov_b32_e32 v0, s6
; NOOPT-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v24, v14
; NOOPT-NEXT:    v_mov_b32_e32 v25, v13
; NOOPT-NEXT:    v_mov_b32_e32 v26, v12
; NOOPT-NEXT:    v_mov_b32_e32 v27, v11
; NOOPT-NEXT:    v_mov_b32_e32 v28, v10
; NOOPT-NEXT:    v_mov_b32_e32 v29, v9
; NOOPT-NEXT:    v_mov_b32_e32 v30, v8
; NOOPT-NEXT:    v_mov_b32_e32 v31, v7
; NOOPT-NEXT:    v_mov_b32_e32 v32, v6
; NOOPT-NEXT:    v_mov_b32_e32 v33, v5
; NOOPT-NEXT:    v_mov_b32_e32 v34, v4
; NOOPT-NEXT:    v_mov_b32_e32 v35, v3
; NOOPT-NEXT:    v_mov_b32_e32 v36, v2
; NOOPT-NEXT:    v_mov_b32_e32 v37, v1
; NOOPT-NEXT:    v_mov_b32_e32 v38, v0
; NOOPT-NEXT:    v_mov_b32_e32 v0, 0x41880000
; NOOPT-NEXT:    s_mov_b32 m0, s5
; NOOPT-NEXT:    v_movreld_b32_e32 v23, v0
; NOOPT-NEXT:    s_mov_b32 s5, 2
; NOOPT-NEXT:    s_add_i32 s4, s4, s5
; NOOPT-NEXT:    s_mov_b32 m0, s4
; NOOPT-NEXT:    v_mov_b32_e32 v7, v23
; NOOPT-NEXT:    v_mov_b32_e32 v8, v24
; NOOPT-NEXT:    v_mov_b32_e32 v9, v25
; NOOPT-NEXT:    v_mov_b32_e32 v10, v26
; NOOPT-NEXT:    v_mov_b32_e32 v11, v27
; NOOPT-NEXT:    v_mov_b32_e32 v12, v28
; NOOPT-NEXT:    v_mov_b32_e32 v13, v29
; NOOPT-NEXT:    v_mov_b32_e32 v14, v30
; NOOPT-NEXT:    v_mov_b32_e32 v15, v31
; NOOPT-NEXT:    v_mov_b32_e32 v16, v32
; NOOPT-NEXT:    v_mov_b32_e32 v17, v33
; NOOPT-NEXT:    v_mov_b32_e32 v18, v34
; NOOPT-NEXT:    v_mov_b32_e32 v19, v35
; NOOPT-NEXT:    v_mov_b32_e32 v20, v36
; NOOPT-NEXT:    v_mov_b32_e32 v21, v37
; NOOPT-NEXT:    v_mov_b32_e32 v22, v38
; NOOPT-NEXT:    v_movreld_b32_e32 v7, v0
; NOOPT-NEXT:    v_mov_b32_e32 v4, v38
; NOOPT-NEXT:    v_mov_b32_e32 v5, v37
; NOOPT-NEXT:    v_mov_b32_e32 v6, v36
; NOOPT-NEXT:    v_mov_b32_e32 v0, v35
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; NOOPT-NEXT:    v_mov_b32_e32 v4, v34
; NOOPT-NEXT:    v_mov_b32_e32 v5, v33
; NOOPT-NEXT:    v_mov_b32_e32 v6, v32
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v31
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; NOOPT-NEXT:    v_mov_b32_e32 v4, v30
; NOOPT-NEXT:    v_mov_b32_e32 v5, v29
; NOOPT-NEXT:    v_mov_b32_e32 v6, v28
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v27
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; NOOPT-NEXT:    v_mov_b32_e32 v4, v26
; NOOPT-NEXT:    v_mov_b32_e32 v5, v25
; NOOPT-NEXT:    v_mov_b32_e32 v6, v24
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v23
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; NOOPT-NEXT:    v_mov_b32_e32 v4, v22
; NOOPT-NEXT:    v_mov_b32_e32 v5, v21
; NOOPT-NEXT:    v_mov_b32_e32 v6, v20
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v19
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
; NOOPT-NEXT:    v_mov_b32_e32 v4, v14
; NOOPT-NEXT:    v_mov_b32_e32 v5, v13
; NOOPT-NEXT:    v_mov_b32_e32 v6, v12
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v11
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; NOOPT-NEXT:    v_mov_b32_e32 v4, v10
; NOOPT-NEXT:    v_mov_b32_e32 v5, v9
; NOOPT-NEXT:    v_mov_b32_e32 v6, v8
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v7
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: insert_w_offset_multiple_in_block:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0xb
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_add_i32 s2, s4, 1
; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v32, 0x41880000
; SI-MOVREL-NEXT:    s_mov_b32 m0, s2
; SI-MOVREL-NEXT:    v_movreld_b32_e32 v0, v32
; SI-MOVREL-NEXT:    s_add_i32 s4, s4, 2
; SI-MOVREL-NEXT:    v_mov_b32_e32 v31, v15
; SI-MOVREL-NEXT:    v_mov_b32_e32 v30, v14
; SI-MOVREL-NEXT:    v_mov_b32_e32 v29, v13
; SI-MOVREL-NEXT:    v_mov_b32_e32 v28, v12
; SI-MOVREL-NEXT:    v_mov_b32_e32 v27, v11
; SI-MOVREL-NEXT:    v_mov_b32_e32 v26, v10
; SI-MOVREL-NEXT:    v_mov_b32_e32 v25, v9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v24, v8
; SI-MOVREL-NEXT:    v_mov_b32_e32 v23, v7
; SI-MOVREL-NEXT:    v_mov_b32_e32 v22, v6
; SI-MOVREL-NEXT:    v_mov_b32_e32 v21, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v20, v4
; SI-MOVREL-NEXT:    v_mov_b32_e32 v19, v3
; SI-MOVREL-NEXT:    v_mov_b32_e32 v18, v2
; SI-MOVREL-NEXT:    v_mov_b32_e32 v17, v1
; SI-MOVREL-NEXT:    v_mov_b32_e32 v16, v0
; SI-MOVREL-NEXT:    s_mov_b32 m0, s4
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    v_movreld_b32_e32 v16, v32
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: insert_w_offset_multiple_in_block:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0x2c
; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    s_add_i32 s2, s4, 1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v32, 0x41880000
; VI-MOVREL-NEXT:    s_mov_b32 m0, s2
; VI-MOVREL-NEXT:    s_add_i32 s4, s4, 2
; VI-MOVREL-NEXT:    v_movreld_b32_e32 v0, v32
; VI-MOVREL-NEXT:    v_mov_b32_e32 v31, v15
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT:    v_mov_b32_e32 v30, v14
; VI-MOVREL-NEXT:    v_mov_b32_e32 v29, v13
; VI-MOVREL-NEXT:    v_mov_b32_e32 v28, v12
; VI-MOVREL-NEXT:    v_mov_b32_e32 v27, v11
; VI-MOVREL-NEXT:    v_mov_b32_e32 v26, v10
; VI-MOVREL-NEXT:    v_mov_b32_e32 v25, v9
; VI-MOVREL-NEXT:    v_mov_b32_e32 v24, v8
; VI-MOVREL-NEXT:    v_mov_b32_e32 v23, v7
; VI-MOVREL-NEXT:    v_mov_b32_e32 v22, v6
; VI-MOVREL-NEXT:    v_mov_b32_e32 v21, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v20, v4
; VI-MOVREL-NEXT:    v_mov_b32_e32 v19, v3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v18, v2
; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, v1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, v0
; VI-MOVREL-NEXT:    s_mov_b32 m0, s4
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    v_movreld_b32_e32 v16, v32
; VI-MOVREL-NEXT:    v_mov_b32_e32 v33, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v32, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 32
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[32:33], v[12:15]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 16
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 64
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    s_add_u32 s4, s0, 0x70
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-MOVREL-NEXT:    s_addc_u32 s5, s1, 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s4
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s5
; VI-MOVREL-NEXT:    s_add_u32 s4, s0, 0x60
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[0:1], v[28:31]
; VI-MOVREL-NEXT:    s_addc_u32 s5, s1, 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s4
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s5
; VI-MOVREL-NEXT:    s_add_u32 s0, s0, 0x50
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
; VI-MOVREL-NEXT:    s_addc_u32 s1, s1, 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s1
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s2
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s3
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: insert_w_offset_multiple_in_block:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_add_i32 s2, s4, 1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
; VI-IDXMODE-NEXT:    s_add_i32 s4, s4, 2
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v32, 0x41880000
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, v32
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v31, v15
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 48
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v30, v14
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v29, v13
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v28, v12
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v27, v11
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v26, v10
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v25, v9
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v24, v8
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v23, v7
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v22, v6
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v21, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v20, v4
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v19, v3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v18, v2
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, v1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, v0
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s4, gpr_idx(DST)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, v32
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v33, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v32, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 32
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[32:33], v[12:15]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 16
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 64
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    s_add_u32 s4, s0, 0x70
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-IDXMODE-NEXT:    s_addc_u32 s5, s1, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s4
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s5
; VI-IDXMODE-NEXT:    s_add_u32 s4, s0, 0x60
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[0:1], v[28:31]
; VI-IDXMODE-NEXT:    s_addc_u32 s5, s1, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s4
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s5
; VI-IDXMODE-NEXT:    s_add_u32 s0, s0, 0x50
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
; VI-IDXMODE-NEXT:    s_addc_u32 s1, s1, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s1
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s2
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s3
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: insert_w_offset_multiple_in_block:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    s_add_i32 s2, s4, 1
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v32, 0x41880000
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, v32
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v31, v15
; GFX9-IDXMODE-NEXT:    s_add_i32 s4, s4, 2
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v30, v14
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v29, v13
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v28, v12
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v27, v11
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v26, v10
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v25, v9
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v24, v8
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v23, v7
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v22, v6
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v21, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v20, v4
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v19, v3
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v18, v2
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, v1
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, v0
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s4, gpr_idx(DST)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, v32
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v32, 0
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %add1 = add i32 %in, 1
  %ins1 = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add1
  %add2 = add i32 %in, 2
  %ins2 = insertelement <16 x float> %ins1, float 17.0, i32 %add2
  store <16 x float> %ins1, ptr addrspace(1) %out1
  %out2 = getelementptr <16 x float>, ptr addrspace(1) %out1, i32 1
  store <16 x float> %ins2, ptr addrspace(1) %out2

  ret void
}

; Make sure we don't hit use of undefined register errors when expanding an
; extract with undef index.
define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
; GENERIC-LABEL: extract_adjacent_blocks:
; GENERIC:       ; %bb.0: ; %bb
; GENERIC-NEXT:    s_load_dword s0, s[2:3], 0x9
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_cmp_lg_u32 s0, 0
; GENERIC-NEXT:    s_cbranch_scc0 .LBB19_4
; GENERIC-NEXT:  ; %bb.1: ; %bb4
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    ;;#ASMSTART
; GENERIC-NEXT:    ; reg use v[0:3]
; GENERIC-NEXT:    ;;#ASMEND
; GENERIC-NEXT:    s_mov_b64 vcc, exec
; GENERIC-NEXT:    s_cbranch_execnz .LBB19_3
; GENERIC-NEXT:  .LBB19_2: ; %bb1
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    ;;#ASMSTART
; GENERIC-NEXT:    ; reg use v[0:3]
; GENERIC-NEXT:    ;;#ASMEND
; GENERIC-NEXT:  .LBB19_3: ; %bb7
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    s_endpgm
; GENERIC-NEXT:  .LBB19_4:
; GENERIC-NEXT:    s_mov_b64 vcc, 0
; GENERIC-NEXT:    s_branch .LBB19_2
;
; NOOPT-LABEL: extract_adjacent_blocks:
; NOOPT:       ; %bb.0: ; %bb
; NOOPT-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; NOOPT-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; NOOPT-NEXT:    s_mov_b32 s14, -1
; NOOPT-NEXT:    s_mov_b32 s15, 0xe8f000
; NOOPT-NEXT:    s_add_u32 s12, s12, s9
; NOOPT-NEXT:    s_addc_u32 s13, s13, 0
; NOOPT-NEXT:    s_load_dword s2, s[2:3], 0x9
; NOOPT-NEXT:    s_mov_b64 s[0:1], -1
; NOOPT-NEXT:    ; implicit-def: $sgpr3
; NOOPT-NEXT:    s_mov_b32 s3, 0
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_cmp_lg_u32 s2, s3
; NOOPT-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
; NOOPT-NEXT:    v_writelane_b32 v4, s0, 0
; NOOPT-NEXT:    v_writelane_b32 v4, s1, 1
; NOOPT-NEXT:    s_mov_b64 s[8:9], exec
; NOOPT-NEXT:    s_mov_b64 exec, -1
; NOOPT-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[8:9]
; NOOPT-NEXT:    s_cbranch_scc1 .LBB19_3
; NOOPT-NEXT:  .LBB19_1: ; %Flow
; NOOPT-NEXT:    s_or_saveexec_b64 s[8:9], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[8:9]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v4, 0
; NOOPT-NEXT:    v_readlane_b32 s1, v4, 1
; NOOPT-NEXT:    ; implicit-def: $sgpr2
; NOOPT-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; NOOPT-NEXT:    s_mov_b32 s0, 1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    v_cmp_ne_u32_e64 s[0:1], v0, s0
; NOOPT-NEXT:    s_and_b64 vcc, exec, s[0:1]
; NOOPT-NEXT:    s_cbranch_vccnz .LBB19_4
; NOOPT-NEXT:  ; %bb.2: ; %bb1
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s6, s1
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s4, 0xf000
; NOOPT-NEXT:    s_mov_b32 s5, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s6
; NOOPT-NEXT:    s_mov_b32 s2, s5
; NOOPT-NEXT:    s_mov_b32 s3, s4
; NOOPT-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    ; implicit-def: $sgpr0
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    ;;#ASMSTART
; NOOPT-NEXT:    ; reg use v[0:3]
; NOOPT-NEXT:    ;;#ASMEND
; NOOPT-NEXT:    s_branch .LBB19_4
; NOOPT-NEXT:  .LBB19_3: ; %bb4
; NOOPT-NEXT:    s_or_saveexec_b64 s[8:9], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[8:9]
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s6, s1
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s4, 0xf000
; NOOPT-NEXT:    s_mov_b32 s5, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s6
; NOOPT-NEXT:    s_mov_b32 s2, s5
; NOOPT-NEXT:    s_mov_b32 s3, s4
; NOOPT-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    ; implicit-def: $sgpr0
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    ;;#ASMSTART
; NOOPT-NEXT:    ; reg use v[0:3]
; NOOPT-NEXT:    ;;#ASMEND
; NOOPT-NEXT:    s_mov_b64 s[0:1], 0
; NOOPT-NEXT:    v_writelane_b32 v4, s0, 0
; NOOPT-NEXT:    v_writelane_b32 v4, s1, 1
; NOOPT-NEXT:    s_or_saveexec_b64 s[8:9], -1
; NOOPT-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[8:9]
; NOOPT-NEXT:    s_branch .LBB19_1
; NOOPT-NEXT:  .LBB19_4: ; %bb7
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    v_mov_b32_e32 v0, s4
; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: extract_adjacent_blocks:
; SI-MOVREL:       ; %bb.0: ; %bb
; SI-MOVREL-NEXT:    s_load_dword s0, s[2:3], 0x9
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_cmp_lg_u32 s0, 0
; SI-MOVREL-NEXT:    s_cbranch_scc0 .LBB19_4
; SI-MOVREL-NEXT:  ; %bb.1: ; %bb4
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    ;;#ASMSTART
; SI-MOVREL-NEXT:    ; reg use v[0:3]
; SI-MOVREL-NEXT:    ;;#ASMEND
; SI-MOVREL-NEXT:    s_cbranch_execnz .LBB19_3
; SI-MOVREL-NEXT:  .LBB19_2: ; %bb1
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    ;;#ASMSTART
; SI-MOVREL-NEXT:    ; reg use v[0:3]
; SI-MOVREL-NEXT:    ;;#ASMEND
; SI-MOVREL-NEXT:  .LBB19_3: ; %bb7
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    s_endpgm
; SI-MOVREL-NEXT:  .LBB19_4:
; SI-MOVREL-NEXT:    s_branch .LBB19_2
;
; VI-LABEL: extract_adjacent_blocks:
; VI:       ; %bb.0: ; %bb
; VI-NEXT:    s_load_dword s0, s[2:3], 0x24
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    s_cmp_lg_u32 s0, 0
; VI-NEXT:    s_cbranch_scc0 .LBB19_4
; VI-NEXT:  ; %bb.1: ; %bb4
; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    ;;#ASMSTART
; VI-NEXT:    ; reg use v[0:3]
; VI-NEXT:    ;;#ASMEND
; VI-NEXT:    s_cbranch_execnz .LBB19_3
; VI-NEXT:  .LBB19_2: ; %bb1
; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    ;;#ASMSTART
; VI-NEXT:    ; reg use v[0:3]
; VI-NEXT:    ;;#ASMEND
; VI-NEXT:  .LBB19_3: ; %bb7
; VI-NEXT:    flat_store_dword v[0:1], v0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_endpgm
; VI-NEXT:  .LBB19_4:
; VI-NEXT:    s_branch .LBB19_2
;
; GFX9-IDXMODE-LABEL: extract_adjacent_blocks:
; GFX9-IDXMODE:       ; %bb.0: ; %bb
; GFX9-IDXMODE-NEXT:    s_load_dword s0, s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    s_cmp_lg_u32 s0, 0
; GFX9-IDXMODE-NEXT:    s_cbranch_scc0 .LBB19_4
; GFX9-IDXMODE-NEXT:  ; %bb.1: ; %bb4
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    ;;#ASMSTART
; GFX9-IDXMODE-NEXT:    ; reg use v[0:3]
; GFX9-IDXMODE-NEXT:    ;;#ASMEND
; GFX9-IDXMODE-NEXT:    s_cbranch_execnz .LBB19_3
; GFX9-IDXMODE-NEXT:  .LBB19_2: ; %bb1
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    ;;#ASMSTART
; GFX9-IDXMODE-NEXT:    ; reg use v[0:3]
; GFX9-IDXMODE-NEXT:    ;;#ASMEND
; GFX9-IDXMODE-NEXT:  .LBB19_3: ; %bb7
; GFX9-IDXMODE-NEXT:    global_store_dword v[0:1], v0, off
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    s_endpgm
; GFX9-IDXMODE-NEXT:  .LBB19_4:
; GFX9-IDXMODE-NEXT:    s_branch .LBB19_2
bb:
  %tmp = icmp eq i32 %arg, 0
  br i1 %tmp, label %bb1, label %bb4

bb1:
  %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef
  %tmp3 = extractelement <4 x float> %tmp2, i32 undef
  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) ; Prevent block optimize out
  br label %bb7

bb4:
  %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef
  %tmp6 = extractelement <4 x float> %tmp5, i32 undef
  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) ; Prevent block optimize out
  br label %bb7

bb7:
  %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
  store volatile float %tmp8, ptr addrspace(1) undef
  ret void
}

define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
; GENERIC-LABEL: insert_adjacent_blocks:
; GENERIC:       ; %bb.0: ; %bb
; GENERIC-NEXT:    s_load_dword s0, s[2:3], 0x9
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_cmp_lg_u32 s0, 0
; GENERIC-NEXT:    s_cbranch_scc0 .LBB20_4
; GENERIC-NEXT:  ; %bb.1: ; %bb4
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    ;;#ASMSTART
; GENERIC-NEXT:    ; reg use v[0:3]
; GENERIC-NEXT:    ;;#ASMEND
; GENERIC-NEXT:    s_mov_b64 vcc, exec
; GENERIC-NEXT:    s_cbranch_execnz .LBB20_3
; GENERIC-NEXT:  .LBB20_2: ; %bb1
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    ;;#ASMSTART
; GENERIC-NEXT:    ; reg use v[0:3]
; GENERIC-NEXT:    ;;#ASMEND
; GENERIC-NEXT:  .LBB20_3: ; %bb7
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    s_endpgm
; GENERIC-NEXT:  .LBB20_4:
; GENERIC-NEXT:    s_mov_b64 vcc, 0
; GENERIC-NEXT:    s_branch .LBB20_2
;
; NOOPT-LABEL: insert_adjacent_blocks:
; NOOPT:       ; %bb.0: ; %bb
; NOOPT-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
; NOOPT-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
; NOOPT-NEXT:    s_mov_b32 s18, -1
; NOOPT-NEXT:    s_mov_b32 s19, 0xe8f000
; NOOPT-NEXT:    s_add_u32 s16, s16, s9
; NOOPT-NEXT:    s_addc_u32 s17, s17, 0
; NOOPT-NEXT:    s_mov_b64 s[0:1], s[2:3]
; NOOPT-NEXT:    s_load_dword s2, s[0:1], 0x9
; NOOPT-NEXT:    s_load_dword s0, s[0:1], 0xa
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b64 s[0:1], -1
; NOOPT-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
; NOOPT-NEXT:    s_mov_b32 s3, 0
; NOOPT-NEXT:    s_cmp_lg_u32 s2, s3
; NOOPT-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
; NOOPT-NEXT:    v_writelane_b32 v4, s0, 0
; NOOPT-NEXT:    v_writelane_b32 v4, s1, 1
; NOOPT-NEXT:    s_mov_b64 s[12:13], exec
; NOOPT-NEXT:    s_mov_b64 exec, -1
; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
; NOOPT-NEXT:    s_cbranch_scc1 .LBB20_3
; NOOPT-NEXT:  .LBB20_1: ; %Flow
; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v4, off, s[16:19], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v4, 0
; NOOPT-NEXT:    v_readlane_b32 s1, v4, 1
; NOOPT-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
; NOOPT-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; NOOPT-NEXT:    s_mov_b32 s0, 1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    v_cmp_ne_u32_e64 s[0:1], v0, s0
; NOOPT-NEXT:    s_and_b64 vcc, exec, s[0:1]
; NOOPT-NEXT:    s_cbranch_vccnz .LBB20_4
; NOOPT-NEXT:  ; %bb.2: ; %bb1
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s6, s1
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s4, 0xf000
; NOOPT-NEXT:    s_mov_b32 s5, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s6
; NOOPT-NEXT:    s_mov_b32 s2, s5
; NOOPT-NEXT:    s_mov_b32 s3, s4
; NOOPT-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    ;;#ASMSTART
; NOOPT-NEXT:    ; reg use v[0:3]
; NOOPT-NEXT:    ;;#ASMEND
; NOOPT-NEXT:    s_branch .LBB20_4
; NOOPT-NEXT:  .LBB20_3: ; %bb4
; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v4, off, s[16:19], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s6, s1
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s4, 0xf000
; NOOPT-NEXT:    s_mov_b32 s5, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s6
; NOOPT-NEXT:    s_mov_b32 s2, s5
; NOOPT-NEXT:    s_mov_b32 s3, s4
; NOOPT-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    ;;#ASMSTART
; NOOPT-NEXT:    ; reg use v[0:3]
; NOOPT-NEXT:    ;;#ASMEND
; NOOPT-NEXT:    s_mov_b64 s[0:1], 0
; NOOPT-NEXT:    v_writelane_b32 v4, s0, 0
; NOOPT-NEXT:    v_writelane_b32 v4, s1, 1
; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
; NOOPT-NEXT:    s_branch .LBB20_1
; NOOPT-NEXT:  .LBB20_4: ; %bb7
; NOOPT-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s10, s1
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s8, 0xf000
; NOOPT-NEXT:    s_mov_b32 s9, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s10
; NOOPT-NEXT:    s_mov_b32 s2, s9
; NOOPT-NEXT:    s_mov_b32 s3, s8
; NOOPT-NEXT:    v_mov_b32_e32 v0, s4
; NOOPT-NEXT:    v_mov_b32_e32 v1, s5
; NOOPT-NEXT:    v_mov_b32_e32 v2, s6
; NOOPT-NEXT:    v_mov_b32_e32 v3, s7
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: insert_adjacent_blocks:
; SI-MOVREL:       ; %bb.0: ; %bb
; SI-MOVREL-NEXT:    s_load_dword s0, s[2:3], 0x9
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_cmp_lg_u32 s0, 0
; SI-MOVREL-NEXT:    s_cbranch_scc0 .LBB20_4
; SI-MOVREL-NEXT:  ; %bb.1: ; %bb4
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    ;;#ASMSTART
; SI-MOVREL-NEXT:    ; reg use v[0:3]
; SI-MOVREL-NEXT:    ;;#ASMEND
; SI-MOVREL-NEXT:    s_cbranch_execnz .LBB20_3
; SI-MOVREL-NEXT:  .LBB20_2: ; %bb1
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    ;;#ASMSTART
; SI-MOVREL-NEXT:    ; reg use v[0:3]
; SI-MOVREL-NEXT:    ;;#ASMEND
; SI-MOVREL-NEXT:  .LBB20_3: ; %bb7
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    s_endpgm
; SI-MOVREL-NEXT:  .LBB20_4:
; SI-MOVREL-NEXT:    s_branch .LBB20_2
;
; VI-LABEL: insert_adjacent_blocks:
; VI:       ; %bb.0: ; %bb
; VI-NEXT:    s_load_dword s0, s[2:3], 0x24
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    s_cmp_lg_u32 s0, 0
; VI-NEXT:    s_cbranch_scc0 .LBB20_4
; VI-NEXT:  ; %bb.1: ; %bb4
; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    ;;#ASMSTART
; VI-NEXT:    ; reg use v[0:3]
; VI-NEXT:    ;;#ASMEND
; VI-NEXT:    s_cbranch_execnz .LBB20_3
; VI-NEXT:  .LBB20_2: ; %bb1
; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    ;;#ASMSTART
; VI-NEXT:    ; reg use v[0:3]
; VI-NEXT:    ;;#ASMEND
; VI-NEXT:  .LBB20_3: ; %bb7
; VI-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_endpgm
; VI-NEXT:  .LBB20_4:
; VI-NEXT:    s_branch .LBB20_2
;
; GFX9-IDXMODE-LABEL: insert_adjacent_blocks:
; GFX9-IDXMODE:       ; %bb.0: ; %bb
; GFX9-IDXMODE-NEXT:    s_load_dword s0, s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    s_cmp_lg_u32 s0, 0
; GFX9-IDXMODE-NEXT:    s_cbranch_scc0 .LBB20_4
; GFX9-IDXMODE-NEXT:  ; %bb.1: ; %bb4
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    ;;#ASMSTART
; GFX9-IDXMODE-NEXT:    ; reg use v[0:3]
; GFX9-IDXMODE-NEXT:    ;;#ASMEND
; GFX9-IDXMODE-NEXT:    s_cbranch_execnz .LBB20_3
; GFX9-IDXMODE-NEXT:  .LBB20_2: ; %bb1
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    ;;#ASMSTART
; GFX9-IDXMODE-NEXT:    ; reg use v[0:3]
; GFX9-IDXMODE-NEXT:    ;;#ASMEND
; GFX9-IDXMODE-NEXT:  .LBB20_3: ; %bb7
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    s_endpgm
; GFX9-IDXMODE-NEXT:  .LBB20_4:
; GFX9-IDXMODE-NEXT:    s_branch .LBB20_2
bb:
  %tmp = icmp eq i32 %arg, 0
  br i1 %tmp, label %bb1, label %bb4

bb1:
  %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef
  %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) ; Prevent block optimize out
  br label %bb7

bb4:
  %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef
  %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) ; Prevent block optimize out
  br label %bb7

bb7:
  %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
  store volatile <4 x float> %tmp8, ptr addrspace(1) undef
  ret void
}

; FIXME: Should be able to fold zero input to movreld to inline imm?
define amdgpu_kernel void @multi_same_block(i32 %arg) {
; GENERIC-LABEL: multi_same_block:
; GENERIC:       ; %bb.0: ; %bb
; GENERIC-NEXT:    s_load_dword s0, s[2:3], 0x9
; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x41900000
; GENERIC-NEXT:    v_mov_b32_e32 v1, 0x41b0cccd
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_add_i32 s2, s0, -16
; GENERIC-NEXT:    s_cmp_eq_u32 s2, 1
; GENERIC-NEXT:    s_cselect_b64 s[0:1], -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e64 v0, v0, 4.0, s[0:1]
; GENERIC-NEXT:    s_cmp_eq_u32 s2, 5
; GENERIC-NEXT:    s_cselect_b64 s[0:1], -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e64 v1, v1, -4.0, s[0:1]
; GENERIC-NEXT:    s_mov_b32 m0, -1
; GENERIC-NEXT:    ds_write_b32 v0, v0
; GENERIC-NEXT:    ds_write_b32 v0, v1
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: multi_same_block:
; NOOPT:       ; %bb.0: ; %bb
; NOOPT-NEXT:    s_load_dword s0, s[2:3], 0x9
; NOOPT-NEXT:    s_mov_b32 s8, 0x41900000
; NOOPT-NEXT:    ; implicit-def: $sgpr9
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr7
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr6
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr5
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr3
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr2
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr10
; NOOPT-NEXT:    v_mov_b32_e32 v12, s9
; NOOPT-NEXT:    v_mov_b32_e32 v7, s8
; NOOPT-NEXT:    v_mov_b32_e32 v6, s7
; NOOPT-NEXT:    v_mov_b32_e32 v5, s6
; NOOPT-NEXT:    v_mov_b32_e32 v4, s5
; NOOPT-NEXT:    v_mov_b32_e32 v3, s4
; NOOPT-NEXT:    v_mov_b32_e32 v2, s3
; NOOPT-NEXT:    v_mov_b32_e32 v1, s2
; NOOPT-NEXT:    v_mov_b32_e32 v0, s1
; NOOPT-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v13, v7
; NOOPT-NEXT:    v_mov_b32_e32 v14, v6
; NOOPT-NEXT:    v_mov_b32_e32 v15, v5
; NOOPT-NEXT:    v_mov_b32_e32 v16, v4
; NOOPT-NEXT:    v_mov_b32_e32 v17, v3
; NOOPT-NEXT:    v_mov_b32_e32 v18, v2
; NOOPT-NEXT:    v_mov_b32_e32 v19, v1
; NOOPT-NEXT:    v_mov_b32_e32 v20, v0
; NOOPT-NEXT:    v_mov_b32_e32 v0, 4.0
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_add_i32 m0, s0, -16
; NOOPT-NEXT:    v_movreld_b32_e32 v12, v0
; NOOPT-NEXT:    s_mov_b32 s4, 0x41b0cccd
; NOOPT-NEXT:    ; implicit-def: $sgpr9
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr8
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr7
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr6
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr5
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr3
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr2
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    ; implicit-def: $sgpr10
; NOOPT-NEXT:    v_mov_b32_e32 v3, s9
; NOOPT-NEXT:    v_mov_b32_e32 v25, s8
; NOOPT-NEXT:    v_mov_b32_e32 v24, s7
; NOOPT-NEXT:    v_mov_b32_e32 v23, s6
; NOOPT-NEXT:    v_mov_b32_e32 v22, s5
; NOOPT-NEXT:    v_mov_b32_e32 v21, s4
; NOOPT-NEXT:    v_mov_b32_e32 v2, s3
; NOOPT-NEXT:    v_mov_b32_e32 v1, s2
; NOOPT-NEXT:    v_mov_b32_e32 v0, s1
; NOOPT-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v4, v25
; NOOPT-NEXT:    v_mov_b32_e32 v5, v24
; NOOPT-NEXT:    v_mov_b32_e32 v6, v23
; NOOPT-NEXT:    v_mov_b32_e32 v7, v22
; NOOPT-NEXT:    v_mov_b32_e32 v8, v21
; NOOPT-NEXT:    v_mov_b32_e32 v9, v2
; NOOPT-NEXT:    v_mov_b32_e32 v10, v1
; NOOPT-NEXT:    v_mov_b32_e32 v11, v0
; NOOPT-NEXT:    v_mov_b32_e32 v0, -4.0
; NOOPT-NEXT:    s_add_i32 m0, s0, -16
; NOOPT-NEXT:    v_movreld_b32_e32 v3, v0
; NOOPT-NEXT:    v_mov_b32_e32 v2, v13
; NOOPT-NEXT:    v_mov_b32_e32 v1, v8
; NOOPT-NEXT:    s_mov_b32 m0, -1
; NOOPT-NEXT:    ; implicit-def: $sgpr0
; NOOPT-NEXT:    v_mov_b32_e32 v0, s0
; NOOPT-NEXT:    ds_write_b32 v0, v2
; NOOPT-NEXT:    s_mov_b32 m0, -1
; NOOPT-NEXT:    ; implicit-def: $sgpr0
; NOOPT-NEXT:    v_mov_b32_e32 v0, s0
; NOOPT-NEXT:    ds_write_b32 v0, v1
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: multi_same_block:
; SI-MOVREL:       ; %bb.0: ; %bb
; SI-MOVREL-NEXT:    s_load_dword s0, s[2:3], 0x9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, 0x41900000
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41b0cccd
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_add_i32 m0, s0, -16
; SI-MOVREL-NEXT:    v_movreld_b32_e32 v0, 4.0
; SI-MOVREL-NEXT:    v_movreld_b32_e32 v4, -4.0
; SI-MOVREL-NEXT:    s_mov_b32 m0, -1
; SI-MOVREL-NEXT:    ds_write_b32 v0, v1
; SI-MOVREL-NEXT:    ds_write_b32 v0, v9
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: multi_same_block:
; VI-MOVREL:       ; %bb.0: ; %bb
; VI-MOVREL-NEXT:    s_load_dword s0, s[2:3], 0x24
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, 0x41900000
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41b0cccd
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    s_add_i32 m0, s0, -16
; VI-MOVREL-NEXT:    v_movreld_b32_e32 v0, 4.0
; VI-MOVREL-NEXT:    v_movreld_b32_e32 v4, -4.0
; VI-MOVREL-NEXT:    s_mov_b32 m0, -1
; VI-MOVREL-NEXT:    ds_write_b32 v0, v1
; VI-MOVREL-NEXT:    ds_write_b32 v0, v9
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: multi_same_block:
; VI-IDXMODE:       ; %bb.0: ; %bb
; VI-IDXMODE-NEXT:    s_load_dword s0, s[2:3], 0x24
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, 0x41900000
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41b0cccd
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_add_i32 s0, s0, -16
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s0, gpr_idx(DST)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 4.0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, -4.0
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    s_mov_b32 m0, -1
; VI-IDXMODE-NEXT:    ds_write_b32 v0, v1
; VI-IDXMODE-NEXT:    ds_write_b32 v0, v9
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: multi_same_block:
; GFX9-IDXMODE:       ; %bb.0: ; %bb
; GFX9-IDXMODE-NEXT:    s_load_dword s0, s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, 0x41900000
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41b0cccd
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    s_add_i32 s0, s0, -16
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s0, gpr_idx(DST)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 4.0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, -4.0
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    ds_write_b32 v0, v1
; GFX9-IDXMODE-NEXT:    ds_write_b32 v0, v9
; GFX9-IDXMODE-NEXT:    s_endpgm
bb:
  %tmp1 = add i32 %arg, -16
  %tmp2 = insertelement <9 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01, float 2.300000e+01, float 2.400000e+01, float 2.500000e+01>, float 4.000000e+00, i32 %tmp1
  %tmp3 = add i32 %arg, -16
  %tmp4 = insertelement <9 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000, float 0x40371999A0000000, float 0x40381999A0000000, float 0x40391999A0000000>, float -4.0, i32 %tmp3
  %tmp5 = bitcast <9 x float> %tmp2 to <9 x i32>
  %tmp6 = extractelement <9 x i32> %tmp5, i32 1
  %tmp7 = bitcast <9 x float> %tmp4 to <9 x i32>
  %tmp8 = extractelement <9 x i32> %tmp7, i32 5
  store volatile i32 %tmp6, ptr addrspace(3) undef, align 4
  store volatile i32 %tmp8, ptr addrspace(3) undef, align 4
  ret void
}

; offset puts outside of superegister bounaries, so clamp to 1st element.
define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
; GENERIC-LABEL: extract_largest_inbounds_offset:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x9
; GENERIC-NEXT:    s_mov_b32 s7, 0xf000
; GENERIC-NEXT:    s_mov_b32 s6, -1
; GENERIC-NEXT:    s_load_dword s12, s[2:3], 0xd
; GENERIC-NEXT:    s_mov_b32 s2, s6
; GENERIC-NEXT:    s_mov_b32 s3, s7
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_mov_b32 s4, s8
; GENERIC-NEXT:    s_mov_b32 s5, s9
; GENERIC-NEXT:    s_mov_b32 s0, s10
; GENERIC-NEXT:    s_mov_b32 s1, s11
; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    s_add_i32 s12, s12, 15
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 1
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 2
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 3
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 4
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 5
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 6
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 7
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 8
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 9
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 10
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 11
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 12
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 13
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 14
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 15
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v15, vcc
; GENERIC-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: extract_largest_inbounds_offset:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0xb
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xd
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_mov_b32 s7, s9
; NOOPT-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
; NOOPT-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
; NOOPT-NEXT:    s_mov_b32 s9, s7
; NOOPT-NEXT:    s_mov_b32 s10, s6
; NOOPT-NEXT:    s_mov_b32 s11, s5
; NOOPT-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v16, v15
; NOOPT-NEXT:    v_mov_b32_e32 v17, v14
; NOOPT-NEXT:    v_mov_b32_e32 v18, v13
; NOOPT-NEXT:    v_mov_b32_e32 v19, v12
; NOOPT-NEXT:    v_mov_b32_e32 v20, v11
; NOOPT-NEXT:    v_mov_b32_e32 v21, v10
; NOOPT-NEXT:    v_mov_b32_e32 v22, v9
; NOOPT-NEXT:    v_mov_b32_e32 v23, v8
; NOOPT-NEXT:    v_mov_b32_e32 v24, v7
; NOOPT-NEXT:    v_mov_b32_e32 v25, v6
; NOOPT-NEXT:    v_mov_b32_e32 v26, v5
; NOOPT-NEXT:    v_mov_b32_e32 v27, v4
; NOOPT-NEXT:    v_mov_b32_e32 v28, v3
; NOOPT-NEXT:    v_mov_b32_e32 v29, v2
; NOOPT-NEXT:    v_mov_b32_e32 v30, v1
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
; NOOPT-NEXT:    s_mov_b32 s5, 15
; NOOPT-NEXT:    s_add_i32 s4, s4, s5
; NOOPT-NEXT:    s_mov_b32 m0, s4
; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: extract_largest_inbounds_offset:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x9
; SI-MOVREL-NEXT:    s_load_dword s12, s[2:3], 0xd
; SI-MOVREL-NEXT:    s_mov_b32 s7, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s6, -1
; SI-MOVREL-NEXT:    s_mov_b32 s2, s6
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_mov_b32 s0, s10
; SI-MOVREL-NEXT:    s_mov_b32 s1, s11
; SI-MOVREL-NEXT:    s_mov_b32 s3, s7
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    s_add_i32 s12, s12, 15
; SI-MOVREL-NEXT:    s_mov_b32 m0, s12
; SI-MOVREL-NEXT:    s_mov_b32 s4, s8
; SI-MOVREL-NEXT:    s_mov_b32 s5, s9
; SI-MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
; SI-MOVREL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: extract_largest_inbounds_offset:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-MOVREL-NEXT:    s_load_dword s2, s[2:3], 0x34
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s6
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s7
; VI-MOVREL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT:    s_add_u32 s0, s6, 48
; VI-MOVREL-NEXT:    s_addc_u32 s1, s7, 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    s_add_u32 s0, s6, 32
; VI-MOVREL-NEXT:    s_addc_u32 s1, s7, 0
; VI-MOVREL-NEXT:    flat_load_dwordx4 v[12:15], v[4:5] glc
; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    s_add_u32 s0, s6, 16
; VI-MOVREL-NEXT:    s_addc_u32 s1, s7, 0
; VI-MOVREL-NEXT:    flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    s_add_i32 s2, s2, 15
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    s_mov_b32 m0, s2
; VI-MOVREL-NEXT:    flat_load_dwordx4 v[4:7], v[4:5] glc
; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, s4
; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s5
; VI-MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
; VI-MOVREL-NEXT:    flat_store_dword v[16:17], v0
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: extract_largest_inbounds_offset:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-IDXMODE-NEXT:    s_load_dword s2, s[2:3], 0x34
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_add_u32 s0, s6, 48
; VI-IDXMODE-NEXT:    s_addc_u32 s1, s7, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s6
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    s_add_u32 s0, s6, 32
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s7
; VI-IDXMODE-NEXT:    s_addc_u32 s1, s7, 0
; VI-IDXMODE-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT:    flat_load_dwordx4 v[12:15], v[4:5] glc
; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    s_add_u32 s0, s6, 16
; VI-IDXMODE-NEXT:    s_addc_u32 s1, s7, 0
; VI-IDXMODE-NEXT:    flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    flat_load_dwordx4 v[4:7], v[4:5] glc
; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, s4
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s5
; VI-IDXMODE-NEXT:    s_add_i32 s2, s2, 15
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, v0
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    flat_store_dword v[16:17], v0
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: extract_largest_inbounds_offset:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    s_load_dword s0, s[2:3], 0x34
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7] glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    s_add_i32 s0, s0, 15
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s0, gpr_idx(SRC0)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, v0
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dword v16, v0, s[4:5]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %ld = load volatile <16 x i32>, ptr addrspace(1) %in
  %offset = add i32 %idx, 15
  %value = extractelement <16 x i32> %ld, i32 %offset
  store i32 %value, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
; GENERIC-LABEL: extract_out_of_bounds_offset:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x9
; GENERIC-NEXT:    s_mov_b32 s7, 0xf000
; GENERIC-NEXT:    s_mov_b32 s6, -1
; GENERIC-NEXT:    s_load_dword s12, s[2:3], 0xd
; GENERIC-NEXT:    s_mov_b32 s2, s6
; GENERIC-NEXT:    s_mov_b32 s3, s7
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_mov_b32 s4, s8
; GENERIC-NEXT:    s_mov_b32 s5, s9
; GENERIC-NEXT:    s_mov_b32 s0, s10
; GENERIC-NEXT:    s_mov_b32 s1, s11
; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    s_add_i32 s12, s12, 16
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 1
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 2
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 3
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 4
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 5
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 6
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 7
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 8
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 9
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 10
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 11
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 12
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 13
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 14
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s12, 15
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v15, vcc
; GENERIC-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: extract_out_of_bounds_offset:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0xb
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xd
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_mov_b32 s7, s9
; NOOPT-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
; NOOPT-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
; NOOPT-NEXT:    s_mov_b32 s9, s7
; NOOPT-NEXT:    s_mov_b32 s10, s6
; NOOPT-NEXT:    s_mov_b32 s11, s5
; NOOPT-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v16, v15
; NOOPT-NEXT:    v_mov_b32_e32 v17, v14
; NOOPT-NEXT:    v_mov_b32_e32 v18, v13
; NOOPT-NEXT:    v_mov_b32_e32 v19, v12
; NOOPT-NEXT:    v_mov_b32_e32 v20, v11
; NOOPT-NEXT:    v_mov_b32_e32 v21, v10
; NOOPT-NEXT:    v_mov_b32_e32 v22, v9
; NOOPT-NEXT:    v_mov_b32_e32 v23, v8
; NOOPT-NEXT:    v_mov_b32_e32 v24, v7
; NOOPT-NEXT:    v_mov_b32_e32 v25, v6
; NOOPT-NEXT:    v_mov_b32_e32 v26, v5
; NOOPT-NEXT:    v_mov_b32_e32 v27, v4
; NOOPT-NEXT:    v_mov_b32_e32 v28, v3
; NOOPT-NEXT:    v_mov_b32_e32 v29, v2
; NOOPT-NEXT:    v_mov_b32_e32 v30, v1
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
; NOOPT-NEXT:    s_mov_b32 s5, 16
; NOOPT-NEXT:    s_add_i32 s4, s4, s5
; NOOPT-NEXT:    s_mov_b32 m0, s4
; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: extract_out_of_bounds_offset:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x9
; SI-MOVREL-NEXT:    s_load_dword s12, s[2:3], 0xd
; SI-MOVREL-NEXT:    s_mov_b32 s7, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s6, -1
; SI-MOVREL-NEXT:    s_mov_b32 s2, s6
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_mov_b32 s0, s10
; SI-MOVREL-NEXT:    s_mov_b32 s1, s11
; SI-MOVREL-NEXT:    s_mov_b32 s3, s7
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    s_add_i32 s12, s12, 16
; SI-MOVREL-NEXT:    s_mov_b32 m0, s12
; SI-MOVREL-NEXT:    s_mov_b32 s4, s8
; SI-MOVREL-NEXT:    s_mov_b32 s5, s9
; SI-MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
; SI-MOVREL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: extract_out_of_bounds_offset:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-MOVREL-NEXT:    s_load_dword s2, s[2:3], 0x34
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s6
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s7
; VI-MOVREL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT:    s_add_u32 s0, s6, 48
; VI-MOVREL-NEXT:    s_addc_u32 s1, s7, 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    s_add_u32 s0, s6, 32
; VI-MOVREL-NEXT:    s_addc_u32 s1, s7, 0
; VI-MOVREL-NEXT:    flat_load_dwordx4 v[12:15], v[4:5] glc
; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    s_add_u32 s0, s6, 16
; VI-MOVREL-NEXT:    s_addc_u32 s1, s7, 0
; VI-MOVREL-NEXT:    flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    s_add_i32 s2, s2, 16
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    s_mov_b32 m0, s2
; VI-MOVREL-NEXT:    flat_load_dwordx4 v[4:7], v[4:5] glc
; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, s4
; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s5
; VI-MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
; VI-MOVREL-NEXT:    flat_store_dword v[16:17], v0
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: extract_out_of_bounds_offset:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-IDXMODE-NEXT:    s_load_dword s2, s[2:3], 0x34
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_add_u32 s0, s6, 48
; VI-IDXMODE-NEXT:    s_addc_u32 s1, s7, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s6
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    s_add_u32 s0, s6, 32
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s7
; VI-IDXMODE-NEXT:    s_addc_u32 s1, s7, 0
; VI-IDXMODE-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT:    flat_load_dwordx4 v[12:15], v[4:5] glc
; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    s_add_u32 s0, s6, 16
; VI-IDXMODE-NEXT:    s_addc_u32 s1, s7, 0
; VI-IDXMODE-NEXT:    flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    flat_load_dwordx4 v[4:7], v[4:5] glc
; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, s4
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s5
; VI-IDXMODE-NEXT:    s_add_i32 s2, s2, 16
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, v0
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    flat_store_dword v[16:17], v0
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: extract_out_of_bounds_offset:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    s_load_dword s0, s[2:3], 0x34
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7] glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    s_add_i32 s0, s0, 16
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s0, gpr_idx(SRC0)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, v0
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dword v16, v0, s[4:5]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %ld = load volatile <16 x i32>, ptr addrspace(1) %in
  %offset = add i32 %idx, 16
  %value = extractelement <16 x i32> %ld, i32 %offset
  store i32 %value, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx.in) {
; GENERIC-LABEL: extractelement_v16i32_or_index:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x9
; GENERIC-NEXT:    s_mov_b32 s7, 0xf000
; GENERIC-NEXT:    s_mov_b32 s6, -1
; GENERIC-NEXT:    s_load_dword s12, s[2:3], 0xd
; GENERIC-NEXT:    s_mov_b32 s2, s6
; GENERIC-NEXT:    s_mov_b32 s3, s7
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_mov_b32 s4, s8
; GENERIC-NEXT:    s_mov_b32 s5, s9
; GENERIC-NEXT:    s_mov_b32 s0, s10
; GENERIC-NEXT:    s_mov_b32 s1, s11
; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    s_lshl_b32 s0, s12, 2
; GENERIC-NEXT:    s_or_b32 s0, s0, 1
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 1
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 2
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 3
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 4
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 5
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 6
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 7
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 8
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 9
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 10
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 11
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 12
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 13
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 14
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_cmp_eq_u32 s0, 15
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v15, vcc
; GENERIC-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: extractelement_v16i32_or_index:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0xb
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xd
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_mov_b32 s7, s9
; NOOPT-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
; NOOPT-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
; NOOPT-NEXT:    s_mov_b32 s9, s7
; NOOPT-NEXT:    s_mov_b32 s10, s6
; NOOPT-NEXT:    s_mov_b32 s11, s5
; NOOPT-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v16, v15
; NOOPT-NEXT:    v_mov_b32_e32 v17, v14
; NOOPT-NEXT:    v_mov_b32_e32 v18, v13
; NOOPT-NEXT:    v_mov_b32_e32 v19, v12
; NOOPT-NEXT:    v_mov_b32_e32 v20, v11
; NOOPT-NEXT:    v_mov_b32_e32 v21, v10
; NOOPT-NEXT:    v_mov_b32_e32 v22, v9
; NOOPT-NEXT:    v_mov_b32_e32 v23, v8
; NOOPT-NEXT:    v_mov_b32_e32 v24, v7
; NOOPT-NEXT:    v_mov_b32_e32 v25, v6
; NOOPT-NEXT:    v_mov_b32_e32 v26, v5
; NOOPT-NEXT:    v_mov_b32_e32 v27, v4
; NOOPT-NEXT:    v_mov_b32_e32 v28, v3
; NOOPT-NEXT:    v_mov_b32_e32 v29, v2
; NOOPT-NEXT:    v_mov_b32_e32 v30, v1
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
; NOOPT-NEXT:    s_mov_b32 s5, 2
; NOOPT-NEXT:    s_lshl_b32 s4, s4, s5
; NOOPT-NEXT:    s_mov_b32 m0, s4
; NOOPT-NEXT:    v_movrels_b32_e32 v0, v1
; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: extractelement_v16i32_or_index:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x9
; SI-MOVREL-NEXT:    s_load_dword s12, s[2:3], 0xd
; SI-MOVREL-NEXT:    s_mov_b32 s7, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s6, -1
; SI-MOVREL-NEXT:    s_mov_b32 s2, s6
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_mov_b32 s0, s10
; SI-MOVREL-NEXT:    s_mov_b32 s1, s11
; SI-MOVREL-NEXT:    s_mov_b32 s3, s7
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    s_lshl_b32 s0, s12, 2
; SI-MOVREL-NEXT:    s_mov_b32 m0, s0
; SI-MOVREL-NEXT:    s_mov_b32 s4, s8
; SI-MOVREL-NEXT:    s_mov_b32 s5, s9
; SI-MOVREL-NEXT:    v_movrels_b32_e32 v0, v1
; SI-MOVREL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: extractelement_v16i32_or_index:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-MOVREL-NEXT:    s_load_dword s2, s[2:3], 0x34
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s6
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s7
; VI-MOVREL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT:    s_add_u32 s0, s6, 48
; VI-MOVREL-NEXT:    s_addc_u32 s1, s7, 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    s_add_u32 s0, s6, 32
; VI-MOVREL-NEXT:    s_addc_u32 s1, s7, 0
; VI-MOVREL-NEXT:    flat_load_dwordx4 v[12:15], v[4:5] glc
; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    s_add_u32 s0, s6, 16
; VI-MOVREL-NEXT:    s_addc_u32 s1, s7, 0
; VI-MOVREL-NEXT:    flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    s_lshl_b32 s0, s2, 2
; VI-MOVREL-NEXT:    s_mov_b32 m0, s0
; VI-MOVREL-NEXT:    flat_load_dwordx4 v[4:7], v[4:5] glc
; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, s4
; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s5
; VI-MOVREL-NEXT:    v_movrels_b32_e32 v0, v1
; VI-MOVREL-NEXT:    flat_store_dword v[16:17], v0
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: extractelement_v16i32_or_index:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-IDXMODE-NEXT:    s_load_dword s2, s[2:3], 0x34
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_add_u32 s0, s6, 48
; VI-IDXMODE-NEXT:    s_addc_u32 s1, s7, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s6
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    s_add_u32 s0, s6, 32
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s7
; VI-IDXMODE-NEXT:    s_addc_u32 s1, s7, 0
; VI-IDXMODE-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT:    flat_load_dwordx4 v[12:15], v[4:5] glc
; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    s_add_u32 s0, s6, 16
; VI-IDXMODE-NEXT:    s_addc_u32 s1, s7, 0
; VI-IDXMODE-NEXT:    flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    flat_load_dwordx4 v[4:7], v[4:5] glc
; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, s4
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s5
; VI-IDXMODE-NEXT:    s_lshl_b32 s0, s2, 2
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s0, gpr_idx(SRC0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, v1
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    flat_store_dword v[16:17], v0
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: extractelement_v16i32_or_index:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    s_load_dword s0, s[2:3], 0x34
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7] glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    s_lshl_b32 s0, s0, 2
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s0, gpr_idx(SRC0)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, v1
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dword v16, v0, s[4:5]
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %ld = load volatile <16 x i32>, ptr addrspace(1) %in
  %idx.shl = shl i32 %idx.in, 2
  %idx = or i32 %idx.shl, 1
  %value = extractelement <16 x i32> %ld, i32 %idx
  store i32 %value, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, <16 x float> %a, i32 %idx.in) nounwind {
; GENERIC-LABEL: insertelement_v16f32_or_index:
; GENERIC:       ; %bb.0:
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x19
; GENERIC-NEXT:    s_load_dword s20, s[2:3], 0x29
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x40a00000
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_lshl_b32 s20, s20, 2
; GENERIC-NEXT:    v_mov_b32_e32 v0, s7
; GENERIC-NEXT:    v_mov_b32_e32 v1, s6
; GENERIC-NEXT:    v_mov_b32_e32 v4, s5
; GENERIC-NEXT:    v_mov_b32_e32 v5, s4
; GENERIC-NEXT:    v_mov_b32_e32 v6, s11
; GENERIC-NEXT:    v_mov_b32_e32 v8, s10
; GENERIC-NEXT:    v_mov_b32_e32 v9, s9
; GENERIC-NEXT:    v_mov_b32_e32 v11, s8
; GENERIC-NEXT:    v_mov_b32_e32 v12, s15
; GENERIC-NEXT:    v_mov_b32_e32 v13, s14
; GENERIC-NEXT:    v_mov_b32_e32 v14, s13
; GENERIC-NEXT:    v_mov_b32_e32 v15, s12
; GENERIC-NEXT:    v_mov_b32_e32 v16, s19
; GENERIC-NEXT:    v_mov_b32_e32 v17, s18
; GENERIC-NEXT:    v_mov_b32_e32 v18, s17
; GENERIC-NEXT:    v_mov_b32_e32 v19, s16
; GENERIC-NEXT:    s_or_b32 s4, s20, 1
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 3
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v3, v10, v0, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 2
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v2, v10, v1, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 1
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v1, v10, v4, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 0
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v10, v5, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 7
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v10, v6, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 6
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v10, v8, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 5
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 4
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v4, v10, v11, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 11
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v10, v12, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 10
; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v10, v13, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 9
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v10, v14, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 8
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v10, v15, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 15
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v9, v10, v16, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 14
; GENERIC-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v10, v17, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 13
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v10, v18, vcc
; GENERIC-NEXT:    s_cmp_lg_u32 s4, 12
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v10, v19, vcc
; GENERIC-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: insertelement_v16f32_or_index:
; NOOPT:       ; %bb.0:
; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; NOOPT-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x19
; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x29
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
; NOOPT-NEXT:    s_mov_b32 s6, -1
; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT:    s_mov_b32 s1, s7
; NOOPT-NEXT:    s_mov_b32 s2, s6
; NOOPT-NEXT:    s_mov_b32 s3, s5
; NOOPT-NEXT:    s_mov_b32 s5, 2
; NOOPT-NEXT:    s_lshl_b32 s4, s4, s5
; NOOPT-NEXT:    v_mov_b32_e32 v0, 0x40a00000
; NOOPT-NEXT:    v_mov_b32_e32 v7, s8
; NOOPT-NEXT:    v_mov_b32_e32 v8, s9
; NOOPT-NEXT:    v_mov_b32_e32 v9, s10
; NOOPT-NEXT:    v_mov_b32_e32 v10, s11
; NOOPT-NEXT:    v_mov_b32_e32 v11, s12
; NOOPT-NEXT:    v_mov_b32_e32 v12, s13
; NOOPT-NEXT:    v_mov_b32_e32 v13, s14
; NOOPT-NEXT:    v_mov_b32_e32 v14, s15
; NOOPT-NEXT:    v_mov_b32_e32 v15, s16
; NOOPT-NEXT:    v_mov_b32_e32 v16, s17
; NOOPT-NEXT:    v_mov_b32_e32 v17, s18
; NOOPT-NEXT:    v_mov_b32_e32 v18, s19
; NOOPT-NEXT:    v_mov_b32_e32 v19, s20
; NOOPT-NEXT:    v_mov_b32_e32 v20, s21
; NOOPT-NEXT:    v_mov_b32_e32 v21, s22
; NOOPT-NEXT:    v_mov_b32_e32 v22, s23
; NOOPT-NEXT:    s_mov_b32 m0, s4
; NOOPT-NEXT:    v_movreld_b32_e32 v8, v0
; NOOPT-NEXT:    v_mov_b32_e32 v4, v22
; NOOPT-NEXT:    v_mov_b32_e32 v5, v21
; NOOPT-NEXT:    v_mov_b32_e32 v6, v20
; NOOPT-NEXT:    v_mov_b32_e32 v0, v19
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; NOOPT-NEXT:    v_mov_b32_e32 v4, v14
; NOOPT-NEXT:    v_mov_b32_e32 v5, v13
; NOOPT-NEXT:    v_mov_b32_e32 v6, v12
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v11
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; NOOPT-NEXT:    v_mov_b32_e32 v4, v10
; NOOPT-NEXT:    v_mov_b32_e32 v5, v9
; NOOPT-NEXT:    v_mov_b32_e32 v6, v8
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, v7
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: insertelement_v16f32_or_index:
; SI-MOVREL:       ; %bb.0:
; SI-MOVREL-NEXT:    s_load_dword s0, s[2:3], 0x29
; SI-MOVREL-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x19
; SI-MOVREL-NEXT:    s_load_dwordx2 s[20:21], s[2:3], 0x9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v16, 0x40a00000
; SI-MOVREL-NEXT:    s_mov_b32 s23, 0xf000
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    s_lshl_b32 s0, s0, 2
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, s4
; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, s5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, s6
; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, s7
; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, s8
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, s9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, s10
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, s11
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, s12
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, s13
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, s14
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, s15
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, s16
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, s17
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, s18
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, s19
; SI-MOVREL-NEXT:    s_mov_b32 m0, s0
; SI-MOVREL-NEXT:    s_mov_b32 s22, -1
; SI-MOVREL-NEXT:    v_movreld_b32_e32 v1, v16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:48
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:32
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: insertelement_v16f32_or_index:
; VI-MOVREL:       ; %bb.0:
; VI-MOVREL-NEXT:    s_load_dword s20, s[2:3], 0xa4
; VI-MOVREL-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, 0x40a00000
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    s_lshl_b32 s2, s20, 2
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s4
; VI-MOVREL-NEXT:    s_mov_b32 m0, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, s6
; VI-MOVREL-NEXT:    v_mov_b32_e32 v3, s7
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s8
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s9
; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, s10
; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, s11
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, s12
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, s13
; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, s14
; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, s15
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, s16
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, s17
; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, s18
; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, s19
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    v_movreld_b32_e32 v1, v16
; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 32
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, s2
; VI-MOVREL-NEXT:    s_add_u32 s2, s0, 16
; VI-MOVREL-NEXT:    s_addc_u32 s3, s1, 0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, s3
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, s2
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-MOVREL-NEXT:    s_nop 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, s0
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: insertelement_v16f32_or_index:
; VI-IDXMODE:       ; %bb.0:
; VI-IDXMODE-NEXT:    s_load_dword s20, s[2:3], 0xa4
; VI-IDXMODE-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0x40a00000
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    s_lshl_b32 s3, s20, 2
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s4
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, s6
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v3, s7
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s8
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s9
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, s10
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, s11
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, s12
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, s13
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, s14
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, s15
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, s16
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, s17
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, s18
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, s19
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 48
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s3, gpr_idx(DST)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, v16
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 32
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, s2
; VI-IDXMODE-NEXT:    s_add_u32 s2, s0, 16
; VI-IDXMODE-NEXT:    s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, s3
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, s2
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
; VI-IDXMODE-NEXT:    s_nop 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, s0
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: insertelement_v16f32_or_index:
; GFX9-IDXMODE:       ; %bb.0:
; GFX9-IDXMODE-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
; GFX9-IDXMODE-NEXT:    s_load_dword s20, s[2:3], 0xa4
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, 0x40a00000
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, s4
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, s5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, s6
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, s7
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, s8
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, s9
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, s10
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, s11
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, s12
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, s13
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, s14
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, s15
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, s16
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, s17
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, s18
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, s19
; GFX9-IDXMODE-NEXT:    s_lshl_b32 s2, s20, 2
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, v17
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX9-IDXMODE-NEXT:    s_endpgm
  %idx.shl = shl i32 %idx.in, 2
  %idx = or i32 %idx.shl, 1
  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx
  store <16 x float> %vecins, ptr addrspace(1) %out, align 64
  ret void
}

define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; GENERIC-LABEL: broken_phi_bb:
; GENERIC:       ; %bb.0: ; %bb
; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT:    s_mov_b32 s6, 8
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s2, -1
; GENERIC-NEXT:    s_branch .LBB26_2
; GENERIC-NEXT:  .LBB26_1: ; %Flow
; GENERIC-NEXT:    ; in Loop: Header=BB26_2 Depth=1
; GENERIC-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
; GENERIC-NEXT:    s_cbranch_vccz .LBB26_4
; GENERIC-NEXT:  .LBB26_2: ; %bb2
; GENERIC-NEXT:    ; =>This Inner Loop Header: Depth=1
; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
; GENERIC-NEXT:    s_cmp_ge_i32 s6, s0
; GENERIC-NEXT:    s_mov_b64 s[4:5], -1
; GENERIC-NEXT:    ; implicit-def: $sgpr6
; GENERIC-NEXT:    s_cbranch_scc1 .LBB26_1
; GENERIC-NEXT:  ; %bb.3: ; %bb4
; GENERIC-NEXT:    ; in Loop: Header=BB26_2 Depth=1
; GENERIC-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    s_mov_b64 s[4:5], 0
; GENERIC-NEXT:    s_mov_b32 s6, s1
; GENERIC-NEXT:    s_branch .LBB26_1
; GENERIC-NEXT:  .LBB26_4: ; %bb8
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: broken_phi_bb:
; NOOPT:       ; %bb.0: ; %bb
; NOOPT-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
; NOOPT-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
; NOOPT-NEXT:    s_mov_b32 s26, -1
; NOOPT-NEXT:    s_mov_b32 s27, 0xe8f000
; NOOPT-NEXT:    s_add_u32 s24, s24, s9
; NOOPT-NEXT:    s_addc_u32 s25, s25, 0
; NOOPT-NEXT:    s_load_dword s1, s[2:3], 0x9
; NOOPT-NEXT:    s_load_dword s0, s[2:3], 0xa
; NOOPT-NEXT:    ; implicit-def: $vgpr18 : SGPR spill to VGPR lane
; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; NOOPT-NEXT:    v_writelane_b32 v18, s1, 0
; NOOPT-NEXT:    s_mov_b32 s1, 8
; NOOPT-NEXT:    v_writelane_b32 v18, s0, 1
; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
; NOOPT-NEXT:    buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
; NOOPT-NEXT:    v_mov_b32_e32 v0, 8
; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
; NOOPT-NEXT:  .LBB26_1: ; %bb2
; NOOPT-NEXT:    ; =>This Loop Header: Depth=1
; NOOPT-NEXT:    ; Child Loop BB26_3 Depth 2
; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
; NOOPT-NEXT:    s_waitcnt expcnt(1)
; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s2, v18, 0
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 s[0:1], -1
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_cmp_ge_i32_e64 s[2:3], v0, s2
; NOOPT-NEXT:    v_mov_b32_e32 v0, s4
; NOOPT-NEXT:    s_and_b64 vcc, exec, s[2:3]
; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill
; NOOPT-NEXT:    v_writelane_b32 v18, s0, 2
; NOOPT-NEXT:    v_writelane_b32 v18, s1, 3
; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
; NOOPT-NEXT:    buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
; NOOPT-NEXT:    s_cbranch_vccnz .LBB26_6
; NOOPT-NEXT:  ; %bb.2: ; %bb4
; NOOPT-NEXT:    ; in Loop: Header=BB26_1 Depth=1
; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v18, 1
; NOOPT-NEXT:    ; implicit-def: $sgpr2_sgpr3
; NOOPT-NEXT:    ; kill: def $sgpr3 killed $sgpr3 killed $sgpr2_sgpr3
; NOOPT-NEXT:    ; implicit-def: $sgpr4_sgpr5
; NOOPT-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; NOOPT-NEXT:    s_mov_b32 s1, 0xf000
; NOOPT-NEXT:    s_mov_b32 s2, -1
; NOOPT-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; NOOPT-NEXT:    s_mov_b32 s5, s3
; NOOPT-NEXT:    s_mov_b32 s6, s2
; NOOPT-NEXT:    s_mov_b32 s7, s1
; NOOPT-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:80 ; 4-byte Folded Spill
; NOOPT-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, s4
; NOOPT-NEXT:    v_mov_b32_e32 v1, s5
; NOOPT-NEXT:    v_mov_b32_e32 v2, s6
; NOOPT-NEXT:    v_mov_b32_e32 v3, s7
; NOOPT-NEXT:    v_mov_b32_e32 v4, s8
; NOOPT-NEXT:    v_mov_b32_e32 v5, s9
; NOOPT-NEXT:    v_mov_b32_e32 v6, s10
; NOOPT-NEXT:    v_mov_b32_e32 v7, s11
; NOOPT-NEXT:    v_mov_b32_e32 v8, s12
; NOOPT-NEXT:    v_mov_b32_e32 v9, s13
; NOOPT-NEXT:    v_mov_b32_e32 v10, s14
; NOOPT-NEXT:    v_mov_b32_e32 v11, s15
; NOOPT-NEXT:    v_mov_b32_e32 v12, s16
; NOOPT-NEXT:    v_mov_b32_e32 v13, s17
; NOOPT-NEXT:    v_mov_b32_e32 v14, s18
; NOOPT-NEXT:    v_mov_b32_e32 v15, s19
; NOOPT-NEXT:    v_mov_b32_e32 v16, s0
; NOOPT-NEXT:    buffer_store_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
; NOOPT-NEXT:    v_writelane_b32 v18, s0, 4
; NOOPT-NEXT:    v_writelane_b32 v18, s1, 5
; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
; NOOPT-NEXT:    buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:  .LBB26_3: ; Parent Loop BB26_1 Depth=1
; NOOPT-NEXT:    ; => This Inner Loop Header: Depth=2
; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v18, 6
; NOOPT-NEXT:    v_readlane_b32 s1, v18, 7
; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(6)
; NOOPT-NEXT:    buffer_load_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(5)
; NOOPT-NEXT:    buffer_load_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(4)
; NOOPT-NEXT:    buffer_load_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(3)
; NOOPT-NEXT:    buffer_load_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(2)
; NOOPT-NEXT:    buffer_load_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(1)
; NOOPT-NEXT:    buffer_load_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v17, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
; NOOPT-NEXT:    s_mov_b32 m0, s2
; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:84 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:88 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[24:27], 0 offset:92 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:96 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[24:27], 0 offset:100 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[24:27], 0 offset:104 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[24:27], 0 offset:108 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:112 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[24:27], 0 offset:116 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[24:27], 0 offset:120 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[24:27], 0 offset:124 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:132 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[24:27], 0 offset:136 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[24:27], 0 offset:140 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[24:27], 0 offset:144 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
; NOOPT-NEXT:    v_writelane_b32 v18, s2, 6
; NOOPT-NEXT:    v_writelane_b32 v18, s3, 7
; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
; NOOPT-NEXT:    buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
; NOOPT-NEXT:    s_cbranch_execnz .LBB26_3
; NOOPT-NEXT:  ; %bb.4: ; in Loop: Header=BB26_1 Depth=1
; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v18, 4
; NOOPT-NEXT:    v_readlane_b32 s1, v18, 5
; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
; NOOPT-NEXT:  ; %bb.5: ; in Loop: Header=BB26_1 Depth=1
; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:84 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:88 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:92 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v3, off, s[24:27], 0 offset:96 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v4, off, s[24:27], 0 offset:100 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v5, off, s[24:27], 0 offset:104 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v6, off, s[24:27], 0 offset:108 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v7, off, s[24:27], 0 offset:112 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v8, off, s[24:27], 0 offset:116 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v9, off, s[24:27], 0 offset:120 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v10, off, s[24:27], 0 offset:124 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v11, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v12, off, s[24:27], 0 offset:132 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v13, off, s[24:27], 0 offset:136 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v14, off, s[24:27], 0 offset:140 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v15, off, s[24:27], 0 offset:144 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 s[0:1], 0
; NOOPT-NEXT:    s_waitcnt vmcnt(14)
; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill
; NOOPT-NEXT:    v_writelane_b32 v18, s0, 2
; NOOPT-NEXT:    v_writelane_b32 v18, s1, 3
; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
; NOOPT-NEXT:    buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
; NOOPT-NEXT:  .LBB26_6: ; %Flow
; NOOPT-NEXT:    ; in Loop: Header=BB26_1 Depth=1
; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v18, 2
; NOOPT-NEXT:    v_readlane_b32 s1, v18, 3
; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload
; NOOPT-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; NOOPT-NEXT:    s_mov_b32 s0, 1
; NOOPT-NEXT:    ; implicit-def: $sgpr1
; NOOPT-NEXT:    v_cmp_ne_u32_e64 s[0:1], v1, s0
; NOOPT-NEXT:    s_and_b64 vcc, exec, s[0:1]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_cbranch_vccnz .LBB26_1
; NOOPT-NEXT:  ; %bb.7: ; %bb8
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: broken_phi_bb:
; SI-MOVREL:       ; %bb.0: ; %bb
; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, 8
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
; SI-MOVREL-NEXT:    s_branch .LBB26_2
; SI-MOVREL-NEXT:  .LBB26_1:
; SI-MOVREL-NEXT:    ; implicit-def: $vgpr0
; SI-MOVREL-NEXT:    s_branch .LBB26_6
; SI-MOVREL-NEXT:  .LBB26_2: ; %bb2
; SI-MOVREL-NEXT:    ; =>This Loop Header: Depth=1
; SI-MOVREL-NEXT:    ; Child Loop BB26_4 Depth 2
; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
; SI-MOVREL-NEXT:    s_cbranch_vccnz .LBB26_1
; SI-MOVREL-NEXT:  ; %bb.3: ; %bb4
; SI-MOVREL-NEXT:    ; in Loop: Header=BB26_2 Depth=1
; SI-MOVREL-NEXT:    buffer_load_dword v16, off, s[0:3], 0 glc
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    v_mov_b32_e32 v17, s1
; SI-MOVREL-NEXT:    s_mov_b64 s[4:5], exec
; SI-MOVREL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-MOVREL-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
; SI-MOVREL-NEXT:    ; => This Inner Loop Header: Depth=2
; SI-MOVREL-NEXT:    v_readfirstlane_b32 s6, v16
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v16
; SI-MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
; SI-MOVREL-NEXT:    s_mov_b32 m0, s6
; SI-MOVREL-NEXT:    v_movreld_b32_e32 v0, v17
; SI-MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
; SI-MOVREL-NEXT:    s_cbranch_execnz .LBB26_4
; SI-MOVREL-NEXT:  ; %bb.5: ; in Loop: Header=BB26_2 Depth=1
; SI-MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
; SI-MOVREL-NEXT:    s_cbranch_execnz .LBB26_2
; SI-MOVREL-NEXT:  .LBB26_6: ; %bb8
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: broken_phi_bb:
; VI-MOVREL:       ; %bb.0: ; %bb
; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, 8
; VI-MOVREL-NEXT:    s_branch .LBB26_2
; VI-MOVREL-NEXT:  .LBB26_1:
; VI-MOVREL-NEXT:    ; implicit-def: $vgpr0
; VI-MOVREL-NEXT:    s_branch .LBB26_6
; VI-MOVREL-NEXT:  .LBB26_2: ; %bb2
; VI-MOVREL-NEXT:    ; =>This Loop Header: Depth=1
; VI-MOVREL-NEXT:    ; Child Loop BB26_4 Depth 2
; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
; VI-MOVREL-NEXT:    s_cbranch_vccnz .LBB26_1
; VI-MOVREL-NEXT:  ; %bb.3: ; %bb4
; VI-MOVREL-NEXT:    ; in Loop: Header=BB26_2 Depth=1
; VI-MOVREL-NEXT:    flat_load_dword v16, v[0:1] glc
; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s1
; VI-MOVREL-NEXT:    s_mov_b64 s[2:3], exec
; VI-MOVREL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-MOVREL-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
; VI-MOVREL-NEXT:    ; => This Inner Loop Header: Depth=2
; VI-MOVREL-NEXT:    v_readfirstlane_b32 s4, v16
; VI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v16
; VI-MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
; VI-MOVREL-NEXT:    s_mov_b32 m0, s4
; VI-MOVREL-NEXT:    v_movreld_b32_e32 v0, v17
; VI-MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
; VI-MOVREL-NEXT:    s_cbranch_execnz .LBB26_4
; VI-MOVREL-NEXT:  ; %bb.5: ; in Loop: Header=BB26_2 Depth=1
; VI-MOVREL-NEXT:    s_mov_b64 exec, s[2:3]
; VI-MOVREL-NEXT:    s_cbranch_execnz .LBB26_2
; VI-MOVREL-NEXT:  .LBB26_6: ; %bb8
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: broken_phi_bb:
; VI-IDXMODE:       ; %bb.0: ; %bb
; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 8
; VI-IDXMODE-NEXT:    s_branch .LBB26_2
; VI-IDXMODE-NEXT:  .LBB26_1:
; VI-IDXMODE-NEXT:    ; implicit-def: $vgpr0
; VI-IDXMODE-NEXT:    s_branch .LBB26_6
; VI-IDXMODE-NEXT:  .LBB26_2: ; %bb2
; VI-IDXMODE-NEXT:    ; =>This Loop Header: Depth=1
; VI-IDXMODE-NEXT:    ; Child Loop BB26_4 Depth 2
; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
; VI-IDXMODE-NEXT:    s_cbranch_vccnz .LBB26_1
; VI-IDXMODE-NEXT:  ; %bb.3: ; %bb4
; VI-IDXMODE-NEXT:    ; in Loop: Header=BB26_2 Depth=1
; VI-IDXMODE-NEXT:    flat_load_dword v16, v[0:1] glc
; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s1
; VI-IDXMODE-NEXT:    s_mov_b64 s[2:3], exec
; VI-IDXMODE-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-IDXMODE-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
; VI-IDXMODE-NEXT:    ; => This Inner Loop Header: Depth=2
; VI-IDXMODE-NEXT:    v_readfirstlane_b32 s4, v16
; VI-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v16
; VI-IDXMODE-NEXT:    s_and_saveexec_b64 vcc, vcc
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s4, gpr_idx(DST)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, v17
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    s_xor_b64 exec, exec, vcc
; VI-IDXMODE-NEXT:    s_cbranch_execnz .LBB26_4
; VI-IDXMODE-NEXT:  ; %bb.5: ; in Loop: Header=BB26_2 Depth=1
; VI-IDXMODE-NEXT:    s_mov_b64 exec, s[2:3]
; VI-IDXMODE-NEXT:    s_cbranch_execnz .LBB26_2
; VI-IDXMODE-NEXT:  .LBB26_6: ; %bb8
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: broken_phi_bb:
; GFX9-IDXMODE:       ; %bb.0: ; %bb
; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 8
; GFX9-IDXMODE-NEXT:    s_branch .LBB26_2
; GFX9-IDXMODE-NEXT:  .LBB26_1:
; GFX9-IDXMODE-NEXT:    ; implicit-def: $vgpr0
; GFX9-IDXMODE-NEXT:    s_branch .LBB26_6
; GFX9-IDXMODE-NEXT:  .LBB26_2: ; %bb2
; GFX9-IDXMODE-NEXT:    ; =>This Loop Header: Depth=1
; GFX9-IDXMODE-NEXT:    ; Child Loop BB26_4 Depth 2
; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
; GFX9-IDXMODE-NEXT:    s_cbranch_vccnz .LBB26_1
; GFX9-IDXMODE-NEXT:  ; %bb.3: ; %bb4
; GFX9-IDXMODE-NEXT:    ; in Loop: Header=BB26_2 Depth=1
; GFX9-IDXMODE-NEXT:    global_load_dword v16, v[0:1], off glc
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, s1
; GFX9-IDXMODE-NEXT:    s_mov_b64 s[2:3], exec
; GFX9-IDXMODE-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-IDXMODE-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
; GFX9-IDXMODE-NEXT:    ; => This Inner Loop Header: Depth=2
; GFX9-IDXMODE-NEXT:    v_readfirstlane_b32 s4, v16
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v16
; GFX9-IDXMODE-NEXT:    s_and_saveexec_b64 vcc, vcc
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s4, gpr_idx(DST)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, v17
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    s_xor_b64 exec, exec, vcc
; GFX9-IDXMODE-NEXT:    s_cbranch_execnz .LBB26_4
; GFX9-IDXMODE-NEXT:  ; %bb.5: ; in Loop: Header=BB26_2 Depth=1
; GFX9-IDXMODE-NEXT:    s_mov_b64 exec, s[2:3]
; GFX9-IDXMODE-NEXT:    s_cbranch_execnz .LBB26_2
; GFX9-IDXMODE-NEXT:  .LBB26_6: ; %bb8
; GFX9-IDXMODE-NEXT:    s_endpgm
bb:
  br label %bb2

bb2:
  %tmp = phi i32 [ 8, %bb ], [ %tmp7, %bb4 ]
  %tmp3 = icmp slt i32 %tmp, %arg
  br i1 %tmp3, label %bb4, label %bb8

bb4:
  %vgpr = load volatile i32, ptr addrspace(1) undef
  %tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr
  %tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr
  %tmp7 = extractelement <16 x i32> %tmp6, i32 0
  br label %bb2

bb8:
  ret void
}

define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %val, <4 x i32> inreg %desc, i32 inreg %A) {
; GENERIC-LABEL: insert_or_disj_index:
; GENERIC:       ; %bb.0: ; %entry
; GENERIC-NEXT:    v_mov_b32_e32 v2, s4
; GENERIC-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
; GENERIC-NEXT:    s_mov_b32 s2, 0
; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
; GENERIC-NEXT:    s_mov_b32 s0, s2
; GENERIC-NEXT:    s_mov_b32 s1, s2
; GENERIC-NEXT:    s_waitcnt vmcnt(0)
; GENERIC-NEXT:    v_readfirstlane_b32 s4, v2
; GENERIC-NEXT:    s_or_b32 s4, s4, 1
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 3
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 2
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v7, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 1
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v6, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 0
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 7
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v12, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 6
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v11, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 5
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 4
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v9, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 11
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v13, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 10
; GENERIC-NEXT:    buffer_store_dwordx4 v[9:12], v[0:1], s[0:3], 0 addr64 offset:16
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_cndmask_b32_e32 v12, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 9
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v11, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 8
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 15
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v14, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 14
; GENERIC-NEXT:    buffer_store_dwordx4 v[10:13], v[0:1], s[0:3], 0 addr64 offset:32
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    s_waitcnt expcnt(0)
; GENERIC-NEXT:    v_cndmask_b32_e32 v13, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 13
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v12, 0, v4, vcc
; GENERIC-NEXT:    s_cmp_eq_u32 s4, 12
; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
; GENERIC-NEXT:    v_cndmask_b32_e32 v11, 0, v4, vcc
; GENERIC-NEXT:    buffer_store_dwordx4 v[11:14], v[0:1], s[0:3], 0 addr64 offset:48
; GENERIC-NEXT:    buffer_store_dwordx4 v[5:8], v[0:1], s[0:3], 0 addr64
; GENERIC-NEXT:    s_endpgm
;
; NOOPT-LABEL: insert_or_disj_index:
; NOOPT:       ; %bb.0: ; %entry
; NOOPT-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
; NOOPT-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
; NOOPT-NEXT:    s_mov_b32 s18, -1
; NOOPT-NEXT:    s_mov_b32 s19, 0xe8f000
; NOOPT-NEXT:    s_add_u32 s16, s16, s5
; NOOPT-NEXT:    s_addc_u32 s17, s17, 0
; NOOPT-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
; NOOPT-NEXT:    v_writelane_b32 v33, s4, 0
; NOOPT-NEXT:    s_mov_b32 s4, s1
; NOOPT-NEXT:    v_readlane_b32 s1, v33, 0
; NOOPT-NEXT:    v_writelane_b32 v33, s4, 1
; NOOPT-NEXT:    s_mov_b32 s4, s0
; NOOPT-NEXT:    v_readlane_b32 s0, v33, 1
; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:144 ; 4-byte Folded Spill
; NOOPT-NEXT:    v_mov_b32_e32 v2, v1
; NOOPT-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; NOOPT-NEXT:    s_mov_b32 s5, s0
; NOOPT-NEXT:    s_mov_b32 s6, s2
; NOOPT-NEXT:    s_mov_b32 s7, s3
; NOOPT-NEXT:    ; implicit-def: $sgpr0
; NOOPT-NEXT:    ; implicit-def: $sgpr0
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v2
; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:136 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:140 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b32 s8, 0xf000
; NOOPT-NEXT:    s_mov_b32 s0, 0
; NOOPT-NEXT:    v_writelane_b32 v33, s0, 2
; NOOPT-NEXT:    s_mov_b32 s2, s0
; NOOPT-NEXT:    s_mov_b32 s3, s8
; NOOPT-NEXT:    s_mov_b32 s8, s0
; NOOPT-NEXT:    s_mov_b32 s9, s0
; NOOPT-NEXT:    ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11
; NOOPT-NEXT:    s_mov_b64 s[10:11], s[2:3]
; NOOPT-NEXT:    v_writelane_b32 v33, s8, 3
; NOOPT-NEXT:    v_writelane_b32 v33, s9, 4
; NOOPT-NEXT:    v_writelane_b32 v33, s10, 5
; NOOPT-NEXT:    v_writelane_b32 v33, s11, 6
; NOOPT-NEXT:    ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7
; NOOPT-NEXT:    ; implicit-def: $sgpr2_sgpr3
; NOOPT-NEXT:    s_waitcnt expcnt(1)
; NOOPT-NEXT:    v_mov_b32_e32 v0, s1
; NOOPT-NEXT:    buffer_load_dword v0, v0, s[4:7], s0 offen
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v0, s0
; NOOPT-NEXT:    v_mov_b32_e32 v30, s0
; NOOPT-NEXT:    v_mov_b32_e32 v29, s0
; NOOPT-NEXT:    v_mov_b32_e32 v28, s0
; NOOPT-NEXT:    v_mov_b32_e32 v27, s0
; NOOPT-NEXT:    v_mov_b32_e32 v26, s0
; NOOPT-NEXT:    v_mov_b32_e32 v25, s0
; NOOPT-NEXT:    v_mov_b32_e32 v24, s0
; NOOPT-NEXT:    v_mov_b32_e32 v23, s0
; NOOPT-NEXT:    v_mov_b32_e32 v22, s0
; NOOPT-NEXT:    v_mov_b32_e32 v21, s0
; NOOPT-NEXT:    v_mov_b32_e32 v20, s0
; NOOPT-NEXT:    v_mov_b32_e32 v19, s0
; NOOPT-NEXT:    v_mov_b32_e32 v18, s0
; NOOPT-NEXT:    v_mov_b32_e32 v17, s0
; NOOPT-NEXT:    v_mov_b32_e32 v16, s0
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:68 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:72 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[16:19], 0 offset:76 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:80 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:84 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:88 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[16:19], 0 offset:92 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[16:19], 0 offset:96 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[16:19], 0 offset:100 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[16:19], 0 offset:104 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[16:19], 0 offset:108 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[16:19], 0 offset:112 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[16:19], 0 offset:116 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[16:19], 0 offset:120 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[16:19], 0 offset:124 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[16:19], 0 offset:128 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
; NOOPT-NEXT:    v_writelane_b32 v33, s0, 7
; NOOPT-NEXT:    v_writelane_b32 v33, s1, 8
; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
; NOOPT-NEXT:    buffer_store_dword v33, off, s[16:19], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill
; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT:  .LBB27_1: ; =>This Inner Loop Header: Depth=1
; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v33, 9
; NOOPT-NEXT:    v_readlane_b32 s1, v33, 10
; NOOPT-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v5, off, s[16:19], 0 offset:24 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v6, off, s[16:19], 0 offset:28 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v7, off, s[16:19], 0 offset:32 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v8, off, s[16:19], 0 offset:36 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(6)
; NOOPT-NEXT:    buffer_load_dword v9, off, s[16:19], 0 offset:40 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(5)
; NOOPT-NEXT:    buffer_load_dword v10, off, s[16:19], 0 offset:44 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(4)
; NOOPT-NEXT:    buffer_load_dword v11, off, s[16:19], 0 offset:48 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(3)
; NOOPT-NEXT:    buffer_load_dword v12, off, s[16:19], 0 offset:52 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(2)
; NOOPT-NEXT:    buffer_load_dword v13, off, s[16:19], 0 offset:56 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(1)
; NOOPT-NEXT:    buffer_load_dword v14, off, s[16:19], 0 offset:60 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v16, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v17, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
; NOOPT-NEXT:    s_mov_b32 m0, s2
; NOOPT-NEXT:    v_movreld_b32_e32 v1, v16
; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:148 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:152 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[16:19], 0 offset:156 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:160 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:164 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:168 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[16:19], 0 offset:172 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[16:19], 0 offset:176 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[16:19], 0 offset:180 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[16:19], 0 offset:184 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[16:19], 0 offset:188 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[16:19], 0 offset:192 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[16:19], 0 offset:196 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[16:19], 0 offset:200 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[16:19], 0 offset:204 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[16:19], 0 offset:208 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v6, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v7, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v8, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v9, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v10, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v11, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v12, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v13, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v14, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill
; NOOPT-NEXT:    buffer_store_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
; NOOPT-NEXT:    v_writelane_b32 v33, s2, 9
; NOOPT-NEXT:    v_writelane_b32 v33, s3, 10
; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
; NOOPT-NEXT:    buffer_store_dword v33, off, s[16:19], 0 ; 4-byte Folded Spill
; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
; NOOPT-NEXT:    s_cbranch_execnz .LBB27_1
; NOOPT-NEXT:  ; %bb.2:
; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v33, 7
; NOOPT-NEXT:    v_readlane_b32 s1, v33, 8
; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
; NOOPT-NEXT:  ; %bb.3:
; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
; NOOPT-NEXT:    buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_readlane_b32 s0, v33, 3
; NOOPT-NEXT:    v_readlane_b32 s1, v33, 4
; NOOPT-NEXT:    v_readlane_b32 s2, v33, 5
; NOOPT-NEXT:    v_readlane_b32 s3, v33, 6
; NOOPT-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v5, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v17, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v18, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v19, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v20, off, s[16:19], 0 offset:160 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v21, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v22, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v23, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v24, off, s[16:19], 0 offset:176 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v25, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v26, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v27, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v28, off, s[16:19], 0 offset:192 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v29, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v30, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v31, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload
; NOOPT-NEXT:    buffer_load_dword v32, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload
; NOOPT-NEXT:    s_waitcnt vmcnt(12)
; NOOPT-NEXT:    v_mov_b32_e32 v6, v20
; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
; NOOPT-NEXT:    v_mov_b32_e32 v8, v18
; NOOPT-NEXT:    v_mov_b32_e32 v0, v17
; NOOPT-NEXT:    s_waitcnt vmcnt(8)
; NOOPT-NEXT:    v_mov_b32_e32 v1, v24
; NOOPT-NEXT:    v_mov_b32_e32 v2, v23
; NOOPT-NEXT:    v_mov_b32_e32 v3, v22
; NOOPT-NEXT:    v_mov_b32_e32 v9, v21
; NOOPT-NEXT:    s_waitcnt vmcnt(4)
; NOOPT-NEXT:    v_mov_b32_e32 v14, v28
; NOOPT-NEXT:    v_mov_b32_e32 v15, v27
; NOOPT-NEXT:    v_mov_b32_e32 v16, v26
; NOOPT-NEXT:    v_mov_b32_e32 v10, v25
; NOOPT-NEXT:    s_waitcnt vmcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v11, v32
; NOOPT-NEXT:    v_mov_b32_e32 v12, v31
; NOOPT-NEXT:    v_mov_b32_e32 v13, v30
; NOOPT-NEXT:    v_mov_b32_e32 v17, v29
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18_vgpr19_vgpr20 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v18, v13
; NOOPT-NEXT:    v_mov_b32_e32 v19, v12
; NOOPT-NEXT:    v_mov_b32_e32 v20, v11
; NOOPT-NEXT:    v_mov_b32_e32 v12, v5
; NOOPT-NEXT:    v_mov_b32_e32 v11, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[17:20], v[11:12], s[0:3], 0 addr64 offset:48
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v11, v16
; NOOPT-NEXT:    v_mov_b32_e32 v12, v15
; NOOPT-NEXT:    v_mov_b32_e32 v13, v14
; NOOPT-NEXT:    v_mov_b32_e32 v15, v5
; NOOPT-NEXT:    v_mov_b32_e32 v14, v4
; NOOPT-NEXT:    buffer_store_dwordx4 v[10:13], v[14:15], s[0:3], 0 addr64 offset:32
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec
; NOOPT-NEXT:    s_waitcnt expcnt(0)
; NOOPT-NEXT:    v_mov_b32_e32 v10, v3
; NOOPT-NEXT:    v_mov_b32_e32 v11, v2
; NOOPT-NEXT:    v_mov_b32_e32 v12, v1
; NOOPT-NEXT:    v_mov_b32_e32 v1, v4
; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
; NOOPT-NEXT:    buffer_store_dwordx4 v[9:12], v[1:2], s[0:3], 0 addr64 offset:16
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; implicit-def: $sgpr4
; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT:    v_mov_b32_e32 v1, v8
; NOOPT-NEXT:    v_mov_b32_e32 v2, v7
; NOOPT-NEXT:    v_mov_b32_e32 v3, v6
; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
; NOOPT-NEXT:    s_endpgm
;
; SI-MOVREL-LABEL: insert_or_disj_index:
; SI-MOVREL:       ; %bb.0: ; %entry
; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, s4
; SI-MOVREL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
; SI-MOVREL-NEXT:    s_mov_b32 s2, 0
; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0
; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT:    s_mov_b32 s0, s2
; SI-MOVREL-NEXT:    s_mov_b32 s1, s2
; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v16, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v17, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v18, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v19, v5
; SI-MOVREL-NEXT:    v_mov_b32_e32 v20, v5
; SI-MOVREL-NEXT:    s_mov_b64 s[4:5], exec
; SI-MOVREL-NEXT:  .LBB27_1: ; =>This Inner Loop Header: Depth=1
; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT:    v_readfirstlane_b32 s6, v2
; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v2
; SI-MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
; SI-MOVREL-NEXT:    s_mov_b32 m0, s6
; SI-MOVREL-NEXT:    v_movreld_b32_e32 v6, v4
; SI-MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
; SI-MOVREL-NEXT:    s_cbranch_execnz .LBB27_1
; SI-MOVREL-NEXT:  ; %bb.2:
; SI-MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[17:20], v[0:1], s[0:3], 0 addr64 offset:48
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[13:16], v[0:1], s[0:3], 0 addr64 offset:32
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[9:12], v[0:1], s[0:3], 0 addr64 offset:16
; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[5:8], v[0:1], s[0:3], 0 addr64
; SI-MOVREL-NEXT:    s_endpgm
;
; VI-MOVREL-LABEL: insert_or_disj_index:
; VI-MOVREL:       ; %bb.0: ; %entry
; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, s4
; VI-MOVREL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0
; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v18, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v19, v5
; VI-MOVREL-NEXT:    v_mov_b32_e32 v20, v5
; VI-MOVREL-NEXT:    s_mov_b64 s[0:1], exec
; VI-MOVREL-NEXT:  .LBB27_1: ; =>This Inner Loop Header: Depth=1
; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT:    v_readfirstlane_b32 s2, v2
; VI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
; VI-MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
; VI-MOVREL-NEXT:    s_mov_b32 m0, s2
; VI-MOVREL-NEXT:    v_movreld_b32_e32 v6, v4
; VI-MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
; VI-MOVREL-NEXT:    s_cbranch_execnz .LBB27_1
; VI-MOVREL-NEXT:  ; %bb.2:
; VI-MOVREL-NEXT:    s_mov_b64 exec, s[0:1]
; VI-MOVREL-NEXT:    v_add_u32_e32 v2, vcc, 48, v0
; VI-MOVREL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[2:3], v[17:20]
; VI-MOVREL-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
; VI-MOVREL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[2:3], v[13:16]
; VI-MOVREL-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
; VI-MOVREL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[2:3], v[9:12]
; VI-MOVREL-NEXT:    flat_store_dwordx4 v[0:1], v[5:8]
; VI-MOVREL-NEXT:    s_endpgm
;
; VI-IDXMODE-LABEL: insert_or_disj_index:
; VI-IDXMODE:       ; %bb.0: ; %entry
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, s4
; VI-IDXMODE-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v18, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v19, v5
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v20, v5
; VI-IDXMODE-NEXT:    s_mov_b64 s[0:1], exec
; VI-IDXMODE-NEXT:  .LBB27_1: ; =>This Inner Loop Header: Depth=1
; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT:    v_readfirstlane_b32 s2, v2
; VI-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
; VI-IDXMODE-NEXT:    s_and_saveexec_b64 vcc, vcc
; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, v4
; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
; VI-IDXMODE-NEXT:    s_xor_b64 exec, exec, vcc
; VI-IDXMODE-NEXT:    s_cbranch_execnz .LBB27_1
; VI-IDXMODE-NEXT:  ; %bb.2:
; VI-IDXMODE-NEXT:    s_mov_b64 exec, s[0:1]
; VI-IDXMODE-NEXT:    v_add_u32_e32 v2, vcc, 48, v0
; VI-IDXMODE-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[2:3], v[17:20]
; VI-IDXMODE-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
; VI-IDXMODE-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[2:3], v[13:16]
; VI-IDXMODE-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
; VI-IDXMODE-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[2:3], v[9:12]
; VI-IDXMODE-NEXT:    flat_store_dwordx4 v[0:1], v[5:8]
; VI-IDXMODE-NEXT:    s_endpgm
;
; GFX9-IDXMODE-LABEL: insert_or_disj_index:
; GFX9-IDXMODE:       ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, s4
; GFX9-IDXMODE-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v18, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v19, v5
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v20, v5
; GFX9-IDXMODE-NEXT:    s_mov_b64 s[0:1], exec
; GFX9-IDXMODE-NEXT:  .LBB27_1: ; =>This Inner Loop Header: Depth=1
; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT:    v_readfirstlane_b32 s2, v2
; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
; GFX9-IDXMODE-NEXT:    s_and_saveexec_b64 vcc, vcc
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, v4
; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT:    s_xor_b64 exec, exec, vcc
; GFX9-IDXMODE-NEXT:    s_cbranch_execnz .LBB27_1
; GFX9-IDXMODE-NEXT:  ; %bb.2:
; GFX9-IDXMODE-NEXT:    s_mov_b64 exec, s[0:1]
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v[0:1], v[17:20], off offset:48
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v[0:1], v[13:16], off offset:32
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v[0:1], v[9:12], off offset:16
; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v[0:1], v[5:8], off
; GFX9-IDXMODE-NEXT:    s_endpgm
entry:
  %idx = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %A, i32 0, i32 0)
  %off = or disjoint i32 %idx, 1
  %v = insertelement <16 x i32> zeroinitializer, i32 %val, i32 %off
  store <16 x i32> %v, ptr addrspace(1) %out
  ret void
}