llvm/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s

declare i64 @_Z13get_global_idj(i32) #0

define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
; GFX8-LABEL: clmem_read_simplified:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s38, -1
; GFX8-NEXT:    s_mov_b32 s39, 0xe80000
; GFX8-NEXT:    s_add_u32 s36, s36, s9
; GFX8-NEXT:    s_addc_u32 s37, s37, 0
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
; GFX8-NEXT:    v_mov_b32_e32 v2, s35
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
; GFX8-NEXT:    v_mov_b32_e32 v3, 3
; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x800
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1000
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1800
; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[3:4]
; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
; GFX8-NEXT:    s_movk_i32 s0, 0x2000
; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x2800
; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v4, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14]
; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16]
; GFX8-NEXT:    s_movk_i32 s0, 0x3000
; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v4, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[17:18]
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x3800, v3
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
; GFX8-NEXT:    s_waitcnt vmcnt(6)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v11
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v12, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(5)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v8, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(4)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(3)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v13, v0
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v14, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v15, v0
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v16, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v17, v0
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v18, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
; GFX8-NEXT:    s_endpgm
;
; GFX9-LABEL: clmem_read_simplified:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s9
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_getpc_b64 s[0:1]
; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v31, v0
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff8000, v1
; GFX9-NEXT:    v_mov_b32_e32 v1, s35
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v18
; GFX9-NEXT:    v_mov_b32_e32 v3, 3
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    s_movk_i32 s1, 0x2000
; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2048
; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, s1, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
; GFX9-NEXT:    s_movk_i32 s0, 0x1000
; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, s0, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off offset:2048
; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[6:7], off
; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[6:7], off offset:2048
; GFX9-NEXT:    s_movk_i32 s0, 0x3000
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
; GFX9-NEXT:    s_waitcnt vmcnt(6)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v3, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(5)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(4)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(3)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
; GFX9-NEXT:    global_store_dwordx2 v18, v[0:1], s[34:35]
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: clmem_read_simplified:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s38, -1
; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000
; GFX10-NEXT:    s_add_u32 s36, s36, s9
; GFX10-NEXT:    s_addc_u32 s37, s37, 0
; GFX10-NEXT:    s_getpc_b64 s[0:1]
; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX10-NEXT:    v_mov_b32_e32 v2, 3
; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff8000, v1
; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v20
; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1000
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 0x2000
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off
; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[8:9], off offset:-2048
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x3000
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[8:9], off
; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[2:3], off offset:-2048
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3800, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(6)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(5)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(4)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(3)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v14, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v16, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v18, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo
; GFX10-NEXT:    global_store_dwordx2 v20, v[0:1], s[34:35]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: clmem_read_simplified:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[0:1]
; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    s_load_b64 s[34:35], s[2:3], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff8000, v1
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v16
; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2048
; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 0x2000
; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x1000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b64 v[10:11], v[6:7], off offset:-4096
; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off offset:2048
; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, 0x2000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    global_load_b64 v[6:7], v[6:7], off
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x2
; GFX11-NEXT:    global_load_b64 v[12:13], v[12:13], off offset:2048
; GFX11-NEXT:    global_load_b64 v[14:15], v[0:1], off
; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT:    s_waitcnt vmcnt(6)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(5)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(4)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v3, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(3)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v12, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v13, v3, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(1)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT:    global_store_b64 v16, v[0:1], s[34:35]
; GFX11-NEXT:    s_endpgm
entry:
  %call = tail call i64 @_Z13get_global_idj(i32 0)
  %conv = and i64 %call, 255
  %a0 = shl i64 %call, 7
  %idx.ext11 = and i64 %a0, 4294934528
  %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11

  %addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv
  %load1 = load i64, ptr addrspace(1) %addr1, align 8
  %addr2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 256
  %load2 = load i64, ptr addrspace(1) %addr2, align 8
  %add.1 = add i64 %load2, %load1

  %add.ptr8.2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 512
  %load3 = load i64, ptr addrspace(1) %add.ptr8.2, align 8
  %add.2 = add i64 %load3, %add.1
  %add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 768
  %load4 = load i64, ptr addrspace(1) %add.ptr8.3, align 8
  %add.3 = add i64 %load4, %add.2

  %add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1024
  %load5 = load i64, ptr addrspace(1) %add.ptr8.4, align 8
  %add.4 = add i64 %load5, %add.3
  %add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1280
  %load6 = load i64, ptr addrspace(1) %add.ptr8.5, align 8
  %add.5 = add i64 %load6, %add.4

  %add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1536
  %load7 = load i64, ptr addrspace(1) %add.ptr8.6, align 8
  %add.6 = add i64 %load7, %add.5
  %add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1792
  %load8 = load i64, ptr addrspace(1) %add.ptr8.7, align 8
  %add.7 = add i64 %load8, %add.6

  store i64 %add.7, ptr addrspace(1) %add.ptr12, align 8
  ret void
}

define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
; GFX8-LABEL: clmem_read:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s38, -1
; GFX8-NEXT:    s_mov_b32 s39, 0xe80000
; GFX8-NEXT:    s_add_u32 s36, s36, s9
; GFX8-NEXT:    s_addc_u32 s37, s37, 0
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 17, v0
; GFX8-NEXT:    v_and_b32_e32 v6, 0xfe000000, v1
; GFX8-NEXT:    v_mov_b32_e32 v1, 3
; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
; GFX8-NEXT:    v_mov_b32_e32 v1, s35
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x5000
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT:    v_mov_b32_e32 v2, 0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    v_mov_b32_e32 v3, 0
; GFX8-NEXT:    s_movk_i32 s0, 0x7f
; GFX8-NEXT:  .LBB1_1: ; %for.cond.preheader
; GFX8-NEXT:    ; =>This Loop Header: Depth=1
; GFX8-NEXT:    ; Child Loop BB1_2 Depth 2
; GFX8-NEXT:    v_mov_b32_e32 v5, v1
; GFX8-NEXT:    v_mov_b32_e32 v4, v0
; GFX8-NEXT:    s_mov_b32 s1, 0
; GFX8-NEXT:  .LBB1_2: ; %for.body
; GFX8-NEXT:    ; Parent Loop BB1_1 Depth=1
; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0xffffb000, v4
; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, -1, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0xffffb800, v4
; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, -1, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0xffffc000, v4
; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, -1, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0xffffc800, v4
; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, -1, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0xffffd000, v4
; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[11:12]
; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14]
; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, -1, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xffffd800, v4
; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, -1, v5, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16]
; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[17:18]
; GFX8-NEXT:    v_add_u32_e32 v19, vcc, 0xffffe000, v4
; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, -1, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, 0xffffe800, v4
; GFX8-NEXT:    flat_load_dwordx2 v[19:20], v[19:20]
; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, -1, v5, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[21:22], v[21:22]
; GFX8-NEXT:    v_add_u32_e32 v23, vcc, 0xfffff000, v4
; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, -1, v5, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[23:24], v[23:24]
; GFX8-NEXT:    v_add_u32_e32 v25, vcc, 0xfffff800, v4
; GFX8-NEXT:    v_addc_u32_e32 v26, vcc, -1, v5, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[25:26], v[25:26]
; GFX8-NEXT:    flat_load_dwordx2 v[27:28], v[4:5]
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x10000, v4
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
; GFX8-NEXT:    s_addk_i32 s1, 0x2000
; GFX8-NEXT:    s_cmp_gt_u32 s1, 0x3fffff
; GFX8-NEXT:    s_waitcnt vmcnt(10)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(9)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v10, v3, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(8)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v11, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v12, v3, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(7)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v13, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v14, v3, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(6)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v15, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v16, v3, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(5)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v17, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v18, v3, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(4)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v19, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v20, v3, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(3)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v21, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v22, v3, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v23, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v24, v3, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v25, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v26, v3, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v27, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v28, v3, vcc
; GFX8-NEXT:    s_cbranch_scc0 .LBB1_2
; GFX8-NEXT:  ; %bb.3: ; %while.cond.loopexit
; GFX8-NEXT:    ; in Loop: Header=BB1_1 Depth=1
; GFX8-NEXT:    s_add_i32 s1, s0, -1
; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
; GFX8-NEXT:    s_cbranch_scc1 .LBB1_5
; GFX8-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX8-NEXT:    s_mov_b32 s0, s1
; GFX8-NEXT:    s_branch .LBB1_1
; GFX8-NEXT:  .LBB1_5: ; %while.end
; GFX8-NEXT:    v_mov_b32_e32 v1, s35
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v6
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT:    s_endpgm
;
; GFX900-LABEL: clmem_read:
; GFX900:       ; %bb.0: ; %entry
; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX900-NEXT:    s_mov_b32 s38, -1
; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
; GFX900-NEXT:    s_add_u32 s36, s36, s9
; GFX900-NEXT:    s_addc_u32 s37, s37, 0
; GFX900-NEXT:    s_getpc_b64 s[0:1]
; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX900-NEXT:    v_mov_b32_e32 v31, v0
; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX900-NEXT:    v_mov_b32_e32 v0, 0
; GFX900-NEXT:    s_mov_b32 s32, 0
; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
; GFX900-NEXT:    v_and_b32_e32 v6, 0xfe000000, v0
; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 3, v6
; GFX900-NEXT:    v_mov_b32_e32 v1, s35
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT:    s_movk_i32 s0, 0x5000
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
; GFX900-NEXT:    v_mov_b32_e32 v2, 0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT:    s_movk_i32 s2, 0x7f
; GFX900-NEXT:    v_mov_b32_e32 v3, 0
; GFX900-NEXT:    s_movk_i32 s0, 0xd000
; GFX900-NEXT:    s_movk_i32 s1, 0xe000
; GFX900-NEXT:    s_movk_i32 s3, 0xf000
; GFX900-NEXT:  .LBB1_1: ; %for.cond.preheader
; GFX900-NEXT:    ; =>This Loop Header: Depth=1
; GFX900-NEXT:    ; Child Loop BB1_2 Depth 2
; GFX900-NEXT:    v_mov_b32_e32 v5, v1
; GFX900-NEXT:    v_mov_b32_e32 v4, v0
; GFX900-NEXT:    s_mov_b32 s4, 0
; GFX900-NEXT:  .LBB1_2: ; %for.body
; GFX900-NEXT:    ; Parent Loop BB1_1 Depth=1
; GFX900-NEXT:    ; => This Inner Loop Header: Depth=2
; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, 0xffffb000, v4
; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, -1, v5, vcc
; GFX900-NEXT:    global_load_dwordx2 v[9:10], v[4:5], off offset:-4096
; GFX900-NEXT:    global_load_dwordx2 v[11:12], v[4:5], off offset:-2048
; GFX900-NEXT:    v_add_co_u32_e32 v13, vcc, 0xffffc000, v4
; GFX900-NEXT:    global_load_dwordx2 v[7:8], v[7:8], off
; GFX900-NEXT:    v_addc_co_u32_e32 v14, vcc, -1, v5, vcc
; GFX900-NEXT:    global_load_dwordx2 v[17:18], v[13:14], off offset:-2048
; GFX900-NEXT:    v_add_co_u32_e32 v15, vcc, s0, v4
; GFX900-NEXT:    v_addc_co_u32_e32 v16, vcc, -1, v5, vcc
; GFX900-NEXT:    global_load_dwordx2 v[15:16], v[15:16], off offset:-2048
; GFX900-NEXT:    v_add_co_u32_e32 v19, vcc, s1, v4
; GFX900-NEXT:    global_load_dwordx2 v[13:14], v[13:14], off
; GFX900-NEXT:    v_addc_co_u32_e32 v20, vcc, -1, v5, vcc
; GFX900-NEXT:    global_load_dwordx2 v[23:24], v[19:20], off offset:-4096
; GFX900-NEXT:    global_load_dwordx2 v[25:26], v[19:20], off offset:-2048
; GFX900-NEXT:    global_load_dwordx2 v[27:28], v[19:20], off
; GFX900-NEXT:    v_add_co_u32_e32 v21, vcc, s3, v4
; GFX900-NEXT:    v_addc_co_u32_e32 v22, vcc, -1, v5, vcc
; GFX900-NEXT:    global_load_dwordx2 v[19:20], v[21:22], off offset:-2048
; GFX900-NEXT:    global_load_dwordx2 v[29:30], v[4:5], off
; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, 0x10000, v4
; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX900-NEXT:    s_addk_i32 s4, 0x2000
; GFX900-NEXT:    s_cmp_gt_u32 s4, 0x3fffff
; GFX900-NEXT:    s_waitcnt vmcnt(8)
; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(7)
; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v17, v2
; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v18, v3, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(5)
; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v14, v3, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v15, v2
; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v16, v3, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(4)
; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v23, v2
; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v24, v3, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(3)
; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v25, v2
; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v26, v3, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(2)
; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v27, v2
; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v28, v3, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(1)
; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v19, v2
; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v20, v3, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v9, v2
; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v10, v3, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v11, v2
; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v12, v3, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v29, v2
; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v30, v3, vcc
; GFX900-NEXT:    s_cbranch_scc0 .LBB1_2
; GFX900-NEXT:  ; %bb.3: ; %while.cond.loopexit
; GFX900-NEXT:    ; in Loop: Header=BB1_1 Depth=1
; GFX900-NEXT:    s_add_i32 s4, s2, -1
; GFX900-NEXT:    s_cmp_eq_u32 s2, 0
; GFX900-NEXT:    s_cbranch_scc1 .LBB1_5
; GFX900-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX900-NEXT:    s_mov_b32 s2, s4
; GFX900-NEXT:    s_branch .LBB1_1
; GFX900-NEXT:  .LBB1_5: ; %while.end
; GFX900-NEXT:    v_mov_b32_e32 v1, s35
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v6
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
; GFX900-NEXT:    s_endpgm
;
; GFX10-LABEL: clmem_read:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s38, -1
; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000
; GFX10-NEXT:    s_add_u32 s36, s36, s9
; GFX10-NEXT:    s_addc_u32 s37, s37, 0
; GFX10-NEXT:    s_getpc_b64 s[0:1]
; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 17, v0
; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX10-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-NEXT:    v_mov_b32_e32 v3, 0
; GFX10-NEXT:    s_movk_i32 s1, 0x7f
; GFX10-NEXT:    v_and_b32_e32 v6, 0xfe000000, v1
; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 3, v6
; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, s34
; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 0, s35, s0
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x5000, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:  .LBB1_1: ; %for.cond.preheader
; GFX10-NEXT:    ; =>This Loop Header: Depth=1
; GFX10-NEXT:    ; Child Loop BB1_2 Depth 2
; GFX10-NEXT:    v_mov_b32_e32 v5, v1
; GFX10-NEXT:    v_mov_b32_e32 v4, v0
; GFX10-NEXT:    s_mov_b32 s2, 0
; GFX10-NEXT:  .LBB1_2: ; %for.body
; GFX10-NEXT:    ; Parent Loop BB1_1 Depth=1
; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v4, 0xffffb800
; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, -1, v5, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v4, 0xffffc800
; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, -1, v5, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v13, vcc_lo, v4, 0xffffd800
; GFX10-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, -1, v5, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v17, vcc_lo, v4, 0xffffe800
; GFX10-NEXT:    s_clause 0x2
; GFX10-NEXT:    global_load_dwordx2 v[11:12], v[7:8], off offset:-2048
; GFX10-NEXT:    global_load_dwordx2 v[15:16], v[9:10], off offset:-2048
; GFX10-NEXT:    global_load_dwordx2 v[19:20], v[13:14], off offset:-2048
; GFX10-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, -1, v5, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v21, vcc_lo, 0xfffff000, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, -1, v5, vcc_lo
; GFX10-NEXT:    s_clause 0x7
; GFX10-NEXT:    global_load_dwordx2 v[23:24], v[17:18], off offset:-2048
; GFX10-NEXT:    global_load_dwordx2 v[7:8], v[7:8], off
; GFX10-NEXT:    global_load_dwordx2 v[9:10], v[9:10], off
; GFX10-NEXT:    global_load_dwordx2 v[13:14], v[13:14], off
; GFX10-NEXT:    global_load_dwordx2 v[25:26], v[17:18], off
; GFX10-NEXT:    global_load_dwordx2 v[27:28], v[21:22], off
; GFX10-NEXT:    global_load_dwordx2 v[29:30], v[4:5], off offset:-2048
; GFX10-NEXT:    global_load_dwordx2 v[31:32], v[4:5], off
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x10000, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
; GFX10-NEXT:    s_addk_i32 s2, 0x2000
; GFX10-NEXT:    s_cmp_gt_u32 s2, 0x3fffff
; GFX10-NEXT:    s_waitcnt vmcnt(10)
; GFX10-NEXT:    v_add_co_u32 v2, s0, v11, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v12, v3, s0
; GFX10-NEXT:    s_waitcnt vmcnt(6)
; GFX10-NEXT:    v_add_co_u32 v2, s0, v7, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v8, v3, s0
; GFX10-NEXT:    v_add_co_u32 v2, s0, v15, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v16, v3, s0
; GFX10-NEXT:    s_waitcnt vmcnt(5)
; GFX10-NEXT:    v_add_co_u32 v2, s0, v9, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v10, v3, s0
; GFX10-NEXT:    v_add_co_u32 v2, s0, v19, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v20, v3, s0
; GFX10-NEXT:    s_waitcnt vmcnt(4)
; GFX10-NEXT:    v_add_co_u32 v2, s0, v13, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v14, v3, s0
; GFX10-NEXT:    v_add_co_u32 v2, s0, v23, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v24, v3, s0
; GFX10-NEXT:    s_waitcnt vmcnt(3)
; GFX10-NEXT:    v_add_co_u32 v2, s0, v25, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v26, v3, s0
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    v_add_co_u32 v2, s0, v27, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v28, v3, s0
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_add_co_u32 v2, s0, v29, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v30, v3, s0
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v31, v2
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v32, v3, vcc_lo
; GFX10-NEXT:    s_cbranch_scc0 .LBB1_2
; GFX10-NEXT:  ; %bb.3: ; %while.cond.loopexit
; GFX10-NEXT:    ; in Loop: Header=BB1_1 Depth=1
; GFX10-NEXT:    s_add_i32 s0, s1, -1
; GFX10-NEXT:    s_cmp_eq_u32 s1, 0
; GFX10-NEXT:    s_cbranch_scc1 .LBB1_5
; GFX10-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX10-NEXT:    s_mov_b32 s1, s0
; GFX10-NEXT:    s_branch .LBB1_1
; GFX10-NEXT:  .LBB1_5: ; %while.end
; GFX10-NEXT:    v_add_co_u32 v0, s0, s34, v6
; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s35, 0, s0
; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
; GFX10-NEXT:    s_endpgm
;
; GFX90A-LABEL: clmem_read:
; GFX90A:       ; %bb.0: ; %entry
; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX90A-NEXT:    s_mov_b32 s38, -1
; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
; GFX90A-NEXT:    s_add_u32 s36, s36, s9
; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
; GFX90A-NEXT:    s_getpc_b64 s[0:1]
; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
; GFX90A-NEXT:    s_mov_b32 s32, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX90A-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
; GFX90A-NEXT:    v_and_b32_e32 v0, 0xfe000000, v0
; GFX90A-NEXT:    v_lshl_or_b32 v1, v1, 3, v0
; GFX90A-NEXT:    v_mov_b32_e32 v2, s35
; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, s34, v1
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v2, vcc
; GFX90A-NEXT:    s_movk_i32 s0, 0x5000
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v1
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX90A-NEXT:    s_movk_i32 s2, 0x7f
; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], 0, 0
; GFX90A-NEXT:    s_movk_i32 s0, 0xd000
; GFX90A-NEXT:    s_movk_i32 s1, 0xe000
; GFX90A-NEXT:    s_movk_i32 s3, 0xf000
; GFX90A-NEXT:  .LBB1_1: ; %for.cond.preheader
; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
; GFX90A-NEXT:    ; Child Loop BB1_2 Depth 2
; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT:    s_mov_b32 s4, 0
; GFX90A-NEXT:  .LBB1_2: ; %for.body
; GFX90A-NEXT:    ; Parent Loop BB1_1 Depth=1
; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT:    v_add_co_u32_e32 v12, vcc, 0xffffb000, v6
; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, -1, v7, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[12:13], off
; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
; GFX90A-NEXT:    v_add_co_u32_e32 v16, vcc, s0, v6
; GFX90A-NEXT:    v_addc_co_u32_e32 v17, vcc, -1, v7, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
; GFX90A-NEXT:    v_add_co_u32_e32 v20, vcc, s1, v6
; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[14:15], off
; GFX90A-NEXT:    v_addc_co_u32_e32 v21, vcc, -1, v7, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[24:25], v[20:21], off offset:-4096
; GFX90A-NEXT:    global_load_dwordx2 v[26:27], v[20:21], off offset:-2048
; GFX90A-NEXT:    global_load_dwordx2 v[28:29], v[20:21], off
; GFX90A-NEXT:    v_add_co_u32_e32 v22, vcc, s3, v6
; GFX90A-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[20:21], v[22:23], off offset:-2048
; GFX90A-NEXT:    global_load_dwordx2 v[30:31], v[6:7], off
; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x10000, v6
; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
; GFX90A-NEXT:    s_addk_i32 s4, 0x2000
; GFX90A-NEXT:    s_cmp_gt_u32 s4, 0x3fffff
; GFX90A-NEXT:    s_waitcnt vmcnt(8)
; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v12, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(7)
; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v18, v1
; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v19, v4, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(5)
; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v14, v1
; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v15, v4, vcc
; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v16, v1
; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v17, v4, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(4)
; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v24, v1
; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v25, v4, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(3)
; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v26, v1
; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v27, v4, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(2)
; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v28, v1
; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v29, v4, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(1)
; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v20, v1
; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v21, v4, vcc
; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v8, v1
; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v4, vcc
; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v10, v1
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v11, v4, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v30, v1
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
; GFX90A-NEXT:    s_cbranch_scc0 .LBB1_2
; GFX90A-NEXT:  ; %bb.3: ; %while.cond.loopexit
; GFX90A-NEXT:    ; in Loop: Header=BB1_1 Depth=1
; GFX90A-NEXT:    s_add_i32 s4, s2, -1
; GFX90A-NEXT:    s_cmp_eq_u32 s2, 0
; GFX90A-NEXT:    s_cbranch_scc1 .LBB1_5
; GFX90A-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX90A-NEXT:    s_mov_b32 s2, s4
; GFX90A-NEXT:    s_branch .LBB1_1
; GFX90A-NEXT:  .LBB1_5: ; %while.end
; GFX90A-NEXT:    v_mov_b32_e32 v1, s35
; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0
; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
; GFX90A-NEXT:    s_endpgm
;
; GFX11-LABEL: clmem_read:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[0:1]
; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    s_load_b64 s[34:35], s[2:3], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 17, v0
; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0xff, v0
; GFX11-NEXT:    s_movk_i32 s1, 0x7f
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_and_b32_e32 v6, 0xfe000000, v1
; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 3, v6
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v0, s0, v0, s34
; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s35, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x5000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:  .LBB1_1: ; %for.cond.preheader
; GFX11-NEXT:    ; =>This Loop Header: Depth=1
; GFX11-NEXT:    ; Child Loop BB1_2 Depth 2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT:    s_mov_b32 s2, 0
; GFX11-NEXT:  .LBB1_2: ; %for.body
; GFX11-NEXT:    ; Parent Loop BB1_1 Depth=1
; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v7, vcc_lo, v4, 0xffffc000
; GFX11-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v9, vcc_lo, 0xffffc000, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v11, vcc_lo, 0xffffd000, v4
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b64 v[13:14], v[7:8], off offset:-4096
; GFX11-NEXT:    global_load_b64 v[9:10], v[9:10], off offset:-2048
; GFX11-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v15, vcc_lo, v4, 0xffffe000
; GFX11-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT:    global_load_b64 v[11:12], v[11:12], off offset:-2048
; GFX11-NEXT:    v_add_co_u32 v17, vcc_lo, 0xffffe000, v4
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b64 v[19:20], v[15:16], off offset:-4096
; GFX11-NEXT:    global_load_b64 v[7:8], v[7:8], off
; GFX11-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v21, vcc_lo, 0xfffff000, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT:    s_clause 0x5
; GFX11-NEXT:    global_load_b64 v[17:18], v[17:18], off offset:-2048
; GFX11-NEXT:    global_load_b64 v[15:16], v[15:16], off
; GFX11-NEXT:    global_load_b64 v[21:22], v[21:22], off offset:-2048
; GFX11-NEXT:    global_load_b64 v[23:24], v[4:5], off offset:-4096
; GFX11-NEXT:    global_load_b64 v[25:26], v[4:5], off offset:-2048
; GFX11-NEXT:    global_load_b64 v[27:28], v[4:5], off
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x10000, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
; GFX11-NEXT:    s_addk_i32 s2, 0x2000
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT:    s_cmp_gt_u32 s2, 0x3fffff
; GFX11-NEXT:    s_waitcnt vmcnt(10)
; GFX11-NEXT:    v_add_co_u32 v2, s0, v13, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, v14, v3, s0
; GFX11-NEXT:    s_waitcnt vmcnt(9)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, v9, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, v10, v3, s0
; GFX11-NEXT:    s_waitcnt vmcnt(6)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, v7, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, v8, v3, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, v11, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, v12, v3, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, v19, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, v20, v3, s0
; GFX11-NEXT:    s_waitcnt vmcnt(5)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, v17, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, v18, v3, s0
; GFX11-NEXT:    s_waitcnt vmcnt(4)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, v15, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, v16, v3, s0
; GFX11-NEXT:    s_waitcnt vmcnt(3)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, v21, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, v22, v3, s0
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, v23, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, v24, v3, s0
; GFX11-NEXT:    s_waitcnt vmcnt(1)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, v25, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, v26, v3, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v27, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v28, v3, vcc_lo
; GFX11-NEXT:    s_cbranch_scc0 .LBB1_2
; GFX11-NEXT:  ; %bb.3: ; %while.cond.loopexit
; GFX11-NEXT:    ; in Loop: Header=BB1_1 Depth=1
; GFX11-NEXT:    s_add_i32 s0, s1, -1
; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
; GFX11-NEXT:    s_cbranch_scc1 .LBB1_5
; GFX11-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX11-NEXT:    s_mov_b32 s1, s0
; GFX11-NEXT:    s_branch .LBB1_1
; GFX11-NEXT:  .LBB1_5: ; %while.end
; GFX11-NEXT:    v_add_co_u32 v0, s0, s34, v6
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s35, 0, s0
; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT:    s_endpgm
entry:
  %call = tail call i64 @_Z13get_global_idj(i32 0)
  %conv = and i64 %call, 255
  %a0 = shl i64 %call, 17
  %idx.ext11 = and i64 %a0, 4261412864
  %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11
  %add.ptr6 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv
  br label %for.cond.preheader

while.cond.loopexit:                              ; preds = %for.body
  %dec = add nsw i32 %dec31, -1
  %tobool = icmp eq i32 %dec31, 0
  br i1 %tobool, label %while.end, label %for.cond.preheader

for.cond.preheader:                               ; preds = %entry, %while.cond.loopexit
  %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ]
  %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ]
  br label %for.body

for.body:                                         ; preds = %for.body, %for.cond.preheader
  %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ]
  %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ]
  %conv3 = zext i32 %block.029 to i64
  %add.ptr8 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3
  %load1 = load i64, ptr addrspace(1) %add.ptr8, align 8
  %add = add i64 %load1, %sum.128

  %add9 = or disjoint i32 %block.029, 256
  %conv3.1 = zext i32 %add9 to i64
  %add.ptr8.1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.1
  %load2 = load i64, ptr addrspace(1) %add.ptr8.1, align 8
  %add.1 = add i64 %load2, %add

  %add9.1 = or disjoint i32 %block.029, 512
  %conv3.2 = zext i32 %add9.1 to i64
  %add.ptr8.2 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.2
  %l3 = load i64, ptr addrspace(1) %add.ptr8.2, align 8
  %add.2 = add i64 %l3, %add.1

  %add9.2 = or disjoint i32 %block.029, 768
  %conv3.3 = zext i32 %add9.2 to i64
  %add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.3
  %l4 = load i64, ptr addrspace(1) %add.ptr8.3, align 8
  %add.3 = add i64 %l4, %add.2

  %add9.3 = or disjoint i32 %block.029, 1024
  %conv3.4 = zext i32 %add9.3 to i64
  %add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.4
  %l5 = load i64, ptr addrspace(1) %add.ptr8.4, align 8
  %add.4 = add i64 %l5, %add.3

  %add9.4 = or disjoint i32 %block.029, 1280
  %conv3.5 = zext i32 %add9.4 to i64
  %add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.5
  %l6 = load i64, ptr addrspace(1) %add.ptr8.5, align 8
  %add.5 = add i64 %l6, %add.4

  %add9.5 = or disjoint i32 %block.029, 1536
  %conv3.6 = zext i32 %add9.5 to i64
  %add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.6
  %load7 = load i64, ptr addrspace(1) %add.ptr8.6, align 8
  %add.6 = add i64 %load7, %add.5

  %add9.6 = or disjoint i32 %block.029, 1792
  %conv3.7 = zext i32 %add9.6 to i64
  %add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.7
  %load8 = load i64, ptr addrspace(1) %add.ptr8.7, align 8
  %add.7 = add i64 %load8, %add.6

  %add9.7 = or disjoint i32 %block.029, 2048
  %conv3.8 = zext i32 %add9.7 to i64
  %add.ptr8.8 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.8
  %load9 = load i64, ptr addrspace(1) %add.ptr8.8, align 8
  %add.8 = add i64 %load9, %add.7

  %add9.8 = or disjoint i32 %block.029, 2304
  %conv3.9 = zext i32 %add9.8 to i64
  %add.ptr8.9 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.9
  %load10 = load i64, ptr addrspace(1) %add.ptr8.9, align 8
  %add.9 = add i64 %load10, %add.8

  %add9.9 = or disjoint i32 %block.029, 2560
  %conv3.10 = zext i32 %add9.9 to i64
  %add.ptr8.10 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.10
  %load11 = load i64, ptr addrspace(1) %add.ptr8.10, align 8
  %add.10 = add i64 %load11, %add.9

  %add9.31 = add nuw nsw i32 %block.029, 8192
  %cmp.31 = icmp ult i32 %add9.31, 4194304
  br i1 %cmp.31, label %for.body, label %while.cond.loopexit

while.end:                                        ; preds = %while.cond.loopexit
  store i64 %add.10, ptr addrspace(1) %add.ptr12, align 8
  ret void
}

; using 32bit address.
define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX8-LABEL: Address32:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s38, -1
; GFX8-NEXT:    s_mov_b32 s39, 0xe80000
; GFX8-NEXT:    s_add_u32 s36, s36, s9
; GFX8-NEXT:    s_addc_u32 s37, s37, 0
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
; GFX8-NEXT:    v_mov_b32_e32 v2, s35
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
; GFX8-NEXT:    v_mov_b32_e32 v3, 2
; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x400
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x800
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0xc00
; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1000
; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v4, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1400
; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1800
; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v4, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1c00
; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v4, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x2000
; GFX8-NEXT:    flat_load_dword v0, v[3:4]
; GFX8-NEXT:    flat_load_dword v19, v[5:6]
; GFX8-NEXT:    flat_load_dword v7, v[7:8]
; GFX8-NEXT:    flat_load_dword v8, v[9:10]
; GFX8-NEXT:    flat_load_dword v9, v[11:12]
; GFX8-NEXT:    flat_load_dword v10, v[13:14]
; GFX8-NEXT:    flat_load_dword v11, v[15:16]
; GFX8-NEXT:    flat_load_dword v12, v[17:18]
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x2400, v3
; GFX8-NEXT:    flat_load_dword v5, v[5:6]
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT:    flat_load_dword v3, v[3:4]
; GFX8-NEXT:    s_waitcnt vmcnt(8)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v19, v0
; GFX8-NEXT:    s_waitcnt vmcnt(7)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
; GFX8-NEXT:    s_waitcnt vmcnt(6)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v8, v0
; GFX8-NEXT:    s_waitcnt vmcnt(5)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
; GFX8-NEXT:    s_waitcnt vmcnt(4)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v10, v0
; GFX8-NEXT:    s_waitcnt vmcnt(3)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v11, v0
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v12, v0
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT:    flat_store_dword v[1:2], v0
; GFX8-NEXT:    s_endpgm
;
; GFX9-LABEL: Address32:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s9
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_getpc_b64 s[0:1]
; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v31, v0
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff8000, v1
; GFX9-NEXT:    v_mov_b32_e32 v1, s35
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v4
; GFX9-NEXT:    v_mov_b32_e32 v3, 2
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    s_movk_i32 s0, 0x1000
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dword v5, v[0:1], off
; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:1024
; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:2048
; GFX9-NEXT:    global_load_dword v8, v[0:1], off offset:3072
; GFX9-NEXT:    global_load_dword v9, v[2:3], off
; GFX9-NEXT:    global_load_dword v10, v[2:3], off offset:1024
; GFX9-NEXT:    global_load_dword v11, v[2:3], off offset:2048
; GFX9-NEXT:    global_load_dword v12, v[2:3], off offset:3072
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dword v2, v[0:1], off
; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:1024
; GFX9-NEXT:    s_waitcnt vmcnt(8)
; GFX9-NEXT:    v_add_u32_e32 v0, v6, v5
; GFX9-NEXT:    s_waitcnt vmcnt(6)
; GFX9-NEXT:    v_add3_u32 v0, v7, v0, v8
; GFX9-NEXT:    s_waitcnt vmcnt(4)
; GFX9-NEXT:    v_add3_u32 v0, v9, v0, v10
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    v_add3_u32 v0, v11, v0, v12
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_add3_u32 v0, v2, v0, v3
; GFX9-NEXT:    global_store_dword v4, v0, s[34:35]
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: Address32:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s38, -1
; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000
; GFX10-NEXT:    s_add_u32 s36, s36, s9
; GFX10-NEXT:    s_addc_u32 s37, s37, 0
; GFX10-NEXT:    s_getpc_b64 s[0:1]
; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX10-NEXT:    v_mov_b32_e32 v2, 2
; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1
; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v8
; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x800, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x1000
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, 0x1000, v0
; GFX10-NEXT:    s_clause 0x4
; GFX10-NEXT:    global_load_dword v9, v[0:1], off
; GFX10-NEXT:    global_load_dword v10, v[0:1], off offset:1024
; GFX10-NEXT:    global_load_dword v11, v[2:3], off offset:1024
; GFX10-NEXT:    global_load_dword v12, v[4:5], off offset:-2048
; GFX10-NEXT:    global_load_dword v13, v[4:5], off
; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1800, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x2000
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dword v14, v[6:7], off offset:1024
; GFX10-NEXT:    global_load_dword v15, v[2:3], off offset:1024
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x2
; GFX10-NEXT:    global_load_dword v2, v[4:5], off offset:-2048
; GFX10-NEXT:    global_load_dword v3, v[4:5], off
; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:1024
; GFX10-NEXT:    s_waitcnt vmcnt(8)
; GFX10-NEXT:    v_add_nc_u32_e32 v0, v10, v9
; GFX10-NEXT:    s_waitcnt vmcnt(6)
; GFX10-NEXT:    v_add3_u32 v0, v12, v0, v11
; GFX10-NEXT:    s_waitcnt vmcnt(4)
; GFX10-NEXT:    v_add3_u32 v0, v13, v0, v14
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    v_add3_u32 v0, v2, v0, v15
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add3_u32 v0, v3, v0, v6
; GFX10-NEXT:    global_store_dword v8, v0, s[34:35]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: Address32:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[0:1]
; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    s_load_b64 s[34:35], s[2:3], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v6
; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b32 v7, v[0:1], off
; GFX11-NEXT:    global_load_b32 v8, v[0:1], off offset:1024
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x2000
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x5
; GFX11-NEXT:    global_load_b32 v9, v[0:1], off offset:2048
; GFX11-NEXT:    global_load_b32 v10, v[0:1], off offset:3072
; GFX11-NEXT:    global_load_b32 v11, v[4:5], off offset:-4096
; GFX11-NEXT:    global_load_b32 v12, v[2:3], off offset:1024
; GFX11-NEXT:    global_load_b32 v13, v[2:3], off offset:2048
; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:3072
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b32 v3, v[4:5], off
; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:1024
; GFX11-NEXT:    s_waitcnt vmcnt(8)
; GFX11-NEXT:    v_add_nc_u32_e32 v1, v8, v7
; GFX11-NEXT:    s_waitcnt vmcnt(6)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add3_u32 v1, v9, v1, v10
; GFX11-NEXT:    s_waitcnt vmcnt(4)
; GFX11-NEXT:    v_add3_u32 v1, v11, v1, v12
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add3_u32 v1, v13, v1, v2
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_add3_u32 v0, v3, v1, v0
; GFX11-NEXT:    global_store_b32 v6, v0, s[34:35]
; GFX11-NEXT:    s_endpgm
entry:
   %call = tail call i64 @_Z13get_global_idj(i32 0)
   %conv = and i64 %call, 255
   %id = shl i64 %call, 7
   %idx.ext11 = and i64 %id, 4294934528
   %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11

   %add.ptr6 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr12, i64 %conv
   %load1 = load i32, ptr addrspace(1) %add.ptr6, align 4

   %add.ptr8.1 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 256
   %load2 = load i32, ptr addrspace(1) %add.ptr8.1, align 4
   %add.1 = add i32 %load2, %load1

   %add.ptr8.2 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 512
   %load3 = load i32, ptr addrspace(1) %add.ptr8.2, align 4
   %add.2 = add i32 %load3, %add.1

   %add.ptr8.3 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 768
   %load4 = load i32, ptr addrspace(1) %add.ptr8.3, align 4
   %add.3 = add i32 %load4, %add.2

   %add.ptr8.4 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1024
   %load5 = load i32, ptr addrspace(1) %add.ptr8.4, align 4
   %add.4 = add i32 %load5, %add.3

   %add.ptr8.5 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1280
   %load6 = load i32, ptr addrspace(1) %add.ptr8.5, align 4
   %add.5 = add i32 %load6, %add.4

   %add.ptr8.6 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1536
   %load7 = load i32, ptr addrspace(1) %add.ptr8.6, align 4
   %add.6 = add i32 %load7, %add.5

   %add.ptr8.7 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1792
   %load8 = load i32, ptr addrspace(1) %add.ptr8.7, align 4
   %add.7 = add i32 %load8, %add.6

   %add.ptr8.8 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 2048
   %load9 = load i32, ptr addrspace(1) %add.ptr8.8, align 4
   %add.8 = add i32 %load9, %add.7

   %add.ptr8.9 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 2304
   %load10 = load i32, ptr addrspace(1) %add.ptr8.9, align 4
   %add.9 = add i32 %load10, %add.8

   store i32 %add.9, ptr addrspace(1) %add.ptr12, align 4
   ret void
}

define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
; GFX8-LABEL: Offset64:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s38, -1
; GFX8-NEXT:    s_mov_b32 s39, 0xe80000
; GFX8-NEXT:    s_add_u32 s36, s36, s9
; GFX8-NEXT:    s_addc_u32 s37, s37, 0
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
; GFX8-NEXT:    v_mov_b32_e32 v2, s35
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
; GFX8-NEXT:    v_mov_b32_e32 v3, 3
; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0xf000
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0xf800
; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[3:4]
; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0, v3
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 1, v4, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v7
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v8, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
; GFX8-NEXT:    s_endpgm
;
; GFX9-LABEL: Offset64:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s9
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_getpc_b64 s[0:1]
; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v31, v0
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff8000, v1
; GFX9-NEXT:    v_mov_b32_e32 v1, s35
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v12
; GFX9-NEXT:    v_mov_b32_e32 v3, 3
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 1, v1, vcc
; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:-4096
; GFX9-NEXT:    s_movk_i32 s0, 0xf000
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
; GFX9-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: Offset64:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s38, -1
; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000
; GFX10-NEXT:    s_add_u32 s36, s36, s9
; GFX10-NEXT:    s_addc_u32 s37, s37, 0
; GFX10-NEXT:    s_getpc_b64 s[0:1]
; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX10-NEXT:    v_mov_b32_e32 v2, 3
; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff8000, v1
; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v12
; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0xfffff800
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 1, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
; GFX10-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: Offset64:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[0:1]
; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    s_load_b64 s[34:35], s[2:3], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v8
; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 1, v1, vcc_lo
; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x2
; GFX11-NEXT:    global_load_b64 v[6:7], v[4:5], off offset:-4096
; GFX11-NEXT:    global_load_b64 v[4:5], v[4:5], off
; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[34:35]
; GFX11-NEXT:    s_endpgm
entry:
  %call = tail call i64 @_Z13get_global_idj(i32 0)
  %conv = and i64 %call, 255
  %a0 = shl i64 %call, 7
  %idx.ext11 = and i64 %a0, 4294934528
  %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11

  %addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv
  %load1 = load i64, ptr addrspace(1) %addr1, align 8

  %addr2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 536870400
  %load2 = load i64, ptr addrspace(1) %addr2, align 8

  %add1 = add i64 %load2, %load1

  %addr3 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 536870656
  %load3 = load i64, ptr addrspace(1) %addr3, align 8

  %add2 = add i64 %load3, %add1

  %addr4 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 536870912
  %load4 = load i64, ptr addrspace(1) %addr4, align 8
  %add4 = add i64 %load4, %add2

  store i64 %add4, ptr addrspace(1) %add.ptr12, align 8
  ret void
}

; TODO: Support load4 as anchor instruction.
define amdgpu_kernel void @p32Offset64(ptr addrspace(1)  %buffer) {
; GFX8-LABEL: p32Offset64:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s38, -1
; GFX8-NEXT:    s_mov_b32 s39, 0xe80000
; GFX8-NEXT:    s_add_u32 s36, s36, s9
; GFX8-NEXT:    s_addc_u32 s37, s37, 0
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
; GFX8-NEXT:    v_mov_b32_e32 v2, s35
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
; GFX8-NEXT:    v_mov_b32_e32 v3, 2
; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
; GFX8-NEXT:    s_mov_b32 s0, 0x7ffff800
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
; GFX8-NEXT:    s_mov_b32 s0, 0x7ffffc00
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
; GFX8-NEXT:    flat_load_dword v0, v[3:4]
; GFX8-NEXT:    flat_load_dword v5, v[5:6]
; GFX8-NEXT:    flat_load_dword v6, v[7:8]
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x80000000, v3
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT:    flat_load_dword v3, v[3:4]
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT:    flat_store_dword v[1:2], v0
; GFX8-NEXT:    s_endpgm
;
; GFX9-LABEL: p32Offset64:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s9
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_getpc_b64 s[0:1]
; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v31, v0
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1
; GFX9-NEXT:    v_mov_b32_e32 v1, s35
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v6
; GFX9-NEXT:    v_mov_b32_e32 v3, 2
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    s_mov_b32 s0, 0x7ffff000
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0x80000000, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dword v7, v[0:1], off
; GFX9-NEXT:    global_load_dword v8, v[2:3], off offset:2048
; GFX9-NEXT:    global_load_dword v9, v[2:3], off offset:3072
; GFX9-NEXT:    global_load_dword v10, v[4:5], off
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    v_add_u32_e32 v0, v8, v7
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_add3_u32 v0, v9, v0, v10
; GFX9-NEXT:    global_store_dword v6, v0, s[34:35]
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: p32Offset64:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s38, -1
; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000
; GFX10-NEXT:    s_add_u32 s36, s36, s9
; GFX10-NEXT:    s_addc_u32 s37, s37, 0
; GFX10-NEXT:    s_getpc_b64 s[0:1]
; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX10-NEXT:    v_mov_b32_e32 v2, 2
; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff8000, v1
; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v4
; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x80000000
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    global_load_dword v5, v[0:1], off
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ffff800, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x2
; GFX10-NEXT:    global_load_dword v6, v[2:3], off offset:-2048
; GFX10-NEXT:    global_load_dword v7, v[2:3], off
; GFX10-NEXT:    global_load_dword v8, v[0:1], off offset:1024
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    v_add_nc_u32_e32 v0, v6, v5
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add3_u32 v0, v8, v0, v7
; GFX10-NEXT:    global_store_dword v4, v0, s[34:35]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: p32Offset64:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[0:1]
; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    s_load_b64 s[34:35], s[2:3], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v6
; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x7ffff000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x80000000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x3
; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:2048
; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:3072
; GFX11-NEXT:    global_load_b32 v3, v[4:5], off
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    v_add_nc_u32_e32 v0, v1, v0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add3_u32 v0, v2, v0, v3
; GFX11-NEXT:    global_store_b32 v6, v0, s[34:35]
; GFX11-NEXT:    s_endpgm
entry:
  %call = tail call i64 @_Z13get_global_idj(i32 0)
  %conv = and i64 %call, 255
  %a0 = shl i64 %call, 7
  %idx.ext11 = and i64 %a0, 4294934528
  %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11

  %addr1 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr12, i64 %conv
  %load1 = load i32, ptr addrspace(1) %addr1, align 8

  %addr2 = getelementptr inbounds i32, ptr addrspace(1) %addr1, i64 536870400
  %load2 = load i32, ptr addrspace(1) %addr2, align 8

  %add1 = add i32 %load2, %load1

  %addr3 = getelementptr inbounds i32, ptr addrspace(1) %addr1, i64 536870656
  %load3 = load i32, ptr addrspace(1) %addr3, align 8

  %add2 = add i32 %load3, %add1

  %addr4 = getelementptr inbounds i32, ptr addrspace(1) %addr1, i64 536870912
  %load4 = load i32, ptr addrspace(1) %addr4, align 8
  %add4 = add i32 %load4, %add2

  store i32 %add4, ptr addrspace(1) %add.ptr12, align 8
  ret void
}

define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX8-LABEL: DiffBase:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s42, -1
; GFX8-NEXT:    s_mov_b32 s43, 0xe80000
; GFX8-NEXT:    s_add_u32 s40, s40, s9
; GFX8-NEXT:    s_addc_u32 s41, s41, 0
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_load_dwordx4 s[36:39], s[2:3], 0x24
; GFX8-NEXT:    s_mov_b64 s[0:1], s[40:41]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[42:43]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff8000, v0
; GFX8-NEXT:    v_mov_b32_e32 v1, s37
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s36, v2
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    v_mov_b32_e32 v3, s39
; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s38, v2
; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v3, vcc
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x1000, v0
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x1800, v0
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x2000, v0
; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x2800, v12
; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v13, vcc
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x3000, v12
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v13, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[8:9]
; GFX8-NEXT:    flat_load_dwordx2 v[10:11], v[10:11]
; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 0x3800, v12
; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v13, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[12:13], v[12:13]
; GFX8-NEXT:    s_waitcnt vmcnt(4)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(3)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v7, v3, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v10, v8
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v12, v4
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v13, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT:    s_endpgm
;
; GFX9-LABEL: DiffBase:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s42, -1
; GFX9-NEXT:    s_mov_b32 s43, 0xe00000
; GFX9-NEXT:    s_add_u32 s40, s40, s9
; GFX9-NEXT:    s_addc_u32 s41, s41, 0
; GFX9-NEXT:    s_getpc_b64 s[0:1]
; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[2:3], 0x24
; GFX9-NEXT:    s_mov_b64 s[0:1], s[40:41]
; GFX9-NEXT:    v_mov_b32_e32 v31, v0
; GFX9-NEXT:    s_mov_b64 s[2:3], s[42:43]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff8000, v0
; GFX9-NEXT:    v_mov_b32_e32 v0, s37
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s36, v16
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v0, vcc
; GFX9-NEXT:    v_mov_b32_e32 v0, s39
; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, s38, v16
; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v0, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x2000, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:2048
; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v10
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v11, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x3000, v10
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v11, vcc
; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off
; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[2:3], off offset:2048
; GFX9-NEXT:    s_waitcnt vmcnt(4)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(3)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v12, v10
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v13, v11, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v15, v3, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX9-NEXT:    global_store_dwordx2 v16, v[0:1], s[36:37]
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: DiffBase:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s42, -1
; GFX10-NEXT:    s_mov_b32 s43, 0x31c16000
; GFX10-NEXT:    s_add_u32 s40, s40, s9
; GFX10-NEXT:    s_addc_u32 s41, s41, 0
; GFX10-NEXT:    s_getpc_b64 s[0:1]
; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT:    s_load_dwordx4 s[36:39], s[2:3], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[40:41]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[42:43]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff8000, v0
; GFX10-NEXT:    v_add_co_u32 v8, s0, s36, v16
; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s0, s37, 0, s0
; GFX10-NEXT:    v_add_co_u32 v12, s0, s38, v16
; GFX10-NEXT:    v_add_co_ci_u32_e64 v13, s0, s39, 0, s0
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, 0x1800
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v12, 0x3000
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v13, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v8
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:-2048
; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3800, v12
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v13, vcc_lo
; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[0:1], off
; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(4)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v8
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v11, v9, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT:    global_store_dwordx2 v16, v[0:1], s[36:37]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: DiffBase:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[0:1]
; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    s_load_b128 s[36:39], s[2:3], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff8000, v0
; GFX11-NEXT:    v_add_co_u32 v2, s0, s36, v12
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s37, 0, s0
; GFX11-NEXT:    v_add_co_u32 v8, s0, s38, v12
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, s39, 0, s0
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 0x2000
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x2000, v8
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x3000, v8
; GFX11-NEXT:    global_load_b64 v[6:7], v[2:3], off offset:-4096
; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b64 v[4:5], v[4:5], off offset:2048
; GFX11-NEXT:    global_load_b64 v[10:11], v[8:9], off
; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off offset:2048
; GFX11-NEXT:    s_waitcnt vmcnt(4)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v6
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v7, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v10, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v11, v5, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(1)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT:    global_store_b64 v12, v[0:1], s[36:37]
; GFX11-NEXT:    s_endpgm
                                    ptr addrspace(1) %buffer2) {
entry:
  %call = tail call i64 @_Z13get_global_idj(i32 0)
  %conv = and i64 %call, 255
  %a0 = shl i64 %call, 7
  %idx.ext11 = and i64 %a0, 4294934528
  %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer1, i64 %idx.ext11

  %add.ptr2 = getelementptr inbounds i8, ptr addrspace(1) %buffer2, i64 %idx.ext11

  %addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 512
  %load1 = load i64, ptr addrspace(1) %addr1, align 8
  %add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 768
  %load2 = load i64, ptr addrspace(1) %add.ptr8.3, align 8
  %add1 = add i64 %load2, %load1
  %add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 1024
  %load3 = load i64, ptr addrspace(1) %add.ptr8.4, align 8
  %add2 = add i64 %load3, %add1

  %add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr2, i64 1280
  %load4 = load i64, ptr addrspace(1) %add.ptr8.5, align 8

  %add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr2, i64 1536
  %load5 = load i64, ptr addrspace(1) %add.ptr8.6, align 8
  %add3 = add i64 %load5, %load4

  %add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr2, i64 1792
  %load6 = load i64, ptr addrspace(1) %add.ptr8.7, align 8
  %add4 = add i64 %load6, %add3

  %add5 = add i64 %add2, %add4

  store i64 %add5, ptr addrspace(1) %add.ptr12, align 8
  ret void
}

define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX8-LABEL: ReverseOrder:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s38, -1
; GFX8-NEXT:    s_mov_b32 s39, 0xe80000
; GFX8-NEXT:    s_add_u32 s36, s36, s9
; GFX8-NEXT:    s_addc_u32 s37, s37, 0
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
; GFX8-NEXT:    v_mov_b32_e32 v2, s35
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
; GFX8-NEXT:    v_mov_b32_e32 v3, 3
; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x3800
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x3000
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x2800
; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[3:4]
; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
; GFX8-NEXT:    s_movk_i32 s0, 0x2000
; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1800
; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v4, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14]
; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16]
; GFX8-NEXT:    s_movk_i32 s0, 0x1000
; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v3
; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v4, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[17:18]
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x800, v3
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
; GFX8-NEXT:    s_waitcnt vmcnt(6)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v11
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v12, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(5)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v8, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(4)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(3)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v13, v0
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v14, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v15, v0
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v16, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v17, v0
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v18, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
; GFX8-NEXT:    s_endpgm
;
; GFX9-LABEL: ReverseOrder:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s9
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_getpc_b64 s[0:1]
; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v31, v0
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff8000, v1
; GFX9-NEXT:    v_mov_b32_e32 v1, s35
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v22
; GFX9-NEXT:    v_mov_b32_e32 v3, 3
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    s_movk_i32 s0, 0x3000
; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:2048
; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
; GFX9-NEXT:    s_movk_i32 s0, 0x2000
; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off offset:2048
; GFX9-NEXT:    s_movk_i32 s0, 0x1000
; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, s0, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[4:5], off
; GFX9-NEXT:    global_load_dwordx2 v[18:19], v[12:13], off offset:2048
; GFX9-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:2048
; GFX9-NEXT:    s_waitcnt vmcnt(6)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(5)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(4)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v18, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v20, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v21, v1, vcc
; GFX9-NEXT:    global_store_dwordx2 v22, v[0:1], s[34:35]
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: ReverseOrder:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s38, -1
; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000
; GFX10-NEXT:    s_add_u32 s36, s36, s9
; GFX10-NEXT:    s_addc_u32 s37, s37, 0
; GFX10-NEXT:    s_getpc_b64 s[0:1]
; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX10-NEXT:    v_mov_b32_e32 v2, 3
; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff8000, v1
; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v20
; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3800, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3000, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x2800, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, 0x2000, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, 0x1800, v0
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[4:5], off
; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, 0x1000, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[12:13], off
; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[14:15], off
; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(6)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v6
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(5)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v16, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v18, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo
; GFX10-NEXT:    global_store_dwordx2 v20, v[0:1], s[34:35]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: ReverseOrder:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[0:1]
; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    s_load_b64 s[34:35], s[2:3], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff8000, v1
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v16
; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x2000, v0
; GFX11-NEXT:    s_clause 0x2
; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
; GFX11-NEXT:    global_load_b64 v[6:7], v[2:3], off offset:2048
; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, 0x1000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x4
; GFX11-NEXT:    global_load_b64 v[12:13], v[8:9], off offset:2048
; GFX11-NEXT:    global_load_b64 v[14:15], v[10:11], off
; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off
; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off offset:2048
; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT:    s_waitcnt vmcnt(6)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v6, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(5)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(4)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v12, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v13, v3, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v3, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(1)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT:    global_store_b64 v16, v[0:1], s[34:35]
; GFX11-NEXT:    s_endpgm
entry:
  %call = tail call i64 @_Z13get_global_idj(i32 0)
  %conv = and i64 %call, 255
  %a0 = shl i64 %call, 7
  %idx.ext11 = and i64 %a0, 4294934528
  %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11

  %addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv
  %load1 = load i64, ptr addrspace(1) %addr1, align 8

  %add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1792
  %load8 = load i64, ptr addrspace(1) %add.ptr8.7, align 8
  %add7 = add i64 %load8, %load1

  %add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1536
  %load7 = load i64, ptr addrspace(1) %add.ptr8.6, align 8
  %add6 = add i64 %load7, %add7

  %add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1280
  %load6 = load i64, ptr addrspace(1) %add.ptr8.5, align 8
  %add5 = add i64 %load6, %add6

  %add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1024
  %load5 = load i64, ptr addrspace(1) %add.ptr8.4, align 8
  %add4 = add i64 %load5, %add5

  %add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 768
  %load4 = load i64, ptr addrspace(1) %add.ptr8.3, align 8
  %add3 = add i64 %load4, %add4

  %add.ptr8.2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 512
  %load3 = load i64, ptr addrspace(1) %add.ptr8.2, align 8
  %add2 = add i64 %load3, %add3

  %addr2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 256
  %load2 = load i64, ptr addrspace(1) %addr2, align 8
  %add1 = add i64 %load2, %add2

  store i64 %add1, ptr addrspace(1) %add.ptr12, align 8
  ret void
}

define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buffer) {
; GFX8-LABEL: negativeoffset:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s38, -1
; GFX8-NEXT:    s_mov_b32 s39, 0xe80000
; GFX8-NEXT:    s_add_u32 s36, s36, s9
; GFX8-NEXT:    s_addc_u32 s37, s37, 0
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
; GFX8-NEXT:    v_mov_b32_e32 v2, s35
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
; GFX8-NEXT:    v_mov_b32_e32 v3, 3
; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x800
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v6, vcc
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0, v0
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, -1, v6, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
; GFX8-NEXT:    s_endpgm
;
; GFX9-LABEL: negativeoffset:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s9
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_getpc_b64 s[0:1]
; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v31, v0
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1
; GFX9-NEXT:    v_mov_b32_e32 v1, s35
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v8
; GFX9-NEXT:    v_mov_b32_e32 v3, 3
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
; GFX9-NEXT:    global_store_dwordx2 v8, v[0:1], s[34:35]
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: negativeoffset:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s38, -1
; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000
; GFX10-NEXT:    s_add_u32 s36, s36, s9
; GFX10-NEXT:    s_addc_u32 s37, s37, 0
; GFX10-NEXT:    s_getpc_b64 s[0:1]
; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX10-NEXT:    v_mov_b32_e32 v2, 3
; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1
; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v8
; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v3
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v3
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v4, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[34:35]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: negativeoffset:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[0:1]
; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    s_load_b64 s[34:35], s[2:3], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff8000, v1
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v4
; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v3
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v3
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:-2048
; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[34:35]
; GFX11-NEXT:    s_endpgm
entry:
  %call = tail call i64 @_Z13get_global_idj(i32 0) #2
  %conv = and i64 %call, 255
  %0 = shl i64 %call, 7
  %idx.ext11 = and i64 %0, 4294934528
  %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11

  %buffer_wave = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv

  %addr1 = getelementptr inbounds i64, ptr addrspace(1) %buffer_wave, i64 -536870656
  %load1 = load i64, ptr addrspace(1) %addr1, align 8

  %addr2 = getelementptr inbounds i64, ptr addrspace(1) %buffer_wave, i64 -536870912
  %load2 = load i64, ptr addrspace(1) %addr2, align 8


  %add = add i64 %load2, %load1

  store i64 %add, ptr addrspace(1) %add.ptr12, align 8
  ret void
}

define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) {
; GFX8-LABEL: negativeoffsetnullptr:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_load_dword s1, s[2:3], 0xec
; GFX8-NEXT:    s_add_u32 s0, 0, -1
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_addc_u32 s1, s1, -1
; GFX8-NEXT:    v_mov_b32_e32 v0, s0
; GFX8-NEXT:    v_mov_b32_e32 v1, s1
; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
; GFX8-NEXT:    s_mov_b64 s[0:1], 0
; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
; GFX8-NEXT:  .LBB8_1: ; %branch
; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT:    s_and_b64 s[2:3], exec, vcc
; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
; GFX8-NEXT:  ; %bb.2: ; %end
; GFX8-NEXT:    s_endpgm
;
; GFX9-LABEL: negativeoffsetnullptr:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_mov_b64 s[0:1], src_private_base
; GFX9-NEXT:    v_mov_b32_e32 v1, s1
; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, -1, 0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
; GFX9-NEXT:  .LBB8_1: ; %branch
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_and_b64 s[2:3], exec, vcc
; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB8_1
; GFX9-NEXT:  ; %bb.2: ; %end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: negativeoffsetnullptr:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b64 s[0:1], src_private_base
; GFX10-NEXT:    s_add_u32 s0, 0, -1
; GFX10-NEXT:    s_addc_u32 s1, s1, -1
; GFX10-NEXT:    v_mov_b32_e32 v0, s0
; GFX10-NEXT:    v_mov_b32_e32 v1, s1
; GFX10-NEXT:    s_mov_b32 s0, 0
; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
; GFX10-NEXT:  .LBB8_1: ; %branch
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
; GFX10-NEXT:    s_or_b32 s0, s1, s0
; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
; GFX10-NEXT:  ; %bb.2: ; %end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: negativeoffsetnullptr:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT:    v_add_co_u32 v0, s0, -1, 0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
; GFX11-NEXT:    s_mov_b32 s0, 0
; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
; GFX11-NEXT:  .LBB8_1: ; %branch
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT:    s_or_b32 s0, s1, s0
; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
; GFX11-NEXT:  ; %bb.2: ; %end
; GFX11-NEXT:    s_endpgm
entry:
  %null = select i1 false, ptr %buffer, ptr addrspacecast (ptr addrspace(5) null to ptr)
  %gep = getelementptr i8, ptr %null, i64 -1
  %ld = load i8, ptr %gep
  %cmp = icmp eq i8 %ld, 0
  br label %branch

branch:
  br i1 %cmp, label %end, label %branch

end:
  ret void
}


attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }